From 5feadf761fca4b1f6db29edd0769115474c94291 Mon Sep 17 00:00:00 2001
From: orangekame3 <miya.org.0309@gmail.com>
Date: Sat, 30 Sep 2023 09:57:24 +0900
Subject: [PATCH 001/670] remove ioutil pkg

---
 tensorflow/go/example_inception_inference_test.go | 7 +++----
 tensorflow/go/genop/internal/genop.go             | 6 +++---
 tensorflow/go/genop/main.go                       | 5 ++---
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/tensorflow/go/example_inception_inference_test.go b/tensorflow/go/example_inception_inference_test.go
index 475619c55a5472..13a9316298a6d9 100644
--- a/tensorflow/go/example_inception_inference_test.go
+++ b/tensorflow/go/example_inception_inference_test.go
@@ -22,14 +22,13 @@ import (
 	"flag"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"log"
 	"net/http"
 	"os"
 	"path/filepath"
 
-	"github.com/tensorflow/tensorflow/tensorflow/go/op"
 	tf "github.com/tensorflow/tensorflow/tensorflow/go"
+	"github.com/tensorflow/tensorflow/tensorflow/go/op"
 )
 
 func Example() {
@@ -88,7 +87,7 @@ func Example() {
 		log.Fatal(err)
 	}
 
-	model, err := ioutil.ReadFile(modelfile)
+	model, err := os.ReadFile(modelfile)
 	if err != nil {
 		log.Fatal(err)
 	}
@@ -145,7 +144,7 @@ func printBestLabel(probabilities []float32, labels []string) {
 
 // Convert the image in filename to a Tensor suitable as input to the Inception model.
 func makeTensorFromImage(filename string) (*tf.Tensor, error) {
-	bytes, err := ioutil.ReadFile(filename)
+	bytes, err := os.ReadFile(filename)
 	if err != nil {
 		return nil, err
 	}
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index 2b72b236a813a6..0c92d7e309aaca 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -39,7 +39,7 @@ import "C"
 import (
 	"fmt"
 	"io"
-	"io/ioutil"
+	"os"
 	"path"
 	"reflect"
 	"sort"
@@ -96,7 +96,7 @@ func registeredOps() (*odpb.OpList, *apiDefMap, error) {
 }
 
 func updateAPIDefs(m *apiDefMap, dir string) error {
-	files, err := ioutil.ReadDir(dir)
+	files, err := os.ReadDir(dir)
 	if err != nil {
 		return err
 	}
@@ -104,7 +104,7 @@ func updateAPIDefs(m *apiDefMap, dir string) error {
 		if file.IsDir() || !strings.HasSuffix(file.Name(), ".pbtxt") {
 			continue
 		}
-		data, err := ioutil.ReadFile(path.Join(dir, file.Name()))
+		data, err := os.ReadFile(path.Join(dir, file.Name()))
 		if err != nil {
 			return fmt.Errorf("failed to read %q: %v", file.Name(), err)
 		}
diff --git a/tensorflow/go/genop/main.go b/tensorflow/go/genop/main.go
index 87c1d27c3b53d7..370a9aaec10a80 100644
--- a/tensorflow/go/genop/main.go
+++ b/tensorflow/go/genop/main.go
@@ -21,7 +21,6 @@ import (
 	"bytes"
 	"flag"
 	"go/format"
-	"io/ioutil"
 	"log"
 	"os"
 	"path/filepath"
@@ -42,7 +41,7 @@ func main() {
 		log.Fatal("-outfile must be set")
 	}
 	if *header != "" {
-		hdr, err := ioutil.ReadFile(*header)
+		hdr, err := os.ReadFile(*header)
 		if err != nil {
 			log.Fatalf("Unable to read %s: %v", *header, err)
 		}
@@ -64,7 +63,7 @@ func main() {
 	if err != nil {
 		log.Fatalf("Failed to generate valid source? 'go fmt' failed: %v", err)
 	}
-	if err := ioutil.WriteFile(*filename, formatted, 0644); err != nil {
+	if err := os.WriteFile(*filename, formatted, 0644); err != nil {
 		log.Fatalf("Failed to write to %q: %v", *filename, err)
 	}
 }

From fc1ee3bcc2d07d30bcd9480280796eb014bb0e2b Mon Sep 17 00:00:00 2001
From: Gauri1 Deshpande <gauri1.deshpande@intel.com>
Date: Fri, 19 Jan 2024 10:11:06 -0800
Subject: [PATCH 002/670] [onednn] Enable auto_mixed_precision for fp16 on cpu

---
 .../optimizers/auto_mixed_precision.cc        | 51 ++++++++++-----
 .../optimizers/auto_mixed_precision_lists.h   | 24 +++++--
 .../optimizers/auto_mixed_precision_test.cc   | 63 ++++++++++++++++---
 3 files changed, 109 insertions(+), 29 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 7cab9376515a87..e8331ea8318490 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -1028,6 +1029,8 @@ std::unordered_map<string, DeviceProperties> GetDevices(Cluster* cluster) {
   return devices;
 }
 
+int GetNumGPUs(const Cluster& cluster);
+
 class AutoMixedPrecisionImpl {
  public:
   // CastType indicates the type of inserted Cast op
@@ -1038,7 +1041,8 @@ class AutoMixedPrecisionImpl {
   AutoMixedPrecisionImpl(Cluster* cluster,
                          const std::unordered_set<string>& nodes_to_preserve,
                          GraphDef* graph, string id,
-                         AutoMixedPrecisionMode mode)
+                         AutoMixedPrecisionMode mode,
+                         const bool run_fp16_on_cpu)
       : devices_(GetDevices(cluster)),
         virtual_placer_(devices_),
         nodes_to_preserve_(nodes_to_preserve),
@@ -1053,7 +1057,9 @@ class AutoMixedPrecisionImpl {
         target_dtype_((mode_ == AutoMixedPrecisionMode::CUDA ||
                        mode_ == AutoMixedPrecisionMode::CPU)
                           ? DT_HALF
-                          : DT_BFLOAT16) {}
+                          : DT_BFLOAT16),
+        num_gpus_(GetNumGPUs(*cluster)),
+        run_fp16_on_cpu_(run_fp16_on_cpu) {}
 
   Status Optimize();
 
@@ -1063,8 +1069,8 @@ class AutoMixedPrecisionImpl {
   std::unique_ptr<AutoMixedPrecisionLists> get_mixed_precision_lists() const {
     switch (mode_) {
       case AutoMixedPrecisionMode::CUDA:
-        return std::make_unique<AutoMixedPrecisionListsCuda>(cuda_version_,
-                                                             cudnn_version_);
+        return std::make_unique<AutoMixedPrecisionListsCuda>(
+            cuda_version_, cudnn_version_, run_fp16_on_cpu_);
       case AutoMixedPrecisionMode::BF16:
         return std::make_unique<AutoMixedPrecisionListsMkl>();
       case AutoMixedPrecisionMode::CPU:
@@ -1147,6 +1153,8 @@ class AutoMixedPrecisionImpl {
   gtl::FlatSet<string> f16_clearlist_;
   absl::flat_hash_set<const NodeDef*> should_process_nodes_;
   DataType target_dtype_;  // Either DT_HALF or DT_BFLOAT16
+  int num_gpus_ = 0;
+  bool run_fp16_on_cpu_ = false;
 };
 
 NodeDef AutoMixedPrecisionImpl::BuildCastNode(
@@ -1421,10 +1429,15 @@ Status AutoMixedPrecisionImpl::Optimize() {
     string device_type;
     switch (mode_) {
       case AutoMixedPrecisionMode::CUDA:
-        device_type = DEVICE_GPU;
-        should_process =
-            !MustPreserve(node) && IsOnDevice(node, device_type) &&
-            (ShouldIgnorePerformance() || IsOnSuitableGPUArch(node));
+        if (!run_fp16_on_cpu_) {
+          device_type = DEVICE_GPU;
+          should_process =
+              !MustPreserve(node) && IsOnDevice(node, device_type) &&
+              (ShouldIgnorePerformance() || IsOnSuitableGPUArch(node));
+        } else {
+          device_type = DEVICE_CPU;
+          should_process = !MustPreserve(node) && IsOnDevice(node, device_type);
+        }
         break;
       case AutoMixedPrecisionMode::BF16:
       case AutoMixedPrecisionMode::CPU:
@@ -1857,7 +1870,7 @@ void AutoMixedPrecisionImpl::AddInferToAllowIfFollowAllow(
     const absl::flat_hash_set<int>& deny_set,
     absl::flat_hash_set<int>* allow_set) const {
   // Currently only target for oneDNN
-  if (mode_ != AutoMixedPrecisionMode::BF16) {
+  if (mode_ != AutoMixedPrecisionMode::BF16 && !run_fp16_on_cpu_) {
     return;
   }
   for (int item_idx = 0; item_idx < graph_type_view_.num_nodes(); ++item_idx) {
@@ -2298,11 +2311,19 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
   *output = item.graph;
 
   int num_gpus = GetNumGPUs(*cluster);
+  bool run_fp16_on_cpu = false;
   if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
-    // AutoMixedPrecision is currently only tuned for GPU.
-    LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()
-                 << " graph optimizer";
-    return OkStatus();
+    // No GPUs to run AutoMixedPrecision in FP16.
+    // Check if CPU supports
+    if (!IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF)) {
+      LOG(WARNING) << "No support for " << name() << " graph optimizer on CPU/GPU";
+      return OkStatus();
+    } else {
+      run_fp16_on_cpu = true;
+      LOG(INFO) << "Running " << name() << " graph optimizer on CPU";
+    }
+  } else {
+    LOG(INFO) << "Running " << name() << " graph optimizer on GPU";
   }
 
   if (num_gpus >= 1 && mode_ == AutoMixedPrecisionMode::BF16) {
@@ -2312,11 +2333,11 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // Optimize the output graph in-place.
   AutoMixedPrecisionImpl optimizer(cluster, item.NodesToPreserve(), output,
-                                   item.id, mode_);
+                                   item.id, mode_, run_fp16_on_cpu);
   if (item.id == "tf_graph") {
     LOG(INFO) << "Running " << name() << " graph optimizer";
   } else {
-    VLOG(1) << "Running " << name() << " graph optimizer on " << item.id;
+    VLOG(INFO) << "Running " << name() << " graph optimizer on " << item.id;
   }
   Status status = optimizer.Optimize();
   if (!status.ok()) {
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index f8f393a1cb960f..63a45f33b977ce 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -106,8 +107,11 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
   }
 
  public:
-  AutoMixedPrecisionListsCuda(int cuda_version, int cudnn_version)
-      : cuda_version_(cuda_version), cudnn_version_(cudnn_version) {}
+  AutoMixedPrecisionListsCuda(int cuda_version, int cudnn_version,
+                              bool run_fp16_on_cpu = false)
+      : cuda_version_(cuda_version),
+        cudnn_version_(cudnn_version),
+        run_fp16_on_cpu_(run_fp16_on_cpu) {}
 
   gtl::FlatSet<string> AllowList() override {
     auto list = gtl::FlatSet<string>{
@@ -143,13 +147,13 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
 #if TENSORFLOW_USE_ROCM
     if (true) {
 #else
-    if (cuda_version_ >= 9010) {
+    if (cuda_version_ >= 9010 || run_fp16_on_cpu_) {
       // Fp16 BatchMatMul is slow before CUDA 9.1.
 #endif
       list.insert("BatchMatMul");
       list.insert("BatchMatMulV2");
     }
-    if (cudnn_version_ >= 7602) {
+    if (cudnn_version_ >= 7602 || run_fp16_on_cpu_) {
       // Fp16 3D conv is slow before CUDNN 7.6.2.
       list.insert("Conv3D");
       list.insert("Conv3DBackpropFilter");
@@ -157,7 +161,7 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
       list.insert("Conv3DBackpropInput");
       list.insert("Conv3DBackpropInputV2");
     }
-    if (cudnn_version_ >= 8000) {
+    if (cudnn_version_ >= 8000 || run_fp16_on_cpu_) {
       list.insert("DepthwiseConv2dNative");
       list.insert("DepthwiseConv2dNativeBackpropFilter");
       list.insert("DepthwiseConv2dNativeBackpropInput");
@@ -220,6 +224,11 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
         "Tanh",
         "TanhGrad",
     };
+    if (run_fp16_on_cpu_) {
+      list.insert("Rsqrt");
+      list.insert("Square");
+      list.insert("SquaredDifference");
+    }
     UpdateList("INFERLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
     // TODO(reedwm): This should be removed if we don't have active users.
@@ -352,6 +361,10 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
         "Where",
         "ZerosLike",
     };
+    if (run_fp16_on_cpu_) {
+      list.insert("ResizeBilinear");
+      list.insert("ScatterNd");
+    }
     AddTensorListOps(&list);
     UpdateList("CLEARLIST", &list);
     return list;
@@ -360,6 +373,7 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
  private:
   int cuda_version_;
   int cudnn_version_;
+  bool run_fp16_on_cpu_;
 };
 
 class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 689185fb08923d..91ce01425d5c1e 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -93,7 +93,7 @@ void VerifyGraphsEquivalent(const GraphDef& original_graph,
 // because otherwise the optimizer will not turn clearlist nodes to float16.
 // When looking at clearlist nodes, this optimizer checks if the nodes have a
 // float16 GPU OpKernel, but without CUDA/HIP there are no GPU OpKernels at all.
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM || INTEL_MKL
 
 const std::pair<int, int> kMinGPUArch = {7, 0};
 
@@ -112,19 +112,33 @@ class AutoMixedPrecisionTest : public GrapplerTest {
     if (gpu_available_) {
       virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
     } else {
-      DeviceProperties device_properties;
-      device_properties.set_type("GPU");
+      if( num_gpus > 0) {
+        DeviceProperties device_properties;
+        device_properties.set_type("GPU");
 #if GOOGLE_CUDA
-      device_properties.mutable_environment()->insert({"architecture", "7"});
-      device_properties.mutable_environment()->insert({"cuda", "9010"});
+        device_properties.mutable_environment()->insert({"architecture", "7"});
+        device_properties.mutable_environment()->insert({"cuda", "9010"});
 #else
-      device_properties.mutable_environment()->insert(
-          {"architecture", "gfx906"});
+        device_properties.mutable_environment()->insert(
+            {"architecture", "gfx906"});
 #endif
-      virtual_cluster_.reset(
-          new VirtualCluster({{"/GPU:1", device_properties}}));
+        virtual_cluster_.reset(
+            new VirtualCluster({{"/GPU:1", device_properties}}));
+      } else {
+	// try running on CPU
+        DeviceProperties device_properties;
+        device_properties.set_type("CPU");
+        virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 0));
+      }
     }
     TF_CHECK_OK(virtual_cluster_->Provision());
+
+    run_fp16_on_cpu_ = false;
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+    run_fp16_on_cpu_ = IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF);
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+
+    skip_test_ = !gpu_available_ && (!IsMKLEnabled() || !run_fp16_on_cpu_);
   }
 
   void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
@@ -172,6 +186,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
       double input_min, double input_max, double atol, double rtol,
       const std::function<Output(const tensorflow::Scope&, Output)>&
           test_op_factory) {
+    if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+
     int size = 128;
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
     Output eye = ops::Const(s.WithOpName("eye"),
@@ -210,11 +226,18 @@ class AutoMixedPrecisionTest : public GrapplerTest {
     }
   }
 
+  bool ShouldSkipTest() {
+    return skip_test_;
+  }
+
   std::unique_ptr<Cluster> virtual_cluster_;
   bool gpu_available_;
+  bool skip_test_;
+  bool run_fp16_on_cpu_;
 };
 
 TEST_F(AutoMixedPrecisionTest, NoOp) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.234f, {32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -252,6 +275,7 @@ TEST_F(AutoMixedPrecisionTest, NoOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, AlreadyFp16) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f, {32, 32});
   Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_HALF);
@@ -290,6 +314,7 @@ TEST_F(AutoMixedPrecisionTest, AlreadyFp16) {
 }
 
 TEST_F(AutoMixedPrecisionTest, Simple) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -339,6 +364,7 @@ TEST_F(AutoMixedPrecisionTest, Simple) {
 }
 
 TEST_F(AutoMixedPrecisionTest, NoInferOp) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   setenv("TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL", "TREAT_INFER_AS_DENY",
          1 /* replace */);
 
@@ -391,6 +417,7 @@ TEST_F(AutoMixedPrecisionTest, NoInferOp) {
 }
 
 TEST_F(AutoMixedPrecisionTest, BidirectionalClearChain) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output clr1 = ops::Relu(s.WithOpName("clr1"), input);
@@ -430,6 +457,7 @@ TEST_F(AutoMixedPrecisionTest, BidirectionalClearChain) {
 }
 
 TEST_F(AutoMixedPrecisionTest, PreserveFetches) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
@@ -475,6 +503,10 @@ TEST_F(AutoMixedPrecisionTest, PreserveFetches) {
 }
 
 TEST_F(AutoMixedPrecisionTest, PreserveCPUNodes) {
+  if (GetNumAvailableGPUs() == 0) {
+    GTEST_SKIP() << "This test is not required on CPU";
+  }
+
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output clr1 = ops::Relu(s.WithOpName("clr1"), input);
@@ -516,6 +548,7 @@ TEST_F(AutoMixedPrecisionTest, PreserveCPUNodes) {
 }
 
 TEST_F(AutoMixedPrecisionTest, PreserveIdentityAfterVariable) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output var1 = ops::Variable(s.WithOpName("var1"), {32, 32}, DT_FLOAT);
@@ -560,6 +593,7 @@ TEST_F(AutoMixedPrecisionTest, PreserveIdentityAfterVariable) {
 }
 
 TEST_F(AutoMixedPrecisionTest, FusedBatchNorm) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   // Uses NHWC data format because non-GPU execution does not support NCHW.
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {8, 56, 56, 16});
@@ -619,6 +653,7 @@ TEST_F(AutoMixedPrecisionTest, FusedBatchNorm) {
 }
 
 TEST_F(AutoMixedPrecisionTest, RepeatedAndListTypeAttrs) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
@@ -659,6 +694,7 @@ TEST_F(AutoMixedPrecisionTest, RepeatedAndListTypeAttrs) {
 }
 
 TEST_F(AutoMixedPrecisionTest, ExistingCast) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), true, {32, 32});
   Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_FLOAT);
@@ -691,6 +727,7 @@ TEST_F(AutoMixedPrecisionTest, ExistingCast) {
 }
 
 TEST_F(AutoMixedPrecisionTest, RecurrentEdgeColorMismatch) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -753,6 +790,7 @@ TEST_F(AutoMixedPrecisionTest, RecurrentEdgeColorMismatch) {
 }
 
 TEST_F(AutoMixedPrecisionTest, TensorListSetGet) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::TensorListReserve(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -824,6 +862,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListSetGet) {
 }
 
 TEST_F(AutoMixedPrecisionTest, TensorListPushPop) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -887,6 +926,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushPop) {
 }
 
 TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32};
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
@@ -937,6 +977,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
 }
 
 TEST_F(AutoMixedPrecisionTest, TensorListPushBackBatchAndConcatLists) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -997,6 +1038,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushBackBatchAndConcatLists) {
 }
 
 TEST_F(AutoMixedPrecisionTest, TensorListThroughFunction) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   // This test passes a tensor list handle through a function with its own
   // Tensor List ops inside to test that the types are not changed to a
   // conflicting state.
@@ -1105,6 +1147,7 @@ bool IsSupportedGPU(const Cluster& cluster) {
 }
 
 TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 33, {64, 32, 32});
   Output allow1 = ops::BatchMatMul(s.WithOpName("allow1"), input, input);
@@ -1437,6 +1480,7 @@ class AutoMixedPrecisionSimulateGpuTest : public GrapplerTest {
   }
 };
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(AutoMixedPrecisionSimulateGpuTest, Simple_NoGpu) {
   TestSimple(tensorflow::Scope::NewRootScope(), /* is_optimized= */ false);
 }
@@ -1456,6 +1500,7 @@ TEST_F(AutoMixedPrecisionSimulateGpuTest, Simple_SimulatedGpu_CpuScope) {
 }
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM || INTEL_MKL
 
 #if INTEL_MKL
 

From 9a3c8d59eb65fac7253b8f9a0db92dc93e22e374 Mon Sep 17 00:00:00 2001
From: Gauri1 Deshpande <gauri1.deshpande@intel.com>
Date: Tue, 23 Jan 2024 12:41:51 -0800
Subject: [PATCH 003/670] Address review comments - update comments as per
 guidelines.

---
 tensorflow/core/grappler/optimizers/auto_mixed_precision.cc     | 2 +-
 .../core/grappler/optimizers/auto_mixed_precision_test.cc       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index e8331ea8318490..466de8be2b4e83 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -2314,7 +2314,7 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
   bool run_fp16_on_cpu = false;
   if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
     // No GPUs to run AutoMixedPrecision in FP16.
-    // Check if CPU supports
+    // Check if CPU supports FP16.
     if (!IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF)) {
       LOG(WARNING) << "No support for " << name() << " graph optimizer on CPU/GPU";
       return OkStatus();
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 91ce01425d5c1e..3dc34150cb0806 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -125,7 +125,7 @@ class AutoMixedPrecisionTest : public GrapplerTest {
         virtual_cluster_.reset(
             new VirtualCluster({{"/GPU:1", device_properties}}));
       } else {
-	// try running on CPU
+	// When no GPUs are available, try running on CPU.
         DeviceProperties device_properties;
         device_properties.set_type("CPU");
         virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 0));

From 27b1a9623f72a5acd13a864d1d00b47a85ca43fe Mon Sep 17 00:00:00 2001
From: Gauri1 Deshpande <gauri1.deshpande@intel.com>
Date: Tue, 30 Jan 2024 12:14:03 -0800
Subject: [PATCH 004/670] Address review comments

---
 .../optimizers/auto_mixed_precision_test.cc   | 37 +++++++++----------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 3dc34150cb0806..c6234ac74a6cb7 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -111,34 +111,32 @@ class AutoMixedPrecisionTest : public GrapplerTest {
 #endif
     if (gpu_available_) {
       virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
-    } else {
-      if( num_gpus > 0) {
-        DeviceProperties device_properties;
-        device_properties.set_type("GPU");
+    } else if( num_gpus > 0) {
+      DeviceProperties device_properties;
+      device_properties.set_type("GPU");
 #if GOOGLE_CUDA
-        device_properties.mutable_environment()->insert({"architecture", "7"});
-        device_properties.mutable_environment()->insert({"cuda", "9010"});
+      device_properties.mutable_environment()->insert({"architecture", "7"});
+      device_properties.mutable_environment()->insert({"cuda", "9010"});
 #else
-        device_properties.mutable_environment()->insert(
-            {"architecture", "gfx906"});
+      device_properties.mutable_environment()->insert(
+          {"architecture", "gfx906"});
 #endif
-        virtual_cluster_.reset(
-            new VirtualCluster({{"/GPU:1", device_properties}}));
-      } else {
-	// When no GPUs are available, try running on CPU.
-        DeviceProperties device_properties;
-        device_properties.set_type("CPU");
-        virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 0));
-      }
+      virtual_cluster_.reset(
+          new VirtualCluster({{"/GPU:1", device_properties}}));
+    } else {
+      // When no GPUs are available, try running on CPU.
+      DeviceProperties device_properties;
+      device_properties.set_type("CPU");
+      virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 0));
     }
     TF_CHECK_OK(virtual_cluster_->Provision());
 
-    run_fp16_on_cpu_ = false;
+    bool run_fp16_on_cpu = false;
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
-    run_fp16_on_cpu_ = IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF);
+    run_fp16_on_cpu = IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF);
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
-    skip_test_ = !gpu_available_ && (!IsMKLEnabled() || !run_fp16_on_cpu_);
+    skip_test_ = !gpu_available_ && (!IsMKLEnabled() || !run_fp16_on_cpu);
   }
 
   void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
@@ -233,7 +231,6 @@ class AutoMixedPrecisionTest : public GrapplerTest {
   std::unique_ptr<Cluster> virtual_cluster_;
   bool gpu_available_;
   bool skip_test_;
-  bool run_fp16_on_cpu_;
 };
 
 TEST_F(AutoMixedPrecisionTest, NoOp) {

From 9a3ef902974965aacfaf2192b3a7253dcd0609f8 Mon Sep 17 00:00:00 2001
From: Gauri1 Deshpande <gauri1.deshpande@intel.com>
Date: Fri, 9 Feb 2024 12:30:25 -0800
Subject: [PATCH 005/670] Address review comments

---
 .../optimizers/auto_mixed_precision.cc        |  75 +++---
 .../optimizers/auto_mixed_precision.h         |  12 +-
 .../optimizers/auto_mixed_precision_lists.h   |  40 ++--
 .../optimizers/auto_mixed_precision_test.cc   | 221 ++++++++++--------
 .../grappler/optimizers/meta_optimizer.cc     |  10 +-
 5 files changed, 198 insertions(+), 160 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 466de8be2b4e83..40201896086e4f 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -47,7 +47,6 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/util/env_var.h"
-#include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -1029,8 +1028,6 @@ std::unordered_map<string, DeviceProperties> GetDevices(Cluster* cluster) {
   return devices;
 }
 
-int GetNumGPUs(const Cluster& cluster);
-
 class AutoMixedPrecisionImpl {
  public:
   // CastType indicates the type of inserted Cast op
@@ -1041,8 +1038,7 @@ class AutoMixedPrecisionImpl {
   AutoMixedPrecisionImpl(Cluster* cluster,
                          const std::unordered_set<string>& nodes_to_preserve,
                          GraphDef* graph, string id,
-                         AutoMixedPrecisionMode mode,
-                         const bool run_fp16_on_cpu)
+                         AutoMixedPrecisionMode mode)
       : devices_(GetDevices(cluster)),
         virtual_placer_(devices_),
         nodes_to_preserve_(nodes_to_preserve),
@@ -1055,11 +1051,10 @@ class AutoMixedPrecisionImpl {
         num_nonvar_casts_to_f16_(0),
         mode_(mode),
         target_dtype_((mode_ == AutoMixedPrecisionMode::CUDA ||
-                       mode_ == AutoMixedPrecisionMode::CPU)
+                       mode_ == AutoMixedPrecisionMode::CPU ||
+                       mode_ == AutoMixedPrecisionMode::FP16_CPU)
                           ? DT_HALF
-                          : DT_BFLOAT16),
-        num_gpus_(GetNumGPUs(*cluster)),
-        run_fp16_on_cpu_(run_fp16_on_cpu) {}
+                          : DT_BFLOAT16) {}
 
   Status Optimize();
 
@@ -1069,16 +1064,20 @@ class AutoMixedPrecisionImpl {
   std::unique_ptr<AutoMixedPrecisionLists> get_mixed_precision_lists() const {
     switch (mode_) {
       case AutoMixedPrecisionMode::CUDA:
-        return std::make_unique<AutoMixedPrecisionListsCuda>(
-            cuda_version_, cudnn_version_, run_fp16_on_cpu_);
+        return std::make_unique<AutoMixedPrecisionListsCuda>(cuda_version_,
+                                                             cudnn_version_);
       case AutoMixedPrecisionMode::BF16:
-        return std::make_unique<AutoMixedPrecisionListsMkl>();
+        return std::make_unique<AutoMixedPrecisionListsMkl>(
+            AutoMixedPrecisionMode::BF16);
       case AutoMixedPrecisionMode::CPU:
         // Note: this is not a typo here. AutoMixedPrecisionListsCuda is used
         // intentionally to make CPU and GPU have the same fp16 ops.
         return std::make_unique<AutoMixedPrecisionListsCuda>(
             /*cuda_version=*/10000,   // Hardcode cuda and cudnn version so
             /*cudnn_version=*/8000);  // CPU emulates the same ops on GPU.
+      case AutoMixedPrecisionMode::FP16_CPU:
+        return std::make_unique<AutoMixedPrecisionListsMkl>(
+            AutoMixedPrecisionMode::FP16_CPU);
     }
   }
   Status PrintDebugLogs(bool preop, size_t timestamp);
@@ -1153,8 +1152,6 @@ class AutoMixedPrecisionImpl {
   gtl::FlatSet<string> f16_clearlist_;
   absl::flat_hash_set<const NodeDef*> should_process_nodes_;
   DataType target_dtype_;  // Either DT_HALF or DT_BFLOAT16
-  int num_gpus_ = 0;
-  bool run_fp16_on_cpu_ = false;
 };
 
 NodeDef AutoMixedPrecisionImpl::BuildCastNode(
@@ -1392,9 +1389,11 @@ Status AutoMixedPrecisionImpl::Optimize() {
       "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL", "", &optimization_level));
   optimization_level = absl::AsciiStrToUpper(optimization_level);
   force_all_fp16_ = optimization_level == "UNSAFE_FORCE_ALL";
-  if (force_all_fp16_ && mode_ == AutoMixedPrecisionMode::BF16) {
-    // Many ops do not support bfloat16 on the CPU so we disallowing forcing to
-    // bfloat16.
+  if (force_all_fp16_ &&
+      (mode_ == AutoMixedPrecisionMode::BF16 ||
+       mode_ == AutoMixedPrecisionMode::FP16_CPU)) {
+    // Many ops do not support bfloat16/fp16 on the CPU. So, disallowing
+    // forcing to bfloat16/fp16.
     return errors::InvalidArgument(
         "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL cannot be set to "
         "UNSAFE_FORCE_ALL when oneDNN is used");
@@ -1429,18 +1428,14 @@ Status AutoMixedPrecisionImpl::Optimize() {
     string device_type;
     switch (mode_) {
       case AutoMixedPrecisionMode::CUDA:
-        if (!run_fp16_on_cpu_) {
-          device_type = DEVICE_GPU;
-          should_process =
-              !MustPreserve(node) && IsOnDevice(node, device_type) &&
-              (ShouldIgnorePerformance() || IsOnSuitableGPUArch(node));
-        } else {
-          device_type = DEVICE_CPU;
-          should_process = !MustPreserve(node) && IsOnDevice(node, device_type);
-        }
+        device_type = DEVICE_GPU;
+        should_process =
+            !MustPreserve(node) && IsOnDevice(node, device_type) &&
+            (ShouldIgnorePerformance() || IsOnSuitableGPUArch(node));
         break;
       case AutoMixedPrecisionMode::BF16:
       case AutoMixedPrecisionMode::CPU:
+      case AutoMixedPrecisionMode::FP16_CPU:
         device_type = DEVICE_CPU;
         should_process = !MustPreserve(node) && IsOnDevice(node, device_type);
         break;
@@ -1870,7 +1865,8 @@ void AutoMixedPrecisionImpl::AddInferToAllowIfFollowAllow(
     const absl::flat_hash_set<int>& deny_set,
     absl::flat_hash_set<int>* allow_set) const {
   // Currently only target for oneDNN
-  if (mode_ != AutoMixedPrecisionMode::BF16 && !run_fp16_on_cpu_) {
+  if (mode_ != AutoMixedPrecisionMode::BF16 &&
+      mode_ != AutoMixedPrecisionMode::FP16_CPU) {
     return;
   }
   for (int item_idx = 0; item_idx < graph_type_view_.num_nodes(); ++item_idx) {
@@ -2311,20 +2307,19 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
   *output = item.graph;
 
   int num_gpus = GetNumGPUs(*cluster);
-  bool run_fp16_on_cpu = false;
   if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
     // No GPUs to run AutoMixedPrecision in FP16.
-    // Check if CPU supports FP16.
-    if (!IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF)) {
-      LOG(WARNING) << "No support for " << name() << " graph optimizer on CPU/GPU";
-      return OkStatus();
-    } else {
-      run_fp16_on_cpu = true;
-      LOG(INFO) << "Running " << name() << " graph optimizer on CPU";
-    }
-  } else {
-    LOG(INFO) << "Running " << name() << " graph optimizer on GPU";
+    LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()
+                 << " graph optimizer";
+    return OkStatus();
+  }
+  // Check if CPU supports FP16
+  if (mode_ == AutoMixedPrecisionMode::FP16_CPU &&
+      !IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF)) {
+    LOG(WARNING) << "No support for " << name() << " graph optimizer on CPU";
+    return OkStatus();
   }
+  LOG(INFO) << "Running " << name() << " graph optimizer ";
 
   if (num_gpus >= 1 && mode_ == AutoMixedPrecisionMode::BF16) {
     LOG(WARNING) << "Note: GPUs detected. Using " << name()
@@ -2333,11 +2328,11 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // Optimize the output graph in-place.
   AutoMixedPrecisionImpl optimizer(cluster, item.NodesToPreserve(), output,
-                                   item.id, mode_, run_fp16_on_cpu);
+                                   item.id, mode_);
   if (item.id == "tf_graph") {
     LOG(INFO) << "Running " << name() << " graph optimizer";
   } else {
-    VLOG(INFO) << "Running " << name() << " graph optimizer on " << item.id;
+    LOG(INFO) << "Running " << name() << " graph optimizer on " << item.id;
   }
   Status status = optimizer.Optimize();
   if (!status.ok()) {
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
index 0807d740f1448c..3f478ec3038534 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
@@ -26,15 +26,16 @@ namespace grappler {
 // CUDA: convert to float16 on GPU
 // BF16: convert to bfloat16 on CPU
 // CPU: emulate float16 on CPU without changing operator kernel
-enum class AutoMixedPrecisionMode { CUDA, BF16, CPU };
+// FP16_CPU : convert to float16 on CPU
+enum class AutoMixedPrecisionMode { CUDA, BF16, CPU, FP16_CPU };
 
 // Convert data types to float16 or bfloat16 where appropriate to improve
 // performance on GPUs or CPUs.
 class AutoMixedPrecision : public GraphOptimizer {
  public:
-  // If 'mode' is CUDA, converts nodes to float16 on Nvidia GPUs. If BF16,
-  // converts nodes to bfloat16 on CPUs in order to take advantage of oneDNN
-  // performance improvements with bfloat16.
+  // If 'mode' is CUDA, converts nodes to float16 on Nvidia GPUs. If BF16 or
+  // FP16_CPU, converts nodes to bfloat16/fp16 on CPUs in order to take
+  // advantage of oneDNN performance improvements with bfloat16/fp16.
   explicit AutoMixedPrecision(
       AutoMixedPrecisionMode mode = AutoMixedPrecisionMode::CUDA)
       : mode_(mode) {}
@@ -49,6 +50,9 @@ class AutoMixedPrecision : public GraphOptimizer {
         return "auto_mixed_precision_onednn_bfloat16";
       case AutoMixedPrecisionMode::CPU:
         return "auto_mixed_precision_cpu";
+      case AutoMixedPrecisionMode::FP16_CPU:
+        // Note: use same config for FP16 on CPU & GPU.
+        return "auto_mixed_precision";
       default:
         LOG(FATAL) << "Invalid value for AutoMixedPrecisionMode: "  // Crash Ok
                    << static_cast<int>(mode_);
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index 63a45f33b977ce..4a520fa3377e8a 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -107,11 +107,8 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
   }
 
  public:
-  AutoMixedPrecisionListsCuda(int cuda_version, int cudnn_version,
-                              bool run_fp16_on_cpu = false)
-      : cuda_version_(cuda_version),
-        cudnn_version_(cudnn_version),
-        run_fp16_on_cpu_(run_fp16_on_cpu) {}
+  AutoMixedPrecisionListsCuda(int cuda_version, int cudnn_version)
+      : cuda_version_(cuda_version), cudnn_version_(cudnn_version) {}
 
   gtl::FlatSet<string> AllowList() override {
     auto list = gtl::FlatSet<string>{
@@ -147,13 +144,13 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
 #if TENSORFLOW_USE_ROCM
     if (true) {
 #else
-    if (cuda_version_ >= 9010 || run_fp16_on_cpu_) {
+    if (cuda_version_ >= 9010) {
       // Fp16 BatchMatMul is slow before CUDA 9.1.
 #endif
       list.insert("BatchMatMul");
       list.insert("BatchMatMulV2");
     }
-    if (cudnn_version_ >= 7602 || run_fp16_on_cpu_) {
+    if (cudnn_version_ >= 7602) {
       // Fp16 3D conv is slow before CUDNN 7.6.2.
       list.insert("Conv3D");
       list.insert("Conv3DBackpropFilter");
@@ -161,7 +158,7 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
       list.insert("Conv3DBackpropInput");
       list.insert("Conv3DBackpropInputV2");
     }
-    if (cudnn_version_ >= 8000 || run_fp16_on_cpu_) {
+    if (cudnn_version_ >= 8000) {
       list.insert("DepthwiseConv2dNative");
       list.insert("DepthwiseConv2dNativeBackpropFilter");
       list.insert("DepthwiseConv2dNativeBackpropInput");
@@ -224,11 +221,6 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
         "Tanh",
         "TanhGrad",
     };
-    if (run_fp16_on_cpu_) {
-      list.insert("Rsqrt");
-      list.insert("Square");
-      list.insert("SquaredDifference");
-    }
     UpdateList("INFERLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
     // TODO(reedwm): This should be removed if we don't have active users.
@@ -361,10 +353,6 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
         "Where",
         "ZerosLike",
     };
-    if (run_fp16_on_cpu_) {
-      list.insert("ResizeBilinear");
-      list.insert("ScatterNd");
-    }
     AddTensorListOps(&list);
     UpdateList("CLEARLIST", &list);
     return list;
@@ -373,12 +361,11 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
  private:
   int cuda_version_;
   int cudnn_version_;
-  bool run_fp16_on_cpu_;
 };
 
 class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
  public:
-  AutoMixedPrecisionListsMkl() {}
+  AutoMixedPrecisionListsMkl(AutoMixedPrecisionMode mode) : mode_(mode) {}
 
   // Only ops which are supported by MKL in bfloat16 should be added to the
   // allow list, infer list, or clear list.
@@ -417,13 +404,14 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
                                      "BiasAddGrad",
                                      "BiasAddV1",
                                      "Erf",
+                                     "Erfc",
                                      "FusedBatchNormV2",
                                      "FusedBatchNormGradV2",
                                      "FusedBatchNormV3",
                                      "FusedBatchNormGradV3",
+                                     "Inv",
                                      "LeakyRelu",
                                      "LeakyReluGrad",
-                                     "Mean",
                                      "Mul",
                                      "Sub",
                                      "Elu",
@@ -449,9 +437,12 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
                                      "Sqrt",
                                      "Square",
                                      "SquaredDifference",
-                                     "Sum",
                                      "Tanh",
                                      "TanhGrad"};
+    if (mode_ != AutoMixedPrecisionMode::FP16_CPU) {
+      list.insert("Mean");
+      list.insert("Sum");
+    }
     UpdateList("INFERLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
     // TODO(reedwm): This should be removed if we don't have active users.
@@ -469,6 +460,10 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
         "SoftmaxCrossEntropyWithLogits",
         "SparseSoftmaxCrossEntropyWithLogits",
     };
+    if (mode_ == AutoMixedPrecisionMode::FP16_CPU) {
+      list.insert("Mean");
+      list.insert("Sum");
+    }
     UpdateList("DENYLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
     // TODO(reedwm): This should be removed if we don't have active users.
@@ -505,6 +500,7 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
         "Greater",
         "GreaterEqual",
         "Identity",
+        "IdentityN",
         "IsFinite",
         "IsInf",
         "IsNan",
@@ -576,6 +572,8 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
     UpdateList("CLEARLIST", &list);
     return list;
   }
+  private:
+  AutoMixedPrecisionMode mode_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index c6234ac74a6cb7..c8b2e18318a143 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -128,15 +128,14 @@ class AutoMixedPrecisionTest : public GrapplerTest {
       DeviceProperties device_properties;
       device_properties.set_type("CPU");
       virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 0));
-    }
-    TF_CHECK_OK(virtual_cluster_->Provision());
 
-    bool run_fp16_on_cpu = false;
+      bool run_fp16_on_cpu = false;
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
-    run_fp16_on_cpu = IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF);
+      run_fp16_on_cpu = IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF);
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+    }
+    TF_CHECK_OK(virtual_cluster_->Provision());
 
-    skip_test_ = !gpu_available_ && (!IsMKLEnabled() || !run_fp16_on_cpu);
   }
 
   void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
@@ -183,8 +182,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
   void TestSimpleUnaryInferOp(
       double input_min, double input_max, double atol, double rtol,
       const std::function<Output(const tensorflow::Scope&, Output)>&
-          test_op_factory) {
-    if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+          test_op_factory, AutoMixedPrecisionMode mode) {
+    if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
 
     int size = 128;
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -203,7 +202,7 @@ class AutoMixedPrecisionTest : public GrapplerTest {
     std::vector<std::pair<string, Tensor>> feed = {{"input", input_tensor}};
     auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
 
-    AutoMixedPrecision optimizer;
+    AutoMixedPrecision optimizer(mode);
     GraphDef output;
     TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -224,17 +223,27 @@ class AutoMixedPrecisionTest : public GrapplerTest {
     }
   }
 
-  bool ShouldSkipTest() {
-    return skip_test_;
+  bool ShouldSkipTest(AutoMixedPrecisionMode mode) {
+    if (mode == AutoMixedPrecisionMode::CUDA && GetNumAvailableGPUs() > 0 ||
+        mode == AutoMixedPrecisionMode::FP16_CPU && is_fp16_enabled_on_cpu_) {
+      return false;
+    } else {
+      return true;
+    }
   }
 
   std::unique_ptr<Cluster> virtual_cluster_;
   bool gpu_available_;
-  bool skip_test_;
+  bool is_fp16_enabled_on_cpu_;
 };
 
-TEST_F(AutoMixedPrecisionTest, NoOp) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+class AutoMixedPrecisionParamTest : public AutoMixedPrecisionTest,
+                                    public ::testing::WithParamInterface<
+                                        AutoMixedPrecisionMode> {};
+
+TEST_P(AutoMixedPrecisionParamTest, NoOp) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.234f, {32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -248,7 +257,7 @@ TEST_F(AutoMixedPrecisionTest, NoOp) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -271,8 +280,9 @@ TEST_F(AutoMixedPrecisionTest, NoOp) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, AlreadyFp16) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, AlreadyFp16) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f, {32, 32});
   Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_HALF);
@@ -287,7 +297,7 @@ TEST_F(AutoMixedPrecisionTest, AlreadyFp16) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
   VLOG(1) << output.DebugString();
@@ -310,8 +320,9 @@ TEST_F(AutoMixedPrecisionTest, AlreadyFp16) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, Simple) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, Simple) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -331,7 +342,7 @@ TEST_F(AutoMixedPrecisionTest, Simple) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -360,11 +371,11 @@ TEST_F(AutoMixedPrecisionTest, Simple) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, NoInferOp) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, NoInferOp) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   setenv("TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL", "TREAT_INFER_AS_DENY",
          1 /* replace */);
-
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -384,7 +395,7 @@ TEST_F(AutoMixedPrecisionTest, NoInferOp) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -413,8 +424,9 @@ TEST_F(AutoMixedPrecisionTest, NoInferOp) {
   unsetenv("TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL");
 }
 
-TEST_F(AutoMixedPrecisionTest, BidirectionalClearChain) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, BidirectionalClearChain) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output clr1 = ops::Relu(s.WithOpName("clr1"), input);
@@ -430,7 +442,7 @@ TEST_F(AutoMixedPrecisionTest, BidirectionalClearChain) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -453,8 +465,9 @@ TEST_F(AutoMixedPrecisionTest, BidirectionalClearChain) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, PreserveFetches) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, PreserveFetches) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
@@ -472,7 +485,7 @@ TEST_F(AutoMixedPrecisionTest, PreserveFetches) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -499,11 +512,11 @@ TEST_F(AutoMixedPrecisionTest, PreserveFetches) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, PreserveCPUNodes) {
-  if (GetNumAvailableGPUs() == 0) {
+TEST_P(AutoMixedPrecisionParamTest, PreserveCPUNodes) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (mode == AutoMixedPrecisionMode::FP16_CPU) {
     GTEST_SKIP() << "This test is not required on CPU";
   }
-
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output clr1 = ops::Relu(s.WithOpName("clr1"), input);
@@ -521,7 +534,7 @@ TEST_F(AutoMixedPrecisionTest, PreserveCPUNodes) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -544,8 +557,9 @@ TEST_F(AutoMixedPrecisionTest, PreserveCPUNodes) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, PreserveIdentityAfterVariable) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, PreserveIdentityAfterVariable) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output var1 = ops::Variable(s.WithOpName("var1"), {32, 32}, DT_FLOAT);
@@ -565,7 +579,7 @@ TEST_F(AutoMixedPrecisionTest, PreserveIdentityAfterVariable) {
   std::vector<std::pair<string, Tensor>> feed = {{"var1", var1_tensor}};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -589,8 +603,9 @@ TEST_F(AutoMixedPrecisionTest, PreserveIdentityAfterVariable) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, FusedBatchNorm) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, FusedBatchNorm) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   // Uses NHWC data format because non-GPU execution does not support NCHW.
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {8, 56, 56, 16});
@@ -623,7 +638,7 @@ TEST_F(AutoMixedPrecisionTest, FusedBatchNorm) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -649,8 +664,9 @@ TEST_F(AutoMixedPrecisionTest, FusedBatchNorm) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, RepeatedAndListTypeAttrs) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, RepeatedAndListTypeAttrs) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
@@ -666,7 +682,7 @@ TEST_F(AutoMixedPrecisionTest, RepeatedAndListTypeAttrs) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -690,8 +706,9 @@ TEST_F(AutoMixedPrecisionTest, RepeatedAndListTypeAttrs) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, ExistingCast) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, ExistingCast) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), true, {32, 32});
   Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_FLOAT);
@@ -703,7 +720,7 @@ TEST_F(AutoMixedPrecisionTest, ExistingCast) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -723,8 +740,9 @@ TEST_F(AutoMixedPrecisionTest, ExistingCast) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, RecurrentEdgeColorMismatch) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, RecurrentEdgeColorMismatch) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -757,7 +775,7 @@ TEST_F(AutoMixedPrecisionTest, RecurrentEdgeColorMismatch) {
   const_node->add_input("^mrg1");
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -786,8 +804,9 @@ TEST_F(AutoMixedPrecisionTest, RecurrentEdgeColorMismatch) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, TensorListSetGet) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, TensorListSetGet) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::TensorListReserve(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -829,7 +848,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListSetGet) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -858,8 +877,9 @@ TEST_F(AutoMixedPrecisionTest, TensorListSetGet) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, TensorListPushPop) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, TensorListPushPop) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -893,7 +913,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushPop) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -922,8 +942,9 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushPop) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, TensorListFromTensor) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32};
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
@@ -948,7 +969,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -973,8 +994,9 @@ TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, TensorListPushBackBatchAndConcatLists) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, TensorListPushBackBatchAndConcatLists) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -1009,7 +1031,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushBackBatchAndConcatLists) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -1034,8 +1056,9 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushBackBatchAndConcatLists) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, TensorListThroughFunction) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, TensorListThroughFunction) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   // This test passes a tensor list handle through a function with its own
   // Tensor List ops inside to test that the types are not changed to a
   // conflicting state.
@@ -1096,7 +1119,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListThroughFunction) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -1143,8 +1166,9 @@ bool IsSupportedGPU(const Cluster& cluster) {
 #endif
 }
 
-TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
+TEST_P(AutoMixedPrecisionParamTest, BatchMatMul) {
+  AutoMixedPrecisionMode mode = GetParam();
+  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 33, {64, 32, 32});
   Output allow1 = ops::BatchMatMul(s.WithOpName("allow1"), input, input);
@@ -1155,7 +1179,7 @@ TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -1179,110 +1203,120 @@ TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, EluOp) {
+TEST_P(AutoMixedPrecisionParamTest, EluOp) {
   TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Elu(scope, input);
-      });
+      }, GetParam());
 }
 
-TEST_F(AutoMixedPrecisionTest, ErfOp) {
+TEST_P(AutoMixedPrecisionParamTest, ErfOp) {
   TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Erf(scope, input);
-      });
+      }, GetParam());
 }
 
-TEST_F(AutoMixedPrecisionTest, ErfcOp) {
+TEST_P(AutoMixedPrecisionParamTest, ErfcOp) {
   TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Erfc(scope, input);
-      });
+      }, GetParam());
 }
 
-TEST_F(AutoMixedPrecisionTest, InvOp) {
+TEST_P(AutoMixedPrecisionParamTest, InvOp) {
   TestSimpleUnaryInferOp(
       0.01, 10, -1, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Inv(scope, input);
-      });
+      }, GetParam());
 }
 
-TEST_F(AutoMixedPrecisionTest, LogOp) {
+TEST_P(AutoMixedPrecisionParamTest, LogOp) {
   TestSimpleUnaryInferOp(
       0.01, 10, 1.0e-3, 2.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Log(scope, input);
-      });
+      }, GetParam());
 }
 
-TEST_F(AutoMixedPrecisionTest, Log1pOp) {
+TEST_P(AutoMixedPrecisionParamTest, Log1pOp) {
   TestSimpleUnaryInferOp(
       -0.99, 9, 1.0e-3, 5.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Log1p(scope, input);
-      });
+      }, GetParam());
 }
 
-TEST_F(AutoMixedPrecisionTest, LogSoftmaxOp) {
+TEST_P(AutoMixedPrecisionParamTest, LogSoftmaxOp) {
   TestSimpleUnaryInferOp(
       -8, 8, -1, 1.0e-2,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::LogSoftmax(scope, input);
-      });
+      }, GetParam());
 }
 
-TEST_F(AutoMixedPrecisionTest, ReciprocalOp) {
+TEST_P(AutoMixedPrecisionParamTest, ReciprocalOp) {
   TestSimpleUnaryInferOp(
       0.01, 10, -1, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Reciprocal(scope, input);
-      });
+      }, GetParam());
 }
 
-TEST_F(AutoMixedPrecisionTest, SigmoidOp) {
+TEST_P(AutoMixedPrecisionParamTest, SigmoidOp) {
   TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Sigmoid(scope, input);
-      });
+      }, GetParam());
 }
 
-TEST_F(AutoMixedPrecisionTest, SoftmaxOp) {
+TEST_P(AutoMixedPrecisionParamTest, SoftmaxOp) {
   TestSimpleUnaryInferOp(
       -8, 8, 2.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Softmax(scope, input);
-      });
+      }, GetParam());
 }
 
-TEST_F(AutoMixedPrecisionTest, SoftplusOp) {
+TEST_P(AutoMixedPrecisionParamTest, SoftplusOp) {
   TestSimpleUnaryInferOp(
       -5, 5, 2.0e-3, 2.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Softplus(scope, input);
-      });
+      }, GetParam());
 }
 
-TEST_F(AutoMixedPrecisionTest, SqrtOp) {
+TEST_P(AutoMixedPrecisionParamTest, SqrtOp) {
   TestSimpleUnaryInferOp(
       0, 10, 1.0e-3, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Sqrt(scope, input);
-      });
+      }, GetParam());
 }
 
-TEST_F(AutoMixedPrecisionTest, TanhOp) {
+TEST_P(AutoMixedPrecisionParamTest, TanhOp) {
   TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Tanh(scope, input);
-      });
+      }, GetParam());
 }
 
+INSTANTIATE_TEST_SUITE_P(AutoMixedPrecisionTest, AutoMixedPrecisionParamTest,
+                         ::testing::ValuesIn({
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+                           AutoMixedPrecisionMode::CUDA,
+#endif
+#if INTEL_MKL
+                           AutoMixedPrecisionMode::FP16_CPU
+#endif
+                          }));
+
 class AutoMixedPrecisionCpuTest : public GrapplerTest {
  protected:
   void SetUp() override {
@@ -1761,6 +1795,7 @@ TEST_F(AutoMixedPrecisionMklTest, InferFollowUpStreamDeny) {
     test::ExpectClose(tensors_expected[i], tensors[i]);
   }
 }
+
 #endif  // INTEL_MKL
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 999e7c0dc6d092..6bfa08a78866dc 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -374,8 +374,14 @@ Status MetaOptimizer::InitializeOptimizers(
   if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision()) &&
       AutoMixedPrecisionEnabled(
           plugin_configs.toggle_config["auto_mixed_precision"])) {
-    optimizers->push_back(
-        std::make_unique<AutoMixedPrecision>(AutoMixedPrecisionMode::CUDA));
+    if (device_types.size() == 1 &&
+        device_types.find("CPU") != device_types.end()) {
+      optimizers->push_back(
+          std::make_unique<AutoMixedPrecision>(AutoMixedPrecisionMode::FP16_CPU));
+    } else {
+      optimizers->push_back(
+          std::make_unique<AutoMixedPrecision>(AutoMixedPrecisionMode::CUDA));
+    }
   }
 #ifdef INTEL_MKL
   if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision_onednn_bfloat16()) &&

From 0ab5b80f9bf952e3eb6f5fb1df0881be7cdd0959 Mon Sep 17 00:00:00 2001
From: Gauri1 Deshpande <gauri1.deshpande@intel.com>
Date: Mon, 12 Feb 2024 10:27:02 -0800
Subject: [PATCH 006/670] minor change

---
 .../core/grappler/optimizers/auto_mixed_precision_test.cc      | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index c8b2e18318a143..780c4835b1c45e 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -1311,8 +1311,7 @@ INSTANTIATE_TEST_SUITE_P(AutoMixedPrecisionTest, AutoMixedPrecisionParamTest,
                          ::testing::ValuesIn({
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
                            AutoMixedPrecisionMode::CUDA,
-#endif
-#if INTEL_MKL
+#elif INTEL_MKL
                            AutoMixedPrecisionMode::FP16_CPU
 #endif
                           }));

From 182a17386f94f26fa7f9753eb03ea0b6263ef8b6 Mon Sep 17 00:00:00 2001
From: Gauri1 Deshpande <gauri1.deshpande@intel.com>
Date: Fri, 16 Feb 2024 15:24:11 -0800
Subject: [PATCH 007/670] Address review comments

---
 .../optimizers/auto_mixed_precision.cc        |  22 +-
 .../optimizers/auto_mixed_precision_lists.h   |  93 +++++----
 .../optimizers/auto_mixed_precision_test.cc   | 191 +++++++++---------
 .../grappler/optimizers/meta_optimizer.cc     |  16 +-
 4 files changed, 160 insertions(+), 162 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 40201896086e4f..8d3ca6f758aa70 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -1064,19 +1064,19 @@ class AutoMixedPrecisionImpl {
   std::unique_ptr<AutoMixedPrecisionLists> get_mixed_precision_lists() const {
     switch (mode_) {
       case AutoMixedPrecisionMode::CUDA:
-        return std::make_unique<AutoMixedPrecisionListsCuda>(cuda_version_,
-                                                             cudnn_version_);
+        return std::make_unique<AutoMixedPrecisionListsFp16>(
+            cuda_version_, cudnn_version_, AutoMixedPrecisionMode::CUDA);
       case AutoMixedPrecisionMode::BF16:
-        return std::make_unique<AutoMixedPrecisionListsMkl>(
-            AutoMixedPrecisionMode::BF16);
+        return std::make_unique<AutoMixedPrecisionListsMkl>();
       case AutoMixedPrecisionMode::CPU:
-        // Note: this is not a typo here. AutoMixedPrecisionListsCuda is used
+        // Note: this is not a typo here. AutoMixedPrecisionListsFp16 is used
         // intentionally to make CPU and GPU have the same fp16 ops.
-        return std::make_unique<AutoMixedPrecisionListsCuda>(
+        return std::make_unique<AutoMixedPrecisionListsFp16>(
             /*cuda_version=*/10000,   // Hardcode cuda and cudnn version so
-            /*cudnn_version=*/8000);  // CPU emulates the same ops on GPU.
+            /*cudnn_version=*/8000,   // CPU emulates the same ops on GPU.
+            AutoMixedPrecisionMode::CPU);
       case AutoMixedPrecisionMode::FP16_CPU:
-        return std::make_unique<AutoMixedPrecisionListsMkl>(
+        return std::make_unique<AutoMixedPrecisionListsFp16>(0, 0,
             AutoMixedPrecisionMode::FP16_CPU);
     }
   }
@@ -1865,8 +1865,7 @@ void AutoMixedPrecisionImpl::AddInferToAllowIfFollowAllow(
     const absl::flat_hash_set<int>& deny_set,
     absl::flat_hash_set<int>* allow_set) const {
   // Currently only target for oneDNN
-  if (mode_ != AutoMixedPrecisionMode::BF16 &&
-      mode_ != AutoMixedPrecisionMode::FP16_CPU) {
+  if (mode_ != AutoMixedPrecisionMode::BF16) {
     return;
   }
   for (int item_idx = 0; item_idx < graph_type_view_.num_nodes(); ++item_idx) {
@@ -2319,7 +2318,6 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
     LOG(WARNING) << "No support for " << name() << " graph optimizer on CPU";
     return OkStatus();
   }
-  LOG(INFO) << "Running " << name() << " graph optimizer ";
 
   if (num_gpus >= 1 && mode_ == AutoMixedPrecisionMode::BF16) {
     LOG(WARNING) << "Note: GPUs detected. Using " << name()
@@ -2332,7 +2330,7 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
   if (item.id == "tf_graph") {
     LOG(INFO) << "Running " << name() << " graph optimizer";
   } else {
-    LOG(INFO) << "Running " << name() << " graph optimizer on " << item.id;
+    VLOG(1) << "Running " << name() << " graph optimizer on " << item.id;
   }
   Status status = optimizer.Optimize();
   if (!status.ok()) {
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index 4a520fa3377e8a..810a3ea8d6f6d8 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -95,7 +95,7 @@ class AutoMixedPrecisionLists {
   }
 };
 
-class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
+class AutoMixedPrecisionListsFp16 : public AutoMixedPrecisionLists {
  private:
   static bool IsPseudoFastMath() {
     string optimization_level;
@@ -107,50 +107,60 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
   }
 
  public:
-  AutoMixedPrecisionListsCuda(int cuda_version, int cudnn_version)
-      : cuda_version_(cuda_version), cudnn_version_(cudnn_version) {}
+  AutoMixedPrecisionListsFp16(int cuda_version, int cudnn_version,
+                              AutoMixedPrecisionMode mode)
+      : cuda_version_(cuda_version), cudnn_version_(cudnn_version) {
+    if (mode == AutoMixedPrecisionMode::CUDA ||
+        mode == AutoMixedPrecisionMode::CPU) {
+      use_cuda_ = true;
+    } else if (mode == AutoMixedPrecisionMode::FP16_CPU) {
+      use_onednn_ = true;
+    }
+  }
 
   gtl::FlatSet<string> AllowList() override {
     auto list = gtl::FlatSet<string>{
-        "BlockLSTM",
-        "BlockLSTMV2",
-        "BlockLSTMGrad",
-        "BlockLSTMGradV2",
         "Conv2D",
         "Conv2DBackpropFilter",
         "Conv2DBackpropInput",
-        "CudnnRNN",
-        "CudnnRNNBackprop",
-        "CudnnRNNBackpropV2",
-        "CudnnRNNBackpropV3",
-        "CudnnRNNV2",
-        "CudnnRNNV3",
         "Einsum",
-        "FusedConv2DBiasActivation",
-        "FusedSparseConvGpuV2",
-        "GRUBlockCell",
-        "GRUBlockCellGrad",
-        "LSTMBlockCell",
-        "LSTMBlockCellGrad",
         "MatMul",
-        "Mha",
-        "MhaV2",
-        "Tmlp",
-        "TmlpV2",
-        "TmlpV3",
-        "Pmlp",
-        "FastUnsortedSegmentMax",
     };
+    if (use_cuda_) {
+      list.insert("BlockLSTM");
+      list.insert("BlockLSTMV2");
+      list.insert("BlockLSTMGrad");
+      list.insert("BlockLSTMGradV2");
+      list.insert("CudnnRNN");
+      list.insert("CudnnRNNBackprop");
+      list.insert("CudnnRNNBackpropV2");
+      list.insert("CudnnRNNBackpropV3");
+      list.insert("CudnnRNNV2");
+      list.insert("CudnnRNNV3");
+      list.insert("FusedConv2DBiasActivation");
+      list.insert("FusedSparseConvGpuV2");
+      list.insert("GRUBlockCell");
+      list.insert("GRUBlockCellGrad");
+      list.insert("LSTMBlockCell");
+      list.insert("LSTMBlockCellGrad");
+      list.insert("Mha");
+      list.insert("MhaV2");
+      list.insert("Tmlp");
+      list.insert("TmlpV2");
+      list.insert("TmlpV3");
+      list.insert("Pmlp");
+      list.insert("FastUnsortedSegmentMax");
+    }
 #if TENSORFLOW_USE_ROCM
     if (true) {
 #else
-    if (cuda_version_ >= 9010) {
+    if ((use_cuda_ && cuda_version_ >= 9010) || use_onednn_ ) {
       // Fp16 BatchMatMul is slow before CUDA 9.1.
 #endif
       list.insert("BatchMatMul");
       list.insert("BatchMatMulV2");
     }
-    if (cudnn_version_ >= 7602) {
+    if ((use_cuda_ && cudnn_version_ >= 7602) || use_onednn_) {
       // Fp16 3D conv is slow before CUDNN 7.6.2.
       list.insert("Conv3D");
       list.insert("Conv3DBackpropFilter");
@@ -158,7 +168,7 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
       list.insert("Conv3DBackpropInput");
       list.insert("Conv3DBackpropInputV2");
     }
-    if (cudnn_version_ >= 8000) {
+    if ((use_cuda_ && cudnn_version_ >= 8000) || use_onednn_) {
       list.insert("DepthwiseConv2dNative");
       list.insert("DepthwiseConv2dNativeBackpropFilter");
       list.insert("DepthwiseConv2dNativeBackpropInput");
@@ -172,7 +182,7 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
   }
 
   gtl::FlatSet<string> InferList() override {
-    if (IsPseudoFastMath()) {
+    if (IsPseudoFastMath() && use_cuda_) {
       return gtl::FlatSet<string>{};
     }
 
@@ -221,6 +231,11 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
         "Tanh",
         "TanhGrad",
     };
+    if (use_onednn_) {
+      list.insert("Rsqrt");
+      list.insert("Square");
+      list.insert("SquaredDifference");
+    }
     UpdateList("INFERLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
     // TODO(reedwm): This should be removed if we don't have active users.
@@ -229,7 +244,7 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
   }
 
   gtl::FlatSet<string> DenyList() override {
-    if (IsPseudoFastMath()) {
+    if (IsPseudoFastMath() && use_cuda_) {
       return gtl::FlatSet<string>{};
     }
 
@@ -252,7 +267,7 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
   }
 
   gtl::FlatSet<string> ClearList() override {
-    if (IsPseudoFastMath()) {
+    if (IsPseudoFastMath() && use_cuda_) {
       return gtl::FlatSet<string>{};
     }
 
@@ -361,11 +376,13 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
  private:
   int cuda_version_;
   int cudnn_version_;
+  bool use_cuda_;
+  bool use_onednn_;
 };
 
 class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
  public:
-  AutoMixedPrecisionListsMkl(AutoMixedPrecisionMode mode) : mode_(mode) {}
+  AutoMixedPrecisionListsMkl() {}
 
   // Only ops which are supported by MKL in bfloat16 should be added to the
   // allow list, infer list, or clear list.
@@ -439,10 +456,6 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
                                      "SquaredDifference",
                                      "Tanh",
                                      "TanhGrad"};
-    if (mode_ != AutoMixedPrecisionMode::FP16_CPU) {
-      list.insert("Mean");
-      list.insert("Sum");
-    }
     UpdateList("INFERLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
     // TODO(reedwm): This should be removed if we don't have active users.
@@ -460,10 +473,6 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
         "SoftmaxCrossEntropyWithLogits",
         "SparseSoftmaxCrossEntropyWithLogits",
     };
-    if (mode_ == AutoMixedPrecisionMode::FP16_CPU) {
-      list.insert("Mean");
-      list.insert("Sum");
-    }
     UpdateList("DENYLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
     // TODO(reedwm): This should be removed if we don't have active users.
@@ -572,8 +581,6 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
     UpdateList("CLEARLIST", &list);
     return list;
   }
-  private:
-  AutoMixedPrecisionMode mode_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 780c4835b1c45e..60dd0b88eed075 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -99,43 +99,47 @@ const std::pair<int, int> kMinGPUArch = {7, 0};
 
 class AutoMixedPrecisionTest : public GrapplerTest {
  protected:
+  void SetMode(AutoMixedPrecisionMode mode) {
+    mode_ = mode;
+  }
   void SetUp() override {
-    int num_gpus = GetNumAvailableGPUs();
-    // If GPUs are available, require that they all satisfy the min arch.
-    gpu_available_ = (num_gpus > 0);
+    if (mode_ == AutoMixedPrecisionMode::CUDA) {
+      int num_gpus = GetNumAvailableGPUs();
+      // If GPUs are available, require that they all satisfy the min arch.
+      gpu_available_ = (num_gpus > 0);
 #if GOOGLE_CUDA
-    gpu_available_ =
-        gpu_available_ && (num_gpus == GetNumAvailableGPUs(kMinGPUArch));
+      gpu_available_ =
+          gpu_available_ && (num_gpus == GetNumAvailableGPUs(kMinGPUArch));
 #else  // Here we force Tensorflow to use the virtual GFX906
-    gpu_available_ = false;
+      gpu_available_ = false;
 #endif
-    if (gpu_available_) {
-      virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
-    } else if( num_gpus > 0) {
-      DeviceProperties device_properties;
-      device_properties.set_type("GPU");
+      if (gpu_available_) {
+        virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
+      } else {
+        DeviceProperties device_properties;
+        device_properties.set_type("GPU");
 #if GOOGLE_CUDA
-      device_properties.mutable_environment()->insert({"architecture", "7"});
-      device_properties.mutable_environment()->insert({"cuda", "9010"});
+        device_properties.mutable_environment()->insert({"architecture", "7"});
+        device_properties.mutable_environment()->insert({"cuda", "9010"});
 #else
-      device_properties.mutable_environment()->insert(
-          {"architecture", "gfx906"});
+        device_properties.mutable_environment()->insert(
+            {"architecture", "gfx906"});
 #endif
-      virtual_cluster_.reset(
-          new VirtualCluster({{"/GPU:1", device_properties}}));
-    } else {
+        virtual_cluster_.reset(
+            new VirtualCluster({{"/GPU:1", device_properties}}));
+      }
+    } else if (mode_ == AutoMixedPrecisionMode::FP16_CPU) {
       // When no GPUs are available, try running on CPU.
       DeviceProperties device_properties;
       device_properties.set_type("CPU");
       virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 0));
 
-      bool run_fp16_on_cpu = false;
+      is_fp16_enabled_on_cpu_ = false;
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
-      run_fp16_on_cpu = IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF);
+      is_fp16_enabled_on_cpu_ = IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF);
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
     }
     TF_CHECK_OK(virtual_cluster_->Provision());
-
   }
 
   void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
@@ -182,8 +186,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
   void TestSimpleUnaryInferOp(
       double input_min, double input_max, double atol, double rtol,
       const std::function<Output(const tensorflow::Scope&, Output)>&
-          test_op_factory, AutoMixedPrecisionMode mode) {
-    if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+          test_op_factory) {
+    if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
 
     int size = 128;
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -202,7 +206,7 @@ class AutoMixedPrecisionTest : public GrapplerTest {
     std::vector<std::pair<string, Tensor>> feed = {{"input", input_tensor}};
     auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
 
-    AutoMixedPrecision optimizer(mode);
+    AutoMixedPrecision optimizer(mode_);
     GraphDef output;
     TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -223,9 +227,10 @@ class AutoMixedPrecisionTest : public GrapplerTest {
     }
   }
 
-  bool ShouldSkipTest(AutoMixedPrecisionMode mode) {
-    if (mode == AutoMixedPrecisionMode::CUDA && GetNumAvailableGPUs() > 0 ||
-        mode == AutoMixedPrecisionMode::FP16_CPU && is_fp16_enabled_on_cpu_) {
+  bool ShouldSkipTest() {
+    if (mode_ == AutoMixedPrecisionMode::CUDA && GetNumAvailableGPUs() > 0 ||
+        (mode_ == AutoMixedPrecisionMode::FP16_CPU && IsMKLEnabled() &&
+         is_fp16_enabled_on_cpu_)) {
       return false;
     } else {
       return true;
@@ -235,15 +240,24 @@ class AutoMixedPrecisionTest : public GrapplerTest {
   std::unique_ptr<Cluster> virtual_cluster_;
   bool gpu_available_;
   bool is_fp16_enabled_on_cpu_;
+  AutoMixedPrecisionMode mode_;
 };
 
 class AutoMixedPrecisionParamTest : public AutoMixedPrecisionTest,
                                     public ::testing::WithParamInterface<
-                                        AutoMixedPrecisionMode> {};
+                                        AutoMixedPrecisionMode> {
+
+  protected:
+  void SetUp() override {
+    mode_ = GetParam();
+    AutoMixedPrecisionTest::SetMode(mode_);
+    AutoMixedPrecisionTest::SetUp();
+  }
+  AutoMixedPrecisionMode mode_;
+};
 
 TEST_P(AutoMixedPrecisionParamTest, NoOp) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.234f, {32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -257,7 +271,7 @@ TEST_P(AutoMixedPrecisionParamTest, NoOp) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -281,8 +295,7 @@ TEST_P(AutoMixedPrecisionParamTest, NoOp) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, AlreadyFp16) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f, {32, 32});
   Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_HALF);
@@ -297,7 +310,7 @@ TEST_P(AutoMixedPrecisionParamTest, AlreadyFp16) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
   VLOG(1) << output.DebugString();
@@ -321,8 +334,7 @@ TEST_P(AutoMixedPrecisionParamTest, AlreadyFp16) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, Simple) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -342,7 +354,7 @@ TEST_P(AutoMixedPrecisionParamTest, Simple) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -372,8 +384,7 @@ TEST_P(AutoMixedPrecisionParamTest, Simple) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, NoInferOp) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   setenv("TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL", "TREAT_INFER_AS_DENY",
          1 /* replace */);
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -395,7 +406,7 @@ TEST_P(AutoMixedPrecisionParamTest, NoInferOp) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -425,8 +436,7 @@ TEST_P(AutoMixedPrecisionParamTest, NoInferOp) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, BidirectionalClearChain) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output clr1 = ops::Relu(s.WithOpName("clr1"), input);
@@ -442,7 +452,7 @@ TEST_P(AutoMixedPrecisionParamTest, BidirectionalClearChain) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -466,8 +476,7 @@ TEST_P(AutoMixedPrecisionParamTest, BidirectionalClearChain) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, PreserveFetches) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
@@ -485,7 +494,7 @@ TEST_P(AutoMixedPrecisionParamTest, PreserveFetches) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -513,8 +522,7 @@ TEST_P(AutoMixedPrecisionParamTest, PreserveFetches) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, PreserveCPUNodes) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (mode == AutoMixedPrecisionMode::FP16_CPU) {
+  if (mode_ == AutoMixedPrecisionMode::FP16_CPU) {
     GTEST_SKIP() << "This test is not required on CPU";
   }
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -534,7 +542,7 @@ TEST_P(AutoMixedPrecisionParamTest, PreserveCPUNodes) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -558,8 +566,7 @@ TEST_P(AutoMixedPrecisionParamTest, PreserveCPUNodes) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, PreserveIdentityAfterVariable) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output var1 = ops::Variable(s.WithOpName("var1"), {32, 32}, DT_FLOAT);
@@ -579,7 +586,7 @@ TEST_P(AutoMixedPrecisionParamTest, PreserveIdentityAfterVariable) {
   std::vector<std::pair<string, Tensor>> feed = {{"var1", var1_tensor}};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -604,8 +611,7 @@ TEST_P(AutoMixedPrecisionParamTest, PreserveIdentityAfterVariable) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, FusedBatchNorm) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   // Uses NHWC data format because non-GPU execution does not support NCHW.
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {8, 56, 56, 16});
@@ -638,7 +644,7 @@ TEST_P(AutoMixedPrecisionParamTest, FusedBatchNorm) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -665,8 +671,7 @@ TEST_P(AutoMixedPrecisionParamTest, FusedBatchNorm) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, RepeatedAndListTypeAttrs) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
@@ -682,7 +687,7 @@ TEST_P(AutoMixedPrecisionParamTest, RepeatedAndListTypeAttrs) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -707,8 +712,7 @@ TEST_P(AutoMixedPrecisionParamTest, RepeatedAndListTypeAttrs) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, ExistingCast) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), true, {32, 32});
   Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_FLOAT);
@@ -720,7 +724,7 @@ TEST_P(AutoMixedPrecisionParamTest, ExistingCast) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -741,8 +745,7 @@ TEST_P(AutoMixedPrecisionParamTest, ExistingCast) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, RecurrentEdgeColorMismatch) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -775,7 +778,7 @@ TEST_P(AutoMixedPrecisionParamTest, RecurrentEdgeColorMismatch) {
   const_node->add_input("^mrg1");
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -805,8 +808,7 @@ TEST_P(AutoMixedPrecisionParamTest, RecurrentEdgeColorMismatch) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, TensorListSetGet) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::TensorListReserve(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -848,7 +850,7 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListSetGet) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -878,8 +880,7 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListSetGet) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, TensorListPushPop) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -913,7 +914,7 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListPushPop) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -943,8 +944,7 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListPushPop) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, TensorListFromTensor) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32};
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
@@ -969,7 +969,7 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListFromTensor) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -995,8 +995,7 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListFromTensor) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, TensorListPushBackBatchAndConcatLists) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -1031,7 +1030,7 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListPushBackBatchAndConcatLists) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -1057,8 +1056,7 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListPushBackBatchAndConcatLists) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, TensorListThroughFunction) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   // This test passes a tensor list handle through a function with its own
   // Tensor List ops inside to test that the types are not changed to a
   // conflicting state.
@@ -1119,7 +1117,7 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListThroughFunction) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -1167,8 +1165,7 @@ bool IsSupportedGPU(const Cluster& cluster) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, BatchMatMul) {
-  AutoMixedPrecisionMode mode = GetParam();
-  if (ShouldSkipTest(mode)) GTEST_SKIP() << "This device doesn't support FP16";
+  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 33, {64, 32, 32});
   Output allow1 = ops::BatchMatMul(s.WithOpName("allow1"), input, input);
@@ -1179,7 +1176,7 @@ TEST_P(AutoMixedPrecisionParamTest, BatchMatMul) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer(mode);
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -1208,7 +1205,7 @@ TEST_P(AutoMixedPrecisionParamTest, EluOp) {
       -5, 5, 1.0e-3, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Elu(scope, input);
-      }, GetParam());
+      });
 }
 
 TEST_P(AutoMixedPrecisionParamTest, ErfOp) {
@@ -1216,7 +1213,7 @@ TEST_P(AutoMixedPrecisionParamTest, ErfOp) {
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Erf(scope, input);
-      }, GetParam());
+      });
 }
 
 TEST_P(AutoMixedPrecisionParamTest, ErfcOp) {
@@ -1224,7 +1221,7 @@ TEST_P(AutoMixedPrecisionParamTest, ErfcOp) {
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Erfc(scope, input);
-      }, GetParam());
+      });
 }
 
 TEST_P(AutoMixedPrecisionParamTest, InvOp) {
@@ -1232,7 +1229,7 @@ TEST_P(AutoMixedPrecisionParamTest, InvOp) {
       0.01, 10, -1, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Inv(scope, input);
-      }, GetParam());
+      });
 }
 
 TEST_P(AutoMixedPrecisionParamTest, LogOp) {
@@ -1240,7 +1237,7 @@ TEST_P(AutoMixedPrecisionParamTest, LogOp) {
       0.01, 10, 1.0e-3, 2.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Log(scope, input);
-      }, GetParam());
+      });
 }
 
 TEST_P(AutoMixedPrecisionParamTest, Log1pOp) {
@@ -1248,7 +1245,7 @@ TEST_P(AutoMixedPrecisionParamTest, Log1pOp) {
       -0.99, 9, 1.0e-3, 5.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Log1p(scope, input);
-      }, GetParam());
+      });
 }
 
 TEST_P(AutoMixedPrecisionParamTest, LogSoftmaxOp) {
@@ -1256,7 +1253,7 @@ TEST_P(AutoMixedPrecisionParamTest, LogSoftmaxOp) {
       -8, 8, -1, 1.0e-2,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::LogSoftmax(scope, input);
-      }, GetParam());
+      });
 }
 
 TEST_P(AutoMixedPrecisionParamTest, ReciprocalOp) {
@@ -1264,7 +1261,7 @@ TEST_P(AutoMixedPrecisionParamTest, ReciprocalOp) {
       0.01, 10, -1, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Reciprocal(scope, input);
-      }, GetParam());
+      });
 }
 
 TEST_P(AutoMixedPrecisionParamTest, SigmoidOp) {
@@ -1272,7 +1269,7 @@ TEST_P(AutoMixedPrecisionParamTest, SigmoidOp) {
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Sigmoid(scope, input);
-      }, GetParam());
+      });
 }
 
 TEST_P(AutoMixedPrecisionParamTest, SoftmaxOp) {
@@ -1280,7 +1277,7 @@ TEST_P(AutoMixedPrecisionParamTest, SoftmaxOp) {
       -8, 8, 2.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Softmax(scope, input);
-      }, GetParam());
+      });
 }
 
 TEST_P(AutoMixedPrecisionParamTest, SoftplusOp) {
@@ -1288,7 +1285,7 @@ TEST_P(AutoMixedPrecisionParamTest, SoftplusOp) {
       -5, 5, 2.0e-3, 2.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Softplus(scope, input);
-      }, GetParam());
+      });
 }
 
 TEST_P(AutoMixedPrecisionParamTest, SqrtOp) {
@@ -1296,7 +1293,7 @@ TEST_P(AutoMixedPrecisionParamTest, SqrtOp) {
       0, 10, 1.0e-3, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Sqrt(scope, input);
-      }, GetParam());
+      });
 }
 
 TEST_P(AutoMixedPrecisionParamTest, TanhOp) {
@@ -1304,14 +1301,15 @@ TEST_P(AutoMixedPrecisionParamTest, TanhOp) {
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Tanh(scope, input);
-      }, GetParam());
+      });
 }
 
 INSTANTIATE_TEST_SUITE_P(AutoMixedPrecisionTest, AutoMixedPrecisionParamTest,
                          ::testing::ValuesIn({
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
                            AutoMixedPrecisionMode::CUDA,
-#elif INTEL_MKL
+#endif
+#if INTEL_MKL
                            AutoMixedPrecisionMode::FP16_CPU
 #endif
                           }));
@@ -1794,7 +1792,6 @@ TEST_F(AutoMixedPrecisionMklTest, InferFollowUpStreamDeny) {
     test::ExpectClose(tensors_expected[i], tensors[i]);
   }
 }
-
 #endif  // INTEL_MKL
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 6bfa08a78866dc..3c0f37d2e9ea4a 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -372,16 +372,12 @@ Status MetaOptimizer::InitializeOptimizers(
       optimizers->push_back(std::make_unique<ShapeOptimizer>());
   }
   if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision()) &&
-      AutoMixedPrecisionEnabled(
-          plugin_configs.toggle_config["auto_mixed_precision"])) {
-    if (device_types.size() == 1 &&
-        device_types.find("CPU") != device_types.end()) {
-      optimizers->push_back(
-          std::make_unique<AutoMixedPrecision>(AutoMixedPrecisionMode::FP16_CPU));
-    } else {
-      optimizers->push_back(
-          std::make_unique<AutoMixedPrecision>(AutoMixedPrecisionMode::CUDA));
-    }
+    AutoMixedPrecisionEnabled(
+        plugin_configs.toggle_config["auto_mixed_precision"])) {
+    optimizers->push_back(
+        std::make_unique<AutoMixedPrecision>(AutoMixedPrecisionMode::FP16_CPU));
+    optimizers->push_back(
+        std::make_unique<AutoMixedPrecision>(AutoMixedPrecisionMode::CUDA));
   }
 #ifdef INTEL_MKL
   if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision_onednn_bfloat16()) &&

From 9780d75ad68275bf30b81c7cf02688fb87e5f1e2 Mon Sep 17 00:00:00 2001
From: Gauri1 Deshpande <gauri1.deshpande@intel.com>
Date: Fri, 16 Feb 2024 19:59:22 -0800
Subject: [PATCH 008/670] Address re-review comments

---
 .../optimizers/auto_mixed_precision.cc        |  6 +--
 .../optimizers/auto_mixed_precision.h         |  4 +-
 .../optimizers/auto_mixed_precision_lists.h   |  7 +--
 .../optimizers/auto_mixed_precision_test.cc   | 50 ++++---------------
 .../grappler/optimizers/meta_optimizer.cc     |  2 +
 .../core/protobuf/rewriter_config.proto       |  4 +-
 tensorflow/python/framework/config.py         |  4 +-
 7 files changed, 24 insertions(+), 53 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 8d3ca6f758aa70..d70ef54aee2533 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -1069,8 +1069,6 @@ class AutoMixedPrecisionImpl {
       case AutoMixedPrecisionMode::BF16:
         return std::make_unique<AutoMixedPrecisionListsMkl>();
       case AutoMixedPrecisionMode::CPU:
-        // Note: this is not a typo here. AutoMixedPrecisionListsFp16 is used
-        // intentionally to make CPU and GPU have the same fp16 ops.
         return std::make_unique<AutoMixedPrecisionListsFp16>(
             /*cuda_version=*/10000,   // Hardcode cuda and cudnn version so
             /*cudnn_version=*/8000,   // CPU emulates the same ops on GPU.
@@ -2308,14 +2306,14 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
   int num_gpus = GetNumGPUs(*cluster);
   if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
     // No GPUs to run AutoMixedPrecision in FP16.
-    LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()
+    VLOG(1) << "No (suitable) GPUs detected, skipping " << name()
                  << " graph optimizer";
     return OkStatus();
   }
   // Check if CPU supports FP16
   if (mode_ == AutoMixedPrecisionMode::FP16_CPU &&
       !IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF)) {
-    LOG(WARNING) << "No support for " << name() << " graph optimizer on CPU";
+    VLOG(1) << "No support for " << name() << " graph optimizer on CPU";
     return OkStatus();
   }
 
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
index 3f478ec3038534..c26b640765f3d4 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
@@ -51,8 +51,8 @@ class AutoMixedPrecision : public GraphOptimizer {
       case AutoMixedPrecisionMode::CPU:
         return "auto_mixed_precision_cpu";
       case AutoMixedPrecisionMode::FP16_CPU:
-        // Note: use same config for FP16 on CPU & GPU.
-        return "auto_mixed_precision";
+        // Note: using different name than GPU for ease of debugging.
+        return "auto_mixed_precision_onednn_float16";
       default:
         LOG(FATAL) << "Invalid value for AutoMixedPrecisionMode: "  // Crash Ok
                    << static_cast<int>(mode_);
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index 810a3ea8d6f6d8..5c4cf2940f1720 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -112,9 +112,13 @@ class AutoMixedPrecisionListsFp16 : public AutoMixedPrecisionLists {
       : cuda_version_(cuda_version), cudnn_version_(cudnn_version) {
     if (mode == AutoMixedPrecisionMode::CUDA ||
         mode == AutoMixedPrecisionMode::CPU) {
+      // Note: this is not a typo here. use_cuda_ is set to true for the CPU
+      // intentionally to make CPU and GPU have the same fp16 ops.
       use_cuda_ = true;
+      use_onednn_ = false;
     } else if (mode == AutoMixedPrecisionMode::FP16_CPU) {
       use_onednn_ = true;
+      use_cuda_ = false;
     }
   }
 
@@ -421,12 +425,10 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
                                      "BiasAddGrad",
                                      "BiasAddV1",
                                      "Erf",
-                                     "Erfc",
                                      "FusedBatchNormV2",
                                      "FusedBatchNormGradV2",
                                      "FusedBatchNormV3",
                                      "FusedBatchNormGradV3",
-                                     "Inv",
                                      "LeakyRelu",
                                      "LeakyReluGrad",
                                      "Mul",
@@ -509,7 +511,6 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
         "Greater",
         "GreaterEqual",
         "Identity",
-        "IdentityN",
         "IsFinite",
         "IsInf",
         "IsNan",
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 60dd0b88eed075..9ac263c89068ab 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -89,12 +89,11 @@ void VerifyGraphsEquivalent(const GraphDef& original_graph,
   }
 }
 
-// Currently, this test suite only passes when TensorFlow passes with CUDA/HIP,
-// because otherwise the optimizer will not turn clearlist nodes to float16.
-// When looking at clearlist nodes, this optimizer checks if the nodes have a
-// float16 GPU OpKernel, but without CUDA/HIP there are no GPU OpKernels at all.
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM || INTEL_MKL
-
+// Currently on GPU, this test suite only passes when TensorFlow passes with
+// CUDA/HIP, because otherwise the optimizer will not turn clearlist nodes to
+// float16. When looking at clearlist nodes, this optimizer checks if the nodes
+// have a float16 GPU OpKernel, but without CUDA/HIP there are no GPU OpKernels
+// at all. And on CPU, this test suite passes when AMX FP16 is supported.
 const std::pair<int, int> kMinGPUArch = {7, 0};
 
 class AutoMixedPrecisionTest : public GrapplerTest {
@@ -129,15 +128,17 @@ class AutoMixedPrecisionTest : public GrapplerTest {
             new VirtualCluster({{"/GPU:1", device_properties}}));
       }
     } else if (mode_ == AutoMixedPrecisionMode::FP16_CPU) {
-      // When no GPUs are available, try running on CPU.
       DeviceProperties device_properties;
       device_properties.set_type("CPU");
       virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 0));
 
-      is_fp16_enabled_on_cpu_ = false;
+      bool is_fp16_enabled_on_cpu = false;
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
-      is_fp16_enabled_on_cpu_ = IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF);
+      is_fp16_enabled_on_cpu = IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF);
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+      if(!IsMKLEnabled() || !is_fp16_enabled_on_cpu) {
+        GTEST_SKIP() << "This device doesn't support FP16";
+      }
     }
     TF_CHECK_OK(virtual_cluster_->Provision());
   }
@@ -187,8 +188,6 @@ class AutoMixedPrecisionTest : public GrapplerTest {
       double input_min, double input_max, double atol, double rtol,
       const std::function<Output(const tensorflow::Scope&, Output)>&
           test_op_factory) {
-    if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
-
     int size = 128;
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
     Output eye = ops::Const(s.WithOpName("eye"),
@@ -227,19 +226,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
     }
   }
 
-  bool ShouldSkipTest() {
-    if (mode_ == AutoMixedPrecisionMode::CUDA && GetNumAvailableGPUs() > 0 ||
-        (mode_ == AutoMixedPrecisionMode::FP16_CPU && IsMKLEnabled() &&
-         is_fp16_enabled_on_cpu_)) {
-      return false;
-    } else {
-      return true;
-    }
-  }
-
   std::unique_ptr<Cluster> virtual_cluster_;
   bool gpu_available_;
-  bool is_fp16_enabled_on_cpu_;
   AutoMixedPrecisionMode mode_;
 };
 
@@ -257,7 +245,6 @@ class AutoMixedPrecisionParamTest : public AutoMixedPrecisionTest,
 };
 
 TEST_P(AutoMixedPrecisionParamTest, NoOp) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.234f, {32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -295,7 +282,6 @@ TEST_P(AutoMixedPrecisionParamTest, NoOp) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, AlreadyFp16) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f, {32, 32});
   Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_HALF);
@@ -334,7 +320,6 @@ TEST_P(AutoMixedPrecisionParamTest, AlreadyFp16) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, Simple) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -384,7 +369,6 @@ TEST_P(AutoMixedPrecisionParamTest, Simple) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, NoInferOp) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   setenv("TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL", "TREAT_INFER_AS_DENY",
          1 /* replace */);
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -436,7 +420,6 @@ TEST_P(AutoMixedPrecisionParamTest, NoInferOp) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, BidirectionalClearChain) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output clr1 = ops::Relu(s.WithOpName("clr1"), input);
@@ -476,7 +459,6 @@ TEST_P(AutoMixedPrecisionParamTest, BidirectionalClearChain) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, PreserveFetches) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
@@ -566,7 +548,6 @@ TEST_P(AutoMixedPrecisionParamTest, PreserveCPUNodes) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, PreserveIdentityAfterVariable) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output var1 = ops::Variable(s.WithOpName("var1"), {32, 32}, DT_FLOAT);
@@ -611,7 +592,6 @@ TEST_P(AutoMixedPrecisionParamTest, PreserveIdentityAfterVariable) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, FusedBatchNorm) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   // Uses NHWC data format because non-GPU execution does not support NCHW.
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {8, 56, 56, 16});
@@ -671,7 +651,6 @@ TEST_P(AutoMixedPrecisionParamTest, FusedBatchNorm) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, RepeatedAndListTypeAttrs) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
@@ -712,7 +691,6 @@ TEST_P(AutoMixedPrecisionParamTest, RepeatedAndListTypeAttrs) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, ExistingCast) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), true, {32, 32});
   Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_FLOAT);
@@ -745,7 +723,6 @@ TEST_P(AutoMixedPrecisionParamTest, ExistingCast) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, RecurrentEdgeColorMismatch) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -808,7 +785,6 @@ TEST_P(AutoMixedPrecisionParamTest, RecurrentEdgeColorMismatch) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, TensorListSetGet) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::TensorListReserve(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -880,7 +856,6 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListSetGet) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, TensorListPushPop) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -944,7 +919,6 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListPushPop) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, TensorListFromTensor) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32};
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
@@ -995,7 +969,6 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListFromTensor) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, TensorListPushBackBatchAndConcatLists) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -1056,7 +1029,6 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListPushBackBatchAndConcatLists) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, TensorListThroughFunction) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   // This test passes a tensor list handle through a function with its own
   // Tensor List ops inside to test that the types are not changed to a
   // conflicting state.
@@ -1165,7 +1137,6 @@ bool IsSupportedGPU(const Cluster& cluster) {
 }
 
 TEST_P(AutoMixedPrecisionParamTest, BatchMatMul) {
-  if (ShouldSkipTest()) GTEST_SKIP() << "This device doesn't support FP16";
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 33, {64, 32, 32});
   Output allow1 = ops::BatchMatMul(s.WithOpName("allow1"), input, input);
@@ -1528,7 +1499,6 @@ TEST_F(AutoMixedPrecisionSimulateGpuTest, Simple_SimulatedGpu_CpuScope) {
 }
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM || INTEL_MKL
 
 #if INTEL_MKL
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 3c0f37d2e9ea4a..3687e6c307e4da 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -231,6 +231,8 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
          new AutoMixedPrecision(AutoMixedPrecisionMode::CUDA));
 #ifdef INTEL_MKL
   if (IsMKLEnabled()) {
+    MK_OPT("auto_mixed_precision", "auto_mixed_precision",
+           new AutoMixedPrecision(AutoMixedPrecisionMode::FP16_CPU));
     MK_OPT("auto_mixed_precision_mkl", "auto_mixed_precision_mkl",
            new AutoMixedPrecision(AutoMixedPrecisionMode::BF16));
     MK_OPT("auto_mixed_precision_onednn_bfloat16",
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 9f4042e6f8be9b..f98d1928d9e156 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -102,8 +102,8 @@ message RewriterConfig {
   // Enable the swap of kernel implementations based on the device placement
   // (default is ON).
   Toggle implementation_selector = 22;
-  // Optimize data types for CUDA (default is OFF).
-  // This will try to use float16 on GPU which is faster.
+  // Optimize data types for CUDA/oneDNN (default is OFF).
+  // This will try to use float16 on GPU/CPU which is faster.
   // Note that this can change the numerical stability of the graph and may
   // require the use of loss scaling to maintain model convergence.
   Toggle auto_mixed_precision = 23;
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 228bacb7f6443d..6ee1860950cc40 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -242,8 +242,8 @@ def set_optimizer_experimental_options(options):
       - implementation_selector: Enable the swap of kernel implementations based
         on the device placement.
       - auto_mixed_precision: Change certain float32 ops to float16 on Volta
-        GPUs and above. Without the use of loss scaling, this can cause
-        numerical underflow (see
+        GPUs and above; and on CPUs with AMX FP16 support. Without the use of
+        loss scaling, this can cause numerical underflow (see
         `keras.mixed_precision.experimental.LossScaleOptimizer`).
       - disable_meta_optimizer: Disable the entire meta optimizer.
       - min_graph_nodes: The minimum number of nodes in a graph to optimizer.

From 1668cc3c7371287fdd17988eb0e218c8231fc10f Mon Sep 17 00:00:00 2001
From: Surya <116063290+SuryanarayanaY@users.noreply.github.com>
Date: Thu, 29 Feb 2024 20:37:10 +0530
Subject: [PATCH 009/670] Fix checkfail in tf.raw_ops.Substr

The API tf.raw_ops.Substr  currently validates whether the input args pos and len are of same shape or not.Its not checking whether these tensors are empty or not and trying to access the Tensor values directly without validating.If a user passes empty tensors it will lead to assertion failure causing core dumped error.

May fixes #63036
---
 tensorflow/core/kernels/substr_op.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index a7880ccc681eff..5f4b2a3a3b0d54 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -56,7 +56,12 @@ class SubstrOp : public OpKernel {
                 errors::InvalidArgument(
                     "pos and len should have the same shape, got: ",
                     pos_shape.DebugString(), " vs. ", len_shape.DebugString()));
-
+    OP_REQUIRES(context, pos_tensor.NumElements() > 0,
+                errors::InvalidArgument("received empty tensor pos_tensor: ",
+                                        pos_tensor.DebugString()));
+    OP_REQUIRES(context, len_tensor.NumElements() > 0,
+                errors::InvalidArgument("received empty tensor len_tensor: ",
+                                        len_tensor.DebugString()));
     bool is_scalar = TensorShapeUtils::IsScalar(pos_shape);
 
     if (is_scalar || input_shape == pos_shape) {

From 2b164c8cd6b5ae3dd6c664127ebdf1104836eeda Mon Sep 17 00:00:00 2001
From: Surya <116063290+SuryanarayanaY@users.noreply.github.com>
Date: Mon, 4 Mar 2024 20:34:41 +0530
Subject: [PATCH 010/670] Fix checkfail in DenseBincount

The API raw_ops.DenseBincount lacks validation of input to be vector. It does have checking for rank<=2 but not for rank>0.
Passing a scalar value causes checkfail with debug build.

Reported at #63068
---
 tensorflow/core/ops/math_ops.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index baa487e728d533..2e4a158add1eb8 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1896,6 +1896,9 @@ REGISTER_OP("DenseBincount")
         c->set_output(0, c->MakeShape({size_val}));
       } else if (c->Rank(c->input(0)) == 2) {
         c->set_output(0, c->MakeShape({c->Dim(c->input(0), 0), size_val}));
+      } else {
+        return errors::InvalidArgument("input must not be a scalar. "
+            "Recieved input of rank ", c->Rank(c->input(0)));
       }
       return absl::OkStatus();
     });

From f4532fd0a6905deab9983a258b61928a8a380f3d Mon Sep 17 00:00:00 2001
From: Surya <116063290+SuryanarayanaY@users.noreply.github.com>
Date: Wed, 6 Mar 2024 10:22:04 +0530
Subject: [PATCH 011/670] Update math_ops.cc

Change the logic to validate rank != 0 explicitly
---
 tensorflow/core/ops/math_ops.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 2e4a158add1eb8..dc93372c1d9df3 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1896,9 +1896,8 @@ REGISTER_OP("DenseBincount")
         c->set_output(0, c->MakeShape({size_val}));
       } else if (c->Rank(c->input(0)) == 2) {
         c->set_output(0, c->MakeShape({c->Dim(c->input(0), 0), size_val}));
-      } else {
-        return errors::InvalidArgument("input must not be a scalar. "
-            "Recieved input of rank ", c->Rank(c->input(0)));
+      } else if (c->Rank(c->input(0)) == 0) {
+        return absl::InvalidArgumentError("The input must not be a scalar. ");
       }
       return absl::OkStatus();
     });

From 994978a764b23db8be45e5b7747f327fa9e6d47e Mon Sep 17 00:00:00 2001
From: Surya <116063290+SuryanarayanaY@users.noreply.github.com>
Date: Thu, 7 Mar 2024 23:00:47 +0530
Subject: [PATCH 012/670] Set output shape to rank 0 input in DensebinCount

Set output shape to rank 0 input in DensebinCount  Op.
---
 tensorflow/core/ops/math_ops.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index dc93372c1d9df3..192899b6726364 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1892,12 +1892,10 @@ REGISTER_OP("DenseBincount")
         return errors::InvalidArgument("size (", size_val,
                                        ") must be non-negative");
       }
-      if (c->Rank(c->input(0)) == 1) {
+      if (c->Rank(c->input(0)) == 1 || c->Rank(c->input(0)) == 0) {
         c->set_output(0, c->MakeShape({size_val}));
       } else if (c->Rank(c->input(0)) == 2) {
         c->set_output(0, c->MakeShape({c->Dim(c->input(0), 0), size_val}));
-      } else if (c->Rank(c->input(0)) == 0) {
-        return absl::InvalidArgumentError("The input must not be a scalar. ");
       }
       return absl::OkStatus();
     });

From a4efadfd30e535f50a000f7dd2853cb9bde88301 Mon Sep 17 00:00:00 2001
From: Surya <116063290+SuryanarayanaY@users.noreply.github.com>
Date: Fri, 8 Mar 2024 10:05:30 +0530
Subject: [PATCH 013/670] Support for Rank 0 Input for DenseBinCount Op

Support for Rank 0 Input for DenseBinCount Op. Assuming that tensor.flat works with scalar tensor also.
---
 tensorflow/core/kernels/bincount_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
index a680a0e4e7a2e3..1a1e55ed067fd3 100644
--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -308,7 +308,7 @@ class DenseBincountOp : public OpKernel {
 
     Tensor* out_t;
     functor::SetZeroFunctor<Device, T> fill;
-    if (data.dims() == 1) {
+    if (data.dims() <= 1) {
       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({size}), &out_t));
       auto out = out_t->flat<T>();
       fill(ctx->eigen_device<Device>(), out);

From 176048ad39a1d928208b92e1503e2d8d2cd35d28 Mon Sep 17 00:00:00 2001
From: Surya <116063290+SuryanarayanaY@users.noreply.github.com>
Date: Mon, 11 Mar 2024 15:50:05 +0530
Subject: [PATCH 014/670] Fix checkfail in UnicodeEncode Op

The Op UnicodeEncode segfaults when passed 2D tensor to `input_splits`.

It has the below check in SetShapeFn which supposed to raise exception if rank !=1 AFAIk. This seems not working for reason unknown to me.

https://github.com/tensorflow/tensorflow/blob/6f64ad5d767a034df45a5eaab8b36fd688cd1217/tensorflow/core/ops/string_ops.cc#L316-L317

Same with input_values argument also.

Added an explicit check in Op.
---
 tensorflow/core/kernels/unicode_ops.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index 3d59cc034480b3..a454e6f69ec646 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -532,6 +532,10 @@ class UnicodeEncodeOp : public OpKernel {
     const Tensor& input_splits = context->input(1);
     const auto input_splits_flat = input_splits.flat<SPLITS_TYPE>();
 
+    OP_REQUIRES(
+        context, input_tensor.dims() == 1 && input_splits.dims() == 1,
+        absl::InvalidArgumentError(
+            "Both the input_tensor and input_splits should be of rank 1. "));   
     OP_REQUIRES(
         context, input_splits.NumElements() > 0,
         errors::InvalidArgument("Input_splits should contain elements, but "

From 3c7b63ecd0afc101c0c889b194e4869906054043 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 16 Mar 2024 10:21:21 -0700
Subject: [PATCH 015/670] Integrate LLVM at llvm/llvm-project@a4ca07f13b56

Updates LLVM usage to match
[a4ca07f13b56](https://github.com/llvm/llvm-project/commit/a4ca07f13b56)

PiperOrigin-RevId: 616433576
---
 .../tensorflow/utils/dump_mlir_util_test.cc   |   2 +-
 third_party/llvm/generated.patch              | 811 ++++++++++++++----
 third_party/llvm/workspace.bzl                |   4 +-
 .../service/cpu/hlo_xla_runtime_pipeline.cc   |   8 +-
 4 files changed, 656 insertions(+), 169 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
index bb474b1413f7ac..2efd63b29b04ef 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
@@ -125,7 +125,7 @@ TEST(DumpCrashReproducerTest, RoundtripDumpAndReadValid) {
   EXPECT_TRUE(mlir::MlirOptMain(output_stream->os(), std::move(input_file),
                                 registry,
                                 mlir::MlirOptMainConfig{}
-                                    .splitInputFile(false)
+                                    .splitInputFile("")
                                     .verifyDiagnostics(false)
                                     .verifyPasses(false)
                                     .allowUnregisteredDialects(false)
diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index b75801c374943b..575d74a4816f67 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,183 +1,670 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
---- a/llvm/include/llvm/IR/AutoUpgrade.h
-+++ b/llvm/include/llvm/IR/AutoUpgrade.h
-@@ -88,9 +88,6 @@
-   /// info. Return true if module is modified.
-   bool UpgradeDebugInfo(Module &M);
- 
--  /// Copies module attributes to the functions in the module.
--  void CopyModuleAttrToFunctions(Module &M);
--
-   /// Check whether a string looks like an old loop attachment tag.
-   inline bool mayBeOldLoopAttachmentTag(StringRef Name) {
-     return Name.starts_with("llvm.vectorizer.");
-diff -ruN --strip-trailing-cr a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
---- a/llvm/lib/IR/AutoUpgrade.cpp
-+++ b/llvm/lib/IR/AutoUpgrade.cpp
-@@ -5178,72 +5178,6 @@
-     Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType()));
+diff -ruN --strip-trailing-cr a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
+--- a/clang/docs/ReleaseNotes.rst
++++ b/clang/docs/ReleaseNotes.rst
+@@ -201,21 +201,6 @@
+   and each must be a positive integer when provided. The parameter ``x`` is required, while ``y`` and
+   ``z`` are optional with default value of 1.
+ 
+-- The ``_Nullable`` and ``_Nonnull`` family of type attributes can now apply
+-  to certain C++ class types, such as smart pointers:
+-  ``void useObject(std::unique_ptr<Object> _Nonnull obj);``.
+-
+-  This works for standard library types including ``unique_ptr``, ``shared_ptr``,
+-  and ``function``. See
+-  `the attribute reference documentation <https://llvm.org/docs/AttributeReference.html#nullability-attributes>`_
+-  for the full list.
+-
+-- The ``_Nullable`` attribute can be applied to C++ class declarations:
+-  ``template <class T> class _Nullable MySmartPointer {};``.
+-
+-  This allows the ``_Nullable`` and ``_Nonnull`` family of type attributes to
+-  apply to this class.
+-
+ Improvements to Clang's diagnostics
+ -----------------------------------
+ - Clang now applies syntax highlighting to the code snippets it
+diff -ruN --strip-trailing-cr a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
+--- a/clang/include/clang/Basic/AttrDocs.td
++++ b/clang/include/clang/Basic/AttrDocs.td
+@@ -4151,20 +4151,6 @@
+       @property (assign, nullable) NSView *superview;
+       @property (readonly, nonnull) NSArray *subviews;
+     @end
+-
+-As well as built-in pointer types, the nullability attributes can be attached
+-to C++ classes marked with the ``_Nullable`` attribute.
+-
+-The following C++ standard library types are considered nullable:
+-``unique_ptr``, ``shared_ptr``, ``auto_ptr``, ``exception_ptr``, ``function``,
+-``move_only_function`` and ``coroutine_handle``.
+-
+-Types should be marked nullable only where the type itself leaves nullability
+-ambiguous. For example, ``std::optional`` is not marked ``_Nullable``, because
+-``optional<int> _Nullable`` is redundant and ``optional<int> _Nonnull`` is
+-not a useful type. ``std::weak_ptr`` is not nullable, because its nullability
+-can change with no visible modification, so static annotation is unlikely to be
+-unhelpful.
+   }];
  }
  
--// Check if the module attribute is present and not zero.
--static bool isModuleAttributeSet(Module &M, const StringRef &ModAttr) {
--  const auto *Attr =
--      mdconst::extract_or_null<ConstantInt>(M.getModuleFlag(ModAttr));
--  return Attr && Attr->getZExtValue();
--}
+@@ -4199,17 +4185,6 @@
+     int fetch_or_zero(int * _Nullable ptr);
+ 
+ a caller of ``fetch_or_zero`` can provide null.
 -
--// Copy an attribute from module to the function if exists.
--// First value of the pair is used when the module attribute is not zero
--// the second otherwise.
--static void
--CopyModuleAttributeToFunction(Function &F, StringRef FnAttrName,
--                              StringRef ModAttrName,
--                              std::pair<StringRef, StringRef> Values) {
--  if (F.hasFnAttribute(FnAttrName))
--    return;
--  F.addFnAttr(FnAttrName, isModuleAttributeSet(*F.getParent(), ModAttrName)
--                              ? Values.first
--                              : Values.second);
--}
+-The ``_Nullable`` attribute on classes indicates that the given class can
+-represent null values, and so the ``_Nullable``, ``_Nonnull`` etc qualifiers
+-make sense for this type. For example:
 -
--// Copy a boolean attribute from module to the function if exists.
--// Module attribute treated false if zero otherwise true.
--static void CopyModuleAttributeToFunction(Function &F, StringRef AttrName) {
--  CopyModuleAttributeToFunction(
--      F, AttrName, AttrName,
--      std::make_pair<StringRef, StringRef>("true", "false"));
+-  .. code-block:: c
+-
+-    class _Nullable ArenaPointer { ... };
+-
+-    ArenaPointer _Nonnull x = ...;
+-    ArenaPointer _Nullable y = nullptr;
+   }];
+ }
+ 
+diff -ruN --strip-trailing-cr a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
+--- a/clang/include/clang/Basic/Attr.td
++++ b/clang/include/clang/Basic/Attr.td
+@@ -2178,10 +2178,9 @@
+   let Documentation = [TypeNonNullDocs];
+ }
+ 
+-def TypeNullable : DeclOrTypeAttr {
++def TypeNullable : TypeAttr {
+   let Spellings = [CustomKeyword<"_Nullable">];
+   let Documentation = [TypeNullableDocs];
+-//  let Subjects = SubjectList<[CXXRecord], ErrorDiag>;
+ }
+ 
+ def TypeNullableResult : TypeAttr {
+diff -ruN --strip-trailing-cr a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
+--- a/clang/include/clang/Basic/Features.def
++++ b/clang/include/clang/Basic/Features.def
+@@ -94,7 +94,6 @@
+ FEATURE(enumerator_attributes, true)
+ FEATURE(nullability, true)
+ FEATURE(nullability_on_arrays, true)
+-FEATURE(nullability_on_classes, true)
+ FEATURE(nullability_nullable_result, true)
+ FEATURE(memory_sanitizer,
+         LangOpts.Sanitize.hasOneOf(SanitizerKind::Memory |
+diff -ruN --strip-trailing-cr a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
+--- a/clang/include/clang/Parse/Parser.h
++++ b/clang/include/clang/Parse/Parser.h
+@@ -3014,7 +3014,6 @@
+   void DiagnoseAndSkipExtendedMicrosoftTypeAttributes();
+   SourceLocation SkipExtendedMicrosoftTypeAttributes();
+   void ParseMicrosoftInheritanceClassAttributes(ParsedAttributes &attrs);
+-  void ParseNullabilityClassAttributes(ParsedAttributes &attrs);
+   void ParseBorlandTypeAttributes(ParsedAttributes &attrs);
+   void ParseOpenCLKernelAttributes(ParsedAttributes &attrs);
+   void ParseOpenCLQualifiers(ParsedAttributes &Attrs);
+diff -ruN --strip-trailing-cr a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
+--- a/clang/include/clang/Sema/Sema.h
++++ b/clang/include/clang/Sema/Sema.h
+@@ -1655,9 +1655,6 @@
+   /// Add [[gsl::Pointer]] attributes for std:: types.
+   void inferGslPointerAttribute(TypedefNameDecl *TD);
+ 
+-  /// Add _Nullable attributes for std:: types.
+-  void inferNullableClassAttribute(CXXRecordDecl *CRD);
+-
+   enum PragmaOptionsAlignKind {
+     POAK_Native,  // #pragma options align=native
+     POAK_Natural, // #pragma options align=natural
+diff -ruN --strip-trailing-cr a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
+--- a/clang/lib/AST/Type.cpp
++++ b/clang/lib/AST/Type.cpp
+@@ -4558,15 +4558,16 @@
+   case Type::Auto:
+     return ResultIfUnknown;
+ 
+-  // Dependent template specializations could instantiate to pointer types.
++  // Dependent template specializations can instantiate to pointer
++  // types unless they're known to be specializations of a class
++  // template.
+   case Type::TemplateSpecialization:
+-    // If it's a known class template, we can already check if it's nullable.
+-    if (TemplateDecl *templateDecl =
+-            cast<TemplateSpecializationType>(type.getTypePtr())
+-                ->getTemplateName()
+-                .getAsTemplateDecl())
+-      if (auto *CTD = dyn_cast<ClassTemplateDecl>(templateDecl))
+-        return CTD->getTemplatedDecl()->hasAttr<TypeNullableAttr>();
++    if (TemplateDecl *templateDecl
++          = cast<TemplateSpecializationType>(type.getTypePtr())
++              ->getTemplateName().getAsTemplateDecl()) {
++      if (isa<ClassTemplateDecl>(templateDecl))
++        return false;
++    }
+     return ResultIfUnknown;
+ 
+   case Type::Builtin:
+@@ -4623,17 +4624,6 @@
+     }
+     llvm_unreachable("unknown builtin type");
+ 
+-  case Type::Record: {
+-    const RecordDecl *RD = cast<RecordType>(type)->getDecl();
+-    // For template specializations, look only at primary template attributes.
+-    // This is a consistent regardless of whether the instantiation is known.
+-    if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(RD))
+-      return CTSD->getSpecializedTemplate()
+-          ->getTemplatedDecl()
+-          ->hasAttr<TypeNullableAttr>();
+-    return RD->hasAttr<TypeNullableAttr>();
+-  }
+-
+   // Non-pointer types.
+   case Type::Complex:
+   case Type::LValueReference:
+@@ -4651,6 +4641,7 @@
+   case Type::DependentAddressSpace:
+   case Type::FunctionProto:
+   case Type::FunctionNoProto:
++  case Type::Record:
+   case Type::DeducedTemplateSpecialization:
+   case Type::Enum:
+   case Type::InjectedClassName:
+diff -ruN --strip-trailing-cr a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
+--- a/clang/lib/CodeGen/CGCall.cpp
++++ b/clang/lib/CodeGen/CGCall.cpp
+@@ -4372,8 +4372,7 @@
+     NNAttr = getNonNullAttr(AC.getDecl(), PVD, ArgType, ArgNo);
+ 
+   bool CanCheckNullability = false;
+-  if (SanOpts.has(SanitizerKind::NullabilityArg) && !NNAttr && PVD &&
+-      !PVD->getType()->isRecordType()) {
++  if (SanOpts.has(SanitizerKind::NullabilityArg) && !NNAttr && PVD) {
+     auto Nullability = PVD->getType()->getNullability();
+     CanCheckNullability = Nullability &&
+                           *Nullability == NullabilityKind::NonNull &&
+diff -ruN --strip-trailing-cr a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
+--- a/clang/lib/CodeGen/CodeGenFunction.cpp
++++ b/clang/lib/CodeGen/CodeGenFunction.cpp
+@@ -979,8 +979,7 @@
+   // return value. Initialize the flag to 'true' and refine it in EmitParmDecl.
+   if (SanOpts.has(SanitizerKind::NullabilityReturn)) {
+     auto Nullability = FnRetTy->getNullability();
+-    if (Nullability && *Nullability == NullabilityKind::NonNull &&
+-        !FnRetTy->isRecordType()) {
++    if (Nullability && *Nullability == NullabilityKind::NonNull) {
+       if (!(SanOpts.has(SanitizerKind::ReturnsNonnullAttribute) &&
+             CurCodeDecl && CurCodeDecl->getAttr<ReturnsNonNullAttr>()))
+         RetValNullabilityPrecondition =
+diff -ruN --strip-trailing-cr a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
+--- a/clang/lib/Parse/ParseDeclCXX.cpp
++++ b/clang/lib/Parse/ParseDeclCXX.cpp
+@@ -1494,15 +1494,6 @@
+   }
+ }
+ 
+-void Parser::ParseNullabilityClassAttributes(ParsedAttributes &attrs) {
+-  while (Tok.is(tok::kw__Nullable)) {
+-    IdentifierInfo *AttrName = Tok.getIdentifierInfo();
+-    auto Kind = Tok.getKind();
+-    SourceLocation AttrNameLoc = ConsumeToken();
+-    attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0, Kind);
+-  }
 -}
 -
--// Copy an attribute from module to the function if exists.
--// First value of the pair is used when the module attribute is not zero
--// the second otherwise.
--static void
--CopyModuleAttributeToFunction(Function &F, StringRef AttrName,
--                              std::pair<StringRef, StringRef> Values) {
--  CopyModuleAttributeToFunction(F, AttrName, AttrName, Values);
+ /// Determine whether the following tokens are valid after a type-specifier
+ /// which could be a standalone declaration. This will conservatively return
+ /// true if there's any doubt, and is appropriate for insert-';' fixits.
+@@ -1684,21 +1675,15 @@
+ 
+   ParsedAttributes attrs(AttrFactory);
+   // If attributes exist after tag, parse them.
+-  for (;;) {
+-    MaybeParseAttributes(PAKM_CXX11 | PAKM_Declspec | PAKM_GNU, attrs);
+-    // Parse inheritance specifiers.
+-    if (Tok.isOneOf(tok::kw___single_inheritance,
+-                    tok::kw___multiple_inheritance,
+-                    tok::kw___virtual_inheritance)) {
+-      ParseMicrosoftInheritanceClassAttributes(attrs);
+-      continue;
+-    }
+-    if (Tok.is(tok::kw__Nullable)) {
+-      ParseNullabilityClassAttributes(attrs);
+-      continue;
+-    }
+-    break;
+-  }
++  MaybeParseAttributes(PAKM_CXX11 | PAKM_Declspec | PAKM_GNU, attrs);
++
++  // Parse inheritance specifiers.
++  if (Tok.isOneOf(tok::kw___single_inheritance, tok::kw___multiple_inheritance,
++                  tok::kw___virtual_inheritance))
++    ParseMicrosoftInheritanceClassAttributes(attrs);
++
++  // Allow attributes to precede or succeed the inheritance specifiers.
++  MaybeParseAttributes(PAKM_CXX11 | PAKM_Declspec | PAKM_GNU, attrs);
+ 
+   // Source location used by FIXIT to insert misplaced
+   // C++11 attributes
+diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp
+--- a/clang/lib/Sema/SemaAttr.cpp
++++ b/clang/lib/Sema/SemaAttr.cpp
+@@ -215,18 +215,6 @@
+   inferGslPointerAttribute(Record, Record);
+ }
+ 
+-void Sema::inferNullableClassAttribute(CXXRecordDecl *CRD) {
+-  static llvm::StringSet<> Nullable{
+-      "auto_ptr",         "shared_ptr", "unique_ptr",         "exception_ptr",
+-      "coroutine_handle", "function",   "move_only_function",
+-  };
+-
+-  if (CRD->isInStdNamespace() && Nullable.count(CRD->getName()) &&
+-      !CRD->hasAttr<TypeNullableAttr>())
+-    for (Decl *Redecl : CRD->redecls())
+-      Redecl->addAttr(TypeNullableAttr::CreateImplicit(Context));
 -}
 -
--void llvm::CopyModuleAttrToFunctions(Module &M) {
--  Triple T(M.getTargetTriple());
--  if (!T.isThumb() && !T.isARM() && !T.isAArch64())
+ void Sema::ActOnPragmaOptionsAlign(PragmaOptionsAlignKind Kind,
+                                    SourceLocation PragmaLoc) {
+   PragmaMsStackAction Action = Sema::PSK_Reset;
+diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
+--- a/clang/lib/Sema/SemaChecking.cpp
++++ b/clang/lib/Sema/SemaChecking.cpp
+@@ -27,7 +27,6 @@
+ #include "clang/AST/ExprObjC.h"
+ #include "clang/AST/ExprOpenMP.h"
+ #include "clang/AST/FormatString.h"
+-#include "clang/AST/IgnoreExpr.h"
+ #include "clang/AST/NSAPI.h"
+ #include "clang/AST/NonTrivialTypeVisitor.h"
+ #include "clang/AST/OperationKinds.h"
+@@ -7358,14 +7357,6 @@
+ ///
+ /// Returns true if the value evaluates to null.
+ static bool CheckNonNullExpr(Sema &S, const Expr *Expr) {
+-  // Treat (smart) pointers constructed from nullptr as null, whether we can
+-  // const-evaluate them or not.
+-  // This must happen first: the smart pointer expr might have _Nonnull type!
+-  if (isa<CXXNullPtrLiteralExpr>(
+-          IgnoreExprNodes(Expr, IgnoreImplicitAsWrittenSingleStep,
+-                          IgnoreElidableImplicitConstructorSingleStep)))
+-    return true;
+-
+   // If the expression has non-null type, it doesn't evaluate to null.
+   if (auto nullability = Expr->IgnoreImplicit()->getType()->getNullability()) {
+     if (*nullability == NullabilityKind::NonNull)
+diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
+--- a/clang/lib/Sema/SemaDeclAttr.cpp
++++ b/clang/lib/Sema/SemaDeclAttr.cpp
+@@ -5976,20 +5976,6 @@
+   D->addAttr(::new (S.Context) BuiltinAliasAttr(S.Context, AL, Ident));
+ }
+ 
+-static void handleNullableTypeAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+-  if (AL.isUsedAsTypeAttr())
 -    return;
 -
--  for (Function &F : M.getFunctionList()) {
--    if (F.isDeclaration())
--      continue;
+-  if (auto *CRD = dyn_cast<CXXRecordDecl>(D);
+-      !CRD || !(CRD->isClass() || CRD->isStruct())) {
+-    S.Diag(AL.getRange().getBegin(), diag::err_attribute_wrong_decl_type_str)
+-        << AL << AL.isRegularKeywordAttribute() << "classes";
+-    return;
+-  }
 -
--    if (!F.hasFnAttribute("sign-return-address")) {
--      StringRef SignType = "none";
--      if (isModuleAttributeSet(M, "sign-return-address"))
--        SignType = "non-leaf";
+-  handleSimpleAttribute<TypeNullableAttr>(S, D, AL);
+-}
 -
--      if (isModuleAttributeSet(M, "sign-return-address-all"))
--        SignType = "all";
+ static void handlePreferredTypeAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+   if (!AL.hasParsedType()) {
+     S.Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 1;
+@@ -9959,10 +9945,6 @@
+   case ParsedAttr::AT_UsingIfExists:
+     handleSimpleAttribute<UsingIfExistsAttr>(S, D, AL);
+     break;
 -
--      F.addFnAttr("sign-return-address", SignType);
--    }
--    CopyModuleAttributeToFunction(F, "branch-target-enforcement");
--    CopyModuleAttributeToFunction(F, "branch-protection-pauth-lr");
--    CopyModuleAttributeToFunction(F, "guarded-control-stack");
--    CopyModuleAttributeToFunction(
--        F, "sign-return-address-key",
--        std::make_pair<StringRef, StringRef>("b_key", "a_key"));
+-  case ParsedAttr::AT_TypeNullable:
+-    handleNullableTypeAttr(S, D, AL);
+-    break;
+   }
+ }
+ 
+diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
+--- a/clang/lib/Sema/SemaDecl.cpp
++++ b/clang/lib/Sema/SemaDecl.cpp
+@@ -18254,10 +18254,8 @@
+   if (PrevDecl)
+     mergeDeclAttributes(New, PrevDecl);
+ 
+-  if (auto *CXXRD = dyn_cast<CXXRecordDecl>(New)) {
++  if (auto *CXXRD = dyn_cast<CXXRecordDecl>(New))
+     inferGslOwnerPointerAttribute(CXXRD);
+-    inferNullableClassAttribute(CXXRD);
 -  }
+ 
+   // If there's a #pragma GCC visibility in scope, set the visibility of this
+   // record.
+diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
+--- a/clang/lib/Sema/SemaInit.cpp
++++ b/clang/lib/Sema/SemaInit.cpp
+@@ -7075,11 +7075,6 @@
+       hasCopyOrMoveCtorParam(S.Context,
+                              getConstructorInfo(Step.Function.FoundDecl));
+ 
+-  // A smart pointer constructed from a nullable pointer is nullable.
+-  if (NumArgs == 1 && !Kind.isExplicitCast())
+-    S.diagnoseNullableToNonnullConversion(
+-        Entity.getType(), Args.front()->getType(), Kind.getLocation());
+-
+   // Determine the arguments required to actually perform the constructor
+   // call.
+   if (S.CompleteConstructorCall(Constructor, Step.Type, Args, Loc,
+diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
+--- a/clang/lib/Sema/SemaOverload.cpp
++++ b/clang/lib/Sema/SemaOverload.cpp
+@@ -14797,13 +14797,6 @@
+           }
+         }
+ 
+-        // Check for nonnull = nullable.
+-        // This won't be caught in the arg's initialization: the parameter to
+-        // the assignment operator is not marked nonnull.
+-        if (Op == OO_Equal)
+-          diagnoseNullableToNonnullConversion(Args[0]->getType(),
+-                                              Args[1]->getType(), OpLoc);
+-
+         // Convert the arguments.
+         if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(FnDecl)) {
+           // Best->Access is only meaningful for class members.
+diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
+--- a/clang/lib/Sema/SemaTemplate.cpp
++++ b/clang/lib/Sema/SemaTemplate.cpp
+@@ -2171,7 +2171,6 @@
+ 
+   AddPushedVisibilityAttribute(NewClass);
+   inferGslOwnerPointerAttribute(NewClass);
+-  inferNullableClassAttribute(NewClass);
+ 
+   if (TUK != TUK_Friend) {
+     // Per C++ [basic.scope.temp]p2, skip the template parameter scopes.
+diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
+--- a/clang/lib/Sema/SemaType.cpp
++++ b/clang/lib/Sema/SemaType.cpp
+@@ -4711,18 +4711,6 @@
+   return false;
+ }
+ 
+-// Whether this is a type broadly expected to have nullability attached.
+-// These types are affected by `#pragma assume_nonnull`, and missing nullability
+-// will be diagnosed with -Wnullability-completeness.
+-static bool shouldHaveNullability(QualType T) {
+-  return T->canHaveNullability(/*ResultIfUnknown=*/false) &&
+-         // For now, do not infer/require nullability on C++ smart pointers.
+-         // It's unclear whether the pragma's behavior is useful for C++.
+-         // e.g. treating type-aliases and template-type-parameters differently
+-         // from types of declarations can be surprising.
+-         !isa<RecordType>(T);
 -}
 -
- static bool isOldLoopArgument(Metadata *MD) {
-   auto *T = dyn_cast_or_null<MDTuple>(MD);
-   if (!T)
-diff -ruN --strip-trailing-cr a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
---- a/llvm/lib/Linker/IRMover.cpp
-+++ b/llvm/lib/Linker/IRMover.cpp
-@@ -1606,11 +1606,6 @@
-   // Loop over all of the linked values to compute type mappings.
-   computeTypeMapping();
- 
--  // Convert module level attributes to function level attributes because
--  // after merging modules the attributes might change and would have different
--  // effect on the functions as the original module would have.
--  CopyModuleAttrToFunctions(*SrcM);
--
-   std::reverse(Worklist.begin(), Worklist.end());
-   while (!Worklist.empty()) {
-     GlobalValue *GV = Worklist.back();
-diff -ruN --strip-trailing-cr a/llvm/test/Linker/link-arm-and-thumb.ll b/llvm/test/Linker/link-arm-and-thumb.ll
---- a/llvm/test/Linker/link-arm-and-thumb.ll
-+++ b/llvm/test/Linker/link-arm-and-thumb.ll
-@@ -13,12 +13,11 @@
-   ret i32 %add
+ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
+                                                 QualType declSpecType,
+                                                 TypeSourceInfo *TInfo) {
+@@ -4841,7 +4829,8 @@
+     // inner pointers.
+     complainAboutMissingNullability = CAMN_InnerPointers;
+ 
+-    if (shouldHaveNullability(T) && !T->getNullability()) {
++    if (T->canHaveNullability(/*ResultIfUnknown*/ false) &&
++        !T->getNullability()) {
+       // Note that we allow but don't require nullability on dependent types.
+       ++NumPointersRemaining;
+     }
+@@ -5064,7 +5053,8 @@
+   // If the type itself could have nullability but does not, infer pointer
+   // nullability and perform consistency checking.
+   if (S.CodeSynthesisContexts.empty()) {
+-    if (shouldHaveNullability(T) && !T->getNullability()) {
++    if (T->canHaveNullability(/*ResultIfUnknown*/ false) &&
++        !T->getNullability()) {
+       if (isVaList(T)) {
+         // Record that we've seen a pointer, but do nothing else.
+         if (NumPointersRemaining > 0)
+diff -ruN --strip-trailing-cr a/clang/test/Sema/nullability.c b/clang/test/Sema/nullability.c
+--- a/clang/test/Sema/nullability.c
++++ b/clang/test/Sema/nullability.c
+@@ -248,5 +248,3 @@
+   void (^withTypedefBad)(INTS _Nonnull [2]) = // expected-error {{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'INTS' (aka 'int[4]')}}
+       ^(INTS _Nonnull x[2]) {}; // expected-error {{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'INTS' (aka 'int[4]')}}
  }
+-
+-struct _Nullable NotCplusplusClass {}; // expected-error {{'_Nullable' attribute only applies to classes}}
+diff -ruN --strip-trailing-cr a/clang/test/SemaCXX/nullability.cpp b/clang/test/SemaCXX/nullability.cpp
+--- a/clang/test/SemaCXX/nullability.cpp
++++ b/clang/test/SemaCXX/nullability.cpp
+@@ -4,10 +4,6 @@
+ #else
+ #  error nullability feature should be defined
+ #endif
+-#if __has_feature(nullability_on_classes)
+-#else
+-#  error smart-pointer feature should be defined
+-#endif
+ 
+ #include "nullability-completeness.h"
+ 
+@@ -31,7 +27,6 @@
+ struct AddNonNull {
+   typedef _Nonnull T type; // expected-error{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'int'}}
+   // expected-error@-1{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'std::nullptr_t'}}
+-  // expected-error@-2{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'NotPtr'}}
+ };
  
--; CHECK: define i32 @main() [[MAIN_ATTRS:#[0-9]+]]
-+; CHECK: define i32 @main() {
- ; CHECK: define i32 @foo(i32 %a, i32 %b) [[ARM_ATTRS:#[0-9]+]]
- ; CHECK: define i32 @bar(i32 %a, i32 %b) [[THUMB_ATTRS:#[0-9]+]]
- 
--; CHECK: attributes [[MAIN_ATTRS]] = { {{.*}} }
--; CHECK: attributes [[ARM_ATTRS]] = { {{.*}} "target-features"="-thumb-mode" }
--; CHECK: attributes [[THUMB_ATTRS]] = { {{.*}} "target-features"="+thumb-mode" }
-+; CHECK: attributes [[ARM_ATTRS]] = { "target-features"="-thumb-mode" }
-+; CHECK: attributes [[THUMB_ATTRS]] = { "target-features"="+thumb-mode" }
- 
- ; STDERR-NOT: warning: Linking two modules of different target triples:
-diff -ruN --strip-trailing-cr a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
---- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
-+++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
-@@ -32,7 +32,6 @@
- ; CHECK-DUMP: <main>:
- ; CHECK-DUMP:      bl      0x8 <main+0x8>
- ; CHECK-DUMP: <foo>:
--; CHECK-DUMP:     paciasp
- 
- ; `main` doesn't support BTI while `foo` does, so in the binary
- ; we should see only PAC which is supported by both.
-diff -ruN --strip-trailing-cr a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll
---- a/llvm/test/LTO/AArch64/link-sign-return-address.ll
-+++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll
-@@ -1,43 +0,0 @@
--; Testcase to check that module with different branch-target-enforcement can
--; be mixed.
--;
--; RUN: llvm-as %s -o %t1.bc
--; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
--; RUN: llvm-lto -exported-symbol main \
--; RUN:          -exported-symbol foo \
--; RUN:          -filetype=obj \
--; RUN:           %t2.bc %t1.bc \
--; RUN:           -o %t1.exe 2>&1
--; RUN: llvm-objdump -d %t1.exe | FileCheck --check-prefix=CHECK-DUMP %s
--; RUN: llvm-readelf -n %t1.exe | FileCheck --allow-empty --check-prefix=CHECK-PROP %s
--
--target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
--target triple = "aarch64-unknown-linux-gnu"
--
--declare i32 @foo();
--
--define i32 @main() {
--entry:
--  %add = call i32 @foo()
--  ret i32 %add
+ typedef AddNonNull<int *>::type nonnull_int_ptr_1;
+@@ -40,33 +35,6 @@
+ 
+ typedef AddNonNull<int>::type nonnull_non_pointer_1; // expected-note{{in instantiation of template class 'AddNonNull<int>' requested here}}
+ 
+-// Nullability on C++ class types (smart pointers).
+-struct NotPtr{};
+-typedef AddNonNull<NotPtr>::type nonnull_non_pointer_2; // expected-note{{in instantiation}}
+-struct _Nullable SmartPtr{
+-  SmartPtr();
+-  SmartPtr(nullptr_t);
+-  SmartPtr(const SmartPtr&);
+-  SmartPtr(SmartPtr&&);
+-  SmartPtr &operator=(const SmartPtr&);
+-  SmartPtr &operator=(SmartPtr&&);
+-};
+-typedef AddNonNull<SmartPtr>::type nonnull_smart_pointer_1;
+-template<class> struct _Nullable SmartPtrTemplate{};
+-typedef AddNonNull<SmartPtrTemplate<int>>::type nonnull_smart_pointer_2;
+-namespace std { inline namespace __1 {
+-  template <class> class unique_ptr {};
+-  template <class> class function;
+-  template <class Ret, class... Args> class function<Ret(Args...)> {};
+-} }
+-typedef AddNonNull<std::unique_ptr<int>>::type nonnull_smart_pointer_3;
+-typedef AddNonNull<std::function<int()>>::type nonnull_smart_pointer_4;
+-
+-class Derived : public SmartPtr {};
+-Derived _Nullable x; // expected-error {{'_Nullable' cannot be applied}}
+-class DerivedPrivate : private SmartPtr {};
+-DerivedPrivate _Nullable y; // expected-error {{'_Nullable' cannot be applied}}
+-
+ // Non-null checking within a template.
+ template<typename T>
+ struct AddNonNull2 {
+@@ -86,7 +54,6 @@
+ void (X::* accepts_nonnull_3)(_Nonnull int *ptr);
+ void accepts_nonnull_4(_Nonnull int *ptr);
+ void (&accepts_nonnull_5)(_Nonnull int *ptr) = accepts_nonnull_4;
+-void accepts_nonnull_6(SmartPtr _Nonnull);
+ 
+ void test_accepts_nonnull_null_pointer_literal(X *x) {
+   accepts_nonnull_1(0); // expected-warning{{null passed to a callee that requires a non-null argument}}
+@@ -94,8 +61,6 @@
+   (x->*accepts_nonnull_3)(0); // expected-warning{{null passed to a callee that requires a non-null argument}}
+   accepts_nonnull_4(0); // expected-warning{{null passed to a callee that requires a non-null argument}}
+   accepts_nonnull_5(0); // expected-warning{{null passed to a callee that requires a non-null argument}}
+-
+-  accepts_nonnull_6(nullptr); // expected-warning{{null passed to a callee that requires a non-null argument}}
+ }
+ 
+ template<void FP(_Nonnull int*)> 
+@@ -106,7 +71,6 @@
+ template void test_accepts_nonnull_null_pointer_literal_template<&accepts_nonnull_4>(); // expected-note{{instantiation of function template specialization}}
+ 
+ void TakeNonnull(void *_Nonnull);
+-void TakeSmartNonnull(SmartPtr _Nonnull);
+ // Check different forms of assignment to a nonull type from a nullable one.
+ void AssignAndInitNonNull() {
+   void *_Nullable nullable;
+@@ -117,26 +81,12 @@
+   void *_Nonnull nonnull;
+   nonnull = nullable; // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}}
+   nonnull = {nullable}; // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}}
++
+   TakeNonnull(nullable); //expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull}}
+   TakeNonnull(nonnull); // OK
+-  nonnull = (void *_Nonnull)nullable; // explicit cast OK
+-
+-  SmartPtr _Nullable s_nullable;
+-  SmartPtr _Nonnull s(s_nullable); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
+-  SmartPtr _Nonnull s2{s_nullable}; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
+-  SmartPtr _Nonnull s3 = {s_nullable}; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
+-  SmartPtr _Nonnull s4 = s_nullable; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
+-  SmartPtr _Nonnull s_nonnull;
+-  s_nonnull = s_nullable; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
+-  s_nonnull = {s_nullable}; // no warning here - might be nice?
+-  TakeSmartNonnull(s_nullable); //expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull}}
+-  TakeSmartNonnull(s_nonnull); // OK
+-  s_nonnull = (SmartPtr _Nonnull)s_nullable; // explicit cast OK
+-  s_nonnull = static_cast<SmartPtr _Nonnull>(s_nullable); // explicit cast OK
+ }
+ 
+ void *_Nullable ReturnNullable();
+-SmartPtr _Nullable ReturnSmartNullable();
+ 
+ void AssignAndInitNonNullFromFn() {
+   void *_Nonnull p(ReturnNullable()); // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}}
+@@ -146,16 +96,8 @@
+   void *_Nonnull nonnull;
+   nonnull = ReturnNullable(); // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}}
+   nonnull = {ReturnNullable()}; // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}}
+-  TakeNonnull(ReturnNullable()); //expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull}}
+ 
+-  SmartPtr _Nonnull s(ReturnSmartNullable()); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
+-  SmartPtr _Nonnull s2{ReturnSmartNullable()}; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
+-  SmartPtr _Nonnull s3 = {ReturnSmartNullable()}; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
+-  SmartPtr _Nonnull s4 = ReturnSmartNullable(); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
+-  SmartPtr _Nonnull s_nonnull;
+-  s_nonnull = ReturnSmartNullable(); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
+-  s_nonnull = {ReturnSmartNullable()};
+-  TakeSmartNonnull(ReturnSmartNullable()); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
++  TakeNonnull(ReturnNullable()); //expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull}}
+ }
+ 
+ void ConditionalExpr(bool c) {
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
++++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+@@ -1019,6 +1019,7 @@
+   const DataLayout &DL = getDataLayout();
+ 
+   // GlobalVariables are always constant pointers themselves.
++  PointerType *PTy = GVar->getType();
+   Type *ETy = GVar->getValueType();
+ 
+   if (GVar->hasExternalLinkage()) {
+@@ -1026,9 +1027,6 @@
+       O << ".visible ";
+     else
+       O << ".extern ";
+-  } else if (GVar->hasCommonLinkage() &&
+-             GVar->getAddressSpace() == ADDRESS_SPACE_GLOBAL) {
+-    O << ".common ";
+   } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() ||
+              GVar->hasAvailableExternallyLinkage() ||
+              GVar->hasCommonLinkage()) {
+@@ -1140,7 +1138,7 @@
+   }
+ 
+   O << ".";
+-  emitPTXAddressSpace(GVar->getAddressSpace(), O);
++  emitPTXAddressSpace(PTy->getAddressSpace(), O);
+ 
+   if (isManaged(*GVar)) {
+     if (STI.getPTXVersion() < 40 || STI.getSmVersion() < 30) {
+@@ -1169,8 +1167,8 @@
+     // Ptx allows variable initilization only for constant and global state
+     // spaces.
+     if (GVar->hasInitializer()) {
+-      if ((GVar->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
+-          (GVar->getAddressSpace() == ADDRESS_SPACE_CONST)) {
++      if ((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
++          (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) {
+         const Constant *Initializer = GVar->getInitializer();
+         // 'undef' is treated as there is no value specified.
+         if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) {
+@@ -1185,7 +1183,7 @@
+             !isa<UndefValue>(GVar->getInitializer())) {
+           report_fatal_error("initial value of '" + GVar->getName() +
+                              "' is not allowed in addrspace(" +
+-                             Twine(GVar->getAddressSpace()) + ")");
++                             Twine(PTy->getAddressSpace()) + ")");
+         }
+       }
+     }
+@@ -1204,8 +1202,8 @@
+       ElementSize = DL.getTypeStoreSize(ETy);
+       // Ptx allows variable initilization only for constant and
+       // global state spaces.
+-      if (((GVar->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
+-           (GVar->getAddressSpace() == ADDRESS_SPACE_CONST)) &&
++      if (((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
++           (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) &&
+           GVar->hasInitializer()) {
+         const Constant *Initializer = GVar->getInitializer();
+         if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/NVPTX/common-linkage.ll b/llvm/test/CodeGen/NVPTX/common-linkage.ll
+--- a/llvm/test/CodeGen/NVPTX/common-linkage.ll
++++ b/llvm/test/CodeGen/NVPTX/common-linkage.ll
+@@ -1,26 +0,0 @@
+-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+-
+-; CHECK: .common .global .align 4 .u32 g
+-@g = common addrspace(1) global i32 0, align 4
+-
+-; CHECK: .weak .const .align 4 .u32 c
+-@c = common addrspace(4) global i32 0, align 4
+-
+-; CHECK: .weak .shared .align 4 .u32 s
+-@s = common addrspace(3) global i32 0, align 4
+-
+-define i32 @f1() {
+-  %1 = load i32, ptr addrspace(1) @g
+-  ret i32 %1
 -}
 -
--!llvm.module.flags = !{!0, !1, !2, !3 }
--!0 = !{i32 8, !"branch-target-enforcement", i32 0}
--!1 = !{i32 8, !"sign-return-address", i32 0}
--!2 = !{i32 8, !"sign-return-address-all", i32 0}
--!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0}
--
--; CHECK-DUMP: <foo>:
--; CHECK-DUMP:     paciasp
--; CHECK-DUMP:     mov     w0, #0x2a
--; CHECK-DUMP:     autiasp
--; CHECK-DUMP:     ret
--; CHECK-DUMP: <main>:
--; CHECK-DUMP-NOT:  paciasp
--; CHECK-DUMP:      str     x30,
--; CHECK-DUMP:      bl      0x14 <main+0x4>
--
--; `main` doesn't support PAC sign-return-address while `foo` does, so in the binary
--; we should not see anything.
--; CHECK-PROP-NOT:   Properties: aarch64 feature: PAC
-\ No newline at end of file
+-define i32 @f4() {
+-  %1 = load i32, ptr addrspace(4) @c
+-  ret i32 %1
+-}
+-
+-define i32 @f3() {
+-  %1 = load i32, ptr addrspace(3) @s
+-  ret i32 %1
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/NVPTX/weak-global.ll b/llvm/test/CodeGen/NVPTX/weak-global.ll
+--- a/llvm/test/CodeGen/NVPTX/weak-global.ll
++++ b/llvm/test/CodeGen/NVPTX/weak-global.ll
+@@ -1,7 +1,7 @@
+ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+ ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+ 
+-; CHECK: .common .global .align 4 .u32 g
++; CHECK: .weak .global .align 4 .u32 g
+ @g = common addrspace(1) global i32 zeroinitializer
+ 
+ define i32 @func0() {
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 772469ed4698c1..c190989fc46286 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "3b5e7c83a6e226d5bd7ed2e9b67449b64812074c"
-    LLVM_SHA256 = "7fa7a38aade8b5fa2f7719cd3b6e2f038fed1b00d7369cdb05b490085de79c91"
+    LLVM_COMMIT = "a4ca07f13b560b4f6fa5459eef7159e4f9ee9a6b"
+    LLVM_SHA256 = "fb936389d46b3ce7ee423c0d788e5359da8ce41cfe8996847719920c6f60b044"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc b/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
index 6cc266d9d8d3e8..1bcab94dae3df8 100644
--- a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
+++ b/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
@@ -179,13 +179,13 @@ static Status CreateHloXlaPipeline(
   }
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createCanonicalizerPass());
-  mlir::bufferization::BufferResultsToOutParamsOptions out_params_options;
-  out_params_options.filterFn = [](mlir::func::FuncOp* func) {
+  mlir::bufferization::BufferResultsToOutParamsOpts out_params_opts;
+  out_params_opts.filterFn = [](mlir::func::FuncOp* func) {
     // Only transform the entry point.
     return func->getSymName() == "main";
   };
-  pm.addPass(mlir::bufferization::createBufferResultsToOutParamsPass(
-      out_params_options));
+  pm.addPass(
+      mlir::bufferization::createBufferResultsToOutParamsPass(out_params_opts));
 
   pm.addNestedPass<FuncOp>(
       mlir::bufferization::createPromoteBuffersToStackPass(nullptr));

From 33a86ec0896b35d1def2c53576ad9cc016796f34 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Sat, 16 Mar 2024 15:03:28 -0700
Subject: [PATCH 016/670] [xla:gpu][NFC] Add test for multiple sliced operands
 for AddressComputationThunk

PiperOrigin-RevId: 616467493
---
 .../runtime/address_computation_thunk_test.cc | 137 ++++++++++++++++++
 1 file changed, 137 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index 60c2f808677324..c7a8c6b88a7653 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -310,6 +310,143 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   ASSERT_FALSE(thunk.ExecuteOnStream(params).ok());
 }
 
+TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t length = sizeof(float) * 2 * 4;
+  int64_t out_length = sizeof(float) * 1;
+  int64_t offset_length = sizeof(int64_t) * 2;
+  int64_t slice_length = sizeof(float) * 3;
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  BufferAllocation alloc_lhs(/*index=*/0, length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, length);
+
+  BufferAllocation alloc_rhs(/*index=*/1, length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, length);
+
+  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+
+  BufferAllocation alloc_workspace(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+
+  BufferAllocation alloc_lhs_offset(/*index=*/4, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset(&alloc_lhs_offset, 0, offset_length);
+
+  BufferAllocation alloc_rhs_offset(/*index=*/5, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs_offset(&alloc_rhs_offset, 0, offset_length);
+
+  BufferAllocation alloc_lhs_fake(/*index=*/0, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, slice_length);
+
+  BufferAllocation alloc_rhs_fake(/*index=*/1, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs_fake(&alloc_rhs_fake, 0, slice_length);
+
+  // Preparing config for GEMM thunk.
+  auto config =
+      GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
+                      0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
+                      se::blas::kDefaultComputePrecision, false, false);
+  ASSERT_TRUE(config.ok());
+
+  // Creating embedded GEMM thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<GemmThunk>(
+      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      slice_out, slice_workspace, /*deterministic=*/true));
+
+  // Wrapping address computation thunk around the GEMM thunk.
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)), {slice_lhs, slice_rhs},
+      {slice_out, slice_workspace}, {slice_lhs_offset, slice_rhs_offset},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}),
+       ShapeUtil::MakeShape(PrimitiveType::F32, {8, 1})},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}),
+       ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1})});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+  // Given a `lhs` tensor of shape f32[2,4]{1,0}
+  // The `lhs` slice that we want to use will be equivalent to this static
+  // slice op:
+  // f32[1,3]{1,0} slice(lhs), slice={[0:1], [1:4]}
+
+  // Preparing memory for thunk arguments.
+  // lhs = [1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0]
+  std::vector<float> arr{1, 2, 3, 4, 5, 6, 7, 8};
+  se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
+  TF_ASSERT_OK(stream.Memcpy(&lhs, arr.data(), length));
+
+  // Given a `rhs` tensor of shape f32[8,1]{1,0}
+  // The `rhs` slice that we want to use will be equivalent to this static
+  // slice op:
+  // f32[3,1]{1,0} slice(rhs), slice={[2:5], [0:1]}
+  // rhs = [1.0,
+  //        2.0,
+  //        3.0,
+  //        4.0,
+  //        5.0,
+  //        6.0,
+  //        7.0,
+  //        8.0]
+  se::DeviceMemory<float> rhs = executor->AllocateArray<float>(8);
+  std::vector<float> rhs_arr(8, 1);
+  TF_ASSERT_OK(stream.Memcpy(&rhs, arr.data(), length));
+
+  se::DeviceMemory<float> out = executor->AllocateArray<float>(1);
+  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+
+  se::DeviceMemory<float> workspace =
+      executor->AllocateArray<float>(1024 * 1024);
+  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+
+  se::DeviceMemory<int64_t> lhs_offset = executor->AllocateArray<int64_t>(2);
+  std::vector<int64_t> lhs_offset_arr{0, 1};
+  TF_ASSERT_OK(
+      stream.Memcpy(&lhs_offset, lhs_offset_arr.data(), offset_length));
+
+  se::DeviceMemory<int64_t> rhs_offset = executor->AllocateArray<int64_t>(2);
+  std::vector<int64_t> rhs_offset_arr{2, 0};
+  TF_ASSERT_OK(
+      stream.Memcpy(&rhs_offset, rhs_offset_arr.data(), offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations(
+      {lhs, rhs, out, workspace, lhs_offset, rhs_offset}, 0,
+      executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Execute address computation thunk and verify that it executed a GEMM on the
+  // right slices.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `out` data back to host for verification.
+  std::vector<float> dst(1, 0);
+  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+
+  ASSERT_EQ(dst, std::vector<float>({2 * 3 + 3 * 4 + 4 * 5}));
+}
+
 static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
                            ffi::BufferBase dst) {
   return stream->MemcpyD2D(

From 1283253eb46b9e1caa2c1caa4316dc545272691c Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Sat, 16 Mar 2024 16:05:15 -0700
Subject: [PATCH 017/670] [xla:gpu] Add support for sliced results in
 AddressComputationThunk

PiperOrigin-RevId: 616475170
---
 .../gpu/runtime/address_computation_thunk.cc  | 167 ++++++++++++++----
 .../gpu/runtime/address_computation_thunk.h   |  26 ++-
 .../runtime/address_computation_thunk_test.cc | 151 +++++++++++++++-
 3 files changed, 293 insertions(+), 51 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 07ae9ac30f67c8..8affba065d2d78 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -47,32 +47,55 @@ AddressComputationThunk::AddressComputationThunk(
     std::vector<std::optional<const BufferAllocation::Slice>> operands,
     std::vector<std::optional<const BufferAllocation::Slice>> results,
     std::vector<std::optional<const BufferAllocation::Slice>>
-        offset_buffer_indices,
-    std::vector<std::optional<const Shape>> orig_shapes,
-    std::vector<std::optional<const Shape>> sliced_shapes)
+        operand_offset_buffer_indices,
+    std::vector<std::optional<const Shape>> operand_orig_shapes,
+    std::vector<std::optional<const Shape>> operand_sliced_shapes,
+    std::vector<std::optional<const BufferAllocation::Slice>>
+        result_offset_buffer_indices,
+    std::vector<std::optional<const Shape>> result_orig_shapes,
+    std::vector<std::optional<const Shape>> result_sliced_shapes)
     : Thunk(Kind::kAddressComputation, thunk_info),
       embedded_thunk_(std::make_unique<SequentialThunk>(
           ThunkInfo(thunk_info.op), std::move(*embedded_thunk))),
       embedded_thunk_operands_(std::move(operands)),
       embedded_thunk_results_(std::move(results)),
-      offset_buffer_indices_(std::move(offset_buffer_indices)),
-      orig_shapes_(std::move(orig_shapes)),
-      sliced_shapes_(std::move(sliced_shapes)) {}
+      operand_offset_buffer_indices_(std::move(operand_offset_buffer_indices)),
+      operand_orig_shapes_(std::move(operand_orig_shapes)),
+      operand_sliced_shapes_(std::move(operand_sliced_shapes)),
+      result_offset_buffer_indices_(std::move(result_offset_buffer_indices)),
+      result_orig_shapes_(std::move(result_orig_shapes)),
+      result_sliced_shapes_(std::move(result_sliced_shapes)) {}
 
 absl::Status AddressComputationThunk::Prepare(
     const PrepareParams& params, ResourceRequests& resource_requests) {
   auto num_operands = embedded_thunk_operands_.size();
-  TF_RET_CHECK(num_operands == offset_buffer_indices_.size());
-  TF_RET_CHECK(num_operands == orig_shapes_.size());
-  TF_RET_CHECK(num_operands == sliced_shapes_.size());
+  TF_RET_CHECK(num_operands == operand_offset_buffer_indices_.size());
+  TF_RET_CHECK(num_operands == operand_orig_shapes_.size());
+  TF_RET_CHECK(num_operands == operand_sliced_shapes_.size());
   for (unsigned i = 0; i < num_operands; ++i) {
-    if (sliced_shapes_[i].has_value()) {
+    if (operand_sliced_shapes_[i].has_value()) {
       TF_RET_CHECK(embedded_thunk_operands_[i].has_value());
-      TF_RET_CHECK(offset_buffer_indices_[i].has_value());
-      TF_RET_CHECK(sliced_shapes_[i]->IsArray());
-      TF_RET_CHECK(orig_shapes_[i].has_value() && orig_shapes_[i]->IsArray());
+      TF_RET_CHECK(operand_offset_buffer_indices_[i].has_value());
+      TF_RET_CHECK(operand_sliced_shapes_[i]->IsArray());
+      TF_RET_CHECK(operand_orig_shapes_[i].has_value() &&
+                   operand_orig_shapes_[i]->IsArray());
+    }
+  }
+
+  auto num_results = embedded_thunk_results_.size();
+  TF_RET_CHECK(num_results == result_offset_buffer_indices_.size());
+  TF_RET_CHECK(num_results == result_orig_shapes_.size());
+  TF_RET_CHECK(num_results == result_sliced_shapes_.size());
+  for (unsigned i = 0; i < num_results; ++i) {
+    if (result_sliced_shapes_[i].has_value()) {
+      TF_RET_CHECK(embedded_thunk_results_[i].has_value());
+      TF_RET_CHECK(result_offset_buffer_indices_[i].has_value());
+      TF_RET_CHECK(result_sliced_shapes_[i]->IsArray());
+      TF_RET_CHECK(result_orig_shapes_[i].has_value() &&
+                   result_orig_shapes_[i]->IsArray());
     }
   }
+
   TF_RETURN_IF_ERROR(embedded_thunk_->Prepare(params, resource_requests));
   return absl::OkStatus();
 }
@@ -81,16 +104,38 @@ absl::Status AddressComputationThunk::Initialize(
     const InitializeParams& params) {
   TF_RETURN_IF_ERROR(embedded_thunk_->Initialize(params));
 
-  unsigned num_offsets = 0;
-  for (auto maybe_shape : sliced_shapes_) {
-    num_offsets += (maybe_shape == std::nullopt) ? 1 : maybe_shape->rank();
+  unsigned operand_offset_count = 0;
+  for (auto maybe_shape : operand_sliced_shapes_) {
+    operand_offset_count +=
+        (maybe_shape == std::nullopt) ? 1 : maybe_shape->rank();
   }
-  absl::MutexLock lock(&mutex_);
-  if (auto it = offsets_.find(params.executor); it == offsets_.end()) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<se::MemoryAllocation> allocation,
-        params.executor->HostMemoryAllocate(num_offsets * sizeof(int64_t)));
-    offsets_.emplace(params.executor, std::move(allocation));
+
+  {
+    absl::MutexLock lock(&mutex_);
+    if (auto it = operand_offsets_.find(params.executor);
+        it == operand_offsets_.end()) {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> allocation,
+                          params.executor->HostMemoryAllocate(
+                              operand_offset_count * sizeof(int64_t)));
+      operand_offsets_.emplace(params.executor, std::move(allocation));
+    }
+  }
+
+  unsigned result_offset_count = 0;
+  for (auto maybe_shape : result_sliced_shapes_) {
+    result_offset_count +=
+        (maybe_shape == std::nullopt) ? 1 : maybe_shape->rank();
+  }
+
+  {
+    absl::MutexLock lock(&mutex_);
+    if (auto it = result_offsets_.find(params.executor);
+        it == result_offsets_.end()) {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> allocation,
+                          params.executor->HostMemoryAllocate(
+                              result_offset_count * sizeof(int64_t)));
+      result_offsets_.emplace(params.executor, std::move(allocation));
+    }
   }
 
   return absl::OkStatus();
@@ -99,16 +144,17 @@ absl::Status AddressComputationThunk::Initialize(
 absl::Status AddressComputationThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   auto& stream = *params.stream;
+  std::vector<se::DeviceMemoryBase> new_buffers;
+  const BufferAllocations& orig_allocations = *params.buffer_allocations;
 
-  // Get memory allocation for copying offsets from device.
-  int64_t* offsets_base = [&] {
+  // Get memory allocation for copying operand offsets from device.
+  int64_t* operand_offsets_base = [&] {
     absl::MutexLock lock(&mutex_);
-    return reinterpret_cast<int64_t*>(offsets_.at(stream.parent())->opaque());
+    return reinterpret_cast<int64_t*>(
+        operand_offsets_.at(stream.parent())->opaque());
   }();
 
-  std::vector<se::DeviceMemoryBase> new_buffers;
-  const BufferAllocations& orig_allocations = *params.buffer_allocations;
-  for (unsigned i = 0; i < offset_buffer_indices_.size(); ++i) {
+  for (unsigned i = 0; i < operand_offset_buffer_indices_.size(); ++i) {
     if (embedded_thunk_operands_[i] == std::nullopt) {
       new_buffers.push_back(se::DeviceMemoryBase());
       continue;
@@ -116,18 +162,18 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
 
     se::DeviceMemoryBase orig_operand =
         orig_allocations.GetDeviceAddress(*embedded_thunk_operands_[i]);
-    if (offset_buffer_indices_[i] == std::nullopt) {
+    if (operand_offset_buffer_indices_[i] == std::nullopt) {
       new_buffers.push_back(orig_operand);
       continue;
     }
 
     se::DeviceMemoryBase offset_src =
-        orig_allocations.GetDeviceAddress(*offset_buffer_indices_[i]);
+        orig_allocations.GetDeviceAddress(*operand_offset_buffer_indices_[i]);
 
     // Copy the ith offset from device to host.
-    const Shape& src_shape = *orig_shapes_[i];
-    const Shape& dst_shape = *sliced_shapes_[i];
-    int64_t* offset_dst = &offsets_base[i];
+    const Shape& src_shape = *operand_orig_shapes_[i];
+    const Shape& dst_shape = *operand_sliced_shapes_[i];
+    int64_t* offset_dst = &operand_offsets_base[i];
     TF_RETURN_IF_ERROR(stream.Memcpy(offset_dst, offset_src,
                                      dst_shape.rank() * sizeof(int64_t)));
 
@@ -155,15 +201,58 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
     new_buffers.push_back(orig_operand.GetByteSlice(new_offset, new_size));
   }
 
-  // TODO(vuson): handle DUS too. For now just copy the results over.
-  for (auto result : embedded_thunk_results_) {
-    if (result == std::nullopt) {
+  // Get memory allocation for copying result offsets from device.
+  int64_t* result_offsets_base = [&] {
+    absl::MutexLock lock(&mutex_);
+    return reinterpret_cast<int64_t*>(
+        result_offsets_.at(stream.parent())->opaque());
+  }();
+
+  for (unsigned i = 0; i < result_offset_buffer_indices_.size(); ++i) {
+    if (embedded_thunk_results_[i] == std::nullopt) {
       new_buffers.push_back(se::DeviceMemoryBase());
-    } else {
-      se::DeviceMemoryBase orig_result =
-          orig_allocations.GetDeviceAddress(*result);
+      continue;
+    }
+
+    se::DeviceMemoryBase orig_result =
+        orig_allocations.GetDeviceAddress(*embedded_thunk_results_[i]);
+    if (result_offset_buffer_indices_[i] == std::nullopt) {
       new_buffers.push_back(orig_result);
+      continue;
+    }
+
+    se::DeviceMemoryBase offset_src =
+        orig_allocations.GetDeviceAddress(*result_offset_buffer_indices_[i]);
+
+    // Copy the ith offset from device to host.
+    const Shape& src_shape = *result_orig_shapes_[i];
+    const Shape& dst_shape = *result_sliced_shapes_[i];
+    int64_t* offset_dst = &result_offsets_base[i];
+    TF_RETURN_IF_ERROR(stream.Memcpy(offset_dst, offset_src,
+                                     dst_shape.rank() * sizeof(int64_t)));
+
+    if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
+      return absl::InternalError(absl::StrFormat(
+          "Failed to retrieve all slice offset values on stream %p: %s",
+          &stream, blocked.message()));
+    }
+
+    // Compute new slice. No need to copy the content to new buffers as we can
+    // reuse the original buffers since slices are contiguous.
+    TF_RET_CHECK(IsContiguousSlice(src_shape, dst_shape));
+
+    int64_t new_size = ShapeUtil::ByteSizeOf(dst_shape);
+    BufferAllocation::Slice orig_slice = *embedded_thunk_results_[i];
+
+    int64_t new_offset = orig_slice.offset();
+    std::vector<int64_t> slice_starts(offset_dst,
+                                      offset_dst + dst_shape.rank());
+    for (auto [start, stride] :
+         llvm::zip(slice_starts, *ShapeUtil::ByteStrides(src_shape))) {
+      new_offset += start * stride;
     }
+
+    new_buffers.push_back(orig_result.GetByteSlice(new_offset, new_size));
   }
 
   // Safe to create a local BufferAllocations here since buffers are only slices
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index abb6d89ed1f59c..d4bdbfe287d9b1 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -47,9 +47,13 @@ class AddressComputationThunk : public Thunk {
       std::vector<std::optional<const BufferAllocation::Slice>> operands,
       std::vector<std::optional<const BufferAllocation::Slice>> results,
       std::vector<std::optional<const BufferAllocation::Slice>>
-          offset_buffer_indices,
-      std::vector<std::optional<const Shape>> orig_shapes,
-      std::vector<std::optional<const Shape>> sliced_shapes);
+          operand_offset_buffer_indices,
+      std::vector<std::optional<const Shape>> operand_orig_shapes,
+      std::vector<std::optional<const Shape>> operand_sliced_shapes,
+      std::vector<std::optional<const BufferAllocation::Slice>>
+          result_offset_buffer_indices,
+      std::vector<std::optional<const Shape>> result_orig_shapes,
+      std::vector<std::optional<const Shape>> result_sliced_shapes);
 
   AddressComputationThunk(const AddressComputationThunk&) = delete;
   AddressComputationThunk& operator=(const AddressComputationThunk&) = delete;
@@ -66,16 +70,22 @@ class AddressComputationThunk : public Thunk {
   std::vector<std::optional<const BufferAllocation::Slice>>
       embedded_thunk_results_;
   std::vector<std::optional<const BufferAllocation::Slice>>
-      offset_buffer_indices_;
-
-  std::vector<std::optional<const Shape>> orig_shapes_;
-  std::vector<std::optional<const Shape>> sliced_shapes_;
+      operand_offset_buffer_indices_;
+  std::vector<std::optional<const Shape>> operand_orig_shapes_;
+  std::vector<std::optional<const Shape>> operand_sliced_shapes_;
+  std::vector<std::optional<const BufferAllocation::Slice>>
+      result_offset_buffer_indices_;
+  std::vector<std::optional<const Shape>> result_orig_shapes_;
+  std::vector<std::optional<const Shape>> result_sliced_shapes_;
 
   // Pinned host memory for transferring offset values from device to host.
   absl::Mutex mutex_;
   absl::flat_hash_map<se::StreamExecutor*,
                       std::unique_ptr<se::MemoryAllocation>>
-      offsets_ ABSL_GUARDED_BY(mutex_);
+      operand_offsets_ ABSL_GUARDED_BY(mutex_);
+  absl::flat_hash_map<se::StreamExecutor*,
+                      std::unique_ptr<se::MemoryAllocation>>
+      result_offsets_ ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index c7a8c6b88a7653..e783cdea0ba6a3 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -124,7 +124,9 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
       std::make_unique<ThunkSequence>(std::move(seq)), {slice_lhs, slice_rhs},
       {slice_out, slice_workspace}, {slice_lhs_offset, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt},
-      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt});
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt},
+      {std::nullopt, std::nullopt}, {std::nullopt, std::nullopt},
+      {std::nullopt, std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -246,7 +248,9 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}),
        ShapeUtil::MakeShape(PrimitiveType::F32, {4, 3})},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2}),
-       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2})});
+       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2})},
+      {std::nullopt, std::nullopt}, {std::nullopt, std::nullopt},
+      {std::nullopt, std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -372,7 +376,9 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}),
        ShapeUtil::MakeShape(PrimitiveType::F32, {8, 1})},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}),
-       ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1})});
+       ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1})},
+      {std::nullopt, std::nullopt}, {std::nullopt, std::nullopt},
+      {std::nullopt, std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -520,7 +526,8 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
       {slice_offset}, {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 8})},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
       // original slice result and not bitcasted one)
-      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 8, 8})});
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 8, 8})}, {std::nullopt},
+      {std::nullopt}, {std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -573,4 +580,140 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   ASSERT_EQ(out, ref);
 }
 
+TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t src_count = 8 * 8 * 10 * 2;
+  int64_t dst_count = 2 * 2 * 2 * 2;
+  int64_t slice_count = 2 * 2;
+  int64_t src_length = sizeof(int32_t) * src_count;
+  int64_t dst_length = sizeof(int32_t) * dst_count;
+  int64_t offset_length = sizeof(int64_t) * 4;
+  int64_t slice_length = sizeof(int32_t) * slice_count;
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
+  BufferAllocation::Slice slice_src(&alloc_src, 0, src_length);
+
+  BufferAllocation alloc_dst(/*index=*/1, dst_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst(&alloc_dst, 0, dst_length);
+
+  BufferAllocation alloc_src_offset(/*index=*/2, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset(&alloc_src_offset, 0, offset_length);
+
+  BufferAllocation alloc_dst_offset(/*index=*/3, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset(&alloc_dst_offset, 0, offset_length);
+
+  // Fake slices for embedded thunk creation.
+  BufferAllocation alloc_src_fake(/*index=*/0, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_fake(&alloc_src_fake, 0, slice_length);
+
+  BufferAllocation alloc_dst_fake(/*index=*/1, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_fake(&alloc_dst_fake, 0, slice_length);
+
+  // Preparing custom call thunk: setting up call target and operands + results
+  // buffers.
+  auto handler = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
+  ASSERT_TRUE(handler.ok());
+
+  std::vector<std::optional<CustomCallThunk::Slice>> operands{
+      CustomCallThunk::Slice{slice_src_fake,
+                             ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
+  std::vector<std::optional<CustomCallThunk::Slice>> results{
+      CustomCallThunk::Slice{slice_dst_fake,
+                             ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
+
+  // Creating embedded custom call thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<CustomCallThunk>(
+      Thunk::ThunkInfo(nullptr), *handler, operands, results,
+      /*attributes=*/CustomCallThunk::AttributesMap(),
+      /*called_computation=*/nullptr));
+
+  // Wrapping address computation thunk around the custom call thunk.
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src}, {slice_dst},
+      {slice_src_offset},
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 2})},
+      // Make sure to pass a dst shape with the same rank as src shape (i.e.
+      // original slice result and not bitcasted one)
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})},
+      {slice_dst_offset},
+      {{ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2, 2, 2})}},
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+  // Given a `src` tensor of shape s32[8,8,10,2]{3,2,1,0}
+  // The `src` slice that we want to copy from will be equivalent to this static
+  // slice op:
+  // s32[1,1,2,2]{3,2,1,0} slice(src), slice={[3:4], [5:6], [2:4], [0:2]}
+  //
+  // Given a `dst` tensor of shape s32[2,2,2,2]{3,2,1,0}
+  // The `dst` slice that we want to copy into will be equivalent to this static
+  // slice op:
+  // s32[1,1,2,2]{3,2,1,0} slice(dst), slice={[1:2], [1:2], [0:2], [0:2]}
+
+  // Preparing memory for thunk arguments.
+  se::DeviceMemory<int32_t> src = executor->AllocateArray<int32_t>(src_count);
+  std::vector<int32_t> src_arr(src_count, 0);
+  for (unsigned i = 0; i < src_count; ++i) src_arr[i] = i;
+  TF_ASSERT_OK(stream.Memcpy(&src, src_arr.data(), src_length));
+
+  se::DeviceMemory<int32_t> dst = executor->AllocateArray<int32_t>(dst_count);
+  TF_ASSERT_OK(stream.MemZero(&dst, dst_length));
+
+  se::DeviceMemory<int64_t> src_offset = executor->AllocateArray<int64_t>(4);
+  std::vector<int64_t> src_offset_arr{3, 5, 2, 0};
+  TF_ASSERT_OK(
+      stream.Memcpy(&src_offset, src_offset_arr.data(), offset_length));
+
+  se::DeviceMemory<int64_t> dst_offset = executor->AllocateArray<int64_t>(4);
+  std::vector<int64_t> dst_offset_arr{1, 1, 0, 0};
+  TF_ASSERT_OK(
+      stream.Memcpy(&dst_offset, dst_offset_arr.data(), offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations({src, dst, src_offset, dst_offset}, 0,
+                                executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Executing address computation thunk.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copying `dst` data back to host for verification.
+  std::vector<int32_t> out(dst_count, 0);
+  TF_ASSERT_OK(stream.Memcpy(out.data(), dst, dst_length));
+
+  // Verifying that the right slice of `src` was copied to `dst`.
+  std::vector<int32_t> ref(dst_count, 0);
+  int64_t src_offset_val =
+      src_offset_arr[3] +
+      2 * (src_offset_arr[2] +
+           10 * (src_offset_arr[1] + 8 * src_offset_arr[0]));
+  int64_t dst_offset_val =
+      dst_offset_arr[3] +
+      2 * (dst_offset_arr[2] + 2 * (dst_offset_arr[1] + 2 * dst_offset_arr[0]));
+  std::copy(src_arr.begin() + src_offset_val,
+            src_arr.begin() + src_offset_val + slice_count,
+            ref.begin() + dst_offset_val);
+  ASSERT_EQ(out, ref);
+}
+
 }  // namespace xla::gpu

From de598b814a737e17292c99fe8036c33d2f170141 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 17 Mar 2024 02:02:06 -0700
Subject: [PATCH 018/670] compat: Update forward compatibility horizon to
 2024-03-17

PiperOrigin-RevId: 616552576
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 5a949daa30884e..382dfdf2eb7712 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 16)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 17)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 042a7c1bbcc7636ee7ea6d9469a061296c1ddf97 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 17 Mar 2024 02:02:13 -0700
Subject: [PATCH 019/670] Update GraphDef version to 1804.

PiperOrigin-RevId: 616552596
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 8fa4e8122aab1e..c96dd2e0380234 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1803  // Updated: 2024/3/16
+#define TF_GRAPH_DEF_VERSION 1804  // Updated: 2024/3/17
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 229b345cbe5afd07c55c51a5a5325ce9016200d5 Mon Sep 17 00:00:00 2001
From: "Jae H. Yoo" <jaeyoo@google.com>
Date: Sun, 17 Mar 2024 17:13:00 -0700
Subject: [PATCH 020/670] Add BFLOAT16 to TFLite flatbuffer schema

PiperOrigin-RevId: 616668681
---
 .../compiler/mlir/lite/utils/convert_type.cc      |  8 ++++++++
 tensorflow/lite/core/api/BUILD                    |  3 ++-
 .../lite/core/api/flatbuffer_conversions.cc       |  3 +++
 .../lite/core/api/flatbuffer_conversions_test.cc  |  8 ++++++++
 tensorflow/lite/core/c/c_api_types.h              |  1 +
 tensorflow/lite/core/c/common.cc                  |  2 ++
 tensorflow/lite/core/c/common.h                   |  7 +++++++
 tensorflow/lite/core/c/common_test.cc             |  1 +
 tensorflow/lite/core/tools/verifier.cc            |  3 +++
 tensorflow/lite/delegates/flex/BUILD              |  3 ++-
 tensorflow/lite/delegates/flex/util.cc            |  8 ++++++++
 tensorflow/lite/delegates/flex/util_test.cc       |  3 +++
 .../delegates/gpu/common/model_builder_helper.h   |  2 ++
 tensorflow/lite/objc/apis/TFLTensor.h             |  3 +++
 tensorflow/lite/objc/sources/TFLCommonUtil.mm     |  2 ++
 tensorflow/lite/optional_debug_tools.cc           |  2 ++
 tensorflow/lite/python/interpreter_wrapper/BUILD  |  1 +
 .../lite/python/interpreter_wrapper/numpy.cc      |  8 ++++++--
 .../lite/python/optimize/calibration_wrapper.cc   |  2 ++
 tensorflow/lite/schema/schema.fbs                 |  1 +
 tensorflow/lite/schema/schema_generated.h         | 15 +++++++++------
 .../lite/tools/serialization/enum_mapping.h       |  2 ++
 tensorflow/lite/util.cc                           |  3 +++
 23 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/utils/convert_type.cc b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
index 9b215e77b89529..e09030ceb7515f 100644
--- a/tensorflow/compiler/mlir/lite/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
@@ -34,6 +34,8 @@ namespace errors = tensorflow::errors;
 tflite::TensorType ConvertTypeToTensorType(mlir::Type type) {
   if (type.isF16()) {
     return tflite::TensorType_FLOAT16;
+  } else if (type.isBF16()) {
+    return tflite::TensorType_BFLOAT16;
   } else if (type.isF32()) {
     return tflite::TensorType_FLOAT32;
   } else if (type.isF64()) {
@@ -81,6 +83,8 @@ mlir::Type ConvertElementType(tflite::TensorType type, mlir::Builder builder) {
   switch (type) {
     case tflite::TensorType_FLOAT16:
       return builder.getF16Type();
+    case tflite::TensorType_BFLOAT16:
+      return builder.getBF16Type();
     case tflite::TensorType_FLOAT32:
       return builder.getF32Type();
     case tflite::TensorType_FLOAT64:
@@ -128,6 +132,8 @@ tensorflow::DataType TflTypeToTfType(tflite::TensorType type) {
       return tensorflow::DT_COMPLEX128;
     case tflite::TensorType_FLOAT16:
       return tensorflow::DT_HALF;
+    case tflite::TensorType_BFLOAT16:
+      return tensorflow::DT_BFLOAT16;
     case tflite::TensorType_FLOAT32:
       return tensorflow::DT_FLOAT;
     case tflite::TensorType_FLOAT64:
@@ -170,6 +176,8 @@ absl::StatusOr<tflite::TensorType> TfTypeToTflType(tensorflow::DataType type) {
       return tflite::TensorType_COMPLEX128;
     case tensorflow::DT_HALF:
       return tflite::TensorType_FLOAT16;
+    case tensorflow::DT_BFLOAT16:
+      return tflite::TensorType_BFLOAT16;
     case tensorflow::DT_FLOAT:
       return tflite::TensorType_FLOAT32;
     case tensorflow::DT_DOUBLE:
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index a0e28f1ccaaf8b..1d6e1ca1eed47a 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "op_resolver_internal_visibility_allowlist")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -154,6 +154,7 @@ cc_test(
         ":api",
         "//tensorflow/lite:string",
         "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 10feeb3fc2c7dd..d36c2b69f4058a 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -1017,6 +1017,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
     case TensorType_FLOAT16:
       *type = kTfLiteFloat16;
       return kTfLiteOk;
+    case TensorType_BFLOAT16:
+      *type = kTfLiteBFloat16;
+      return kTfLiteOk;
     case TensorType_FLOAT32:
       *type = kTfLiteFloat32;
       return kTfLiteOk;
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 6e08e6880e5522..87c897dfc0928e 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
 
@@ -189,6 +190,13 @@ TEST_F(FlatbufferConversionsTest, TestConvertTensorTypeFloat16) {
   EXPECT_EQ(kTfLiteFloat16, type);
 }
 
+TEST_F(FlatbufferConversionsTest, TestConvertTensorTypeBFloat16) {
+  TfLiteType type;
+  EXPECT_EQ(kTfLiteOk,
+            ConvertTensorType(TensorType_BFLOAT16, &type, &mock_reporter_));
+  EXPECT_EQ(kTfLiteBFloat16, type);
+}
+
 TEST_F(FlatbufferConversionsTest, TestConvertTensorTypeInt4) {
   TfLiteType type;
   EXPECT_EQ(kTfLiteOk,
diff --git a/tensorflow/lite/core/c/c_api_types.h b/tensorflow/lite/core/c/c_api_types.h
index 1170025cbab9a2..32cefa839f4452 100644
--- a/tensorflow/lite/core/c/c_api_types.h
+++ b/tensorflow/lite/core/c/c_api_types.h
@@ -133,6 +133,7 @@ typedef enum {
   kTfLiteUInt32 = 16,
   kTfLiteUInt16 = 17,
   kTfLiteInt4 = 18,
+  kTfLiteBFloat16 = 19,
 } TfLiteType;
 
 /// Legacy. Will be deprecated in favor of `TfLiteAffineQuantization`.
diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc
index fd7c415f96e634..7afecdbe885199 100644
--- a/tensorflow/lite/core/c/common.cc
+++ b/tensorflow/lite/core/c/common.cc
@@ -370,6 +370,8 @@ const char* TfLiteTypeGetName(TfLiteType type) {
       return "STRING";
     case kTfLiteFloat16:
       return "FLOAT16";
+    case kTfLiteBFloat16:
+      return "BFLOAT16";
     case kTfLiteFloat64:
       return "FLOAT64";
     case kTfLiteResource:
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index 4e4890164d3aa6..9801bde9ddc6ea 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -296,6 +296,13 @@ typedef struct TfLiteFloat16 {
   uint16_t data;
 } TfLiteFloat16;
 
+/// bfloat16 data type compatible with the Google Brain definition.
+/// https://cloud.google.com/tpu/docs/bfloat16.
+/// This provides 1 bit of sign, 8 bits of exponent, and 7 bits of mantissa.
+typedef struct TfLiteBFloat16 {
+  uint16_t data;
+} TfLiteBFloat16;
+
 /// Return the name of a given type, for error reporting purposes.
 const char* TfLiteTypeGetName(TfLiteType type);
 
diff --git a/tensorflow/lite/core/c/common_test.cc b/tensorflow/lite/core/c/common_test.cc
index d2bc137378656e..58fd8654d8b171 100644
--- a/tensorflow/lite/core/c/common_test.cc
+++ b/tensorflow/lite/core/c/common_test.cc
@@ -107,6 +107,7 @@ TEST(Types, TestTypeNames) {
   EXPECT_EQ(type_name(kTfLiteFloat64), "FLOAT64");
   EXPECT_EQ(type_name(kTfLiteFloat32), "FLOAT32");
   EXPECT_EQ(type_name(kTfLiteFloat16), "FLOAT16");
+  EXPECT_EQ(type_name(kTfLiteBFloat16), "BFLOAT16");
   EXPECT_EQ(type_name(kTfLiteInt16), "INT16");
   EXPECT_EQ(type_name(kTfLiteUInt16), "UINT16");
   EXPECT_EQ(type_name(kTfLiteInt32), "INT32");
diff --git a/tensorflow/lite/core/tools/verifier.cc b/tensorflow/lite/core/tools/verifier.cc
index cdf8959d55483f..c878f7c392d14a 100644
--- a/tensorflow/lite/core/tools/verifier.cc
+++ b/tensorflow/lite/core/tools/verifier.cc
@@ -409,6 +409,9 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     case TensorType_FLOAT16:
       bytes_required *= sizeof(uint16_t);
       break;
+    case TensorType_BFLOAT16:
+      bytes_required *= sizeof(uint16_t);
+      break;
     case TensorType_FLOAT64:
       bytes_required *= sizeof(double);
       break;
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 77a16ffa032865..5f126f68124cf8 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -7,10 +7,10 @@ load(
     "tf_opts_nortti_if_lite_protos",
     "tf_opts_nortti_if_mobile",
 )
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist")
 load("//tensorflow/lite/delegates/flex:build_def.bzl", "tflite_flex_cc_library", "tflite_flex_shared_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 default_visibility = [
     "//tensorflow/compiler/mlir/lite:__subpackages__",
@@ -322,6 +322,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:tensor",
         "@com_google_absl//absl/strings:str_format",
diff --git a/tensorflow/lite/delegates/flex/util.cc b/tensorflow/lite/delegates/flex/util.cc
index 8a115a4f33cf64..9940fadb8d7625 100644
--- a/tensorflow/lite/delegates/flex/util.cc
+++ b/tensorflow/lite/delegates/flex/util.cc
@@ -17,10 +17,12 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_format.h"
+#include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/string_util.h"
 
@@ -74,6 +76,8 @@ TF_DataType GetTensorFlowDataType(TfLiteType type) {
       return TF_FLOAT;
     case kTfLiteFloat16:
       return TF_HALF;
+    case kTfLiteBFloat16:
+      return TF_BFLOAT16;
     case kTfLiteFloat64:
       return TF_DOUBLE;
     case kTfLiteInt16:
@@ -116,6 +120,8 @@ TfLiteType GetTensorFlowLiteType(TF_DataType type) {
       return kTfLiteFloat32;
     case TF_HALF:
       return kTfLiteFloat16;
+    case TF_BFLOAT16:
+      return kTfLiteBFloat16;
     case TF_DOUBLE:
       return kTfLiteFloat64;
     case TF_INT16:
@@ -186,6 +192,8 @@ const char* TfLiteTypeToTfTypeName(TfLiteType type) {
       return "string";
     case kTfLiteFloat16:
       return "float16";
+    case kTfLiteBFloat16:
+      return "bfloat16";
     case kTfLiteFloat64:
       return "float64";
     case kTfLiteResource:
diff --git a/tensorflow/lite/delegates/flex/util_test.cc b/tensorflow/lite/delegates/flex/util_test.cc
index c7361314aa38f5..7dfea9e6437c9d 100644
--- a/tensorflow/lite/delegates/flex/util_test.cc
+++ b/tensorflow/lite/delegates/flex/util_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
@@ -118,6 +119,7 @@ TEST(UtilTest, TypeConversionsFromTFLite) {
   EXPECT_EQ(TF_FLOAT, GetTensorFlowDataType(kTfLiteNoType));
   EXPECT_EQ(TF_FLOAT, GetTensorFlowDataType(kTfLiteFloat32));
   EXPECT_EQ(TF_HALF, GetTensorFlowDataType(kTfLiteFloat16));
+  EXPECT_EQ(TF_BFLOAT16, GetTensorFlowDataType(kTfLiteBFloat16));
   EXPECT_EQ(TF_DOUBLE, GetTensorFlowDataType(kTfLiteFloat64));
   EXPECT_EQ(TF_INT16, GetTensorFlowDataType(kTfLiteInt16));
   EXPECT_EQ(TF_INT32, GetTensorFlowDataType(kTfLiteInt32));
@@ -136,6 +138,7 @@ TEST(UtilTest, TypeConversionsFromTFLite) {
 
 TEST(UtilTest, TypeConversionsFromTensorFlow) {
   EXPECT_EQ(kTfLiteFloat16, GetTensorFlowLiteType(TF_HALF));
+  EXPECT_EQ(kTfLiteBFloat16, GetTensorFlowLiteType(TF_BFLOAT16));
   EXPECT_EQ(kTfLiteFloat32, GetTensorFlowLiteType(TF_FLOAT));
   EXPECT_EQ(kTfLiteFloat64, GetTensorFlowLiteType(TF_DOUBLE));
   EXPECT_EQ(kTfLiteInt16, GetTensorFlowLiteType(TF_INT16));
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 14384ce5be9a1c..27bb621c40dea9 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -154,6 +154,8 @@ absl::Status CreateVectorCopyData(const TfLiteTensor& src, T* dst) {
         return absl::OkStatus();
       case kTfLiteFloat16:
         return absl::UnimplementedError("src can't be float16.");
+      case kTfLiteBFloat16:
+        return absl::UnimplementedError("src can't be bfloat16.");
       case kTfLiteFloat64:
         for (int i = 0; i < n; ++i) {
           dst[i] = tflite::GetTensorData<double>(&src)[i];
diff --git a/tensorflow/lite/objc/apis/TFLTensor.h b/tensorflow/lite/objc/apis/TFLTensor.h
index cd60b2144a0e6c..deaf52f9e5843f 100644
--- a/tensorflow/lite/objc/apis/TFLTensor.h
+++ b/tensorflow/lite/objc/apis/TFLTensor.h
@@ -52,6 +52,9 @@ typedef NS_ENUM(NSUInteger, TFLTensorDataType) {
 
   /** 64-bit double precision floating point. */
   TFLTensorDataTypeFloat64,
+
+  /** 16-bit bfloat16 floating point. */
+  TFLTensorDataTypeBFloat16,
 };
 
 /**
diff --git a/tensorflow/lite/objc/sources/TFLCommonUtil.mm b/tensorflow/lite/objc/sources/TFLCommonUtil.mm
index 57362ceabb6597..8f9e37ebb421b6 100644
--- a/tensorflow/lite/objc/sources/TFLCommonUtil.mm
+++ b/tensorflow/lite/objc/sources/TFLCommonUtil.mm
@@ -32,6 +32,8 @@ TFLTensorDataType TFLTensorDataTypeFromCTensor(const TfLiteTensor *cTensor) {
       return TFLTensorDataTypeFloat32;
     case kTfLiteFloat16:
       return TFLTensorDataTypeFloat16;
+    case kTfLiteBFloat16:
+      return TFLTensorDataTypeBFloat16;
     case kTfLiteFloat64:
       return TFLTensorDataTypeFloat64;
     case kTfLiteInt32:
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index ce6e9e4973f702..9b716cdffb17c9 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -336,6 +336,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteComplex128";
     case kTfLiteFloat16:
       return "kTfLiteFloat16";
+    case kTfLiteBFloat16:
+      return "kTfLiteBFloat16";
     case kTfLiteFloat64:
       return "kTfLiteFloat64";
     case kTfLiteResource:
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index fa0af673063325..ed111e41efee0a 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -13,6 +13,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.cc b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
index 0e07563702fcb0..45146cf88b0616 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #define TFLITE_IMPORT_NUMPY  // See numpy.h for explanation.
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
 
-#include <memory>
-
 namespace tflite {
 namespace python {
 
@@ -38,6 +39,9 @@ int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
       return NPY_FLOAT32;
     case kTfLiteFloat16:
       return NPY_FLOAT16;
+    case kTfLiteBFloat16:
+      // TODO(b/329491949): NPY_BFLOAT16 currently doesn't exist
+      return NPY_FLOAT16;
     case kTfLiteFloat64:
       return NPY_FLOAT64;
     case kTfLiteInt32:
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index ffccf71a40635e..65f5dfe49d51ca 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -114,6 +114,8 @@ inline TensorType TfLiteTypeToSchemaType(TfLiteType type) {
       return TensorType_FLOAT32;
     case kTfLiteFloat16:
       return TensorType_FLOAT16;
+    case kTfLiteBFloat16:
+      return TensorType_BFLOAT16;
     case kTfLiteFloat64:
       return TensorType_FLOAT64;
     case kTfLiteInt32:
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 382462f938d93b..fe9ee4c11cc5c9 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -58,6 +58,7 @@ enum TensorType : byte {
   UINT32 = 15,
   UINT16 = 16,
   INT4 = 17,
+  BFLOAT16 = 18,
 }
 
 // Custom quantization parameters for experimenting with new quantization
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index b416555e837c3f..79d78c1fc84341 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -700,11 +700,12 @@ enum TensorType : int8_t {
   TensorType_UINT32 = 15,
   TensorType_UINT16 = 16,
   TensorType_INT4 = 17,
+  TensorType_BFLOAT16 = 18,
   TensorType_MIN = TensorType_FLOAT32,
-  TensorType_MAX = TensorType_INT4
+  TensorType_MAX = TensorType_BFLOAT16
 };
 
-inline const TensorType (&EnumValuesTensorType())[18] {
+inline const TensorType (&EnumValuesTensorType())[19] {
   static const TensorType values[] = {
     TensorType_FLOAT32,
     TensorType_FLOAT16,
@@ -723,13 +724,14 @@ inline const TensorType (&EnumValuesTensorType())[18] {
     TensorType_VARIANT,
     TensorType_UINT32,
     TensorType_UINT16,
-    TensorType_INT4
+    TensorType_INT4,
+    TensorType_BFLOAT16
   };
   return values;
 }
 
 inline const char * const *EnumNamesTensorType() {
-  static const char * const names[19] = {
+  static const char * const names[20] = {
     "FLOAT32",
     "FLOAT16",
     "INT32",
@@ -748,13 +750,14 @@ inline const char * const *EnumNamesTensorType() {
     "UINT32",
     "UINT16",
     "INT4",
+    "BFLOAT16",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameTensorType(TensorType e) {
-  if (::flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_INT4)) return "";
+  if (::flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_BFLOAT16)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesTensorType()[index];
 }
diff --git a/tensorflow/lite/tools/serialization/enum_mapping.h b/tensorflow/lite/tools/serialization/enum_mapping.h
index 574b1ee3e21cf7..d218b66258581f 100644
--- a/tensorflow/lite/tools/serialization/enum_mapping.h
+++ b/tensorflow/lite/tools/serialization/enum_mapping.h
@@ -64,6 +64,8 @@ inline TensorType TfLiteTypeToSchemaType(TfLiteType type) {
       return TensorType_FLOAT32;
     case kTfLiteFloat16:
       return TensorType_FLOAT16;
+    case kTfLiteBFloat16:
+      return TensorType_BFLOAT16;
     case kTfLiteFloat64:
       return TensorType_FLOAT64;
     case kTfLiteInt32:
diff --git a/tensorflow/lite/util.cc b/tensorflow/lite/util.cc
index d0d385a310d732..cecda0e5eb44a1 100644
--- a/tensorflow/lite/util.cc
+++ b/tensorflow/lite/util.cc
@@ -118,6 +118,9 @@ TfLiteStatus GetSizeOfType(TfLiteContext* context, const TfLiteType type,
     case kTfLiteFloat16:
       *bytes = sizeof(TfLiteFloat16);
       break;
+    case kTfLiteBFloat16:
+      *bytes = sizeof(TfLiteBFloat16);
+      break;
     case kTfLiteFloat64:
       *bytes = sizeof(double);
       break;

From eac0721765d3f5ee31a9a36baff8ce70352012f4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 Mar 2024 02:01:57 -0700
Subject: [PATCH 021/670] compat: Update forward compatibility horizon to
 2024-03-18

PiperOrigin-RevId: 616753201
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 382dfdf2eb7712..813819ae0aec8d 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 17)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 18)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 1449a4f07665788459a3cb37fbe4354835e592ef Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 Mar 2024 02:02:10 -0700
Subject: [PATCH 022/670] Update GraphDef version to 1805.

PiperOrigin-RevId: 616753252
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index c96dd2e0380234..b199c37ee80142 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1804  // Updated: 2024/3/17
+#define TF_GRAPH_DEF_VERSION 1805  // Updated: 2024/3/18
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 4c0445cde011df70ea111c09d2fa6a9f412b8f93 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Mon, 18 Mar 2024 07:05:12 -0700
Subject: [PATCH 023/670] Fix MOF transpose fusions.

The current code attempts to evaluate the epilogue for each transpose, but it
needs to be evaluated once for all transposes together.

PiperOrigin-RevId: 616815857
---
 .../xla/service/gpu/fusions/transpose_mlir.cc | 202 +++++-------------
 .../xla/service/gpu/fusions/transpose_mlir.h  |  15 +-
 .../service/gpu/model/indexing_analysis.cc    |   4 +
 3 files changed, 67 insertions(+), 154 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
index 9311097e0093f8..8f3f4ef37480b4 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
@@ -111,17 +111,16 @@ Tiling ComputeTransposeTiling(const TransposeDescription& tiled_transpose) {
 }
 
 // Returns transpose heroes that should be codegened via shmem.
-absl::flat_hash_set<const HloInstruction*> GetShMemTranposes(
+std::vector<const HloInstruction*> GetShMemTransposes(
     const HloFusionAnalysis& analysis) {
-  absl::flat_hash_set<const HloInstruction*> tranposes_to_tile;
+  ConstHloInstructionSet transposes_to_tile;
   for (const auto [hero, root] :
        llvm::zip(analysis.fusion_heroes(), analysis.fusion_roots())) {
-    if (!GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
-      continue;
+    if (GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
+      transposes_to_tile.insert(hero);
     }
-    tranposes_to_tile.insert(hero);
   }
-  return tranposes_to_tile;
+  return {transposes_to_tile.begin(), transposes_to_tile.end()};
 }
 
 }  // namespace
@@ -129,7 +128,7 @@ absl::flat_hash_set<const HloInstruction*> GetShMemTranposes(
 MlirTransposeFusion::MlirTransposeFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis),
       tiling_(ComputeTransposeTiling(analysis.tiled_transpose())),
-      shmem_transposes_(GetShMemTranposes(analysis)) {
+      shmem_transposes_(GetShMemTransposes(analysis)) {
   for (auto [root, hero] :
        llvm::zip(analysis_.fusion_roots(), analysis_.fusion_heroes())) {
     if (auto transpose = GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
@@ -143,14 +142,7 @@ MlirTransposeFusion::MlirTransposeFusion(const HloFusionAnalysis& analysis)
     const HloFusionAnalysis& analysis) {
   // If there is a hero, which does not have a transpose, the codegen might
   // fail because of the incorrect thread ID mapping for that particular case.
-  for (const auto [hero, root] :
-       llvm::zip(analysis.fusion_heroes(), analysis.fusion_roots())) {
-    if (!GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
-      return false;
-    }
-  }
-  return mlir_converter::IsHloConversionSupported(
-      analysis.fusion(), analysis.device_info().gpu_compute_capability());
+  return GetShMemTransposes(analysis).size() == analysis.fusion_heroes().size();
 }
 
 std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
@@ -161,7 +153,11 @@ std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
     // Non-transpose roots are elementwise by definition.
     return ComputeThreadIdToInputIndexing(root_index, 0, ctx);
   }
+  return ComputeThreadIdToOutputIndexing(hero, ctx);
+}
 
+IndexingMap MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
+    const HloInstruction& hero, MLIRContext* ctx) const {
   // The block offsets are permuted, but the thread offsets remain the same.
   auto block_offset = GetBlockOffsetsForTiling(tiling_, ctx)
                           .getSubMap(std::vector<unsigned>{permutation_.begin(),
@@ -180,10 +176,8 @@ std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
   return map;
 }
 
-std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToInputIndexing(
-    int64_t root_index, int64_t hero_operand_index, MLIRContext* ctx) const {
-  const auto& hero = *analysis_.fusion_heroes()[root_index];
-
+IndexingMap MlirTransposeFusion::ComputeThreadIdToInputIndexing(
+    const HloInstruction& hero, MLIRContext* ctx) const {
   auto map = ComposeIndexingMaps(
       GetIndexingMapForTiling(tiling_, ctx),
       GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(), ctx));
@@ -242,32 +236,12 @@ absl::StatusOr<SmallVector<Value, 4>> MlirTransposeFusion::EmitWriteToShMemMlir(
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
   int num_outputs = entry_function.getArguments().size() - num_inputs;
 
-  SmallPtrSet<const HloInstruction*, 8> emitted_heros;
-
   SmallVector<Value> shmem_intermediate_result;
-  for (const auto& [root_index, hero_and_root] : llvm::enumerate(
-           llvm::zip(analysis_.fusion_heroes(), analysis_.fusion_roots()))) {
-    const HloInstruction* transpose = std::get<0>(hero_and_root);
-    const HloInstruction* root = std::get<1>(hero_and_root);
-
-    // The same hero can occure only multiple (hero, root) pair. We should emit
-    // the write to shmem only once.
-    if (!emitted_heros.insert(transpose).second) {
-      continue;
-    }
-
-    // Skip non-transpose heroes and handle them in EmitReadFromShMemMlir.
-    auto description =
-        GetDescriptionForTiledTransposeEmitter(*root, *transpose);
-    if (!description.has_value()) {
-      continue;
-    }
-
-    auto input_indexing = ComputeThreadIdToInputIndexing(
-        root_index, /*hero_operand_index=*/0, builder.getContext());
-    TF_RET_CHECK(input_indexing) << "Indexing is never nullopt";
+  for (auto* transpose : shmem_transposes_) {
+    auto input_indexing =
+        ComputeThreadIdToInputIndexing(*transpose, builder.getContext());
     IndexingMap shmem_input_indexing =
-        GetSharedMemoryWriteIndexingMap(*input_indexing, permutation_[2]);
+        GetSharedMemoryWriteIndexingMap(input_indexing, permutation_[2]);
 
     // Allocate shared memory.
     const HloInstruction* transpose_operand = transpose->operand(0);
@@ -278,11 +252,11 @@ absl::StatusOr<SmallVector<Value, 4>> MlirTransposeFusion::EmitWriteToShMemMlir(
 
     // Emit loop that writes subgraphs of transpose operands to shmem.
     auto shmem_result = EmitThreadLoopNest(
-        builder, {shmem}, *input_indexing,
+        builder, {shmem}, input_indexing,
         [&](ValueRange output_tensors, ValueRange dim_values,
             ValueRange symbol_values) -> SmallVector<Value> {
           auto input_indices =
-              ApplyAffineMap(input_indexing->GetAffineMap(), dim_values,
+              ApplyAffineMap(input_indexing.GetAffineMap(), dim_values,
                              symbol_values, builder);
           auto shmem_indices =
               ApplyAffineMap(shmem_input_indexing.GetAffineMap(), dim_values,
@@ -313,115 +287,43 @@ absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
     const HloFusionInstruction& fusion,
     const mlir_converter::PartitionedComputations& computations,
     const CallTargetProvider& call_targets, ValueRange shmem_tensors) const {
-  SmallVector<Value, 4> result_tensors;
-
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
 
-  SmallPtrSet<const HloInstruction*, 16> hero_roots{
-      analysis_.fusion_roots().begin(), analysis_.fusion_roots().end()};
-
-  // Cache for root indexing per hero. If multiple roots use the same hero, they
-  // will have identical indexing.
-  absl::flat_hash_map<const HloInstruction*, IndexingMap> root_to_hero_indexing;
-
-  int transpose_hero_count = 0;
-
-  // Map from hero instruction to shmem tensor value.
-  absl::flat_hash_map<const HloInstruction*, Value> hero_to_shmem_tensor;
-
   ValueRange output_tensor_args =
       entry_function.getArguments().drop_front(num_inputs);
+  auto output_indexing = ComputeThreadIdToOutputIndexing(
+      *shmem_transposes_.front(), builder.getContext());
+  auto shmem_output_indexing =
+      GetSharedMemoryReadIndexingMap(output_indexing, permutation_[2]);
+  auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(
+      shmem_transposes_.front(), builder.getContext());
+  auto root_indexing = ComposeIndexingMaps(output_indexing, epilogue_indexing);
+  auto result_tensors = EmitThreadLoopNest(
+      builder, output_tensor_args, output_indexing,
+      [&](ValueRange output_tensors, ValueRange dim_values,
+          ValueRange symbol_values) -> SmallVector<Value> {
+        auto shmem_indices =
+            ApplyAffineMap(shmem_output_indexing.GetAffineMap(), dim_values,
+                           symbol_values, builder);
+        llvm::SmallVector<Value> transpose_values;
+        for (auto shmem : shmem_tensors) {
+          transpose_values.push_back(
+              builder.create<ExtractOp>(shmem, shmem_indices));
+        }
+        auto root_indices = ApplyAffineMap(root_indexing.GetAffineMap(),
+                                           dim_values, symbol_values, builder);
+        auto result_scalars =
+            EmitEpilogue(computations, entry_function, transpose_values,
+                         root_indices, builder);
+        SmallVector<Value> results;
+        results.reserve(output_tensor_args.size());
+        for (auto [tensor, value] : llvm::zip(output_tensors, result_scalars)) {
+          results.push_back(
+              builder.create<InsertOp>(value, tensor, root_indices));
+        }
+        return results;
+      });
 
-  for (const auto& [root_index, hero_and_root] : llvm::enumerate(
-           llvm::zip(analysis_.fusion_heroes(), analysis_.fusion_roots()))) {
-    const HloInstruction* transpose = std::get<0>(hero_and_root);
-    const HloInstruction* root = std::get<1>(hero_and_root);
-
-    auto* mlir_context = builder.getContext();
-    auto output_indexing =
-        ComputeThreadIdToOutputIndexing(root_index, mlir_context);
-    TF_RET_CHECK(output_indexing) << "Indexing is never nullopt";
-
-    if (!root_to_hero_indexing.contains(transpose)) {
-      auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(
-          transpose, mlir_context,
-          /*is_root=*/[&](const HloInstruction* instr) {
-            return hero_roots.contains(instr);
-          });
-      root_to_hero_indexing.emplace(
-          transpose, ComposeIndexingMaps(*output_indexing, epilogue_indexing));
-    }
-
-    const IndexingMap& root_indexing = root_to_hero_indexing.at(transpose);
-
-    IndexingMap shmem_output_indexing =
-        GetSharedMemoryReadIndexingMap(*output_indexing, permutation_[2]);
-    auto description =
-        GetDescriptionForTiledTransposeEmitter(*root, *transpose);
-
-    if (description.has_value()) {
-      auto subresult_tensors = EmitThreadLoopNest(
-          builder, output_tensor_args[root_index], *output_indexing,
-          [&](ValueRange output_tensors, ValueRange dim_values,
-              ValueRange symbol_values) -> SmallVector<Value> {
-            auto root_indices =
-                ApplyAffineMap(root_indexing.GetAffineMap(), dim_values,
-                               symbol_values, builder);
-            auto shmem_indices =
-                ApplyAffineMap(shmem_output_indexing.GetAffineMap(), dim_values,
-                               symbol_values, builder);
-
-            if (!hero_to_shmem_tensor.contains(transpose)) {
-              hero_to_shmem_tensor[transpose] =
-                  shmem_tensors[transpose_hero_count];
-              ++transpose_hero_count;
-            }
-
-            mlir::Value value = builder.create<ExtractOp>(
-                hero_to_shmem_tensor[transpose], shmem_indices);
-            auto result_scalars = EmitEpilogue(computations, entry_function,
-                                               value, root_indices, builder);
-            SmallVector<Value> results;
-            results.reserve(output_tensor_args.size());
-            for (auto [tensor, value] :
-                 llvm::zip(output_tensors, result_scalars)) {
-              results.push_back(
-                  builder.create<InsertOp>(value, tensor, root_indices));
-            }
-            return results;
-          });
-      result_tensors.append(subresult_tensors.begin(), subresult_tensors.end());
-    } else {
-      auto indexing = ComputeThreadIdToOutputIndexing(0, builder.getContext());
-      TF_RET_CHECK(indexing) << "Indexing is never nullopt";
-      auto subresult_tensors = EmitThreadLoopNest(
-          builder, output_tensor_args, *indexing,
-          [&](ValueRange output_tensors, ValueRange dim_values,
-              ValueRange symbol_values) -> SmallVector<Value> {
-            auto output_indices = ApplyAffineMap(
-                indexing->GetAffineMap(), dim_values, symbol_values, builder);
-
-            // Generate the operands for the root function: input tensors +
-            // output indices.
-            llvm::SmallVector<Value> operands(
-                entry_function.getArguments().take_front(num_inputs));
-            absl::c_copy(output_indices, std::back_inserter(operands));
-
-            auto result_scalars =
-                builder.create<PureCallOp>(call_targets(root), operands);
-
-            SmallVector<Value> results;
-            results.reserve(output_tensor_args.size());
-            for (auto [tensor, value] :
-                 llvm::zip(output_tensors, result_scalars.getResults())) {
-              results.push_back(
-                  builder.create<InsertOp>(value, tensor, output_indices));
-            }
-            return results;
-          });
-      result_tensors.append(subresult_tensors.begin(), subresult_tensors.end());
-    }
-  }
   builder.create<ReturnOp>(result_tensors);
   return absl::OkStatus();
 }
@@ -429,7 +331,7 @@ absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
 std::vector<const HloInstruction*>
 MlirTransposeFusion::GetInstructionsWithCustomCodegen(
     const HloFusionInstruction& fusion) const {
-  return {shmem_transposes_.begin(), shmem_transposes_.end()};
+  return GetShMemTransposes(analysis_);
 }
 
 absl::Status MlirTransposeFusion::EmitEntryFunction(
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
index 3df6073f5d924e..58c8d6265ae838 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
@@ -17,15 +17,14 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <vector>
 
-#include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
-#include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
@@ -59,9 +58,17 @@ class MlirTransposeFusion : public MlirFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* ctx) const override {
+    return ComputeThreadIdToInputIndexing(
+        *analysis_.fusion_heroes()[root_index], ctx);
+  }
 
  protected:
+  IndexingMap ComputeThreadIdToInputIndexing(const HloInstruction& hero,
+                                             mlir::MLIRContext* ctx) const;
+  IndexingMap ComputeThreadIdToOutputIndexing(const HloInstruction& hero,
+                                              mlir::MLIRContext* ctx) const;
+
   absl::Status EmitEntryFunction(
       const mlir_converter::PartitionedComputations& computations,
       const mlir_converter::CallTargetProvider& call_targets,
@@ -87,7 +94,7 @@ class MlirTransposeFusion : public MlirFusionEmitterBase {
   const HloFusionAnalysis& analysis_;
   Tiling tiling_;
   Vector3 permutation_;
-  absl::flat_hash_set<const HloInstruction*> shmem_transposes_;
+  std::vector<const HloInstruction*> shmem_transposes_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index eca6ccd0dc067b..a6a14c28ca8161 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -1188,6 +1188,10 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
   if (auto transpose = DynCast<HloTransposeInstruction>(instr)) {
     return ComputeInputToOutputTransposeOpIndexing(transpose, ctx);
   }
+  if (instr->opcode() == HloOpcode::kTuple) {
+    return HloInstructionIndexing::FromIndexingMaps(
+        {CreateIdentityMap(instr->shape().tuple_shapes(input_id), ctx)});
+  }
   // If we cannot compute input-to-output indexing, we return std::nullopt for
   // every op result.
   int64_t num_results =

From d1b0fb4020e8020e41c86f18fc6685df82a96188 Mon Sep 17 00:00:00 2001
From: Sergey Kozub <sergeykozub@google.com>
Date: Mon, 18 Mar 2024 08:05:01 -0700
Subject: [PATCH 024/670] Support sparse dots in GemmFusion pass

The codegen will only support this for NVidia GPUs, which have the following restrictions:
- only 2:4 structured sparsity is allowed;
- only the first dot operand may be sparse;

PiperOrigin-RevId: 616829925
---
 .../xla/xla/service/gpu/gemm_fusion.cc        | 35 ++++++++++--
 .../xla/xla/service/gpu/gemm_fusion_test.cc   | 57 +++++++++++++++++++
 2 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_fusion.cc b/third_party/xla/xla/service/gpu/gemm_fusion.cc
index 999989208d73ce..e98904b364443f 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion.cc
@@ -162,14 +162,17 @@ struct HlosAndRequirements {
 HloInstruction& FuseDot(const HloDotInstruction& dot,
                         const HloInstruction& fused_lhs,
                         const HloInstruction& fused_rhs,
+                        std::optional<const HloInstruction*> fused_meta,
                         HloComputation::Builder& builder  // append
 ) {
-  CHECK_EQ(dot.operand_count(), 2);
   VLOG(3) << "Fusing " << dot.ToString();
 
-  std::array<HloInstruction*, 2> hlo_new_operands = {
+  std::vector<HloInstruction*> hlo_new_operands = {
       const_cast<HloInstruction*>(&fused_lhs),
       const_cast<HloInstruction*>(&fused_rhs)};
+  if (fused_meta.has_value()) {
+    hlo_new_operands.push_back(const_cast<HloInstruction*>(fused_meta.value()));
+  }
   return *builder.AddInstruction(
       dot.CloneWithNewOperands(dot.shape(), hlo_new_operands));
 }
@@ -620,12 +623,33 @@ absl::StatusOr<FusionDecision> CreateDotFusion(
     return can_handle;
   }
 
+  // Verify sparse dot constraints.
+  if (dot.sparse_operands()) {
+    const SparsityDescriptor& descriptor = dot.sparsity().front();
+    if (dot.sparse_operands() != 1 || descriptor.index() != 0) {
+      return InvalidArgument("Sparsity is only supported on left operand");
+    }
+    if (descriptor.type() != SparsityType::SPARSITY_STRUCTURED_N_M ||
+        descriptor.n() != 2 || descriptor.m() != 4) {
+      return InvalidArgument("Only 2:4 structured sparsity is supported");
+    }
+    // DotDimensionSorter pass makes sure the sparse dimension is minor.
+    CHECK_EQ(descriptor.dimension(), dot.operand(0)->shape().rank() - 1);
+  }
+
   HlosAndRequirements lhs_hlos_and_reqs = FuseDotOperand(
       dot, /*operand_index=*/0, gpu_version, builder, fusion_inputs);
   HlosAndRequirements rhs_hlos_and_reqs = FuseDotOperand(
       dot, /*operand_index=*/1, gpu_version, builder, fusion_inputs);
-  HloInstruction& fused_dot = FuseDot(dot, *lhs_hlos_and_reqs.fused_hlo,
-                                      *rhs_hlos_and_reqs.fused_hlo, builder);
+  std::optional<const HloInstruction*> meta_hlo;
+  if (dot.sparse_operands()) {
+    HlosAndRequirements meta_hlos_and_reqs = FuseDotOperand(
+        dot, /*operand_index=*/2, gpu_version, builder, fusion_inputs);
+    meta_hlo.emplace(meta_hlos_and_reqs.fused_hlo);
+  }
+  HloInstruction& fused_dot =
+      FuseDot(dot, *lhs_hlos_and_reqs.fused_hlo, *rhs_hlos_and_reqs.fused_hlo,
+              meta_hlo, builder);
   // For now the RHS doesn't support splits, so it also doesn't impose any
   // requirements.
   HlosAndRequirements fused_output_and_reqs =
@@ -642,7 +666,8 @@ absl::StatusOr<FusionDecision> CreateDotFusion(
       dot.precision_config().algorithm();
   if (algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6 ||
       algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3 ||
-      dot.GetModule()->config().debug_options().xla_gpu_triton_gemm_any()) {
+      dot.GetModule()->config().debug_options().xla_gpu_triton_gemm_any() ||
+      dot.sparse_operands()) {
     return FusionDecision{};
   }
 
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
index 43c1c155fd4189..bdb1be455024f0 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
@@ -1148,6 +1148,63 @@ ENTRY e {
 })");
 }
 
+class SparseDotTest : public GemmFusionTest {};
+
+TEST_F(SparseDotTest, DotWithSparseLhsOperandIsRewritten) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule test
+ENTRY main {
+  lhs = f16[2,16] parameter(0)
+  rhs = f16[32,2] parameter(1)
+  meta = u16[2,2] parameter(2)
+  ROOT dot = f32[2,2] dot(lhs, rhs, meta),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+})")
+                    .value();
+  EXPECT_TRUE(GemmFusion(gpu_version_).Run(module.get()).value());
+
+  MatchHloModule(*module, R"(
+; CHECK-LABEL: ENTRY %main ({{.*}}: f16[2,16], {{.*}}: f16[32,2], {{.*}}: u16[2,2]) -> f32[2,2] {
+; CHECK-NEXT: [[P0:%[^ ]+]] = f16[2,16]{1,0} parameter(0)
+; CHECK-NEXT: [[P1:%[^ ]+]] = f16[32,2]{1,0} parameter(1)
+; CHECK-NEXT: [[META:%[^ ]+]] = u16[2,2]{1,0} parameter(2)
+; CHECK:      ROOT {{.*}} = f32[2,2]{1,0}
+; CHECK-SAME:   fusion(f16[2,16]{1,0} [[P0]], f16[32,2]{1,0} [[P1]], u16[2,2]{1,0} [[META]]),
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   __triton_gemm
+})");
+}
+
+TEST_F(SparseDotTest, DotWithSparseRhsOperandIsNotSupported) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule test
+ENTRY main {
+  lhs = f16[2,32] parameter(0)
+  rhs = f16[16,2] parameter(1)
+  meta = u16[2,2] parameter(2)
+  ROOT dot = f32[2,2] dot(lhs, rhs, meta),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=R.0@2:4
+})")
+                    .value();
+  auto result = GemmFusion(gpu_version_).Run(module.get());
+  EXPECT_FALSE(result.ok());
+}
+
+TEST_F(SparseDotTest, UnsupportedSparsityType) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule test
+ENTRY main {
+  lhs = f16[2,8] parameter(0)
+  rhs = f16[32,2] parameter(1)
+  meta = u16[2,1] parameter(2)
+  ROOT dot = f32[2,2] dot(lhs, rhs, meta),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@1:4
+})")
+                    .value();
+  auto result = GemmFusion(gpu_version_).Run(module.get());
+  EXPECT_FALSE(result.ok());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From 3a717efddf65842ff10f24f71eecbc47490a0422 Mon Sep 17 00:00:00 2001
From: Sergey Kozub <sergeykozub@google.com>
Date: Mon, 18 Mar 2024 08:18:49 -0700
Subject: [PATCH 025/670] Support sparse dots in GemmFusionAutotuner pass

1) Add `allow_cublas` flag and set it to false for sparse dots (we cannot run cublas for reference, as it doesn't support sparsity).
2) Make sure the configs that are not supported by the codegen are excluded. Specifically, if there are more threads than metadata values, it'd fail.
3) For deviceless compilations, apply the `ReduceTileSizes` to the default config, as otherwise it produces an incorrect config for sparse dots (too many threads).

PiperOrigin-RevId: 616833330
---
 third_party/xla/xla/service/gpu/BUILD         |  1 +
 .../xla/service/gpu/gemm_fusion_autotuner.cc  | 80 +++++++++++++------
 .../service/gpu/gemm_fusion_autotuner_test.cc | 34 +++++++-
 3 files changed, 90 insertions(+), 25 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index bc0f46f49bef75..59d7cc553ec9cc 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -766,6 +766,7 @@ xla_test(
         ":backend_configs_cc",
         ":gemm_fusion",
         ":gemm_fusion_autotuner",
+        ":ir_emission_utils",
         ":matmul_utils",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
index a53affc736bb3c..e68364f71d3903 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
@@ -188,6 +188,9 @@ class GemmFusionAutotunerVisitor : public DfsHloRewriteVisitor {
 // This contains all alternative Triton GEMM configs related to one fusion.
 struct GemmConfigSet {
   std::vector<TritonGemmConfig> configs;
+  // Setting this to true disallows verification and fallback to cuBLAS, and
+  // the usage of cuDNN.
+  bool has_sparsity = false;
 };
 
 using CuDnnPlanId = int64_t;
@@ -259,10 +262,12 @@ class GemmConfigSetCollector : public ConstDfsHloVisitorWithDefault {
         fusion->GetModule()->config().debug_options();
     auto cuda_comp =
         std::get<se::CudaComputeCapability>(config_.GetGpuComputeCapability());
-    return {GetPossibleMatmulAutotuneConfigs(
-        *Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
-            *fusion->called_computations().at(0), HloOpcode::kDot)),
-        cuda_comp, debug_options, config_.ExhaustiveTilingSearch())};
+    const HloDotInstruction* dot_instr =
+        Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
+            *fusion->called_computations().at(0), HloOpcode::kDot));
+    auto configs = GetPossibleMatmulAutotuneConfigs(
+        *dot_instr, cuda_comp, debug_options, config_.ExhaustiveTilingSearch());
+    return {configs, /*has_sparsity=*/dot_instr->sparse_operands() > 0};
   }
 
   AutotuneConfig config_;
@@ -294,8 +299,11 @@ TileSizeLimit GetUpperLimit(const HloDotInstruction& dot) {
       std::max<int64_t>(tsl::NextPowerOfTwoS64(m), kMinTileSize);
   const int64_t block_n_limit =
       std::max<int64_t>(tsl::NextPowerOfTwoS64(n), kMinTileSize);
+  // Increase minimum tile size for the contracting dimension proportionally
+  // to the sparsity multiplier (assume 2:4 structured sparsity).
   const int64_t block_k_limit =
-      std::max<int64_t>(tsl::NextPowerOfTwoS64(k), kMinTileSize);
+      std::max<int64_t>(tsl::NextPowerOfTwoS64(k),
+                        kMinTileSize * (dot.sparse_operands() ? 2 : 1));
   return {block_m_limit, block_n_limit, block_k_limit};
 }
 
@@ -345,6 +353,12 @@ std::vector<TritonGemmConfig> GetExhaustiveMatmulAutotuneConfigs(
             if (block_k > limit.block_k) {
               continue;
             }
+            // Sparse meta should have at least one element per thread.
+            // Note: only 2:4 structured sparsity is currently supported.
+            if (dot.sparse_operands() &&
+                block_m * block_k / 16 < num_warps * WarpSize()) {
+              continue;
+            }
             for (int split_k : SPLIT_K) {
               if (split_k >
                   std::min<int64_t>(max_split_k,
@@ -429,6 +443,13 @@ std::vector<TritonGemmConfig> ReduceTileSizes(
     config.block_k = std::min<int64_t>(config.block_k, limit.block_k);
     config.split_k = std::min<int64_t>(
         config.split_k, GetSplitKLimit(config.block_k, limit.block_k));
+    // Sparse meta should have at least one element per thread.
+    // Note: only 2:4 structured sparsity is currently supported.
+    if (dot.sparse_operands()) {
+      int meta_elements = config.block_m * config.block_k / 16;
+      config.num_warps =
+          std::min<int64_t>(config.num_warps, meta_elements / WarpSize());
+    }
   }
 
   // Remove duplicates.
@@ -632,16 +653,16 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
 
     if (IsFusionKind(hlo, kTritonGemmFusionKind)) {
       config_count += gemm_config_set.configs.size();
-      if (IsCuDnnEnabled(config, debug_opts) &&
+      if (!gemm_config_set.has_sparsity && IsCuDnnEnabled(config, debug_opts) &&
           HasAlgorithmSupportedByCudnn(hlo)) {
         config_count += GetCuDnnPlanCount(hlo, config);
       }
     } else if (IsFusionKind(hlo, kCuDnnFusionKind)) {
       config_count += GetCuDnnPlanCount(hlo, config);
     }
+    // Reference config for verification (uses cuBLAS).
+    config_count += !gemm_config_set.has_sparsity;
   }
-  // cuBLAS configs: one per fusion.
-  config_count += gemm_config_sets.size();
 
   std::atomic<int> done_count = 0;
   std::atomic<int> good_count = 0;
@@ -756,16 +777,19 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
         });
       }
 
-      thread_pool->Schedule([&, fusion] {
-        absl::StatusOr<bool> has_executable =
-            compile_reference_executable(fusion);
-        TF_CHECK_OK(has_executable.status());
-        log(has_executable.value());
-        counter.DecrementCount();
-      });
+      if (!gemm_config_set.has_sparsity) {
+        thread_pool->Schedule([&, fusion] {
+          absl::StatusOr<bool> has_executable =
+              compile_reference_executable(fusion);
+          TF_CHECK_OK(has_executable.status());
+          log(has_executable.value());
+          counter.DecrementCount();
+        });
+      }
 
       if (IsFusionKind(*fusion, kCuDnnFusionKind) ||
           (IsFusionKind(*fusion, kTritonGemmFusionKind) &&
+           !gemm_config_set.has_sparsity &&
            IsCuDnnEnabled(config, debug_opts) &&
            HasAlgorithmSupportedByCudnn(*fusion))) {
         const int plan_count = GetCuDnnPlanCount(*fusion, config);
@@ -803,12 +827,15 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
         log(has_executable);
       }
 
-      TF_ASSIGN_OR_RETURN(bool has_executable,
-                          compile_reference_executable(fusion));
-      log(has_executable);
+      if (!gemm_config_set.has_sparsity) {
+        TF_ASSIGN_OR_RETURN(bool has_executable,
+                            compile_reference_executable(fusion));
+        log(has_executable);
+      }
 
       if (IsFusionKind(*fusion, kCuDnnFusionKind) ||
           (IsFusionKind(*fusion, kTritonGemmFusionKind) &&
+           !gemm_config_set.has_sparsity &&
            IsCuDnnEnabled(config, debug_opts) &&
            HasAlgorithmSupportedByCudnn(*fusion))) {
         const int plan_count = GetCuDnnPlanCount(*fusion, config);
@@ -864,11 +891,10 @@ absl::StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
     input_shapes.push_back(param->shape());
   }
 
-  // Run with cuBLAS.
+  // Run with cuBLAS (optional).
   std::optional<ScopedShapedBuffer> reference_buffer;
-  absl::Duration cublas_duration;
-  {
-    TF_RET_CHECK(executable_set.reference != nullptr);
+  absl::Duration cublas_duration = absl::InfiniteDuration();
+  if (executable_set.reference != nullptr) {
     TF_ASSIGN_OR_RETURN(std::optional<ProfilingOutput> output,
                         util.ProfileExecutable(&*executable_set.reference,
                                                stream, inputs, input_shapes));
@@ -925,7 +951,9 @@ absl::StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
     *res.mutable_run_time() =
         tsl::proto_utils::ToDurationProto(profiling_output->duration);
 
-    if (config.should_check_correctness()) {
+    // Reference buffer is available when `config.should_check_correctness()`
+    // is set and reference executable was compiled.
+    if (reference_buffer.has_value()) {
       TF_ASSIGN_OR_RETURN(
           se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
           rz_allocator.CheckRedzones());
@@ -1157,7 +1185,11 @@ absl::StatusOr<bool> GemmFusionAutotuner::Run(
       if (IsFusionKind(*fusion, kCuDnnFusionKind)) {
         res.mutable_algorithm()->set_algo_id(-1);
       } else {
-        *res.mutable_triton() = kDefaultGemmTiling.ToProto();
+        const HloDotInstruction* dot_instr =
+            Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
+                *fusion->called_computations().at(0), HloOpcode::kDot));
+        auto config = ReduceTileSizes(*dot_instr, {kDefaultGemmTiling}).front();
+        *res.mutable_triton() = config.ToProto();
       }
       *res.mutable_run_time() =
           tsl::proto_utils::ToDurationProto(absl::ZeroDuration());
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
index 4d886a8f68988d..a0d3e85a782356 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gemm_fusion.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_pass_pipeline.h"
@@ -694,7 +695,7 @@ ENTRY e {
         RunFileCheck(
             module->ToString(HloPrintOptions{}.set_print_operand_shape(false)),
             R"(
-// CHECK: backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"32","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+// CHECK: backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"16","block_k":"16","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}}}
             )"));
     EXPECT_TRUE(filecheck_matches);
   } else {
@@ -770,6 +771,37 @@ ENTRY e {
       [](const TritonGemmConfig& config) { return config.split_k == 1; }));
 }
 
+class GemmFusionAutotunerConfigTest
+    : public StatelessAutotunerTest,
+      public ::testing::WithParamInterface<bool> {};
+
+TEST_P(GemmFusionAutotunerConfigTest, SparseDotDiscardsUnsupportedTiles) {
+  const std::string kHloText = R"(
+HloModule test
+ENTRY wais {
+  lhs = f16[5,1600] parameter(0)
+  rhs = f16[3200,10] parameter(1)
+  meta = u16[5,200] parameter(2)
+  ROOT dot = f32[5,10] dot(lhs, rhs, meta),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  auto dot =
+      Cast<HloDotInstruction>(module->entry_computation()->root_instruction());
+
+  auto configs = GetPossibleMatmulAutotuneConfigs(
+      *dot, se::CudaComputeCapability{8, 0}, GetDebugOptionsForTest(),
+      /*exhaustive_tiling_search=*/GetParam());
+  for (const auto& config : configs) {
+    int metadata_size = config.block_m * config.block_k / 16;
+    EXPECT_LE(config.num_warps * WarpSize(), metadata_size);
+    EXPECT_GT(config.block_k, 16);  // kMinTileSize
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(GemmFusionAutotunerConfigSweep,
+                         GemmFusionAutotunerConfigTest, ::testing::Bool());
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From ea0619d9091b50a6204cef9085953deef9d98c91 Mon Sep 17 00:00:00 2001
From: Eunjae Kim <eunjaekim@google.com>
Date: Mon, 18 Mar 2024 08:18:52 -0700
Subject: [PATCH 026/670] Insert a task to the low priority task queue when the
 criticality is one of the fixed list of low priority criticalities and
 support padding the high priority batch with the unbatched tasks given via
 the ProcessBatchCallBack.

PiperOrigin-RevId: 616833344
---
 tensorflow/core/kernels/BUILD                 |  17 +-
 tensorflow/core/kernels/batch_kernels_test.cc | 331 ++++++++++++++++--
 tensorflow/core/kernels/batching_util/BUILD   |   3 +
 .../batching_util/batch_resource_base.cc      | 149 +++++---
 .../batching_util/batch_resource_base.h       |  34 +-
 .../batching_util/shared_batch_scheduler.h    | 122 +++++--
 .../shared_batch_scheduler_test.cc            | 234 ++++++++++++-
 7 files changed, 763 insertions(+), 127 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index aa313ddbd3b032..f97593aaf43898 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1645,19 +1645,34 @@ cc_library(
 
 tf_cc_test(
     name = "batch_kernels_test",
-    size = "small",
+    size = "medium",
     srcs = ["batch_kernels_test.cc"],
     features = ["-layering_check"],
     deps = [
         ":batch_kernel_test_util",
         ":batch_kernels",
+        ":cwise_op",
         ":function_ops",
         ":shape_ops",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/kernels/batching_util:warmup",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/core/public:version",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:blocking_counter",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:refcount",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
diff --git a/tensorflow/core/kernels/batch_kernels_test.cc b/tensorflow/core/kernels/batch_kernels_test.cc
index 68ae309504ffb6..320d17b14396c2 100644
--- a/tensorflow/core/kernels/batch_kernels_test.cc
+++ b/tensorflow/core/kernels/batch_kernels_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/device_factory.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/public/version.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/blocking_counter.h"
+#include "tsl/platform/criticality.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/refcount.h"
 #include "tsl/platform/status.h"
@@ -65,10 +67,283 @@ TEST_P(BatchFunctionKernelTest, EnableAdaptiveScheduler) {
 
 INSTANTIATE_TEST_SUITE_P(Params, BatchFunctionKernelTest, ::testing::Bool());
 
-class BatchFunctionKernelParallelWarmupTestState : public OpsTestBase {
+class SharedBatchFunctionTestState : public OpsTestBase {
  public:
   // Init test fixture with a batch kernel instance.
-  Status Init(bool enable_splitting) {
+  void CreateFunctionLibraryRuntime() {
+    pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
+        device_mgr_.get(), Env::Default(), /*config=*/nullptr,
+        TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
+        /*thread_pool=*/nullptr, /*parent=*/nullptr,
+        /*session_metadata=*/nullptr,
+        Rendezvous::Factory{[](const int64_t, const DeviceMgr *device_mgr,
+                               tsl::core::RefCountPtr<Rendezvous> *r) {
+          *r = tsl::core::RefCountPtr<Rendezvous>(
+              new IntraProcessRendezvous(device_mgr));
+          return absl::OkStatus();
+        }});
+  }
+};
+
+class BatchFunctionTestState : public SharedBatchFunctionTestState {
+ public:
+  // Init test fixture with a batch kernel instance. The caller guarantees that
+  // the device pointer is valid throughout the life of this class.
+  absl::Status Init(Device *device, bool enable_low_priority_queue) {
+    // Override the per-test/per-op device with a given device so that it can
+    // be shared between ops.
+    device_ = device;
+
+    NameAttrList f;
+    f.set_name("ShapeEnforcingFunction");
+    FunctionDef func = FunctionDefHelper::Create(
+        // function_name
+        f.name(),
+        // in_def
+        {"x:int64"},
+        // out_def
+        {"o:int64"},
+        // attr_def
+        {},
+        // node_def
+        {{{"o"},
+          "EnsureShape",
+          {"x"},
+          {{"T", DataType::DT_INT64}, {"shape", TensorShape({4, 2})}}}},
+        // ret_def
+        {{"o", "o:output"}});
+    TF_RETURN_IF_ERROR(flib_def_->AddFunctionDef(func));
+    SharedBatchFunctionTestState::CreateFunctionLibraryRuntime();
+
+    std::vector<NodeDefBuilder::NodeOut> inputs(
+        {NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64})});
+    TF_RETURN_IF_ERROR(NodeDefBuilder("BatchTPUInput", "BatchFunction")
+                           .Attr("max_batch_size", 4)
+                           .Attr("num_batch_threads", 4)
+                           .Attr("allowed_batch_sizes", {4})
+                           .Attr("batch_timeout_micros", 5000000)
+                           .Attr("max_enqueued_batches", 10)
+                           .Attr("low_priority_max_batch_size",
+                                 enable_low_priority_queue ? 64 : 0)
+                           .Attr("low_priority_batch_timeout_micros",
+                                 enable_low_priority_queue ? 50000000 : 0)
+                           .Attr("low_priority_allowed_batch_sizes",
+                                 enable_low_priority_queue ? std::vector<int>{1}
+                                                           : std::vector<int>())
+                           .Attr("low_priority_max_enqueued_batches",
+                                 enable_low_priority_queue ? 100 : 0)
+                           .Attr("Tin", {DataType::DT_INT64})
+                           .Input(inputs)
+                           .Attr("Tcaptured", std::vector<DataType>{})
+                           .Input(std::vector<NodeDefBuilder::NodeOut>{})
+                           .Attr("Tout", std::vector<DataType>{DT_INT64})
+                           .Attr("f", f)
+                           .Finalize(node_def()));
+    return OpsTestBase::InitOp();
+  }
+
+  void TestBody() override {}
+};
+
+class BatchFunctionTest : public ::testing::TestWithParam<bool> {
+ protected:
+  void SetUp() override {
+    // The device needs to be shared in each test case and within each test case
+    // only.
+    cpu_device_ =
+        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
+  }
+  std::unique_ptr<Device> cpu_device_;
+};
+
+TEST_P(BatchFunctionTest, BatchingWorksWithoutCriticality) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  bool enable_low_priority_queue = GetParam();
+  {
+    tsl::BlockingCounter blocking_counter(4);
+    // 8 threads run the batch op with no explicit criticality set. They are
+    // eventually batched to form a tensor with [4, 2] shape which is verified
+    // within the function.
+    for (int i = 0; i < 4; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kCritical);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(
+            test_state.Init(cpu_device_.get(), enable_low_priority_queue));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({123, 456}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+}
+
+TEST_P(BatchFunctionTest, PaddingWorksWithoutCriticality) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  bool enable_low_priority_queue = GetParam();
+  {
+    tsl::BlockingCounter blocking_counter(2);
+    // 2 threads run the batch op with no explicit criticality set. They are
+    // eventually batched and padded to form a tensor with [4, 2] shape which is
+    // verified within the function.
+    for (int i = 0; i < 2; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kCritical);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(
+            test_state.Init(cpu_device_.get(), enable_low_priority_queue));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({123, 456}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+}
+
+#if defined(PLATFORM_GOOGLE)
+TEST_P(BatchFunctionTest, BatchingWorks) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  bool enable_low_priority_queue = GetParam();
+  {
+    tsl::BlockingCounter blocking_counter(4);
+    // 2 threads run the batch op with critical plus and 2 threads run the batch
+    // op with sheddable. They are eventually batched to form a tensor with [4,
+    // 2] shape which is verified within the function.
+    for (int i = 0; i < 2; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kCriticalPlus);
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kCriticalPlus);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(
+            test_state.Init(cpu_device_.get(), enable_low_priority_queue));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({123, 456}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kSheddable);
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kSheddable);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(
+            test_state.Init(cpu_device_.get(), enable_low_priority_queue));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({234, 567}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+}
+
+TEST_P(BatchFunctionTest, PaddingWorks) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  bool enable_low_priority_queue = GetParam();
+  {
+    tsl::BlockingCounter blocking_counter(2);
+    // 1 thread run the batch op with critical plus and 1 threads run the batch
+    // op with sheddable. They are eventually batched and padded to form a
+    // tensor with [4, 2] shape which is verified within the function.
+    Env::Default()->SchedClosure([&]() {
+      tsl::criticality::ScopedCriticality scoped_criticality(
+          tsl::criticality::Criticality::kCriticalPlus);
+      ASSERT_EQ(tsl::criticality::GetCriticality(),
+                tsl::criticality::Criticality::kCriticalPlus);
+
+      BatchFunctionTestState test_state;
+      test_state.set_session_metadata(session_metadata);
+      TF_ASSERT_OK(
+          test_state.Init(cpu_device_.get(), enable_low_priority_queue));
+      test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
+      TF_EXPECT_OK(test_state.RunOpKernel());
+
+      test::ExpectTensorEqual<int64_t>(
+          *test_state.GetOutput(0),
+          test::AsTensor<int64_t>({123, 456}, TensorShape({1, 2})));
+      blocking_counter.DecrementCount();
+    });
+
+    Env::Default()->SchedClosure([&]() {
+      tsl::criticality::ScopedCriticality scoped_criticality(
+          tsl::criticality::Criticality::kSheddable);
+      ASSERT_EQ(tsl::criticality::GetCriticality(),
+                tsl::criticality::Criticality::kSheddable);
+
+      BatchFunctionTestState test_state;
+      test_state.set_session_metadata(session_metadata);
+      TF_ASSERT_OK(
+          test_state.Init(cpu_device_.get(), enable_low_priority_queue));
+      test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
+      TF_EXPECT_OK(test_state.RunOpKernel());
+
+      test::ExpectTensorEqual<int64_t>(
+          *test_state.GetOutput(0),
+          test::AsTensor<int64_t>({234, 567}, TensorShape({1, 2})));
+      blocking_counter.DecrementCount();
+    });
+
+    blocking_counter.Wait();
+  }
+}
+#endif
+
+INSTANTIATE_TEST_SUITE_P(BatchFunctionTest, BatchFunctionTest,
+                         ::testing::Bool());
+
+class BatchFunctionKernelParallelWarmupTestState
+    : public SharedBatchFunctionTestState {
+ public:
+  // Init test fixture with a batch kernel instance.
+  absl::Status Init(bool enable_splitting) {
     static auto *const cpu_device = []() {
       auto device =
           DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
@@ -98,40 +373,29 @@ class BatchFunctionKernelParallelWarmupTestState : public OpsTestBase {
         // ret_def
         {{"o", "o:output"}});
     TF_RETURN_IF_ERROR(flib_def_->AddFunctionDef(func));
-
-    pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
-        device_mgr_.get(), Env::Default(), /*config=*/nullptr,
-        TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
-        /*thread_pool=*/nullptr, /*parent=*/nullptr,
-        /*session_metadata=*/nullptr,
-        Rendezvous::Factory{[](const int64_t, const DeviceMgr *device_mgr,
-                               tsl::core::RefCountPtr<Rendezvous> *r) {
-          *r = tsl::core::RefCountPtr<Rendezvous>(
-              new IntraProcessRendezvous(device_mgr));
-          return absl::OkStatus();
-        }});
+    SharedBatchFunctionTestState::CreateFunctionLibraryRuntime();
 
     std::vector<NodeDefBuilder::NodeOut> inputs(
         {NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64})});
-    TF_CHECK_OK(NodeDefBuilder("BatchTPUInput", "BatchFunction")
-                    .Attr("max_batch_size", enable_splitting ? 16 : 8)
-                    .Attr("num_batch_threads", 8)
-                    .Attr("allowed_batch_sizes", {2, 4, 8})
-                    .Attr("batch_timeout_micros", 1000000)
-                    .Attr("max_enqueued_batches", 10)
-                    .Attr("enable_large_batch_splitting", true)
-                    .Attr("low_priority_max_batch_size", 64)
-                    .Attr("low_priority_batch_timeout_micros", 8000)
-                    .Attr("low_priority_allowed_batch_sizes", {32, 64})
-                    .Attr("low_priority_max_enqueued_batches", 1000)
-                    .Attr("Tin", {DataType::DT_INT64})
-                    .Input(inputs)
-                    .Attr("Tcaptured", std::vector<DataType>{})
-                    .Input(std::vector<NodeDefBuilder::NodeOut>{})
-                    .Attr("Tout", std::vector<DataType>{DT_INT64})
-                    .Attr("f", f)
-                    .Finalize(node_def()));
-    return InitOp();
+    TF_RETURN_IF_ERROR(NodeDefBuilder("BatchTPUInput", "BatchFunction")
+                           .Attr("max_batch_size", enable_splitting ? 16 : 8)
+                           .Attr("num_batch_threads", 8)
+                           .Attr("allowed_batch_sizes", {2, 4, 8})
+                           .Attr("batch_timeout_micros", 1000000)
+                           .Attr("max_enqueued_batches", 10)
+                           .Attr("enable_large_batch_splitting", true)
+                           .Attr("low_priority_max_batch_size", 64)
+                           .Attr("low_priority_batch_timeout_micros", 8000)
+                           .Attr("low_priority_allowed_batch_sizes", {32, 64})
+                           .Attr("low_priority_max_enqueued_batches", 1000)
+                           .Attr("Tin", {DataType::DT_INT64})
+                           .Input(inputs)
+                           .Attr("Tcaptured", std::vector<DataType>{})
+                           .Input(std::vector<NodeDefBuilder::NodeOut>{})
+                           .Attr("Tout", std::vector<DataType>{DT_INT64})
+                           .Attr("f", f)
+                           .Finalize(node_def()));
+    return OpsTestBase::InitOp();
   }
 
   void TestBody() override {}
@@ -200,5 +464,6 @@ TEST_P(BatchFunctionKernelParallelWarmupTest, ParallelWarmup) {
 INSTANTIATE_TEST_SUITE_P(BatchFunctionKernelParallelWarmupTestSuite,
                          BatchFunctionKernelParallelWarmupTest,
                          ::testing::Bool());
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 2d9e06650e54b5..d34bd7331a35d5 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -161,6 +161,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:criticality",
         "@local_tsl//tsl/platform:errors",
     ],
 )
@@ -180,6 +181,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:criticality",
     ],
     alwayslink = 1,
 )
@@ -391,6 +393,7 @@ cc_library(
         "//tensorflow/core/util:incremental_barrier",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index 98a83fda8833a5..51d744616db8c6 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/functional/bind_front.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -50,6 +51,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/concat_split_util.h"
 #include "tensorflow/core/kernels/batching_util/input_split_metadata.h"
 #include "tensorflow/core/kernels/batching_util/threadsafe_status.h"
@@ -272,6 +274,17 @@ const string& GetModelName(OpKernelContext* ctx) {
   return ctx->session_metadata()->name();
 }
 
+// Returns the sum of the task sizes. The caller must guarantee that the
+// unique_ptrs in the argument vectors are not null.
+int GetTotalTaskSize(
+    const std::vector<std::unique_ptr<BatchResourceBase::BatchTask>>& tasks) {
+  int tasks_size = 0;
+  for (const auto& task : tasks) {
+    tasks_size += task->size();
+  }
+  return tasks_size;
+}
+
 }  // namespace
 
 std::unique_ptr<BatchResourceBase::BatchTask>
@@ -617,17 +630,22 @@ int BatchResourceBase::RoundToLowestAllowedBatchSize(int batch_size) const {
 }
 
 Status BatchResourceBase::ConcatInputTensors(
-    const BatchT& batch, OpKernelContext* context,
-    std::vector<Tensor>* concatenated_tensors) const {
+    const BatchT& batch,
+    const std::vector<std::unique_ptr<BatchTask>>& unbatched_tasks,
+    OpKernelContext* context, std::vector<Tensor>* concatenated_tensors) const {
   if (batch.num_tasks() == 0) {
     return errors::InvalidArgument("Empty batch.");
   }
+
+  int unbatched_tasks_size = GetTotalTaskSize(unbatched_tasks);
   const bool just_for_warmup = batch.task(0).forced_warmup_batch_size > 0;
   const int padded_batch_size =
-      just_for_warmup ? batch.task(0).forced_warmup_batch_size
-                      : RoundToLowestAllowedBatchSize(batch.size());
+      just_for_warmup
+          ? batch.task(0).forced_warmup_batch_size
+          : RoundToLowestAllowedBatchSize(batch.size() + unbatched_tasks_size);
   const int padding_amount =
-      just_for_warmup ? padded_batch_size : padded_batch_size - batch.size();
+      just_for_warmup ? padded_batch_size
+                      : padded_batch_size - batch.size() - unbatched_tasks_size;
   profiler::TraceMe trace_me([padded_batch_size, padding_amount,
                               disable_padding =
                                   batcher_queue_options_.disable_padding]() {
@@ -636,6 +654,9 @@ Status BatchResourceBase::ConcatInputTensors(
                                {"padding_amount", padding_amount},
                                {"disable_padding", disable_padding}});
   });
+  // TODO(b/316379576): Add metrics for the breakdown between the size of the
+  // original batch size and the unbatched task size and update the batch size
+  // to include the unbatched tasks.
   RecordPaddingSize(padding_amount, GetModelName(context), padded_batch_size,
                     context->op_kernel().name());
   RecordPaddingSizeV2(padding_amount, GetModelName(context), padded_batch_size,
@@ -660,10 +681,14 @@ Status BatchResourceBase::ConcatInputTensors(
     if (just_for_warmup) {
       to_concatenate.reserve(padding_amount);
     } else {
-      to_concatenate.reserve(batch.num_tasks() + padding_amount);
+      to_concatenate.reserve(batch.num_tasks() + unbatched_tasks.size() +
+                             padding_amount);
       for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
         to_concatenate.push_back(batch.task(task_idx).inputs.at(i));
       }
+      for (int task_idx = 0; task_idx < unbatched_tasks.size(); ++task_idx) {
+        to_concatenate.push_back(unbatched_tasks[task_idx]->inputs.at(i));
+      }
     }
 
     // Add padding as needed if padding is allowed. Use the first row of the
@@ -794,7 +819,8 @@ Status BatchResourceBase::ConcatInputTensors(
 }
 
 Status BatchResourceBase::SplitOutputTensors(
-    const std::vector<Tensor>& combined_outputs, BatchT* batch) const {
+    const std::vector<Tensor>& combined_outputs, BatchT* batch,
+    std::vector<std::unique_ptr<BatchTask>>& unbatched_tasks) const {
   DCHECK_GE(batch->num_tasks(), 1);
   if (batch->num_tasks() < 1) {
     return errors::Internal("Batch size expected to be positive; was ",
@@ -802,14 +828,20 @@ Status BatchResourceBase::SplitOutputTensors(
   }
 
   std::vector<int64_t> task_sizes_plus_optional_padding;
-  task_sizes_plus_optional_padding.reserve(batch->num_tasks());
+  task_sizes_plus_optional_padding.reserve(batch->num_tasks() +
+                                           unbatched_tasks.size());
   for (int i = 0; i < batch->num_tasks(); ++i) {
     task_sizes_plus_optional_padding.push_back(batch->task(i).size());
   }
-  const int padding_size =
-      batcher_queue_options_.disable_padding
-          ? 0
-          : RoundToLowestAllowedBatchSize(batch->size()) - batch->size();
+  for (int i = 0; i < unbatched_tasks.size(); ++i) {
+    task_sizes_plus_optional_padding.push_back(unbatched_tasks[i]->size());
+  }
+  int unbatched_tasks_size = GetTotalTaskSize(unbatched_tasks);
+  const int padding_size = batcher_queue_options_.disable_padding
+                               ? 0
+                               : RoundToLowestAllowedBatchSize(
+                                     batch->size() + unbatched_tasks_size) -
+                                     batch->size() - unbatched_tasks_size;
   if (padding_size > 0) {
     task_sizes_plus_optional_padding.push_back(padding_size);
   }
@@ -829,7 +861,8 @@ Status BatchResourceBase::SplitOutputTensors(
           "Batched output tensor has 0 dimensions");
     }
     if (output_tensor.shape().dim_size(0) !=
-        static_cast<int64_t>(batch->size() + padding_size)) {
+        static_cast<int64_t>(batch->size() + unbatched_tasks_size +
+                             padding_size)) {
       return errors::FailedPrecondition(
           "Batched output tensor's 0th dimension does not equal the sum of "
           "the 0th dimension sizes of the input tensors");
@@ -861,12 +894,35 @@ Status BatchResourceBase::SplitOutputTensors(
         task.context->set_output(i, split_tensor[j]);
       }
     }
+    for (int j = 0; j < unbatched_tasks.size(); ++j) {
+      // The unbatched tasks are not split, so no need to handle the partial
+      // case separately.
+      unbatched_tasks[j]->context->set_output(
+          i, split_tensor[batch->num_tasks() + j]);
+    }
   }
 
   return absl::OkStatus();
 }
 
-void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
+void BatchResourceBase::CleanUpFunctionHelper(BatchTask& task,
+                                              const Status& status) const {
+  WithContext wc(task.propagated_context);
+  if (!status.ok()) {
+    if (!absl::StrContains(status.message(),
+                           "Function was cancelled before it was started")) {
+      task.status->Update(status);
+    } else {
+      // Do not propagate this error; Prefer a more helpful error message.
+      LOG(ERROR) << "ERROR!!!! " << status.message();
+    }
+  }
+  task.done_callback();
+}
+
+void BatchResourceBase::ProcessFuncBatch(
+    std::unique_ptr<BatchT> batch,
+    std::vector<std::unique_ptr<BatchTask>> unbatched_tasks) const {
   if (batch->empty()) {
     return;
   }
@@ -896,24 +952,19 @@ void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
     if (cleanup_done) {
       return;
     }
+    // TODO(b/316379576): Update this to take the unbatch task cost into
+    // consideration when excluding the wasted cost and propagate cost to the
+    // unbatched tasks.
     SplitBatchCostsAndRecordMetrics(model_name, batch_cost_measurements,
                                     processed_size, *batch);
     // Clear the measurements before unblocking the batch task, as measurements
     // are associated with the task's thread context.
     batch_cost_measurements.clear();
     for (int i = 0; i < batch->num_tasks(); ++i) {
-      WithContext wc(batch->task(i).propagated_context);
-      if (!status.ok()) {
-        if (!absl::StrContains(
-                status.message(),
-                "Function was cancelled before it was started")) {
-          batch->mutable_task(i)->status->Update(status);
-        } else {
-          // Do not propagate this error; Prefer a more helpful error message.
-          LOG(ERROR) << "ERROR!!!! " << status.message();
-        }
-      }
-      batch->mutable_task(i)->done_callback();
+      CleanUpFunctionHelper(*batch->mutable_task(i), status);
+    }
+    for (int i = 0; i < unbatched_tasks.size(); ++i) {
+      CleanUpFunctionHelper(*unbatched_tasks[i], status);
     }
     cleanup_done = true;
   };
@@ -927,7 +978,8 @@ void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
   }
 
   std::vector<Tensor> concatenated_tensors;
-  status = ConcatInputTensors(*batch, last_task_context, &concatenated_tensors);
+  status = ConcatInputTensors(*batch, unbatched_tasks, last_task_context,
+                              &concatenated_tensors);
   processed_size = RoundToLowestAllowedBatchSize(batch->size());
   if (!status.ok()) {
     return;
@@ -969,7 +1021,8 @@ void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
           return;
         }
         if (last_task.forced_warmup_batch_size == 0) {
-          final_status = SplitOutputTensors(combined_outputs, batch.get());
+          final_status = SplitOutputTensors(combined_outputs, batch.get(),
+                                            unbatched_tasks);
         }
       });
 }
@@ -1011,7 +1064,7 @@ void BatchResourceBase::ProcessBatch(std::unique_ptr<BatchT> batch) const {
   const int num_input_edges = batch->task(0).inputs.size();
   std::vector<Tensor> concatenated_tensors;
   const Status concat_status =
-      ConcatInputTensors(*batch, last_task_context, &concatenated_tensors);
+      ConcatInputTensors(*batch, {}, last_task_context, &concatenated_tensors);
   processed_size = RoundToLowestAllowedBatchSize(batch->size());
   OP_REQUIRES_OK_ASYNC(last_task_context, concat_status, last_task_callback);
 
@@ -1081,6 +1134,20 @@ void BatchResourceBase::ProcessBatch(std::unique_ptr<BatchT> batch) const {
   return absl::OkStatus();
 }
 
+void BatchResourceBase::ProcessBatchCallBack(
+    std::unique_ptr<Batch<BatchTask>> batch,
+    std::vector<std::unique_ptr<BatchTask>> unbatched_tasks) {
+  if (!session_metadata().name().empty()) {
+    absl::MutexLock lock(&outstanding_batch_mu_);
+    num_outstanding_batched_items_ -= batch->size();
+  }
+  if (!has_process_batch_function_) {
+    ProcessBatch(std::move(batch));
+  } else {
+    ProcessFuncBatch(std::move(batch), std::move(unbatched_tasks));
+  }
+}
+
 // Looks up the batcher queue for 'queue_name'. If it didn't previously exist,
 // creates it.
 Status BatchResourceBase::LookupOrCreateBatcherQueue(const string& queue_name,
@@ -1094,23 +1161,19 @@ Status BatchResourceBase::LookupOrCreateBatcherQueue(const string& queue_name,
   }
 
   std::unique_ptr<BatcherQueueT> new_queue;
-  auto process_batch_callback = [this](std::unique_ptr<BatchT> batch) {
-    if (!session_metadata().name().empty()) {
-      absl::MutexLock lock(&outstanding_batch_mu_);
-      num_outstanding_batched_items_ -= batch->size();
-    }
-    if (!has_process_batch_function_) {
-      ProcessBatch(std::move(batch));
-    } else {
-      ProcessFuncBatch(std::move(batch));
-    }
-  };
   if (batcher_) {
-    TF_RETURN_IF_ERROR(batcher_->AddQueue(batcher_queue_options_,
-                                          process_batch_callback, &new_queue));
+    TF_RETURN_IF_ERROR(batcher_->AddQueue(
+        batcher_queue_options_,
+        absl::bind_front(&BatchResourceBase::ProcessBatchCallBack, this),
+        &new_queue));
   } else if (adaptive_batcher_) {
+    std::function<void(std::unique_ptr<Batch<BatchTask>>)>
+        reduced_process_batch_callback = [this](std::unique_ptr<BatchT> batch) {
+          ProcessBatchCallBack(std::move(batch), {});
+        };
     TF_RETURN_IF_ERROR(adaptive_batcher_->AddQueue(
-        adaptive_batcher_queue_options_, process_batch_callback, &new_queue));
+        adaptive_batcher_queue_options_, reduced_process_batch_callback,
+        &new_queue));
   } else {
     return errors::Internal("No batcher defined.");
   }
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.h b/tensorflow/core/kernels/batching_util/batch_resource_base.h
index 1bd122e1d1dc9e..60ecf980e95443 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.h
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.h
@@ -271,17 +271,37 @@ class BatchResourceBase : public ResourceBase {
   // returns 'batch_size'.
   int RoundToLowestAllowedBatchSize(int batch_size) const;
 
-  Status ConcatInputTensors(const BatchT& batch, OpKernelContext* context,
-                            std::vector<Tensor>* concatenated_tensors) const;
-
-  Status SplitOutputTensors(const std::vector<Tensor>& combined_outputs,
-                            BatchT* batch) const;
-
-  void ProcessFuncBatch(std::unique_ptr<BatchT> batch) const;
+  // Helper function to propagate the status to the task's context and call the
+  // done callback on the task.
+  void CleanUpFunctionHelper(BatchTask& task, const Status& status) const;
+
+  // Concatenates the input tensors of the tasks from the batch and the
+  // unbatched task vector. When padding is enabled in the batcher queue, they
+  // are padded with garbage value up to the nearest allowed batch size.
+  Status ConcatInputTensors(
+      const BatchT& batch,
+      const std::vector<std::unique_ptr<BatchTask>>& unbatched_tasks,
+      OpKernelContext* context,
+      std::vector<Tensor>* concatenated_tensors) const;
+
+  Status SplitOutputTensors(
+      const std::vector<Tensor>& combined_outputs, BatchT* batch,
+      std::vector<std::unique_ptr<BatchTask>>& unbatched_tasks) const;
+
+  void ProcessFuncBatch(
+      std::unique_ptr<BatchT> batch,
+      std::vector<std::unique_ptr<BatchTask>> unbatched_tasks = {}) const;
 
   // Processes a batch of one or more BatchTask entries.
   void ProcessBatch(std::unique_ptr<BatchT> batch) const;
 
+  // Callback function that wraps the Process*Batch functions above. The caller
+  // of the callback must guarantee that the unique pointers passed as argument
+  // are not null.
+  void ProcessBatchCallBack(
+      std::unique_ptr<Batch<BatchTask>> batch,
+      std::vector<std::unique_ptr<BatchTask>> unbatched_tasks);
+
   // Emits an index tensor, which the Unbatch op will use to un-concatenate
   // the tensor and attribute the pieces to the right batch keys. The index
   // tensor contains, for each input: [batch_key, start_offset, end_offset]
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index d1d8551c250bea..4b9a599a77c5ac 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/context_types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
+#include "tsl/platform/criticality.h"
 #include "tsl/platform/errors.h"
 
 namespace tensorflow {
@@ -436,6 +437,15 @@ class Queue {
   // Same as IsEmpty(), but assumes the caller already holds a lock on 'mu_'.
   bool IsEmptyInternal() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Returns true iff the task is a low priority task based on the queue option.
+  bool IsLowPriorityTask(std::unique_ptr<TaskType>* task);
+
+  // Implementation of ScheduleWithoutOrEagerSplit above. Enqueues `task` as it
+  // is or split it inline (eagerly) to form batches to be processed by
+  // `Queue<TaskType>::ProcessBatch`
+  Status ScheduleWithoutOrEagerSplitImpl(std::unique_ptr<TaskType>* task)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   // Closes the open batch residing at the back of std::deque, and inserts a
   // fresh open batch behind it.
   void StartNewBatch() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -949,6 +959,72 @@ Status Queue<TaskType>::ScheduleWithLazySplit(std::unique_ptr<TaskType>* task) {
   return absl::OkStatus();
 }
 
+template <typename TaskType>
+bool Queue<TaskType>::IsLowPriorityTask(std::unique_ptr<TaskType>* task) {
+  if (!options_.enable_priority_queue) {
+    return false;
+  }
+
+  // The criticality is defined only when the task is a derived class of
+  // BatchTask.
+  if constexpr (std::is_base_of_v<BatchTask, TaskType>) {
+    // TODO(b/316379576): Make the criticality and priority configurable.
+    return ((*task)->criticality() ==
+                tsl::criticality::Criticality::kSheddablePlus ||
+            (*task)->criticality() ==
+                tsl::criticality::Criticality::kSheddable);
+  }
+
+  // Otherwise, consider it a high priority task and return false.
+  return false;
+}
+
+template <typename TaskType>
+Status Queue<TaskType>::ScheduleWithoutOrEagerSplitImpl(
+    std::unique_ptr<TaskType>* task) {
+  // TODO(b/161857471):
+  // Add test coverage when when concurrent incoming batches arrives and
+  // use up all queue capacity.
+  TF_RETURN_IF_ERROR(ValidateBatchTaskQueueCapacity((*task).get()));
+
+  std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+
+  const int64_t open_batch_remaining_slot =
+      max_execution_batch_size() - batches.back()->size();
+
+  const int64_t input_task_size = (*task)->size();
+
+  std::vector<std::unique_ptr<TaskType>> output_tasks;
+
+  if (input_task_size <= open_batch_remaining_slot ||
+      !options_.enable_large_batch_splitting) {
+    // This is the fast path when input doesn't need to be split.
+    output_tasks.push_back(std::move(*task));
+  } else {
+    TF_RETURN_IF_ERROR(SplitInputBatchIntoSubtasks(task, &output_tasks));
+  }
+
+  for (int i = 0; i < output_tasks.size(); ++i) {
+    if (batches.back()->size() + output_tasks[i]->size() >
+        max_execution_batch_size()) {
+      StartNewBatch();
+    }
+    if (batches.back()->empty()) {
+      open_batch_start_time_micros_ = env_->NowMicros();
+    }
+    profiler::TraceMeProducer trace_me(
+        [&output_tasks, i] {
+          return profiler::TraceMeEncode("ScheduleOutputTask",
+                                         {{"size", output_tasks[i]->size()}});
+        },
+        profiler::ContextType::kSharedBatchScheduler,
+        batches.back()->traceme_context_id());
+    batches.back()->AddTask(std::move(output_tasks[i]));
+  }
+
+  return absl::OkStatus();
+}
+
 // TODO(b/194294263):
 // Merge `ScheduleWithoutOrEagerSplit` and `ScheduleWithLazySplit` into
 // `Schedule`.
@@ -969,48 +1045,18 @@ Status Queue<TaskType>::ScheduleWithoutOrEagerSplit(
 
     DCHECK(!closed_);
 
-    // TODO(b/161857471):
-    // Add test coverage when when concurrent incoming batches arrives and
-    // use up all queue capacity.
-    TF_RETURN_IF_ERROR(ValidateBatchTaskQueueCapacity((*task).get()));
-
-    std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
-
-    const int64_t open_batch_remaining_slot =
-        max_execution_batch_size() - batches.back()->size();
-
-    const int64_t input_task_size = (*task)->size();
-
-    std::vector<std::unique_ptr<TaskType>> output_tasks;
-
-    if (input_task_size <= open_batch_remaining_slot ||
-        !large_batch_splitting) {
-      // This is the fast path when input doesn't need to be split.
-      output_tasks.push_back(std::move(*task));
+    if (IsLowPriorityTask(task)) {
+      // Insert the task to the low priority task queue instead of the high
+      // priority batch queue below.
+      low_priority_tasks_.AddTask(std::move(*task));
     } else {
-      TF_RETURN_IF_ERROR(SplitInputBatchIntoSubtasks(task, &output_tasks));
-    }
-
-    for (int i = 0; i < output_tasks.size(); ++i) {
-      if (batches.back()->size() + output_tasks[i]->size() >
-          max_execution_batch_size()) {
-        StartNewBatch();
-      }
-      if (batches.back()->empty()) {
-        open_batch_start_time_micros_ = env_->NowMicros();
-      }
-      profiler::TraceMeProducer trace_me(
-          [&output_tasks, i] {
-            return profiler::TraceMeEncode("ScheduleOutputTask",
-                                           {{"size", output_tasks[i]->size()}});
-          },
-          profiler::ContextType::kSharedBatchScheduler,
-          batches.back()->traceme_context_id());
-      batches.back()->AddTask(std::move(output_tasks[i]));
+      TF_RETURN_IF_ERROR(ScheduleWithoutOrEagerSplitImpl(task));
     }
 
+    // Check if the batch queue has a schedulable batch and mark it schedulable
+    // if it not already marked.
     if (!schedulable_batch_) {
-      if (batches.size() > 1 || IsOpenBatchSchedulable()) {
+      if (GetBatches().size() > 1 || IsOpenBatchSchedulable()) {
         schedulable_batch_ = true;
         notify_of_schedulable_batch = true;
       }
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index a703028a5e6234..29b79b3bb4b712 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 
+#include <cstddef>
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT(build/c++11)
@@ -37,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tsl/platform/criticality.h"
 
 namespace tensorflow {
 namespace serving {
@@ -46,19 +48,43 @@ using ::testing::HasSubstr;
 
 class FakeTask : public BatchTask {
  public:
-  explicit FakeTask(size_t size) : size_(size) {}
+  explicit FakeTask(size_t size, tsl::criticality::Criticality criticality =
+                                     tsl::criticality::Criticality::kCritical)
+      : size_(size), criticality_(criticality) {}
 
   ~FakeTask() override = default;
 
   size_t size() const override { return size_; }
 
+  tsl::criticality::Criticality criticality() const override {
+    return criticality_;
+  }
+
  private:
   const size_t size_;
+  const tsl::criticality::Criticality criticality_;
 
   FakeTask(const FakeTask&) = delete;
   void operator=(const FakeTask&) = delete;
 };
 
+// Fake task taht doesn't inherit BatchTask and doesn't define criticality. The
+// shared batch scheduler should still work with this task.
+class FakeTaskWithoutCriticality {
+ public:
+  explicit FakeTaskWithoutCriticality(size_t size) : size_(size) {}
+
+  ~FakeTaskWithoutCriticality() = default;
+
+  size_t size() const { return size_; }
+
+ private:
+  const size_t size_;
+
+  FakeTaskWithoutCriticality(const FakeTaskWithoutCriticality&) = delete;
+  void operator=(const FakeTaskWithoutCriticality&) = delete;
+};
+
 using Queue = BatchScheduler<FakeTask>;
 using Scheduler = SharedBatchScheduler<FakeTask>;
 using QueueOptions = Scheduler::QueueOptions;
@@ -67,10 +93,26 @@ using SplitFunc =
                          int first_output_task_size, int input_batch_size_limit,
                          std::vector<std::unique_ptr<FakeTask>>* output_tasks)>;
 
-// Creates a FakeTask of size 'task_size', and calls 'scheduler->Schedule()' on
-// that task. Returns the resulting status.
-Status ScheduleTask(size_t task_size, BatchScheduler<FakeTask>* scheduler) {
-  std::unique_ptr<FakeTask> task(new FakeTask(task_size));
+// Creates a FakeTask of size 'task_size' and 'criticality', and calls
+// 'scheduler->Schedule()' on that task. Returns the resulting status.
+// 'criticality' defaults to kCritical.
+Status ScheduleTask(size_t task_size, BatchScheduler<FakeTask>* scheduler,
+                    tsl::criticality::Criticality criticality =
+                        tsl::criticality::Criticality::kCritical) {
+  std::unique_ptr<FakeTask> task(new FakeTask(task_size, criticality));
+  Status status = scheduler->Schedule(&task);
+  // Schedule() should have consumed 'task' iff it returned Status::OK.
+  CHECK_EQ(status.ok(), task == nullptr);
+  return status;
+}
+
+// Helper function similar to the function above. Creates a FakeTask of size
+// 'task_size' and calls 'scheduler->Schedule()' on that task. Returns the
+// resulting status.
+Status ScheduleTaskWithoutCriticality(
+    size_t task_size, BatchScheduler<FakeTaskWithoutCriticality>* scheduler) {
+  std::unique_ptr<FakeTaskWithoutCriticality> task(
+      new FakeTaskWithoutCriticality(task_size));
   Status status = scheduler->Schedule(&task);
   // Schedule() should have consumed 'task' iff it returned Status::OK.
   CHECK_EQ(status.ok(), task == nullptr);
@@ -349,6 +391,101 @@ TEST_P(SharedBatchSchedulerTest,
   EXPECT_TRUE(queue_1_callback_called);
 }
 
+// The task in the shared batch scheduler template parameter does not define
+// criticality priority queue. It should work as if the priority queue is
+// disabled.
+TEST_P(
+    SharedBatchSchedulerTest,
+    CallbackWithTaskVectorOkWithPriorityQueueEnabledWithCriticalitylessTask) {
+  bool queue_0_callback_called = false;
+  auto queue_0_callback =
+      [&queue_0_callback_called](
+          std::unique_ptr<Batch<FakeTaskWithoutCriticality>> batch,
+          std::vector<std::unique_ptr<FakeTaskWithoutCriticality>> tasks) {
+        queue_0_callback_called = true;
+        ASSERT_TRUE(batch->IsClosed());
+        ASSERT_EQ(3, batch->num_tasks());
+        EXPECT_EQ(1, batch->task(0).size());
+        EXPECT_EQ(3, batch->task(1).size());
+        EXPECT_EQ(5, batch->task(2).size());
+        EXPECT_EQ(0, tasks.size());
+      };
+  bool queue_1_callback_called = false;
+  auto queue_1_callback =
+      [&queue_1_callback_called](
+          std::unique_ptr<Batch<FakeTaskWithoutCriticality>> batch,
+          std::vector<std::unique_ptr<FakeTaskWithoutCriticality>> tasks) {
+        queue_1_callback_called = true;
+        ASSERT_TRUE(batch->IsClosed());
+        ASSERT_EQ(2, batch->num_tasks());
+        EXPECT_EQ(2, batch->task(0).size());
+        EXPECT_EQ(4, batch->task(1).size());
+        EXPECT_EQ(0, tasks.size());
+      };
+  {
+    SharedBatchScheduler<FakeTaskWithoutCriticality>::Options options;
+    options.num_batch_threads = 3;
+    options.env = Env::Default();
+
+    std::shared_ptr<SharedBatchScheduler<FakeTaskWithoutCriticality>>
+        shared_batch_scheduler;
+    TF_CHECK_OK(SharedBatchScheduler<FakeTaskWithoutCriticality>::Create(
+        options, &shared_batch_scheduler));
+
+    // Create two queues.
+
+    const SharedBatchScheduler<FakeTaskWithoutCriticality>::QueueOptions
+        queue_options = {
+            .input_batch_size_limit = 10,
+            .batch_timeout_micros = 1000 * 1000,
+            .max_enqueued_batches = 2,
+            .enable_large_batch_splitting = enable_input_batch_split(),
+            .split_input_task_func =
+                [](std::unique_ptr<FakeTaskWithoutCriticality>* input_task,
+                   int open_batch_remaining_slot, int max_batch_size,
+                   std::vector<std::unique_ptr<FakeTaskWithoutCriticality>>*
+                       output_tasks) -> Status {
+              std::unique_ptr<FakeTaskWithoutCriticality> owned_input_task =
+                  std::move(*input_task);
+              const int input_task_size = owned_input_task->size();
+
+              const internal::InputSplitMetadata input_split_metadata(
+                  input_task_size, open_batch_remaining_slot, max_batch_size);
+
+              const absl::FixedArray<int> task_sizes =
+                  input_split_metadata.task_sizes();
+              const int num_batches = task_sizes.size();
+
+              output_tasks->resize(num_batches);
+              for (int i = 0; i < num_batches; i++) {
+                (*output_tasks)[i] =
+                    std::make_unique<FakeTaskWithoutCriticality>(task_sizes[i]);
+              }
+
+              return absl::OkStatus();
+            },
+            .enable_lazy_split = enable_lazy_split(),
+            .max_execution_batch_size = 10,
+            .enable_priority_queue = true};
+
+    std::unique_ptr<BatchScheduler<FakeTaskWithoutCriticality>> queue_0;
+    TF_CHECK_OK(shared_batch_scheduler->AddQueue(queue_options,
+                                                 queue_0_callback, &queue_0));
+    std::unique_ptr<BatchScheduler<FakeTaskWithoutCriticality>> queue_1;
+    TF_CHECK_OK(shared_batch_scheduler->AddQueue(queue_options,
+                                                 queue_1_callback, &queue_1));
+
+    // Submit tasks to the two queues.
+    TF_ASSERT_OK(ScheduleTaskWithoutCriticality(1, queue_0.get()));
+    TF_ASSERT_OK(ScheduleTaskWithoutCriticality(2, queue_1.get()));
+    TF_ASSERT_OK(ScheduleTaskWithoutCriticality(3, queue_0.get()));
+    TF_ASSERT_OK(ScheduleTaskWithoutCriticality(4, queue_1.get()));
+    TF_ASSERT_OK(ScheduleTaskWithoutCriticality(5, queue_0.get()));
+  }
+  EXPECT_TRUE(queue_0_callback_called);
+  EXPECT_TRUE(queue_1_callback_called);
+}
+
 TEST_P(SharedBatchSchedulerTest, ObeyBatchSizeConstraint) {
   // Set up a fake clock, which only advances when we explicitly tell it to.
   test_util::FakeClockEnv env(Env::Default());
@@ -912,6 +1049,93 @@ INSTANTIATE_TEST_SUITE_P(
                       std::make_tuple(/*enable_input_batch_split=*/false,
                                       /*enable_lazy_split=*/false)));
 
+using SharedBatchSchedulerPriorityTest = SharedBatchSchedulerTest;
+
+TEST_P(SharedBatchSchedulerPriorityTest,
+       CallbackWithTaskVectorOkWithPriorityQueueEnabledWithPrioritySet) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    queue_callback_called = true;
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(2, batch->num_tasks());
+    EXPECT_EQ(1, batch->task(0).size());
+    EXPECT_EQ(3, batch->task(1).size());
+    EXPECT_EQ(1, tasks.size());
+    EXPECT_EQ(5, tasks[0]->size());
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    // Create two queues.
+    const QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/true);
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    // Submit tasks to the two queues.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+  }
+  EXPECT_TRUE(queue_callback_called);
+}
+
+TEST_P(SharedBatchSchedulerPriorityTest,
+       CallbackWithTaskVectorOkWithPriorityQueueDisabledWithPrioritySet) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    queue_callback_called = true;
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(3, batch->num_tasks());
+    EXPECT_EQ(1, batch->task(0).size());
+    EXPECT_EQ(3, batch->task(1).size());
+    EXPECT_EQ(5, batch->task(2).size());
+    EXPECT_EQ(0, tasks.size());
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    // Create two queues.
+    const QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/false);
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    // Submit tasks to the two queues.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+  }
+  EXPECT_TRUE(queue_callback_called);
+}
+
+// Lazy split is to be removed. The mixed priority batching is only supported
+// when the lazy split is not enabled.
+INSTANTIATE_TEST_SUITE_P(
+    Parameter, SharedBatchSchedulerPriorityTest,
+    ::testing::Values(std::make_tuple(/*enable_input_batch_split=*/true,
+                                      /*enable_lazy_split=*/false),
+                      std::make_tuple(/*enable_input_batch_split=*/false,
+                                      /*enable_lazy_split=*/false)));
+
 #ifdef PLATFORM_GOOGLE
 // This benchmark relies on https://github.com/google/benchmark features,
 // (in particular, `Benchmark::ThreadRange`) not available in open-sourced TF

From a6e55e74bba04b80bab202a53265182d06307aa1 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Mon, 18 Mar 2024 09:24:37 -0700
Subject: [PATCH 027/670] [XLA:GPU] Add IndexingContext to store MLIRContext*
 and RTVars registry.

PiperOrigin-RevId: 616850787
---
 third_party/xla/xla/service/gpu/BUILD         |   1 +
 third_party/xla/xla/service/gpu/fusions/BUILD |   3 +
 .../xla/service/gpu/fusions/concatenate.cc    |   2 +-
 .../xla/xla/service/gpu/fusions/concatenate.h |   4 +-
 .../service/gpu/fusions/concatenate_mlir.cc   |  17 +-
 .../service/gpu/fusions/concatenate_mlir.h    |   5 +-
 .../xla/service/gpu/fusions/fusion_emitter.cc |  19 +-
 .../xla/service/gpu/fusions/fusion_emitter.h  |   6 +-
 .../fusions/in_place_dynamic_update_slice.h   |   4 +-
 .../xla/service/gpu/fusions/input_slices.cc   |   4 +-
 .../xla/service/gpu/fusions/input_slices.h    |   4 +-
 .../service/gpu/fusions/input_slices_mlir.cc  |   8 +-
 .../service/gpu/fusions/input_slices_mlir.h   |   4 +-
 .../service/gpu/fusions/input_slices_test.cc  |   5 +-
 .../xla/xla/service/gpu/fusions/loop.cc       |  13 +-
 .../xla/xla/service/gpu/fusions/loop.h        |   4 +-
 .../xla/xla/service/gpu/fusions/loop_mlir.cc  |  17 +-
 .../xla/xla/service/gpu/fusions/loop_mlir.h   |   4 +-
 .../xla/service/gpu/fusions/loop_mlir_test.cc |  16 +-
 .../xla/xla/service/gpu/fusions/loop_test.cc  |  12 +-
 .../gpu/fusions/mlir/elemental_hlo_to_mlir.cc |  11 +-
 .../gpu/fusions/mlir/mlir_fusion_emitter.h    |   1 +
 .../fusions/mlir/mlir_fusion_emitter_test.cc  |   4 +-
 .../gpu/fusions/mlir/simplify_affine.cc       |   5 +-
 .../gpu/fusions/mlir_emitter_test_base.cc     |   3 +-
 .../gpu/fusions/mlir_emitter_test_base.h      |   2 +
 .../xla/service/gpu/fusions/reduction_base.cc |  39 +--
 .../xla/service/gpu/fusions/reduction_base.h  |  13 +-
 .../gpu/fusions/reduction_base_test.cc        |  51 ++--
 .../xla/service/gpu/fusions/reduction_mlir.cc |  12 +-
 .../xla/xla/service/gpu/fusions/scatter.h     |   4 +-
 .../xla/service/gpu/fusions/scatter_mlir.cc   |  17 +-
 .../xla/service/gpu/fusions/scatter_mlir.h    |   4 +-
 .../service/gpu/fusions/scatter_mlir_test.cc  |  12 +-
 .../xla/xla/service/gpu/fusions/transpose.cc  |  20 +-
 .../xla/xla/service/gpu/fusions/transpose.h   |   4 +-
 .../xla/service/gpu/fusions/transpose_mlir.cc |  39 +--
 .../xla/service/gpu/fusions/transpose_mlir.h  |  14 +-
 .../gpu/fusions/transpose_mlir_test.cc        |  16 +-
 .../xla/service/gpu/fusions/transpose_test.cc |  41 +--
 .../xla/xla/service/gpu/ir_emitter_context.h  |   4 +
 third_party/xla/xla/service/gpu/model/BUILD   |  13 +-
 .../service/gpu/model/coalescing_analysis.cc  |  29 ++-
 .../service/gpu/model/coalescing_analysis.h   |   7 +-
 .../gpu/model/coalescing_analysis_test.cc     |   6 +-
 .../model/gpu_indexing_performance_model.cc   |   2 +-
 .../model/gpu_indexing_performance_model.h    |   5 +-
 .../service/gpu/model/indexing_analysis.cc    | 236 +++++++++++-------
 .../xla/service/gpu/model/indexing_analysis.h |  37 +--
 .../gpu/model/indexing_analysis_test.cc       |  14 +-
 .../xla/service/gpu/model/indexing_context.cc |  27 ++
 .../xla/service/gpu/model/indexing_context.h  |  54 ++++
 .../xla/xla/service/gpu/model/indexing_map.cc |  21 +-
 .../xla/xla/service/gpu/model/indexing_map.h  |  24 +-
 .../service/gpu/model/indexing_map_test.cc    |  79 ++++--
 .../service/gpu/model/indexing_test_utils.cc  |  12 +-
 .../service/gpu/model/indexing_test_utils.h   |   4 +
 .../xla/service/gpu/model/tile_analysis.cc    |   8 +-
 58 files changed, 665 insertions(+), 381 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/model/indexing_context.cc
 create mode 100644 third_party/xla/xla/service/gpu/model/indexing_context.h

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 59d7cc553ec9cc..508fd8f638e8a5 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -300,6 +300,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:name_uniquer",
+        "//xla/service/gpu/model:indexing_map",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 94e10f18bc4d55..fefac9f2e75fa0 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -216,6 +216,7 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:affine_map_printer",
+        "//xla/service/gpu/model:indexing_map",
         "//xla/stream_executor:device_description",
         "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
@@ -705,6 +706,7 @@ xla_cc_test(
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emitter_context",
+        "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/model:indexing_test_utils",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
@@ -981,6 +983,7 @@ xla_cc_test(
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/model:affine_map_printer",
+        "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/model:indexing_test_utils",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.cc b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
index 084aece24b1c92..b8acbd4f8072d9 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
@@ -58,7 +58,7 @@ ConcatenateFusion::ConcatenateFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis) {}
 
 std::optional<IndexingMap> ConcatenateFusion::ComputeThreadIdToOutputIndexing(
-    int64_t output_id, mlir::MLIRContext* ctx) const {
+    int64_t output_id, IndexingContext* indexing_context) const {
   return std::nullopt;  // TODO(b/319081342): Implement this.
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.h b/third_party/xla/xla/service/gpu/fusions/concatenate.h
index 997033293eff2b..5e51b50c2d1408 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.h
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.h
@@ -38,11 +38,11 @@ class ConcatenateFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, mlir::MLIRContext* ctx) const override;
+      int64_t output_id, IndexingContext* indexing_context) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
+      IndexingContext* indexing_context) const override {
     // TODO(b/319081342): Implement this.
     return std::nullopt;
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
index 638c8ec9436c25..974365eca8efa3 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
@@ -66,17 +66,17 @@ LaunchDimensions MlirConcatenateFusion::launch_dimensions() const {
 
 std::optional<IndexingMap>
 MlirConcatenateFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, mlir::MLIRContext* ctx) const {
+    int64_t root_index, IndexingContext* indexing_context) const {
   return std::nullopt;
 }
 
 std::optional<IndexingMap>
 MlirConcatenateFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    mlir::MLIRContext* ctx) const {
+    IndexingContext* indexing_context) const {
   return GetDefaultThreadIdToOutputIndexingMap(
       launch_dimensions(), /*unroll_factor=*/1,
-      GetLargestConcatOperandShape(analysis_), ctx);
+      GetLargestConcatOperandShape(analysis_), indexing_context);
 }
 
 std::vector<const HloInstruction*>
@@ -96,7 +96,8 @@ absl::Status MlirConcatenateFusion::EmitEntryFunction(
   const auto* concat = analysis_.fusion_heroes()[0];
   mlir::ImplicitLocOpBuilder builder(entry_function.getLoc(), entry_function);
   builder.setInsertionPointToStart(entry_function.addEntryBlock());
-  auto* ctx = entry_function.getContext();
+  auto* mlir_context = entry_function.getContext();
+  IndexingContext indexing_context{mlir_context};
 
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
   SmallVector<Value> input_tensors(
@@ -109,13 +110,15 @@ absl::Status MlirConcatenateFusion::EmitEntryFunction(
 
   auto thread_id_to_input_map =
       ComputeThreadIdToInputIndexing(
-          /*root_index=*/0, /*hero_operand_index=*/0, ctx)
+          /*root_index=*/0, /*hero_operand_index=*/0, &indexing_context)
           .value();
-  auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(concat, ctx);
+  auto epilogue_indexing =
+      ComputeEpilogueInputToOutputIndexing(concat, &indexing_context);
 
   for (auto [operand_index, operand] : llvm::enumerate(concat->operands())) {
     auto input_to_output_map =
-        *ComputeInputToOutputIndexing(concat, /*input_id=*/operand_index, ctx)
+        *ComputeInputToOutputIndexing(concat, /*input_id=*/operand_index,
+                                      &indexing_context)
              .indexing_maps.front()
              .begin();
     auto thread_id_to_output_map = ComposeIndexingMaps(
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
index 5003046bf39e41..f07a637d16c956 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
@@ -42,11 +43,11 @@ class MlirConcatenateFusion : public MlirFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, IndexingContext* indexing_context) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override;
+      IndexingContext* indexing_context) const override;
 
  protected:
   absl::Status EmitEntryFunction(
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
index d4d3a33ce57e78..e18557c012df62 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
@@ -119,8 +119,9 @@ absl::Status AnnotateKernelLaunchDimensions(
 
 IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
     const LaunchDimensions& launch_dims, int unroll_factor,
-    const Shape& output_shape, mlir::MLIRContext* ctx) {
+    const Shape& output_shape, IndexingContext* indexing_context) {
   std::vector<mlir::AffineExpr> output_dims(output_shape.rank());
+  auto mlir_context = indexing_context->GetMLIRContext();
 
   std::array<uint64_t, 3> thread_counts{
       launch_dims.thread_counts_per_block().x,
@@ -143,19 +144,20 @@ IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
   // This means that this code supports some launch grids that the parallel
   // loop emitter doesn't support. This is safe, since the latter CHECK fails
   // if its assumptions are not fulfilled.
-  mlir::AffineExpr c0 = mlir::getAffineConstantExpr(0, ctx);
+  mlir::AffineExpr c0 = mlir::getAffineConstantExpr(0, mlir_context);
   mlir::AffineExpr linear_index = c0;
   uint64_t stride = 1;
   for (int i = 0; i < 3; ++i) {
-    auto coord = mlir::getAffineDimExpr(kIndexingMapThreadIdxDims[i], ctx) +
-                 mlir::getAffineDimExpr(kIndexingMapBlockIdxDims[i], ctx) *
-                     thread_counts[i];
+    auto coord =
+        mlir::getAffineDimExpr(kIndexingMapThreadIdxDims[i], mlir_context) +
+        mlir::getAffineDimExpr(kIndexingMapBlockIdxDims[i], mlir_context) *
+            thread_counts[i];
     auto linear_component = coord * stride;
     linear_index = linear_index + linear_component;
     stride *= total_sizes[i];
   }
-  mlir::AffineExpr chunk_id = mlir::getAffineSymbolExpr(0, ctx);
-  mlir::AffineExpr unroll_elem_id = mlir::getAffineSymbolExpr(1, ctx);
+  mlir::AffineExpr chunk_id = mlir::getAffineSymbolExpr(0, mlir_context);
+  mlir::AffineExpr unroll_elem_id = mlir::getAffineSymbolExpr(1, mlir_context);
 
   linear_index = linear_index * unroll_factor +
                  chunk_id * unroll_factor * launch_dims.launch_bound() +
@@ -187,8 +189,9 @@ IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
               1});
   symbol_ranges.push_back({0, unroll_factor - 1});
   IndexingMap indexing_map(
+      indexing_context,
       mlir::AffineMap::get(/*dimCount=*/6,
-                           /*symbolCount=*/2, output_dims, ctx),
+                           /*symbolCount=*/2, output_dims, mlir_context),
       dimension_ranges, symbol_ranges);
   // Remove the unroll_elem_id symbol if unrolling divides num_elements.
   if (num_elements % unroll_factor == 0) {
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
index dbc8e8718debe0..b5fa0f32152e32 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
@@ -77,14 +77,14 @@ class KernelFusionInterface : public FusionInterface {
   // unsupported (scatter, in-place DUS). Implementations will return nullopt.
   // Note: Work in progress, not implemented for all emitters.
   virtual std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const = 0;
+      int64_t root_index, IndexingContext* indexing_context) const = 0;
 
   // Computes an indexing map from thread to input element(s) of the root's
   // **hero**. Note that in many cases this is not computable from the output
   // indexing. The indexing may only be known for some operands of the hero.
   virtual std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const = 0;
+      IndexingContext* indexing_context) const = 0;
 
   static constexpr std::array<int, 3> kIndexingMapThreadIdxDims = {0, 1, 2};
   static constexpr std::array<int, 3> kIndexingMapBlockIdxDims = {3, 4, 5};
@@ -96,7 +96,7 @@ class KernelFusionInterface : public FusionInterface {
   // block sizes in the given launch dimensions.
   static IndexingMap GetDefaultThreadIdToOutputIndexingMap(
       const LaunchDimensions& launch_dims, int unroll_factor,
-      const Shape& output_shape, mlir::MLIRContext* ctx);
+      const Shape& output_shape, IndexingContext* indexing_context);
 };
 
 // Base class for fusions that are implemented using a single kernel, which is
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
index 12be8043b05ec1..4e4f2d82e94a80 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
@@ -67,7 +67,7 @@ class InPlaceDynamicUpdateSliceFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override {
+      int64_t root_index, IndexingContext* indexing_context) const override {
     // The mapping cannot be statically computed in general, since the offsets
     // are unknown.
     return std::nullopt;
@@ -75,7 +75,7 @@ class InPlaceDynamicUpdateSliceFusion : public KernelFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
+      IndexingContext* indexing_context) const override {
     // TODO(b/319081342): Implement this.
     return std::nullopt;
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.cc b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
index 85f661a8f125f5..aa1398639bd397 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
@@ -183,7 +183,7 @@ LaunchDimensions InputSlicesFusion::launch_dimensions() const {
 }
 
 std::optional<IndexingMap> InputSlicesFusion::ComputeThreadIdToOutputIndexing(
-    int64_t output_id, mlir::MLIRContext* ctx) const {
+    int64_t output_id, IndexingContext* indexing_context) const {
   // The mapping here is trivial and the same for all outputs - slice offsets
   // are applied in the indexing from slice outputs to slice inputs.
   auto launch_dims = launch_dimensions();
@@ -191,7 +191,7 @@ std::optional<IndexingMap> InputSlicesFusion::ComputeThreadIdToOutputIndexing(
   // still use the requested output's shape for clarity.
   const auto& shape = analysis_.fusion_roots()[output_id]->shape();
   return GetDefaultThreadIdToOutputIndexingMap(launch_dims, unroll_factor_,
-                                               shape, ctx);
+                                               shape, indexing_context);
 }
 
 absl::Status InputSlicesFusion::EmitKernel(
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.h b/third_party/xla/xla/service/gpu/fusions/input_slices.h
index 90f4f4e4a24d03..b1164c5df28e45 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.h
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.h
@@ -48,11 +48,11 @@ class InputSlicesFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, mlir::MLIRContext* ctx) const override;
+      int64_t output_id, IndexingContext* indexing_context) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
+      IndexingContext* indexing_context) const override {
     // TODO(b/319081342): Implement this.
     return std::nullopt;
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
index c1108ca37e8cd3..a10babd539b2e7 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
@@ -52,7 +52,7 @@ using mlir::ValueRange;
 
 std::optional<IndexingMap>
 MlirInputSlicesFusion::ComputeThreadIdToOutputIndexing(
-    int64_t output_id, mlir::MLIRContext* ctx) const {
+    int64_t output_id, IndexingContext* indexing_context) const {
   // The mapping here is trivial and the same for all outputs - slice offsets
   // are applied in the indexing from slice outputs to slice inputs.
   auto launch_dims = launch_dimensions();
@@ -60,7 +60,7 @@ MlirInputSlicesFusion::ComputeThreadIdToOutputIndexing(
   // still use the requested output's shape for clarity.
   const auto& shape = analysis_.fusion_roots()[output_id]->shape();
   return GetDefaultThreadIdToOutputIndexingMap(launch_dims, unroll_factor_,
-                                               shape, ctx);
+                                               shape, indexing_context);
 }
 
 LaunchDimensions MlirInputSlicesFusion::launch_dimensions() const {
@@ -80,8 +80,8 @@ absl::Status MlirInputSlicesFusion::EmitEntryFunction(
 
   // We enforce that all the root shapes have identical dimensions in
   // IsHloOpSupported.
-  auto indexing =
-      ComputeThreadIdToOutputIndexing(0, entry_function.getContext());
+  IndexingContext indexing_context{entry_function.getContext()};
+  auto indexing = ComputeThreadIdToOutputIndexing(0, &indexing_context);
   TF_RET_CHECK(indexing) << "Indexing is never nullopt";
 
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
index 1de06b963d9e59..53b9d76f97a9ca 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
@@ -37,11 +37,11 @@ class MlirInputSlicesFusion : public MlirFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, mlir::MLIRContext* ctx) const override;
+      int64_t output_id, IndexingContext* indexing_context) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
+      IndexingContext* indexing_context) const override {
     // TODO(b/319081342): Implement this.
     return std::nullopt;
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc b/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc
index 094bbfac7a27a9..939ab506b62dc4 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
@@ -34,6 +35,7 @@ namespace {
 
 class InputSlicesTest : public HloTestBase {
  public:
+  InputSlicesTest() : indexing_context_(&mlir_context_) {}
   void SetUp() override {
     HloTestBase::SetUp();
     printer_ =
@@ -44,6 +46,7 @@ class InputSlicesTest : public HloTestBase {
  protected:
   AffineMapPrinter printer_;
   mlir::MLIRContext mlir_context_;
+  IndexingContext indexing_context_;
 };
 
 TEST_F(InputSlicesTest, ThreadIndexing) {
@@ -76,7 +79,7 @@ TEST_F(InputSlicesTest, ThreadIndexing) {
   ASSERT_NE(fusion, nullptr);
 
   auto thread_id_to_output_indexing =
-      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context_);
+      fusion->ComputeThreadIdToOutputIndexing(0, &indexing_context_);
   EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
               MatchIndexingString(R"(
     (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (0,
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.cc b/third_party/xla/xla/service/gpu/fusions/loop.cc
index e7a13200fe391f..35b1f18348ac6d 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop.cc
@@ -215,23 +215,24 @@ LoopFusion::LoopFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis), config_(ComputeLoopFusionConfig(analysis)) {}
 
 std::optional<IndexingMap> LoopFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, mlir::MLIRContext* ctx) const {
+    int64_t root_index, IndexingContext* indexing_context) const {
   auto launch_dims = launch_dimensions();
   return GetDefaultThreadIdToOutputIndexingMap(
-      launch_dims, config_.unroll_factor, GetElementShape(analysis_), ctx);
+      launch_dims, config_.unroll_factor, GetElementShape(analysis_),
+      indexing_context);
 }
 
 std::optional<IndexingMap> LoopFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    mlir::MLIRContext* ctx) const {
+    IndexingContext* indexing_context) const {
   std::optional<IndexingMap> thread_id_to_output_indexing =
-      ComputeThreadIdToOutputIndexing(root_index, ctx);
+      ComputeThreadIdToOutputIndexing(root_index, indexing_context);
   if (!thread_id_to_output_indexing.has_value()) {
     return std::nullopt;
   }
   const HloInstruction* fusion_root = analysis_.fusion_roots()[root_index];
-  auto output_to_input_indexing =
-      ComputeOutputToInputIndexing(fusion_root, /*output_id=*/0, ctx);
+  auto output_to_input_indexing = ComputeOutputToInputIndexing(
+      fusion_root, /*output_id=*/0, indexing_context);
   IndexingMapSet output_to_input_indexing_set =
       output_to_input_indexing.indexing_maps[hero_operand_index];
   // Since we are computing the indexing for a non-fusion op, there is only one
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.h b/third_party/xla/xla/service/gpu/fusions/loop.h
index e466abe66a843f..9371015cf0a356 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.h
+++ b/third_party/xla/xla/service/gpu/fusions/loop.h
@@ -40,11 +40,11 @@ class LoopFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, IndexingContext* indexing_context) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override;
+      IndexingContext* indexing_context) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
index 82734d06cc9c9a..0989de4bde6726 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
@@ -63,23 +63,24 @@ const Shape& GetFusionResultShape(const HloFusionAnalysis& analysis) {
 }  // namespace
 
 std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, mlir::MLIRContext* ctx) const {
+    int64_t root_index, IndexingContext* indexing_context) const {
   auto launch_dims = launch_dimensions();
   return GetDefaultThreadIdToOutputIndexingMap(
-      launch_dims, config_.unroll_factor, GetFusionResultShape(analysis_), ctx);
+      launch_dims, config_.unroll_factor, GetFusionResultShape(analysis_),
+      indexing_context);
 }
 
 std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    mlir::MLIRContext* ctx) const {
+    IndexingContext* indexing_context) const {
   std::optional<IndexingMap> thread_id_to_output_indexing =
-      ComputeThreadIdToOutputIndexing(root_index, ctx);
+      ComputeThreadIdToOutputIndexing(root_index, indexing_context);
   if (!thread_id_to_output_indexing.has_value()) {
     return std::nullopt;
   }
   const HloInstruction* fusion_root = analysis_.fusion_roots()[root_index];
-  auto output_to_input_indexing =
-      ComputeOutputToInputIndexing(fusion_root, /*output_id=*/0, ctx);
+  auto output_to_input_indexing = ComputeOutputToInputIndexing(
+      fusion_root, /*output_id=*/0, indexing_context);
   IndexingMapSet output_to_input_indexing_set =
       output_to_input_indexing.indexing_maps[hero_operand_index];
   // Since we are computing the indexing for a non-fusion op, there is only one
@@ -106,8 +107,8 @@ absl::Status MlirLoopFusion::EmitEntryFunction(
 
   // We enforce that all the root shapes have identical dimensions in
   // IsHloOpSupported.
-  auto indexing =
-      ComputeThreadIdToOutputIndexing(0, entry_function.getContext());
+  IndexingContext indexing_context{entry_function.getContext()};
+  auto indexing = ComputeThreadIdToOutputIndexing(0, &indexing_context);
   TF_RET_CHECK(indexing) << "Indexing is never nullopt";
 
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir.h b/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
index 228c8c87b5ff28..b70b7070ab626f 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
@@ -37,11 +37,11 @@ class MlirLoopFusion : public MlirFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, IndexingContext* indexing_context) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override;
+      IndexingContext* indexing_context) const override;
 
  protected:
   absl::Status EmitEntryFunction(
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc
index 1f3d41bddc46a0..9febfd5d565e66 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc
@@ -47,8 +47,8 @@ TEST_F(MlirLoopFusionTest, ThreadId_IndexingUnrolled) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   MlirLoopFusion fusion(analysis);
-  auto thread_id_to_output_indexing =
-      fusion.ComputeThreadIdToOutputIndexing(/*root_index=*/0, &mlir_context_);
+  auto thread_id_to_output_indexing = fusion.ComputeThreadIdToOutputIndexing(
+      /*root_index=*/0, &indexing_context_);
 
   EXPECT_THAT(thread_id_to_output_indexing->ToString(thread_id_printer_),
               MatchIndexingString(R"(
@@ -90,8 +90,8 @@ TEST_F(MlirLoopFusionTest, ThreadId_IndexingNotUnrolled) {
   auto analysis = AnalyzeFusion(*root, device_info_);
 
   MlirLoopFusion fusion(analysis);
-  auto thread_id_to_output_indexing =
-      fusion.ComputeThreadIdToOutputIndexing(/*root_index=*/0, &mlir_context_);
+  auto thread_id_to_output_indexing = fusion.ComputeThreadIdToOutputIndexing(
+      /*root_index=*/0, &indexing_context_);
   EXPECT_THAT(thread_id_to_output_indexing->ToString(thread_id_printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (th_x)
@@ -106,7 +106,7 @@ TEST_F(MlirLoopFusionTest, ThreadId_IndexingNotUnrolled) {
               unroll_id in [0, 0]
             )"));
   auto thread_id_to_input_indexing = fusion.ComputeThreadIdToInputIndexing(
-      /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
+      /*root_index=*/0, /*hero_operand_index=*/0, &indexing_context_);
   EXPECT_THAT(thread_id_to_input_indexing->ToString(thread_id_printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (th_x)
@@ -142,8 +142,8 @@ TEST_F(MlirLoopFusionTest, ThreadId_Broadcast) {
   auto analysis = AnalyzeFusion(*root, device_info_);
 
   MlirLoopFusion fusion(analysis);
-  auto thread_id_to_output_indexing =
-      fusion.ComputeThreadIdToOutputIndexing(/*root_index=*/0, &mlir_context_);
+  auto thread_id_to_output_indexing = fusion.ComputeThreadIdToOutputIndexing(
+      /*root_index=*/0, &indexing_context_);
   EXPECT_THAT(thread_id_to_output_indexing->ToString(thread_id_printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
@@ -162,7 +162,7 @@ TEST_F(MlirLoopFusionTest, ThreadId_Broadcast) {
                 th_x + bl_x * 128 in [0, 5999]
             )"));
   auto thread_id_to_input_indexing = fusion.ComputeThreadIdToInputIndexing(
-      /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
+      /*root_index=*/0, /*hero_operand_index=*/0, &indexing_context_);
   EXPECT_THAT(thread_id_to_input_indexing->ToString(thread_id_printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_test.cc b/third_party/xla/xla/service/gpu/fusions/loop_test.cc
index 1bb5fdb8705d30..91e56aafe0a6b5 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_test.cc
@@ -37,6 +37,7 @@ namespace {
 
 class LoopTest : public HloTestBase {
  public:
+  LoopTest() : indexing_context_(&mlir_context_) {}
   void SetUp() override {
     HloTestBase::SetUp();
 
@@ -50,6 +51,7 @@ class LoopTest : public HloTestBase {
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
   AffineMapPrinter printer_;
   mlir::MLIRContext mlir_context_;
+  IndexingContext indexing_context_;
 };
 
 absl::StatusOr<std::unique_ptr<KernelFusionInterface>> GetFusion(
@@ -84,7 +86,7 @@ TEST_F(LoopTest, ThreadIndexingUnrolled) {
   TF_ASSERT_OK_AND_ASSIGN(auto loop_fusion, GetFusion(analysis));
   auto thread_id_to_output_indexing =
       loop_fusion->ComputeThreadIdToOutputIndexing(/*root_index=*/0,
-                                                   &mlir_context_);
+                                                   &indexing_context_);
 
   EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
               MatchIndexingString(R"(
@@ -127,7 +129,7 @@ TEST_F(LoopTest, ThreadIndexingNotUnrolled) {
   TF_ASSERT_OK_AND_ASSIGN(auto loop_fusion, GetFusion(analysis));
   auto thread_id_to_output_indexing =
       loop_fusion->ComputeThreadIdToOutputIndexing(/*root_index=*/0,
-                                                   &mlir_context_);
+                                                   &indexing_context_);
   EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (th_x)
@@ -143,7 +145,7 @@ TEST_F(LoopTest, ThreadIndexingNotUnrolled) {
             )"));
   auto thread_id_to_input_indexing =
       loop_fusion->ComputeThreadIdToInputIndexing(
-          /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
+          /*root_index=*/0, /*hero_operand_index=*/0, &indexing_context_);
   EXPECT_THAT(thread_id_to_input_indexing->ToString(printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (th_x)
@@ -180,7 +182,7 @@ TEST_F(LoopTest, Broadcast) {
   TF_ASSERT_OK_AND_ASSIGN(auto loop_fusion, GetFusion(analysis));
   auto thread_id_to_output_indexing =
       loop_fusion->ComputeThreadIdToOutputIndexing(/*root_index=*/0,
-                                                   &mlir_context_);
+                                                   &indexing_context_);
   EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
@@ -200,7 +202,7 @@ TEST_F(LoopTest, Broadcast) {
             )"));
   auto thread_id_to_input_indexing =
       loop_fusion->ComputeThreadIdToInputIndexing(
-          /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
+          /*root_index=*/0, /*hero_operand_index=*/0, &indexing_context_);
   EXPECT_THAT(thread_id_to_input_indexing->ToString(printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index 0701425c81ca5b..8e2124dbe0e87b 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -67,6 +67,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
@@ -468,7 +469,8 @@ absl::StatusOr<SmallVector<Value>> EmitPad(
     const HloInstruction* instr, mlir::Type result_element_type,
     ValueRange indices, const OperandProvider& operand_provider,
     ImplicitLocOpBuilder& b) {
-  auto indexing = ComputeOutputToInputIndexing(instr, 0, b.getContext());
+  IndexingContext indexing_context{b.getContext()};
+  auto indexing = ComputeOutputToInputIndexing(instr, 0, &indexing_context);
   const auto& indexing_map = *indexing.indexing_maps[0].begin();
   mlir::Value is_in_bounds = CheckConstraints(indexing_map, indices, {}, b);
 
@@ -673,9 +675,10 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
                             operand->shape().element_type(), builder));
     arg_types.push_back(operand_element_type);
   }
-  auto input_indices = GetInputIndices(
-      ComputeOutputToInputIndexing(instr, 0, builder.getContext()), indices,
-      builder);
+  IndexingContext indexing_context(builder.getContext());
+  auto input_indices =
+      GetInputIndices(ComputeOutputToInputIndexing(instr, 0, &indexing_context),
+                      indices, builder);
   SmallVector<Value> operands;
   for (auto&& [operand_number, operand_indices] :
        llvm::enumerate(input_indices)) {
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
index 6baf86372613e5..79836a9d5ed8a3 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/stream_executor/device_description.h"
 
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter_test.cc
index b0ed47330a5558..d5623e11ae58ee 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter_test.cc
@@ -61,12 +61,12 @@ class DummyCopyFusionEmitter : public MlirFusionEmitterBase {
   LaunchDimensions launch_dimensions() const final { return {1, 100}; }
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t, mlir::MLIRContext*) const final {
+      int64_t, IndexingContext*) const final {
     return std::nullopt;
   }
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
-      int64_t, int64_t, mlir::MLIRContext*) const final {
+      int64_t, int64_t, IndexingContext*) const final {
     return std::nullopt;
   }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
index c86fbda41c4cf1..2507281a283ebd 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "xla/service/gpu/fusions/mlir/passes.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
@@ -114,7 +115,9 @@ struct RewriteAffineApply
       }
     }
 
-    IndexingMap map(op.getAffineMap(), dim_ranges, symbol_ranges);
+    IndexingContext indexing_context(op->getContext());
+    IndexingMap map(&indexing_context, op.getAffineMap(), dim_ranges,
+                    symbol_ranges);
     map.Simplify();
     auto expr = map.GetAffineMap().getResult(0);
 
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc b/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc
index 2dfc06b9e747af..bdf424c079c7a3 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc
@@ -49,7 +49,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-MlirEmitterTestBaseImpl::MlirEmitterTestBaseImpl() {
+MlirEmitterTestBaseImpl::MlirEmitterTestBaseImpl()
+    : indexing_context_(&mlir_context_) {
   // clang-format off
   mlir_context_.loadDialect<
       mlir::affine::AffineDialect,
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.h b/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.h
index a299c2ea4007ba..147b57b6f84b70 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 
@@ -50,6 +51,7 @@ class MlirEmitterTestBaseImpl : public HloTestBase {
   stream_executor::DeviceDescription device_info_ =
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
   mlir::MLIRContext mlir_context_;
+  IndexingContext indexing_context_;
   AffineMapPrinter thread_id_printer_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
index 57db8735cc0e60..6ea9220034eaa8 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
@@ -317,18 +317,19 @@ ReductionInfo ReductionInfo::Create(const HloFusionAnalysis& analysis) {
 }
 
 std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, mlir::MLIRContext* ctx) const {
+    int64_t root_index, IndexingContext* indexing_context) const {
   if (!groups_.is_reduction_root[root_index]) {
     // Non-transpose roots are elementwise by definition.
-    return ComputeThreadIdToInputIndexing(root_index, 0, ctx);
+    return ComputeThreadIdToInputIndexing(root_index, 0, indexing_context);
   }
   auto* root = analysis_.fusion_roots()[root_index];
   auto* hero = analysis_.fusion_heroes()[root_index];
 
-  auto block_offsets = GetBlockOffsetsForTiling(tiling_, ctx);
-  auto thread_ids = DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx),
-                                             tiling_.GetThreadsPerBlock(),
-                                             tiling_.GetThreadStrides());
+  auto mlir_context = indexing_context->GetMLIRContext();
+  auto block_offsets = GetBlockOffsetsForTiling(tiling_, mlir_context);
+  auto thread_ids = DelinearizeInBoundsIndex(
+      mlir::getAffineDimExpr(0, mlir_context), tiling_.GetThreadsPerBlock(),
+      tiling_.GetThreadStrides());
 
   auto physical_shape = ShapeUtil::DeleteDimensions(hero->dimensions(),
                                                     hero->operand(0)->shape());
@@ -352,9 +353,10 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
   auto physical_index = [&]() {
     if (is_row_reduction_) {
       IndexingMap linear_index(
+          indexing_context,
           mlir::AffineMap::get(
               6, 0, block_offsets.getResult(kRowKept) + thread_ids[kRowKept],
-              ctx),
+              mlir_context),
           dimension_ranges, {});
       int rows_per_warp = GetRowsPerWarp();
       if (rows_per_warp > 1) {
@@ -367,20 +369,21 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
       return ComposeIndexingMaps(
           linear_index, GetBitcastMap(ShapeUtil::MakeShape(
                                           PRED, {tiling_.GetShape()[kRowKept]}),
-                                      physical_shape, ctx));
+                                      physical_shape, indexing_context));
     }
 
     IndexingMap projected_index(
+        indexing_context,
         mlir::AffineMap::get(
             6, 0,
             {block_offsets.getResult(kColMajorKept),
              block_offsets.getResult(kColMinorKept) + thread_ids[kColReduced]},
-            ctx),
+            mlir_context),
         dimension_ranges, {});
 
     projected_index.AddConstraint(
         mlir::getAffineDimExpr(
-            KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx) %
+            KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context) %
             WarpSize(),
         {0, 0});
     if (!is_row_reduction_) {
@@ -395,24 +398,25 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
         GetBitcastMap(ShapeUtil::DeleteDimension(
                           ReductionDimensions::kColReducedDimension,
                           tiling_.GetXlaShape()),
-                      physical_shape, ctx));
+                      physical_shape, indexing_context));
   }();
 
   auto map = ComposeIndexingMaps(
       physical_index,
-      GetBitcastMap(FirstShape(hero->shape()), FirstShape(root->shape()), ctx));
+      GetBitcastMap(FirstShape(hero->shape()), FirstShape(root->shape()),
+                    indexing_context));
 
   int group_index = groups_.group_id_per_root[root_index];
   map.AddConstraint(
       mlir::getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[1],
-                             ctx),
+                             mlir_context),
       {group_index, group_index});
   return map;
 }
 
 std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    mlir::MLIRContext* ctx) const {
+    IndexingContext* indexing_context) const {
   auto* hero = analysis_.fusion_heroes()[root_index];
   if (groups_.is_reduction_root[root_index] &&
       hero_operand_index >= hero->operand_count() / 2) {
@@ -421,15 +425,16 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToInputIndexing(
   }
 
   auto map = ComposeIndexingMaps(
-      GetIndexingMapForTiling(tiling_, ctx),
+      GetIndexingMapForTiling(tiling_, indexing_context),
       GetBitcastMap(tiling_.GetXlaShape(),
-                    hero->operand(hero_operand_index)->shape(), ctx));
+                    hero->operand(hero_operand_index)->shape(),
+                    indexing_context));
   // Only threads with the right y block index actually do anything for this
   // root.
   int group_index = groups_.group_id_per_root[root_index];
   map.AddConstraint(
       mlir::getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[1],
-                             ctx),
+                             indexing_context->GetMLIRContext()),
       {group_index, group_index});
   return map;
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base.h b/third_party/xla/xla/service/gpu/fusions/reduction_base.h
index 93c2ecc2681f83..89442524b7e058 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base.h
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base.h
@@ -57,11 +57,11 @@ class ReductionInfo {
   int GetRowsPerWarp() const;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const;
+      int64_t root_index, IndexingContext* indexing_context) const;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const;
+      IndexingContext* indexing_context) const;
 
   LaunchDimensions launch_dimensions() const;
 
@@ -93,15 +93,16 @@ class ReductionFusionBase : public Base {
       : analysis_(analysis), reduction_info_(ReductionInfo::Create(analysis)) {}
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override {
-    return reduction_info().ComputeThreadIdToOutputIndexing(root_index, ctx);
+      int64_t root_index, IndexingContext* indexing_context) const override {
+    return reduction_info().ComputeThreadIdToOutputIndexing(root_index,
+                                                            indexing_context);
   }
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
+      IndexingContext* indexing_context) const override {
     return reduction_info().ComputeThreadIdToInputIndexing(
-        root_index, hero_operand_index, ctx);
+        root_index, hero_operand_index, indexing_context);
   }
 
   LaunchDimensions launch_dimensions() const override {
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc b/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
index 2c4ffa0e9ce078..6b7e8dcc2c4f42 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
@@ -35,9 +36,14 @@ namespace gpu {
 namespace {
 
 class ReductionTest : public HloTestBase {
+ public:
+  ReductionTest() : indexing_context_(&mlir_context_) {}
+
  protected:
   stream_executor::DeviceDescription device_info_ =
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  mlir::MLIRContext mlir_context_;
+  IndexingContext indexing_context_;
 };
 
 class FakeReductionFusion : public ReductionFusionBase<KernelFusionInterface> {
@@ -78,11 +84,10 @@ TEST_F(ReductionTest, ThreadIndexingRowReduction) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
-  mlir::MLIRContext mlir_context;
 
-  EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
-      MatchIndexingString(R"(
+  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           (d3 * 8 + d0 floordiv 32) floordiv 64,
           (d3 * 8 + d0 floordiv 32) mod 64,
@@ -103,7 +108,7 @@ TEST_F(ReductionTest, ThreadIndexingRowReduction) {
         d3 * 8 + d0 floordiv 32 in [0, 6399]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &indexing_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5) -> (
           (d3 * 8 + d0 floordiv 32) floordiv 64,
@@ -147,11 +152,10 @@ TEST_F(ReductionTest, ThreadIndexingMultiRowReduction) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
-  mlir::MLIRContext mlir_context;
 
-  EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
-      MatchIndexingString(R"(
+  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 + (d0 floordiv 4) floordiv 64,
           (d0 floordiv 4) mod 64,
@@ -172,7 +176,7 @@ TEST_F(ReductionTest, ThreadIndexingMultiRowReduction) {
         d3 * 64 + d0 floordiv 4 in [0, 6399]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &indexing_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5) -> (
           d3 + (d0 floordiv 4) floordiv 64,
@@ -217,11 +221,10 @@ TEST_F(ReductionTest, ThreadIndexingColumnReduction) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
-  mlir::MLIRContext mlir_context;
 
-  EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
-      MatchIndexingString(R"(
+  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3,
           d0 floordiv 32 + s1 * 32,
@@ -235,7 +238,7 @@ TEST_F(ReductionTest, ThreadIndexingColumnReduction) {
         d0 mod 32 in [0, 31]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &indexing_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5) -> (
           d3,
@@ -273,10 +276,9 @@ TEST_F(ReductionTest, ThreadIndexingOutputLayout) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
-  mlir::MLIRContext mlir_context;
 
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &indexing_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5) -> (
           (d3 * 8 + d0 floordiv 32) floordiv 64,
@@ -322,7 +324,6 @@ TEST_F(ReductionTest, ThreadIndexingSideOutput) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
-  mlir::MLIRContext mlir_context;
 
   constexpr char kExpectedIndexing[] = R"(
       (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
@@ -344,11 +345,11 @@ TEST_F(ReductionTest, ThreadIndexingSideOutput) {
       d0 mod 32 + s2 * 32 in [0, 511]
       d3 * 8 + d0 floordiv 32 in [0, 6399]
   )";
+  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(1, 0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(kExpectedIndexing));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(1, 0, &mlir_context)->ToString(),
-      MatchIndexingString(kExpectedIndexing));
-  EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(1, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(1, &indexing_context_)->ToString(),
       MatchIndexingString(kExpectedIndexing));
 }
 
@@ -377,9 +378,9 @@ TEST_F(ReductionTest, bla) {
   FakeReductionFusion fusion(analysis);
   mlir::MLIRContext mlir_context;
 
-  EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
-      MatchIndexingString(R"(
+  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3] -> (
           d3,
           (d0 + s2 * 512) * 2 + s3
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
index 1ff23dddcf51ba..c0e500803c0b46 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
@@ -147,7 +147,9 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
   int num_warps_row = tiling.GetThreadsPerBlock()
                           [ReductionDimensions::kRowMinorReducedDimension] /
                       WarpSize();
-  auto ctx = state.entry_function.getContext();
+
+  auto* mlir_context = state.entry_function.getContext();
+  IndexingContext indexing_context(mlir_context);
 
   auto zero = builder.create<mlir::arith::ConstantIndexOp>(0);
   auto lane_id = builder.create<mlir::gpu::LaneIdOp>();
@@ -161,10 +163,10 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
   auto thread_ids = mlir_converter::ApplyAffineMap(
       mlir::AffineMap::get(
           /*dimCount=*/1, /*symbolCount=*/0,
-          DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx),
+          DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, mlir_context),
                                    tiling.GetThreadsPerBlock(),
                                    tiling.GetThreadStrides()),
-          ctx),
+          mlir_context),
       {thread_id}, {}, builder);
   SmallVector<Value> thread_and_block_indices{thread_id, zero, zero,
                                               block_id,  zero, zero};
@@ -200,7 +202,7 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
   }
   bool use_shared = !shared_tile_size.empty();
 
-  auto output_indexing = ComputeThreadIdToOutputIndexing(0, ctx);
+  auto output_indexing = ComputeThreadIdToOutputIndexing(0, &indexing_context);
   auto output_indices = mlir_converter::ApplyAffineMap(
       output_indexing->GetAffineMap(), thread_and_block_indices, {}, builder);
   auto thread_has_output = mlir_converter::CheckConstraints(
@@ -236,7 +238,7 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
   SmallVector<llvm::SmallVector<Value>> results;
   for (auto* hero : reduction_heroes_) {
     auto input_indexing = ComputeThreadIdToInputIndexing(
-        reduction_roots_.at(hero).front(), 0, ctx);
+        reduction_roots_.at(hero).front(), 0, &indexing_context);
     TF_ASSIGN_OR_RETURN(
         auto accumulated,
         state.EmitPerThreadReducedElements(*input_indexing, hero, inits[hero]));
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.h b/third_party/xla/xla/service/gpu/fusions/scatter.h
index 6982bbc8e6bd2c..6b0e2c5fe81eb9 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.h
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.h
@@ -44,7 +44,7 @@ class ScatterFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override {
+      int64_t root_index, IndexingContext* indexing_context) const override {
     // The kernel iterates over updates, whose correspondence to output
     // elements cannot be computed statically.
     return std::nullopt;
@@ -52,7 +52,7 @@ class ScatterFusion : public KernelFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
+      IndexingContext* indexing_context) const override {
     // TODO(b/319081342): Implement this.
     return std::nullopt;
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
index 85242c0740e7b6..2f4a3e0af8ce5d 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
@@ -83,13 +83,13 @@ bool MlirScatterFusion::IsSupported(const HloFusionAnalysis& analysis) {
 }
 
 std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, mlir::MLIRContext* ctx) const {
+    int64_t root_index, IndexingContext* indexing_context) const {
   return std::nullopt;
 }
 
 std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    mlir::MLIRContext* ctx) const {
+    IndexingContext* indexing_context) const {
   auto* scatter =
       DynCast<HloScatterInstruction>(analysis_.fusion_heroes().front());
   int64_t scatter_operand_count = scatter->scatter_operand_count();
@@ -106,7 +106,8 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
   // Compute thread id mapping based on the first update operand.
   Shape scatter_update_shape = scatter->scatter_updates().front()->shape();
   IndexingMap scatter_update_map = GetDefaultThreadIdToOutputIndexingMap(
-      launch_dimensions(), config_.unroll_factor, scatter_update_shape, ctx);
+      launch_dimensions(), config_.unroll_factor, scatter_update_shape,
+      indexing_context);
 
   // For scatter indices we project indexing for scatter updates and take the
   // first result of the affine map only, because they coincide.
@@ -114,11 +115,14 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
     Shape scatter_indices_shape = scatter->scatter_indices()->shape();
     CHECK_EQ(scatter_indices_shape.rank(), 2) << scatter->ToString();
     // Create a map from scatter update to scatter indices.
+    auto* mlir_context = indexing_context->GetMLIRContext();
     IndexingMap updates_to_indices_map{
+        indexing_context,
         mlir::AffineMap::get(
             /*dimCount=*/scatter_update_shape.rank(), /*symbolCount=*/1,
-            {mlir::getAffineDimExpr(0, ctx), mlir::getAffineSymbolExpr(0, ctx)},
-            ctx),
+            {mlir::getAffineDimExpr(0, mlir_context),
+             mlir::getAffineSymbolExpr(0, mlir_context)},
+            mlir_context),
         /*dim_ranges=*/RangesFromTensorSizes(scatter_update_shape.dimensions()),
         /*symbol_ranges=*/
         RangesFromTensorSizes({scatter_indices_shape.dimensions(1)})};
@@ -185,10 +189,11 @@ absl::Status MlirScatterFusion::EmitEntryFunction(
   const HloInstruction* scatter_update = scatter->operand(kScatterUpdateIndex);
 
   mlir::MLIRContext* mlir_context = entry_function.getContext();
+  IndexingContext indexing_context{mlir_context};
   auto thread_id_to_update_map =
       ComputeThreadIdToInputIndexing(
           /*root_index=*/0, /*hero_operand_index=*/kScatterUpdateIndex,
-          mlir_context)
+          &indexing_context)
           .value();
   thread_id_to_update_map.Simplify();
   thread_id_to_update_map.RemoveUnusedSymbols();
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
index e66e2c6a4f5a78..016a67c7c512fd 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
@@ -42,11 +42,11 @@ class MlirScatterFusion : public MlirFusionEmitterBase {
   static bool IsSupported(const HloFusionAnalysis& analysis);
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, IndexingContext* indexing_context) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override;
+      IndexingContext* indexing_context) const override;
 
  protected:
   absl::Status EmitEntryFunction(
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
index f7fdba3b97db30..dd868683d745bf 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
@@ -97,25 +97,25 @@ TEST_F(MlirScatterFusionTest, ThreadId_IndexingUnrolled) {
   EXPECT_THAT(
       fusion
           .ComputeThreadIdToInputIndexing(
-              /*root_index=*/0, /*hero_operand_index=*/3, &mlir_context_)
+              /*root_index=*/0, /*hero_operand_index=*/3, &indexing_context_)
           ->ToString(thread_id_printer_),
       MatchIndexingString(kUpdatesIndexing));
   EXPECT_THAT(
       fusion
           .ComputeThreadIdToInputIndexing(
-              /*root_index=*/0, /*hero_operand_index=*/4, &mlir_context_)
+              /*root_index=*/0, /*hero_operand_index=*/4, &indexing_context_)
           ->ToString(thread_id_printer_),
       MatchIndexingString(kUpdatesIndexing));
   EXPECT_THAT(
       fusion
           .ComputeThreadIdToInputIndexing(
-              /*root_index=*/1, /*hero_operand_index=*/3, &mlir_context_)
+              /*root_index=*/1, /*hero_operand_index=*/3, &indexing_context_)
           ->ToString(thread_id_printer_),
       MatchIndexingString(kUpdatesIndexing));
   EXPECT_THAT(
       fusion
           .ComputeThreadIdToInputIndexing(
-              /*root_index=*/1, /*hero_operand_index=*/4, &mlir_context_)
+              /*root_index=*/1, /*hero_operand_index=*/4, &indexing_context_)
           ->ToString(thread_id_printer_),
       MatchIndexingString(kUpdatesIndexing));
 
@@ -137,13 +137,13 @@ TEST_F(MlirScatterFusionTest, ThreadId_IndexingUnrolled) {
   EXPECT_THAT(
       fusion
           .ComputeThreadIdToInputIndexing(
-              /*root_index=*/0, /*hero_operand_index=*/2, &mlir_context_)
+              /*root_index=*/0, /*hero_operand_index=*/2, &indexing_context_)
           ->ToString(thread_id_printer_),
       MatchIndexingString(kIndicesIndexing));
   EXPECT_THAT(
       fusion
           .ComputeThreadIdToInputIndexing(
-              /*root_index=*/1, /*hero_operand_index=*/2, &mlir_context_)
+              /*root_index=*/1, /*hero_operand_index=*/2, &indexing_context_)
           ->ToString(thread_id_printer_),
       MatchIndexingString(kIndicesIndexing));
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.cc b/third_party/xla/xla/service/gpu/fusions/transpose.cc
index 99f113cbafbea7..fbce46e7b82665 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.cc
@@ -284,19 +284,20 @@ LaunchDimensions TransposeFusion::launch_dimensions() const {
 }
 
 std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, mlir::MLIRContext* ctx) const {
+    int64_t root_index, IndexingContext* indexing_context) const {
+  auto* mlir_context = indexing_context->GetMLIRContext();
   const auto& hero = *analysis_.fusion_heroes()[root_index];
   const auto& root = *analysis_.fusion_roots()[root_index];
   if (!GetDescriptionForTiledTransposeEmitter(root, hero)) {
     // Non-transpose roots are elementwise by definition.
-    return ComputeThreadIdToInputIndexing(root_index, 0, ctx);
+    return ComputeThreadIdToInputIndexing(root_index, 0, indexing_context);
   }
 
   // The block offsets are permuted, but the thread offsets remain the same.
-  auto block_offset = GetBlockOffsetsForTiling(tiling_, ctx)
+  auto block_offset = GetBlockOffsetsForTiling(tiling_, mlir_context)
                           .getSubMap(std::vector<unsigned>{permutation_.begin(),
                                                            permutation_.end()});
-  auto thread_offset = GetThreadOffsetsForTiling(tiling_, ctx);
+  auto thread_offset = GetThreadOffsetsForTiling(tiling_, mlir_context);
   auto permuted_tiled_shape =
       ShapeUtil::MakeShape(U8, Permute(tiling_.GetShape(), permutation_));
 
@@ -304,20 +305,21 @@ std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToOutputIndexing(
       GetIndexingMapForTiling(
           block_offset, thread_offset, tiling_.GetNumThreadsPerBlock(),
           tiling_.GetNumBlocks(), tiling_.GetThreadTileSize(),
-          permuted_tiled_shape.dimensions()),
-      GetBitcastMap(permuted_tiled_shape, hero.shape(), ctx));
+          permuted_tiled_shape.dimensions(), indexing_context),
+      GetBitcastMap(permuted_tiled_shape, hero.shape(), indexing_context));
   map.Simplify();
   return map;
 }
 
 std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    mlir::MLIRContext* ctx) const {
+    IndexingContext* indexing_context) const {
   const auto& hero = *analysis_.fusion_heroes()[root_index];
 
   auto map = ComposeIndexingMaps(
-      GetIndexingMapForTiling(tiling_, ctx),
-      GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(), ctx));
+      GetIndexingMapForTiling(tiling_, indexing_context),
+      GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(),
+                    indexing_context));
   map.Simplify();
   return map;
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.h b/third_party/xla/xla/service/gpu/fusions/transpose.h
index 899b1cb94390ae..d45cf15c762561 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.h
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.h
@@ -64,11 +64,11 @@ class TransposeFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, IndexingContext* indexing_context) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override;
+      IndexingContext* indexing_context) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
index 8f3f4ef37480b4..4b8a2af5661935 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
@@ -146,23 +146,24 @@ MlirTransposeFusion::MlirTransposeFusion(const HloFusionAnalysis& analysis)
 }
 
 std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, MLIRContext* ctx) const {
+    int64_t root_index, IndexingContext* indexing_context) const {
   const auto& hero = *analysis_.fusion_heroes()[root_index];
   const auto& root = *analysis_.fusion_roots()[root_index];
   if (!GetDescriptionForTiledTransposeEmitter(root, hero)) {
     // Non-transpose roots are elementwise by definition.
-    return ComputeThreadIdToInputIndexing(root_index, 0, ctx);
+    return ComputeThreadIdToInputIndexing(root_index, 0, indexing_context);
   }
-  return ComputeThreadIdToOutputIndexing(hero, ctx);
+  return ComputeThreadIdToOutputIndexing(hero, indexing_context);
 }
 
 IndexingMap MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
-    const HloInstruction& hero, MLIRContext* ctx) const {
+    const HloInstruction& hero, IndexingContext* indexing_context) const {
   // The block offsets are permuted, but the thread offsets remain the same.
-  auto block_offset = GetBlockOffsetsForTiling(tiling_, ctx)
+  auto* mlir_context = indexing_context->GetMLIRContext();
+  auto block_offset = GetBlockOffsetsForTiling(tiling_, mlir_context)
                           .getSubMap(std::vector<unsigned>{permutation_.begin(),
                                                            permutation_.end()});
-  auto thread_offset = GetThreadOffsetsForTiling(tiling_, ctx);
+  auto thread_offset = GetThreadOffsetsForTiling(tiling_, mlir_context);
   auto permuted_tiled_shape =
       ShapeUtil::MakeShape(U8, Permute(tiling_.GetShape(), permutation_));
 
@@ -170,17 +171,18 @@ IndexingMap MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
       GetIndexingMapForTiling(
           block_offset, thread_offset, tiling_.GetNumThreadsPerBlock(),
           tiling_.GetNumBlocks(), tiling_.GetThreadTileSize(),
-          permuted_tiled_shape.dimensions()),
-      GetBitcastMap(permuted_tiled_shape, hero.shape(), ctx));
+          permuted_tiled_shape.dimensions(), indexing_context),
+      GetBitcastMap(permuted_tiled_shape, hero.shape(), indexing_context));
   map.Simplify();
   return map;
 }
 
 IndexingMap MlirTransposeFusion::ComputeThreadIdToInputIndexing(
-    const HloInstruction& hero, MLIRContext* ctx) const {
+    const HloInstruction& hero, IndexingContext* indexing_context) const {
   auto map = ComposeIndexingMaps(
-      GetIndexingMapForTiling(tiling_, ctx),
-      GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(), ctx));
+      GetIndexingMapForTiling(tiling_, indexing_context),
+      GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(),
+                    indexing_context));
   map.Simplify();
   return map;
 }
@@ -194,6 +196,7 @@ LaunchDimensions MlirTransposeFusion::launch_dimensions() const {
 IndexingMap GetSharedMemoryWriteIndexingMap(
     const IndexingMap& thread_id_indexing, int loop_dim) {
   auto* mlir_context = thread_id_indexing.GetMLIRContext();
+  IndexingContext indexing_context{mlir_context};
 
   AffineExpr c0 = mlir::getAffineConstantExpr(0, mlir_context);
   AffineExpr th_x = mlir::getAffineDimExpr(0, mlir_context);
@@ -201,6 +204,7 @@ IndexingMap GetSharedMemoryWriteIndexingMap(
   mlir::bindSymbolsList(mlir_context, llvm::MutableArrayRef(tile_sizes));
 
   IndexingMap shmem_write_indexing{
+      &indexing_context,
       AffineMap::get(
           thread_id_indexing.GetDimensionCount(),
           thread_id_indexing.GetSymbolCount(),
@@ -219,7 +223,8 @@ IndexingMap GetSharedMemoryReadIndexingMap(
     const IndexingMap& thread_id_indexing, int loop_dim) {
   IndexingMap write_indexing =
       GetSharedMemoryWriteIndexingMap(thread_id_indexing, loop_dim);
-  return IndexingMap{write_indexing.GetAffineMap().getSubMap({0, 2, 1}),
+  return IndexingMap{thread_id_indexing.GetIndexingContext(),
+                     write_indexing.GetAffineMap().getSubMap({0, 2, 1}),
                      write_indexing.GetDimensionRanges(),
                      write_indexing.GetSymbolRanges(),
                      write_indexing.GetConstraints()};
@@ -236,10 +241,11 @@ absl::StatusOr<SmallVector<Value, 4>> MlirTransposeFusion::EmitWriteToShMemMlir(
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
   int num_outputs = entry_function.getArguments().size() - num_inputs;
 
+  IndexingContext indexing_context{builder.getContext()};
   SmallVector<Value> shmem_intermediate_result;
   for (auto* transpose : shmem_transposes_) {
     auto input_indexing =
-        ComputeThreadIdToInputIndexing(*transpose, builder.getContext());
+        ComputeThreadIdToInputIndexing(*transpose, &indexing_context);
     IndexingMap shmem_input_indexing =
         GetSharedMemoryWriteIndexingMap(input_indexing, permutation_[2]);
 
@@ -288,15 +294,16 @@ absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
     const mlir_converter::PartitionedComputations& computations,
     const CallTargetProvider& call_targets, ValueRange shmem_tensors) const {
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
-
+  auto* mlir_context = builder.getContext();
+  IndexingContext indexing_context{mlir_context};
   ValueRange output_tensor_args =
       entry_function.getArguments().drop_front(num_inputs);
   auto output_indexing = ComputeThreadIdToOutputIndexing(
-      *shmem_transposes_.front(), builder.getContext());
+      *shmem_transposes_.front(), &indexing_context);
   auto shmem_output_indexing =
       GetSharedMemoryReadIndexingMap(output_indexing, permutation_[2]);
   auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(
-      shmem_transposes_.front(), builder.getContext());
+      shmem_transposes_.front(), &indexing_context);
   auto root_indexing = ComposeIndexingMaps(output_indexing, epilogue_indexing);
   auto result_tensors = EmitThreadLoopNest(
       builder, output_tensor_args, output_indexing,
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
index 58c8d6265ae838..fd9f5863e8260e 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
@@ -54,20 +54,20 @@ class MlirTransposeFusion : public MlirFusionEmitterBase {
   static bool IsSupported(const HloFusionAnalysis& analysis);
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, IndexingContext* indexing_context) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
+      IndexingContext* indexing_context) const override {
     return ComputeThreadIdToInputIndexing(
-        *analysis_.fusion_heroes()[root_index], ctx);
+        *analysis_.fusion_heroes()[root_index], indexing_context);
   }
 
  protected:
-  IndexingMap ComputeThreadIdToInputIndexing(const HloInstruction& hero,
-                                             mlir::MLIRContext* ctx) const;
-  IndexingMap ComputeThreadIdToOutputIndexing(const HloInstruction& hero,
-                                              mlir::MLIRContext* ctx) const;
+  IndexingMap ComputeThreadIdToInputIndexing(
+      const HloInstruction& hero, IndexingContext* indexing_context) const;
+  IndexingMap ComputeThreadIdToOutputIndexing(
+      const HloInstruction& hero, IndexingContext* indexing_context) const;
 
   absl::Status EmitEntryFunction(
       const mlir_converter::PartitionedComputations& computations,
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
index e1d64067afb90a..38fe0789b8eadf 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
@@ -46,9 +46,9 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing021) {
   auto analysis = AnalyzeFusion(*root, device_info_);
 
   MlirTransposeFusion fusion(analysis);
-  EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
-      MatchIndexingString(R"(
+  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
           d0 floordiv 32 + s1 * 4,
@@ -67,7 +67,7 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing021) {
         s2 in [0, 0]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &indexing_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
@@ -105,9 +105,9 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing201) {
   auto analysis = AnalyzeFusion(*root, device_info_);
   MlirTransposeFusion fusion(analysis);
 
-  EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
-      MatchIndexingString(R"(
+  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
           d0 floordiv 32 + (d3 * 32 + s1 * 4) mod 64,
@@ -126,7 +126,7 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing201) {
         s2 in [0, 0]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &indexing_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d0 floordiv 32 + s1 * 4,
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_test.cc b/third_party/xla/xla/service/gpu/fusions/transpose_test.cc
index d7363bbd39f382..94d3df1898ad3b 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_test.cc
@@ -37,9 +37,14 @@ namespace {
 using ::testing::HasSubstr;
 
 class TransposeTest : public HloTestBase {
+ public:
+  TransposeTest() : indexing_context_(&mlir_context_) {}
+
  protected:
   stream_executor::DeviceDescription device_info_ =
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  mlir::MLIRContext mlir_context_;
+  IndexingContext indexing_context_;
 };
 
 absl::StatusOr<std::unique_ptr<TransposeFusion>> GetTransposeFusion(
@@ -74,9 +79,9 @@ TEST_F(TransposeTest, ThreadIndexing021) {
   TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetTransposeFusion(analysis));
   mlir::MLIRContext mlir_context;
 
-  EXPECT_THAT(
-      fusion->ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
-      MatchIndexingString(R"(
+  EXPECT_THAT(fusion->ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
           d0 floordiv 32 + s1 * 4,
@@ -94,9 +99,9 @@ TEST_F(TransposeTest, ThreadIndexing021) {
         s1 in [0, 7]
         s2 in [0, 0]
       )"));
-  EXPECT_THAT(
-      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
-      MatchIndexingString(R"(
+  EXPECT_THAT(fusion->ComputeThreadIdToOutputIndexing(0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
           d0 floordiv 32 + (d3 mod 2) * 32 + s1 * 4,
@@ -136,9 +141,9 @@ TEST_F(TransposeTest, ThreadIndexing201) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetTransposeFusion(analysis));
   mlir::MLIRContext mlir_context;
-  EXPECT_THAT(
-      fusion->ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
-      MatchIndexingString(R"(
+  EXPECT_THAT(fusion->ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
           d0 floordiv 32 + (d3 * 32 + s1 * 4) mod 64,
@@ -156,9 +161,9 @@ TEST_F(TransposeTest, ThreadIndexing201) {
         s1 in [0, 7]
         s2 in [0, 0]
       )"));
-  EXPECT_THAT(
-      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
-      MatchIndexingString(R"(
+  EXPECT_THAT(fusion->ComputeThreadIdToOutputIndexing(0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d0 floordiv 32 + s1 * 4,
           d3 floordiv 2,
@@ -200,9 +205,9 @@ TEST_F(TransposeTest, ThreadIndexingPartialBlock) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetTransposeFusion(analysis));
   mlir::MLIRContext mlir_context;
-  EXPECT_THAT(
-      fusion->ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
-      MatchIndexingString(R"(
+  EXPECT_THAT(fusion->ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d0 floordiv 32 + s0 * 4,
           d3,
@@ -222,9 +227,9 @@ TEST_F(TransposeTest, ThreadIndexingPartialBlock) {
         d0 floordiv 32 + s0 * 4 in [0, 23]
         d0 mod 32 in [0, 23]
       )"));
-  EXPECT_THAT(
-      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
-      MatchIndexingString(R"(
+  EXPECT_THAT(fusion->ComputeThreadIdToOutputIndexing(0, &indexing_context_)
+                  ->ToString(),
+              MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           s0,
           d0 floordiv 32,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index cc79e4cd3c8266..2ae9a636d7fcc3 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
 #include "xla/service/name_uniquer.h"
 #include "xla/stream_executor/device_description.h"
@@ -69,6 +70,7 @@ class IrEmitterContext {
         platform_name_(std::move(platform_name)),
         gpu_device_info_(gpu_device_info),
         mlir_context_(mlir_context),
+        indexing_context_(mlir_context_),
         llvm_module_(llvm_module),
         emit_kernels_(emit_kernels) {}
   // Disallow copy and assign.
@@ -98,6 +100,7 @@ class IrEmitterContext {
     return cc != nullptr ? *cc : se::RocmComputeCapability();
   }
   mlir::MLIRContext* mlir_context() { return mlir_context_; }
+  IndexingContext* indexing_context() { return &indexing_context_; }
   llvm::Module* llvm_module() { return llvm_module_; }
   NameUniquer* name_uniquer() { return &name_uniquer_; }
 
@@ -126,6 +129,7 @@ class IrEmitterContext {
   std::string platform_name_;
   const se::DeviceDescription& gpu_device_info_;
   mlir::MLIRContext* mlir_context_;
+  IndexingContext indexing_context_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
   std::vector<GpuExecutable::ConstantInfo> constants_;
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 84e8e5001e101b..a3b08d3fc5b971 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -418,10 +418,18 @@ xla_cc_test(
 
 cc_library(
     name = "indexing_map",
-    srcs = ["indexing_map.cc"],
-    hdrs = ["indexing_map.h"],
+    srcs = [
+        "indexing_context.cc",
+        "indexing_map.cc",
+    ],
+    hdrs = [
+        "indexing_context.h",
+        "indexing_map.h",
+    ],
     deps = [
         ":affine_map_printer",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -576,6 +584,7 @@ xla_cc_test(
     srcs = ["coalescing_analysis_test.cc"],
     deps = [
         ":coalescing_analysis",
+        ":indexing_map",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index 2eed7c5ad26826..c697fb752f9b90 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -223,11 +223,14 @@ bool IsCoalesced(const IndexingMap& thread_id_to_input_indexing_map,
   if (thread_id_to_input_indexing_map.GetAffineMap().getNumResults() == 0) {
     return true;
   }
-  MLIRContext* mlir_context = thread_id_to_input_indexing_map.GetMLIRContext();
+  IndexingContext* indexing_context =
+      thread_id_to_input_indexing_map.GetIndexingContext();
+  mlir::MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   AffineExpr thread_x_dim = mlir::getAffineDimExpr(
       KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context);
   AffineExpr c0 = mlir::getAffineConstantExpr(0, mlir_context);
   IndexingMap thread_x_first_32_elements{
+      indexing_context,
       AffineMap::get(1, 0, {thread_x_dim, c0, c0, c0, c0, c0}, mlir_context),
       {Interval{0, 31}},
       {}};
@@ -257,7 +260,8 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
     const HloFusionAdaptor& fusion_adaptor,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context) {
+    KernelFusionInterface* fusion_interface,
+    IndexingContext* indexing_context) {
   GroupedByOpIndexingMap result;
   for (const auto& [root_index, hero] :
        llvm::enumerate(fusion_analysis.fusion_heroes())) {
@@ -269,7 +273,7 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
       // Compute thread ID -> hero operand indexing map.
       std::optional<IndexingMap> thread_id_to_hero_operand_map =
           fusion_interface->ComputeThreadIdToInputIndexing(
-              root_index, hero_operand_index, mlir_context);
+              root_index, hero_operand_index, indexing_context);
       if (!thread_id_to_hero_operand_map.has_value()) {
         return std::nullopt;
       }
@@ -277,7 +281,7 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
       HloInstructionAdaptor hero_operand_adaptor(*hero_operand);
       GroupedByOpIndexingMap instr_indexing_keyed_by_operands =
           ComputeGroupedOutputToInputIndexing(
-              fusion_adaptor, hero_operand_adaptor, mlir_context);
+              fusion_adaptor, hero_operand_adaptor, indexing_context);
       // For every operand compute thread ID -> physical layout of operand
       // indexing map.
       for (const HloInstruction* operand : operands) {
@@ -291,11 +295,11 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
 
         IndexingMap operand_logical_to_physical_map =
             GetIndexingMapFromLogicalToPhysicalLayout(operand_shape,
-                                                      mlir_context);
+                                                      indexing_context);
         IndexingMap operand_physical_to_linearized_shape = GetBitcastMap(
             ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                 operand_shape),
-            GetLinearizedShape(operand_shape), mlir_context);
+            GetLinearizedShape(operand_shape), indexing_context);
         IndexingMap operand_logical_to_linearized_physical_shape =
             operand_logical_to_physical_map *
             operand_physical_to_linearized_shape;
@@ -330,12 +334,12 @@ CoalescingAnalysis::CoalescingAnalysis(
     const HloInstruction* instr,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context,
+    KernelFusionInterface* fusion_interface, IndexingContext* indexing_context,
     bool use_heuristic) {
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(instr);
   if (!use_heuristic && ComputeCoalescingForAllOperands(
                             *fusion_adaptor, operands, fusion_analysis,
-                            fusion_interface, mlir_context)) {
+                            fusion_interface, indexing_context)) {
     return;
   }
   // If ComputeCoalescingForAllOperands fails, fallback to using the heuristic.
@@ -347,12 +351,12 @@ CoalescingAnalysis::CoalescingAnalysis(
     const HloInstruction* producer, const HloInstruction* consumer,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context,
+    KernelFusionInterface* fusion_interface, IndexingContext* indexing_context,
     bool use_heuristic) {
   ProducerConsumerFusion fusion_adaptor(producer, consumer);
   if (!use_heuristic &&
       ComputeCoalescingForAllOperands(fusion_adaptor, operands, fusion_analysis,
-                                      fusion_interface, mlir_context)) {
+                                      fusion_interface, indexing_context)) {
     return;
   }
   // If ComputeCoalescingForAllOperands fails, fallback to using the heuristic.
@@ -364,11 +368,12 @@ bool CoalescingAnalysis::ComputeCoalescingForAllOperands(
     const HloFusionAdaptor& fusion_adaptor,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context) {
+    KernelFusionInterface* fusion_interface,
+    IndexingContext* indexing_context) {
   std::optional<GroupedByOpIndexingMap> thread_id_to_input_memory_layouts =
       GetThreadIdToInputMemoryLayoutsMaps(fusion_adaptor, operands,
                                           fusion_analysis, fusion_interface,
-                                          mlir_context);
+                                          indexing_context);
   if (!thread_id_to_input_memory_layouts.has_value()) {
     return false;
   }
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
index 300036aa453bae..86e93dcad69d3b 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
@@ -38,7 +38,7 @@ class CoalescingAnalysis {
                      absl::Span<const HloInstruction* const> operands,
                      const HloFusionAnalysis& fusion_analysis,
                      KernelFusionInterface* fusion_interface = nullptr,
-                     mlir::MLIRContext* mlir_context = nullptr,
+                     IndexingContext* indexing_context = nullptr,
                      bool use_heuristic = true);
 
   // Computes read coalescing for operands of fused `producer` and `consumer`.
@@ -47,7 +47,7 @@ class CoalescingAnalysis {
                      absl::Span<const HloInstruction* const> operands,
                      const HloFusionAnalysis& fusion_analysis,
                      KernelFusionInterface* fusion_interface = nullptr,
-                     mlir::MLIRContext* mlir_context = nullptr,
+                     IndexingContext* indexing_context = nullptr,
                      bool use_heuristic = true);
 
   // Returns true if the operand is read coalesced.
@@ -58,7 +58,8 @@ class CoalescingAnalysis {
       const HloFusionAdaptor& fusion_adaptor,
       absl::Span<const HloInstruction* const> operands,
       const HloFusionAnalysis& fusion_analysis,
-      KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context);
+      KernelFusionInterface* fusion_interface,
+      IndexingContext* indexing_context = nullptr);
 
   absl::flat_hash_map<const HloInstruction*, bool> coalescing_per_operand_;
   bool is_coalesced_computed_by_heuristic_ = false;
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index 18a69aa6bf404b..5a788bb1e0fee1 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/hlo_traversal.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -44,6 +45,8 @@ using ::testing::ElementsAre;
 
 class CoalescingTest : public HloTestBase {
  public:
+  CoalescingTest() : indexing_context_(&mlir_context_) {}
+
   std::vector<bool> IsReadCoalescedPerOperand(absl::string_view hlo_string) {
     auto module = ParseAndReturnVerifiedModule(hlo_string).value();
     HloInstruction* root = module->entry_computation()->root_instruction();
@@ -58,7 +61,7 @@ class CoalescingTest : public HloTestBase {
     EXPECT_TRUE(emitter.ok());
 
     CoalescingAnalysis coalescing_analysis(root, root->operands(), analysis,
-                                           fusion, &mlir_context_,
+                                           fusion, &indexing_context_,
                                            /*use_heuristic=*/false);
 
     std::vector<bool> results;
@@ -80,6 +83,7 @@ class CoalescingTest : public HloTestBase {
   stream_executor::DeviceDescription device_info_ =
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
   mlir::MLIRContext mlir_context_;
+  IndexingContext indexing_context_;
 };
 
 TEST_F(CoalescingTest, IdentityLayout) {
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
index 58325183af50e6..7d8802568a5e2d 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -123,7 +123,7 @@ GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForFusion(
   // operands. For each instruction, tells which elements of the instructions
   // result will be used to compute one result element of the fusion.
   auto grouped_fusion_indexing = ComputeGroupedOutputToInputIndexing(
-      fusion_adaptor, roots[0], mlir_context_);
+      fusion_adaptor, roots[0], &indexing_context_);
 
   int64_t flops = 0;
   int64_t bytes_read = 0;
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
index 14d7e520a820d3..0f2b66eef4ca07 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
 #include "xla/service/gpu/model/hlo_op_profiles.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
@@ -42,7 +43,8 @@ class GpuPerformanceModelWithIndexingAnalysis : public GpuPerformanceModelBase {
       : hlo_op_profile_(&HloOpProfiles::Singleton().GetProfile(device_info)),
         device_info_(device_info),
         shape_size_(shape_size),
-        mlir_context_(mlir_context) {}
+        mlir_context_(mlir_context),
+        indexing_context_(mlir_context_) {}
 
   EstimateRunTimeData EstimateRunTimeForFusion(
       const HloFusionAnalysis& fusion_analysis, bool is_coalesced = true);
@@ -68,6 +70,7 @@ class GpuPerformanceModelWithIndexingAnalysis : public GpuPerformanceModelBase {
   const se::DeviceDescription* device_info_;
   HloCostAnalysis::ShapeSizeFunction shape_size_;
   mlir::MLIRContext* mlir_context_;
+  IndexingContext indexing_context_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index a6a14c28ca8161..cc2cc9f2b83519 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -52,6 +52,7 @@ limitations under the License.
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -77,22 +78,27 @@ HloInstructionIndexing CreateUnknownIndexing(int64_t count = 1) {
   return indexing;
 }
 
-IndexingMap CreateIdentityMap(const Shape& shape, MLIRContext* ctx) {
+IndexingMap CreateIdentityMap(const Shape& shape,
+                              IndexingContext* indexing_context) {
   if (shape.IsTuple()) {
     // Should happen only for variadic reduce. In that case all tuple shapes are
     // equal.
-    return CreateIdentityMap(shape.tuple_shapes(0), ctx);
+    return CreateIdentityMap(shape.tuple_shapes(0), indexing_context);
   }
 
   auto dims = shape.dimensions();
   IndexingMap identity_map = IndexingMap::FromTensorSizes(
-      AffineMap::getMultiDimIdentityMap(dims.size(), ctx), dims, {});
+      indexing_context,
+      AffineMap::getMultiDimIdentityMap(dims.size(),
+                                        indexing_context->GetMLIRContext()),
+      dims, {});
   return identity_map;
 }
 
 HloInstructionIndexing ComputeOutputToInputCwiseOpIndexing(
-    const HloInstruction* instr, MLIRContext* mlir_context) {
-  IndexingMap identity_map = CreateIdentityMap(instr->shape(), mlir_context);
+    const HloInstruction* instr, IndexingContext* indexing_context) {
+  IndexingMap identity_map =
+      CreateIdentityMap(instr->shape(), indexing_context);
 
   HloInstructionIndexing instr_indexing;
   instr_indexing.indexing_maps.resize(instr->operand_count());
@@ -104,21 +110,24 @@ HloInstructionIndexing ComputeOutputToInputCwiseOpIndexing(
 }
 
 HloInstructionIndexing ComputeInputToOutputCwiseOpIndexing(
-    const HloInstruction* instr, MLIRContext* mlir_context) {
-  IndexingMap identity_map = CreateIdentityMap(instr->shape(), mlir_context);
+    const HloInstruction* instr, IndexingContext* indexing_context) {
+  IndexingMap identity_map =
+      CreateIdentityMap(instr->shape(), indexing_context);
   return HloInstructionIndexing::FromIndexingMaps({identity_map});
 }
 
 HloInstructionIndexing ComputeOutputToInputBroadcastOpIndexing(
-    const HloBroadcastInstruction* bcast, MLIRContext* mlir_context) {
+    const HloBroadcastInstruction* bcast, IndexingContext* indexing_context) {
   auto output_dims = bcast->shape().dimensions();
 
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   std::vector<AffineExpr> exprs;
   exprs.reserve(bcast->dimensions().size());
   for (int64_t bcast_dim : bcast->dimensions()) {
     exprs.push_back(getAffineDimExpr(bcast_dim, mlir_context));
   }
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context,
       AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
                      mlir_context),
       output_dims, {});
@@ -126,7 +135,9 @@ HloInstructionIndexing ComputeOutputToInputBroadcastOpIndexing(
 }
 
 HloInstructionIndexing ComputeInputToOutputBroadcastOpIndexing(
-    const HloBroadcastInstruction* bcast, MLIRContext* mlir_context) {
+    const HloBroadcastInstruction* bcast, IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
   absl::Span<const int64_t> bcast_dims = bcast->dimensions();
 
   const Shape& input_shape = bcast->operand(0)->shape();
@@ -149,6 +160,7 @@ HloInstructionIndexing ComputeInputToOutputBroadcastOpIndexing(
         std::distance(bcast_dims.begin(), bcast_dim), mlir_context));
   }
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context,
       AffineMap::get(input_shape.rank(), added_dims_sizes.size(), exprs,
                      mlir_context),
       input_shape.dimensions(), added_dims_sizes);
@@ -166,7 +178,10 @@ std::vector<Interval> RangesFromUpperBounds(absl::Span<const int64_t> bounds) {
 }
 
 HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
-    const HloConcatenateInstruction* concat, MLIRContext* mlir_context) {
+    const HloConcatenateInstruction* concat,
+    IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
   const auto& operand_0_dims = concat->operand(0)->shape().dimensions();
 
   // Initialize affine map and domain. Only concat_dim elements of both have to
@@ -185,7 +200,7 @@ HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
     int64_t operand_concat_dim = operand->shape().dimensions()[concat_dim];
     dim_ranges[concat_dim] = Interval{offset, offset + operand_concat_dim - 1};
     concat_indexing.indexing_maps[operand_id].insert(
-        IndexingMap(affine_map.getAffineMap(), dim_ranges,
+        IndexingMap(indexing_context, affine_map.getAffineMap(), dim_ranges,
                     /*symbol_ranges=*/{}));
     offset += operand_concat_dim;
   }
@@ -194,7 +209,9 @@ HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
 
 HloInstructionIndexing ComputeInputToOutputConcatenateOpIndexing(
     const HloConcatenateInstruction* concat, int input_id,
-    MLIRContext* mlir_context) {
+    IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
   int64_t concat_dim = concat->concatenate_dimension();
   int64_t offset = 0;
   for (int64_t operand_id = 0; operand_id < input_id; ++operand_id) {
@@ -207,8 +224,8 @@ HloInstructionIndexing ComputeInputToOutputConcatenateOpIndexing(
       AffineMap::getMultiDimIdentityMap(operand_dims.size(), mlir_context);
   affine_map.setResult(concat_dim,
                        getAffineDimExpr(concat_dim, mlir_context) + offset);
-  IndexingMap indexing_map =
-      IndexingMap::FromTensorSizes(affine_map.getAffineMap(), operand_dims, {});
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context, affine_map.getAffineMap(), operand_dims, {});
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
@@ -216,10 +233,10 @@ HloInstructionIndexing ComputeInputToOutputConcatenateOpIndexing(
 // until the HloParameterInstruction is found.
 HloInstructionIndexing ComputeOutputToInputFusionOpIndexing(
     const HloFusionInstruction* fusion, int output_id,
-    MLIRContext* mlir_context) {
+    IndexingContext* indexing_context) {
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(fusion);
   auto grouped_indexing_maps = ComputeGroupedOutputToInputIndexing(
-      *fusion_adaptor, fusion_adaptor->GetRoots()[output_id], mlir_context);
+      *fusion_adaptor, fusion_adaptor->GetRoots()[output_id], indexing_context);
 
   // After the traversal, `grouped_indexing_maps` is keyed by
   // HloParameterInstructions. Convert them back to the operand id and return.
@@ -232,7 +249,9 @@ HloInstructionIndexing ComputeOutputToInputFusionOpIndexing(
 }
 
 HloInstructionIndexing ComputeOutputToInputDotOpIndexing(
-    const HloDotInstruction* dot, MLIRContext* mlir_context) {
+    const HloDotInstruction* dot, IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
   CHECK_NE(dot, nullptr);
   const DotDimensionNumbers& dim_numbers = dot->dot_dimension_numbers();
   absl::Span<const int64_t> lhs_contracting_dims(
@@ -297,11 +316,13 @@ HloInstructionIndexing ComputeOutputToInputDotOpIndexing(
   }
 
   IndexingMap lhs_indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context,
       AffineMap::get(dot->shape().rank(), input_dim_sizes.size(), lhs_exprs,
                      mlir_context),
       dot->shape().dimensions(), input_dim_sizes);
 
   IndexingMap rhs_indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context,
       AffineMap::get(dot->shape().rank(), input_dim_sizes.size(), rhs_exprs,
                      mlir_context),
       dot->shape().dimensions(), input_dim_sizes);
@@ -313,7 +334,10 @@ IndexingMap ComputeOutputToInputPadOpIndexingImpl(
     absl::Span<const int64_t> output_dims,
     absl::Span<const int64_t> padding_low,
     absl::Span<const int64_t> padding_high,
-    absl::Span<const int64_t> padding_interior, MLIRContext* mlir_context) {
+    absl::Span<const int64_t> padding_interior,
+    IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
   int64_t output_rank = output_dims.size();
 
   std::vector<AffineExpr> exprs;
@@ -338,12 +362,15 @@ IndexingMap ComputeOutputToInputPadOpIndexingImpl(
     ++output_dim_id;
   }
   return IndexingMap{
+      indexing_context,
       AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
       dimension_ranges, /*symbol_ranges = */ {}, absl::MakeSpan(constraints)};
 }
 
 HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
-    const HloPadInstruction* pad, MLIRContext* mlir_context) {
+    const HloPadInstruction* pad, IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
   const Shape& output_shape = pad->shape();
   int64_t rank = output_shape.rank();
   SmallVector<int64_t> padding_low, padding_high, padding_interior;
@@ -357,8 +384,9 @@ HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
   }
   IndexingMap input_indexing_map = ComputeOutputToInputPadOpIndexingImpl(
       output_shape.dimensions(), padding_low, padding_high, padding_interior,
-      mlir_context);
+      indexing_context);
   IndexingMap padding_value_indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context,
       AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {}, mlir_context),
       output_shape.dimensions(), /*symbol_upper_bounds=*/{});
   return HloInstructionIndexing::FromIndexingMaps(
@@ -367,7 +395,9 @@ HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
 
 HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
     const HloReduceInstruction* reduce, int output_id,
-    MLIRContext* mlir_context) {
+    IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
   absl::flat_hash_set<int64_t> reduce_dims_ids(reduce->dimensions().begin(),
                                                reduce->dimensions().end());
 
@@ -389,10 +419,12 @@ HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
     exprs.push_back(getAffineDimExpr(output_dim_id++, mlir_context));
   }
   IndexingMap inputs_indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context,
       AffineMap::get(output_shape.rank(), reduce_dims_ids.size(), exprs,
                      mlir_context),
       output_shape.dimensions(), parallel_dims_sizes);
   IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context,
       AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {}, mlir_context),
       output_shape.dimensions(), {});
 
@@ -409,7 +441,9 @@ HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
 
 HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
     const HloReduceInstruction* reduce, int input_id,
-    MLIRContext* mlir_context) {
+    IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
   absl::flat_hash_set<int64_t> reduce_dims_ids(reduce->dimensions().begin(),
                                                reduce->dimensions().end());
   const Shape& input_shape = reduce->operand(input_id)->shape();
@@ -429,10 +463,12 @@ HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
     inits_exprs.push_back(getAffineSymbolExpr(output_dim_id++, mlir_context));
   }
   IndexingMap inputs_indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context,
       AffineMap::get(input_shape.rank(), /*symbolCount=*/0, inputs_exprs,
                      mlir_context),
       input_shape.dimensions(), {});
   IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context,
       AffineMap::get(0, /*symbolCount=*/output_rank, inits_exprs, mlir_context),
       {}, output_shape.dimensions());
 
@@ -452,7 +488,9 @@ HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
 // of bounds.
 HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
     const HloReduceWindowInstruction* reduce_window, int output_id,
-    MLIRContext* mlir_context) {
+    IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
   const Shape& input_shape = reduce_window->operand(0)->shape();
   const Shape& output_shape = GetOutputShape(reduce_window, 0);
   int64_t rank = input_shape.rank();
@@ -492,11 +530,11 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
   // Indexing map for pad op that pads the input.
   IndexingMap padded_input_indexing = ComputeOutputToInputPadOpIndexingImpl(
       padded_input_dimensions, padding_low, padding_high, padding_interior,
-      mlir_context);
+      indexing_context);
   // Indexing map for reduce-window, that does not do any padding.
   IndexingMap reduce_window_indexing_no_padding(
-      AffineMap::get(rank, rank, exprs, mlir_context), dim_ranges,
-      symbol_ranges);
+      indexing_context, AffineMap::get(rank, rank, exprs, mlir_context),
+      dim_ranges, symbol_ranges);
 
   // Composed indexing.
   IndexingMap inputs_indexing = ComposeIndexingMaps(
@@ -506,6 +544,7 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
 
   // Indexing map for the init value.
   IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context,
       AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {}, mlir_context),
       output_shape.dimensions(), /*symbol_upper_bounds=*/{});
 
@@ -677,30 +716,35 @@ AffineMap ComputeReshapeIndexingMap(const Shape& input, const Shape& output,
 };
 
 HloInstructionIndexing ComputeOutputToInputReshapeOpIndexing(
-    const HloReshapeInstruction* reshape, MLIRContext* mlir_context) {
+    const HloReshapeInstruction* reshape, IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
   const auto& input = reshape->operand(0)->shape();
   const auto& output = reshape->shape();
 
   IndexingMap reshape_indexing_map = IndexingMap::FromTensorSizes(
-      ComputeReshapeIndexingMap(input, output, mlir_context),
+      indexing_context, ComputeReshapeIndexingMap(input, output, mlir_context),
       output.dimensions(), {});
   reshape_indexing_map.Simplify();
   return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
 }
 HloInstructionIndexing ComputeInputToOutputReshapeOpIndexing(
-    const HloReshapeInstruction* reshape, MLIRContext* mlir_context) {
+    const HloReshapeInstruction* reshape, IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
   const auto& input = reshape->operand(0)->shape();
   const auto& output = reshape->shape();
 
   IndexingMap reshape_indexing_map = IndexingMap::FromTensorSizes(
-      ComputeReshapeIndexingMap(output, input, mlir_context),
+      indexing_context, ComputeReshapeIndexingMap(output, input, mlir_context),
       input.dimensions(), {});
   reshape_indexing_map.Simplify();
   return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
 }
 
 HloInstructionIndexing ComputeReverseOpIndexing(
-    const HloReverseInstruction* reverse, MLIRContext* mlir_context) {
+    const HloReverseInstruction* reverse, IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   absl::flat_hash_set<int64_t> reverse_dims(reverse->dimensions().begin(),
                                             reverse->dimensions().end());
   auto output_dims = reverse->shape().dimensions();
@@ -717,6 +761,7 @@ HloInstructionIndexing ComputeReverseOpIndexing(
   }
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context,
       AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
                      mlir_context),
       output_dims, {});
@@ -725,7 +770,8 @@ HloInstructionIndexing ComputeReverseOpIndexing(
 }
 
 HloInstructionIndexing ComputeOutputToInputSliceOpIndexing(
-    const HloSliceInstruction* slice, MLIRContext* mlir_context) {
+    const HloSliceInstruction* slice, IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   auto output_rank = slice->shape().rank();
 
   std::vector<AffineExpr> exprs;
@@ -736,6 +782,7 @@ HloInstructionIndexing ComputeOutputToInputSliceOpIndexing(
                     slice->slice_starts()[dim]);
   }
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      indexing_context,
       AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
       slice->shape().dimensions(), {});
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
@@ -749,25 +796,31 @@ AffineMap ComputeTransposeIndexingMap(absl::Span<const int64_t> permutation,
 }
 
 HloInstructionIndexing ComputeOutputToInputTransposeOpIndexing(
-    const HloTransposeInstruction* transpose, MLIRContext* mlir_context) {
-  AffineMap inverse_permutation = ComputeTransposeIndexingMap(
-      InversePermutation(transpose->dimensions()), mlir_context);
-  return HloInstructionIndexing::FromIndexingMaps({IndexingMap::FromTensorSizes(
-      inverse_permutation, transpose->shape().dimensions(), {})});
+    const HloTransposeInstruction* transpose,
+    IndexingContext* indexing_context) {
+  AffineMap inverse_permutation =
+      ComputeTransposeIndexingMap(InversePermutation(transpose->dimensions()),
+                                  indexing_context->GetMLIRContext());
+  return HloInstructionIndexing::FromIndexingMaps(
+      {IndexingMap::FromTensorSizes(indexing_context, inverse_permutation,
+                                    transpose->shape().dimensions(), {})});
 }
 
 HloInstructionIndexing ComputeInputToOutputTransposeOpIndexing(
-    const HloTransposeInstruction* transpose, MLIRContext* mlir_context) {
-  AffineMap forward_permutation =
-      ComputeTransposeIndexingMap(transpose->dimensions(), mlir_context);
+    const HloTransposeInstruction* transpose,
+    IndexingContext* indexing_context) {
+  AffineMap forward_permutation = ComputeTransposeIndexingMap(
+      transpose->dimensions(), indexing_context->GetMLIRContext());
   return HloInstructionIndexing::FromIndexingMaps({IndexingMap::FromTensorSizes(
-      forward_permutation, transpose->operand(0)->shape().dimensions(), {})});
+      indexing_context, forward_permutation,
+      transpose->operand(0)->shape().dimensions(), {})});
 }
 
 }  // namespace
 
 IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
-                          MLIRContext* ctx) {
+                          IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   ShapeUtil::BitcastDecomposition decomposed_bitcast =
       ShapeUtil::DecomposeBitcast(input_shape, output_shape);
 
@@ -779,7 +832,8 @@ IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
         << "Failed to deduce permutation for a bitcast.";
 
     return IndexingMap::FromTensorSizes(
-        ComputeTransposeIndexingMap(permutation.value(), ctx),
+        indexing_context,
+        ComputeTransposeIndexingMap(permutation.value(), mlir_context),
         input_shape.dimensions(), {});
   }
   if (std::holds_alternative<ShapeUtil::BitcastDecompositionReshape>(
@@ -787,38 +841,39 @@ IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
     // Note: ComputeReshapeIndexingMap assumes it's computing an output->input
     // indexing, so input and output are reversed.
     return IndexingMap::FromTensorSizes(
-        ComputeReshapeIndexingMap(output_shape, input_shape, ctx),
+        indexing_context,
+        ComputeReshapeIndexingMap(output_shape, input_shape, mlir_context),
         input_shape.dimensions(), {});
   }
   // `trt` stands for transpose-reshape-transpose decomposition of bitcast.
   auto trt = std::get<ShapeUtil::BitcastDecompositionTrt>(decomposed_bitcast);
-  auto transpose_map_1 = ComputeTransposeIndexingMap(trt.transpose1_dims, ctx);
-  auto reshape_map =
-      ComputeReshapeIndexingMap(trt.reshape_shape, trt.transpose1_shape, ctx);
-  auto transpose_map_2 = ComputeTransposeIndexingMap(trt.transpose2_dims, ctx);
+  auto transpose_map_1 =
+      ComputeTransposeIndexingMap(trt.transpose1_dims, mlir_context);
+  auto reshape_map = ComputeReshapeIndexingMap(
+      trt.reshape_shape, trt.transpose1_shape, mlir_context);
+  auto transpose_map_2 =
+      ComputeTransposeIndexingMap(trt.transpose2_dims, mlir_context);
   auto bitcast_map =
       transpose_map_2.compose(reshape_map).compose(transpose_map_1);
-  return IndexingMap::FromTensorSizes(bitcast_map, input_shape.dimensions(),
-                                      {});
+  return IndexingMap::FromTensorSizes(indexing_context, bitcast_map,
+                                      input_shape.dimensions(), {});
 }
 
 namespace {
 
 HloInstructionIndexing ComputeOutputToInputBitcastOpIndexing(
-    const HloInstruction* bitcast, MLIRContext* mlir_context) {
-  auto bitcast_map = GetBitcastMap(bitcast->shape(),
-                                   bitcast->operand(0)->shape(), mlir_context);
+    const HloInstruction* bitcast, IndexingContext* indexing_context) {
+  auto bitcast_map = GetBitcastMap(
+      bitcast->shape(), bitcast->operand(0)->shape(), indexing_context);
   bitcast_map.Simplify();
-
   return HloInstructionIndexing::FromIndexingMaps({bitcast_map});
 }
 
 HloInstructionIndexing ComputeInputToOutputBitcastOpIndexing(
-    const HloInstruction* bitcast, MLIRContext* mlir_context) {
+    const HloInstruction* bitcast, IndexingContext* indexing_context) {
   auto bitcast_map = GetBitcastMap(bitcast->operand(0)->shape(),
-                                   bitcast->shape(), mlir_context);
+                                   bitcast->shape(), indexing_context);
   bitcast_map.Simplify();
-
   return HloInstructionIndexing::FromIndexingMaps({bitcast_map});
 }
 
@@ -867,32 +922,38 @@ llvm::SmallVector<AffineExpr, 4> DelinearizeInBoundsIndex(
   return result;
 }
 
-IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(const Shape& shape,
-                                                      MLIRContext* ctx) {
+IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
+    const Shape& shape, IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   if (shape.rank() == 0) {
-    return IndexingMap(AffineMap::get(ctx), {}, {});
+    return IndexingMap(indexing_context, AffineMap::get(mlir_context), {}, {});
   }
   return IndexingMap::FromTensorSizes(
+      indexing_context,
       ComputeTransposeIndexingMap(
-          InversePermutation(ToTransposeDimensions(shape.layout())), ctx),
+          InversePermutation(ToTransposeDimensions(shape.layout())),
+          mlir_context),
       ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(shape)
           .dimensions(),
       {});
 }
 
-IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(const Shape& shape,
-                                                      MLIRContext* ctx) {
+IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(
+    const Shape& shape, IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   if (shape.rank() == 0) {
-    return IndexingMap(AffineMap::get(ctx), {}, {});
+    return IndexingMap(indexing_context, AffineMap::get(mlir_context), {}, {});
   }
   return IndexingMap::FromTensorSizes(
-      ComputeTransposeIndexingMap(ToTransposeDimensions(shape.layout()), ctx),
+      indexing_context,
+      ComputeTransposeIndexingMap(ToTransposeDimensions(shape.layout()),
+                                  mlir_context),
       shape.dimensions(), {});
 }
 
 AffineMap GetBlockOffsetsForTiling(const Tiling& tiling,
-                                   mlir::MLIRContext* ctx) {
-  auto offsets = DelinearizeInBoundsIndex(getAffineDimExpr(3, ctx),
+                                   MLIRContext* mlir_context) {
+  auto offsets = DelinearizeInBoundsIndex(getAffineDimExpr(3, mlir_context),
                                           tiling.GetBlockCounts(),
                                           tiling.GetBlockStrides());
   for (auto&& [offset, tile_size] :
@@ -903,13 +964,13 @@ AffineMap GetBlockOffsetsForTiling(const Tiling& tiling,
 }
 
 AffineMap GetThreadOffsetsForTiling(const Tiling& tiling,
-                                    mlir::MLIRContext* ctx) {
-  auto offsets = DelinearizeInBoundsIndex(getAffineDimExpr(0, ctx),
+                                    MLIRContext* mlir_context) {
+  auto offsets = DelinearizeInBoundsIndex(getAffineDimExpr(0, mlir_context),
                                           tiling.GetThreadsPerBlock(),
                                           tiling.GetThreadStrides());
   for (int dim = 0; dim < tiling.GetShape().size(); ++dim) {
     if (tiling.GetThreadTileSize()[dim] > 1) {
-      offsets[dim] = offsets[dim] + getAffineSymbolExpr(dim, ctx) *
+      offsets[dim] = offsets[dim] + getAffineSymbolExpr(dim, mlir_context) *
                                         tiling.GetThreadsPerBlock()[dim];
     }
   }
@@ -917,11 +978,13 @@ AffineMap GetThreadOffsetsForTiling(const Tiling& tiling,
 }
 
 IndexingMap GetIndexingMapForTiling(const Tiling& tiling,
-                                    mlir::MLIRContext* ctx) {
+                                    IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   return GetIndexingMapForTiling(
-      GetBlockOffsetsForTiling(tiling, ctx),
-      GetThreadOffsetsForTiling(tiling, ctx), tiling.GetNumThreadsPerBlock(),
-      tiling.GetNumBlocks(), tiling.GetThreadTileSize(), tiling.GetShape());
+      GetBlockOffsetsForTiling(tiling, mlir_context),
+      GetThreadOffsetsForTiling(tiling, mlir_context),
+      tiling.GetNumThreadsPerBlock(), tiling.GetNumBlocks(),
+      tiling.GetThreadTileSize(), tiling.GetShape(), indexing_context);
 }
 
 IndexingMap GetIndexingMapForTiling(AffineMap block_offsets,
@@ -929,7 +992,8 @@ IndexingMap GetIndexingMapForTiling(AffineMap block_offsets,
                                     int64_t threads_per_block,
                                     int64_t num_blocks,
                                     absl::Span<const int64_t> thread_tile_sizes,
-                                    absl::Span<const int64_t> tiled_shape) {
+                                    absl::Span<const int64_t> tiled_shape,
+                                    IndexingContext* indexing_context) {
   llvm::SmallVector<AffineExpr, 4> offsets;
   offsets.reserve(block_offsets.getNumResults());
   for (auto [block, thread] :
@@ -941,8 +1005,8 @@ IndexingMap GetIndexingMapForTiling(AffineMap block_offsets,
   };
   auto affine_map = mlir::AffineMap::get(block_offsets.getNumDims(),
                                          block_offsets.getNumSymbols(), offsets,
-                                         offsets[0].getContext());
-  IndexingMap map{affine_map, dimension_ranges,
+                                         indexing_context->GetMLIRContext());
+  IndexingMap map{indexing_context, affine_map, dimension_ranges,
                   RangesFromUpperBounds(thread_tile_sizes)};
   for (int i = 0; i < tiled_shape.size(); ++i) {
     map.AddConstraint(affine_map.getResult(i), {0, tiled_shape[i] - 1});
@@ -1034,7 +1098,7 @@ GroupedByOpIndexingMap GroupIndexingMapsByProducers(
 
 GroupedByOpIndexingMap ComputeGroupedOutputToInputIndexing(
     const HloFusionAdaptor& fusion_adaptor, HloInstructionAdaptor target_instr,
-    MLIRContext* ctx) {
+    IndexingContext* ctx) {
   auto initial_map = CreateIdentityMap(target_instr.instruction().shape(), ctx);
 
   GroupedByOpIndexingMap grouped_indexing_maps;
@@ -1088,9 +1152,9 @@ bool FuseProducerConsumerOutputToInputIndexing(
     const HloInstruction* producer_instr,
     absl::flat_hash_map<const HloInstruction*, IndexingMapSet>*
         consumer_indexing,
-    MLIRContext* mlir_context) {
+    IndexingContext* indexing_context) {
   auto producer_indexing = ComputeOutputToInputIndexing(
-      producer_instr, /*output_id=*/0, mlir_context);
+      producer_instr, /*output_id=*/0, indexing_context);
   auto consumer_indexing_maps = (*consumer_indexing)[producer_instr];
   for (const auto& [producer_operand_id, producer_operand_indexing] :
        llvm::enumerate(producer_indexing.indexing_maps)) {
@@ -1109,7 +1173,7 @@ bool FuseProducerConsumerOutputToInputIndexing(
 
 HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
                                                     int output_id,
-                                                    MLIRContext* ctx) {
+                                                    IndexingContext* ctx) {
   if (HloInstruction::IsOpElementwise(instr->opcode())) {
     return ComputeOutputToInputCwiseOpIndexing(instr, ctx);
   }
@@ -1163,7 +1227,7 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
 
 HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
                                                     int input_id,
-                                                    MLIRContext* ctx) {
+                                                    IndexingContext* ctx) {
   if (HloInstruction::IsOpElementwise(instr->opcode())) {
     return ComputeInputToOutputCwiseOpIndexing(instr, ctx);
   }
@@ -1200,15 +1264,15 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
 }
 
 IndexingMap ComputeEpilogueInputToOutputIndexing(
-    const HloInstruction* epilogue_root, mlir::MLIRContext* ctx,
+    const HloInstruction* epilogue_root, IndexingContext* indexing_context,
     std::function<bool(const HloInstruction*)> is_root) {
   auto* instr = epilogue_root;
-  auto root_indexing = CreateIdentityMap(instr->shape(), ctx);
+  auto root_indexing = CreateIdentityMap(instr->shape(), indexing_context);
   while (!is_root(instr)) {
     // There can be multiple users, but they must have compatible indexing maps.
     auto* user = instr->users().front();
-    auto user_indexing =
-        ComputeInputToOutputIndexing(user, user->operand_index(instr), ctx);
+    auto user_indexing = ComputeInputToOutputIndexing(
+        user, user->operand_index(instr), indexing_context);
     root_indexing = root_indexing * *user_indexing.indexing_maps[0].begin();
     root_indexing.Simplify();
     instr = user;
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.h b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
index 59a56ae750a03d..47abac957e0e0e 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
@@ -67,15 +67,15 @@ std::string ToString(const mlir::AffineMap& affine_map);
 
 // Computes indexing maps for all input operands necessary to compute an element
 // of the `output_id` instruction output.
-HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
-                                                    int output_id,
-                                                    mlir::MLIRContext* ctx);
+HloInstructionIndexing ComputeOutputToInputIndexing(
+    const HloInstruction* instr, int output_id,
+    IndexingContext* indexing_context);
 
 // Computes indexing maps for all output operands that the element of the
 // `input_id` instruction input will participate in.
-HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
-                                                    int input_id,
-                                                    mlir::MLIRContext* ctx);
+HloInstructionIndexing ComputeInputToOutputIndexing(
+    const HloInstruction* instr, int input_id,
+    IndexingContext* indexing_context);
 
 // Computes the indexing for `epilogue_parent`'s epilogue. For example, if
 // `epilogue_parent` is a transpose, computes the input to output indexing for
@@ -94,7 +94,7 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
 // FindNonTrivialHero, i.e., each instruction in the epilogue only has a single
 // user, or the users have identical indexing maps.
 IndexingMap ComputeEpilogueInputToOutputIndexing(
-    const HloInstruction* epilogue_root, mlir::MLIRContext* ctx,
+    const HloInstruction* epilogue_root, IndexingContext* indexing_context,
     std::function<bool(const HloInstruction*)> is_root =
         [](const HloInstruction* instr) { return instr->IsRoot(); });
 
@@ -105,7 +105,7 @@ using GroupedByOpIndexingMap =
 // cluster starting with `target_instr` and going from def to use.
 GroupedByOpIndexingMap ComputeGroupedOutputToInputIndexing(
     const HloFusionAdaptor& fusion_adaptor, HloInstructionAdaptor target_instr,
-    mlir::MLIRContext* ctx);
+    IndexingContext* indexing_context);
 
 // Groups indexing maps by instructions.
 absl::flat_hash_map<const HloInstruction*, IndexingMapSet>
@@ -118,44 +118,45 @@ bool FuseProducerConsumerOutputToInputIndexing(
     const HloInstruction* producer_instr,
     absl::flat_hash_map<const HloInstruction*, IndexingMapSet>*
         consumer_indexing,
-    mlir::MLIRContext* mlir_context);
+    IndexingContext* mlir_context);
 
 // Creates an indexing map for bitcasting from `input_shape` to `output_shape`.
 // Equivalent to linearizing the input_shape index and then delinearizing it
 // to output_shape.
 IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
-                          mlir::MLIRContext* ctx);
+                          IndexingContext* indexing_context);
 
 // Creates an indexing map from the physical layout of the tensor to its logical
 // layout.
-IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(const Shape& shape,
-                                                      mlir::MLIRContext* ctx);
+IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
+    const Shape& shape, IndexingContext* indexing_context);
 
 // Creates an indexing map from the logical layout of the tensor to its physical
 // layout.
-IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(const Shape& shape,
-                                                      mlir::MLIRContext* ctx);
+IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(
+    const Shape& shape, IndexingContext* indexing_context);
 
 // Creates an indexing map from thread and block IDs to elements of the tiled
 // shape. Uses the same convention as KernelFusionInterface: dimensions 0 to 2
 // are thread indices (currently only 0 is used), dimensions 3 to 5 are block
 // indices (currently only 3 is used).
 mlir::AffineMap GetBlockOffsetsForTiling(const Tiling& tiling,
-                                         mlir::MLIRContext* ctx);
+                                         mlir::MLIRContext* mlir_context);
 mlir::AffineMap GetThreadOffsetsForTiling(const Tiling& tiling,
-                                          mlir::MLIRContext* ctx);
+                                          mlir::MLIRContext* mlir_context);
 
 // Convenience functions for the two functions above
 // (`GetBlockOffsestsForTiling` + `GetThreadOffsetsForTiling`). Also sets up
 // the ranges of dimensions and symbols.
 IndexingMap GetIndexingMapForTiling(const Tiling& tiling,
-                                    mlir::MLIRContext* ctx);
+                                    IndexingContext* indexing_context);
 IndexingMap GetIndexingMapForTiling(mlir::AffineMap block_offsets,
                                     mlir::AffineMap thread_offsets,
                                     int64_t threads_per_block,
                                     int64_t num_blocks,
                                     absl::Span<const int64_t> thread_tile_sizes,
-                                    absl::Span<const int64_t> tiled_shape);
+                                    absl::Span<const int64_t> tiled_shape,
+                                    IndexingContext* indexing_context);
 
 // Returns the shape of the output of the instruction.
 const Shape& GetOutputShape(const HloInstruction* instr, int64_t output_id);
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 7388d194d3eaa4..39cade7c560b58 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -91,7 +91,7 @@ TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing) {
   auto fusion_adaptor = ProducerConsumerFusion(transpose, root);
 
   auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
-      fusion_adaptor, fusion_adaptor.GetRoots()[0], &mlir_context_);
+      fusion_adaptor, fusion_adaptor.GetRoots()[0], &indexing_context_);
   EXPECT_THAT(grouped_indexing,
               UnorderedElementsAre(
                   Pair(root, ElementsAre(MatchIndexingMap(R"(
@@ -148,7 +148,7 @@ TEST_F(IndexingAnalysisTest,
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(root);
 
   auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
-      *fusion_adaptor, fusion_adaptor->GetRoots()[0], &mlir_context_);
+      *fusion_adaptor, fusion_adaptor->GetRoots()[0], &indexing_context_);
 
   EXPECT_THAT(grouped_indexing,
               UnorderedElementsAre(
@@ -200,7 +200,7 @@ TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing_SingleOp) {
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(exponential);
   HloInstructionAdaptor parameter_adaptor(*parameter);
   auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
-      *fusion_adaptor, parameter_adaptor, &mlir_context_);
+      *fusion_adaptor, parameter_adaptor, &indexing_context_);
   EXPECT_THAT(grouped_indexing, UnorderedElementsAre(Pair(
                                     parameter, ElementsAre(MatchIndexingMap(R"(
                                                      (d0, d1) -> (d0, d1)
@@ -240,7 +240,7 @@ TEST_F(IndexingAnalysisTest,
   auto parameter_0 = bcast.GetOperand(0);
 
   auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
-      *fusion_adaptor, bcast, &mlir_context_);
+      *fusion_adaptor, bcast, &indexing_context_);
   EXPECT_THAT(
       grouped_indexing,
       UnorderedElementsAre(
@@ -2083,7 +2083,7 @@ TEST_F(IndexingAnalysisTest, TilingIndexing) {
   Tiling tiling{/*shape=*/{1022, 256, 16},
                 /*tile_sizes=*/{8, 1, 4},
                 /*num_threads=*/{1, 4, 4}};
-  auto indexing_map = GetIndexingMapForTiling(tiling, &mlir_context_);
+  auto indexing_map = GetIndexingMapForTiling(tiling, &indexing_context_);
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
@@ -2118,7 +2118,7 @@ TEST_F(IndexingAnalysisTest, EpilogueIndexing) {
   ASSERT_TRUE(module.ok());
   EXPECT_THAT(ComputeEpilogueInputToOutputIndexing(
                   (*module)->entry_computation()->GetInstructionWithName("t"),
-                  &mlir_context_)
+                  &indexing_context_)
                   .ToString(),
               MatchIndexingString(R"(
                   (d0, d1) -> (d0 + d1 * 1000)
@@ -2139,7 +2139,7 @@ TEST_F(IndexingAnalysisTest, EpilogueIndexing_NoEpilogue) {
   ASSERT_TRUE(module.ok());
   EXPECT_THAT(ComputeEpilogueInputToOutputIndexing(
                   (*module)->entry_computation()->GetInstructionWithName("t"),
-                  &mlir_context_)
+                  &indexing_context_)
                   .ToString(),
               MatchIndexingString(R"(
                   (d0, d1) -> (d0, d1)
diff --git a/third_party/xla/xla/service/gpu/model/indexing_context.cc b/third_party/xla/xla/service/gpu/model/indexing_context.cc
new file mode 100644
index 00000000000000..f44e4977e41baa
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/indexing_context.cc
@@ -0,0 +1,27 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/indexing_context.h"
+
+namespace xla {
+namespace gpu {
+
+IndexingContext::RTValsID IndexingContext::RegisterRTSymbol(
+    const HloInstruction* instr, IndexingMap indexing_map) {
+  return 0;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_context.h b/third_party/xla/xla/service/gpu/model/indexing_context.h
new file mode 100644
index 00000000000000..2560cd09ab1864
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/indexing_context.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_INDEXING_CONTEXT_H_
+#define XLA_SERVICE_GPU_MODEL_INDEXING_CONTEXT_H_
+
+#include <cstdint>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/indexing_map.h"
+
+namespace xla {
+namespace gpu {
+
+class IndexingContext {
+ public:
+  using RTValsID = int64_t;
+
+  explicit IndexingContext(mlir::MLIRContext* mlir_context)
+      : mlir_context_(mlir_context) {}
+
+  mlir::MLIRContext* GetMLIRContext() const { return mlir_context_; }
+
+  // TBD: This method should behave like a thread-safe counter. It will register
+  // a new RTSymbol by adding it to `rt_vals_registry_` with the newly generated
+  // ID.
+  RTValsID RegisterRTSymbol(const HloInstruction* instr,
+                            IndexingMap indexing_map);
+
+ private:
+  mlir::MLIRContext* mlir_context_;
+  absl::flat_hash_map<RTValsID, std::pair<const HloInstruction*, IndexingMap>>
+      rt_vals_registry_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_INDEXING_CONTEXT_H_
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index 27ec08b41c58bc..7d92c7dcb9e6e1 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
 
 namespace xla {
@@ -376,7 +377,7 @@ AffineExpr AffineExprSimplifier::SimplifyOnce(AffineExpr expr) {
       auto rhs = SimplifyOnce(binop.getRHS());
 
       // Rewrite `(x // c) * c + (x % c)` to `x`.
-      // TODO(jreiffers): This should also work with (a+b)+c.
+      // This should also work with (a+b)+c.
       auto rewrite_add = [&](AffineExpr a, AffineExpr b) -> AffineExpr {
         if (auto mod = GetConstantRhs(a, AffineExprKind::Mod)) {
           if (auto mul = GetConstantRhs(b, AffineExprKind::Mul); mod == mul) {
@@ -596,12 +597,22 @@ std::vector<Interval> RangesFromTensorSizes(
 }
 
 IndexingMap IndexingMap::FromTensorSizes(
-    AffineMap affine_map, absl::Span<const int64_t> dim_upper_bounds,
+    IndexingContext* indexing_context, AffineMap affine_map,
+    absl::Span<const int64_t> dim_upper_bounds,
     absl::Span<const int64_t> symbol_upper_bounds) {
-  return IndexingMap{affine_map, RangesFromTensorSizes(dim_upper_bounds),
+  return IndexingMap{indexing_context, affine_map,
+                     RangesFromTensorSizes(dim_upper_bounds),
                      RangesFromTensorSizes(symbol_upper_bounds)};
 }
 
+mlir::MLIRContext* IndexingMap::GetMLIRContext() const {
+  return indexing_context_->GetMLIRContext();
+}
+
+IndexingContext* IndexingMap::GetIndexingContext() const {
+  return indexing_context_;
+}
+
 void IndexingMap::AddConstraint(mlir::AffineExpr expr, Interval range) {
   if (auto dim_expr = mlir::dyn_cast<AffineDimExpr>(expr)) {
     Interval& current_range = dim_ranges_[dim_expr.getPosition()];
@@ -1011,7 +1022,9 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
     combined_symbol_ranges.push_back(symbol_range);
   }
 
-  IndexingMap composed_indexing_map(composed_map, first.GetDimensionRanges(),
+  IndexingContext* indexing_context = first.GetIndexingContext();
+  IndexingMap composed_indexing_map(indexing_context, composed_map,
+                                    first.GetDimensionRanges(),
                                     std::move(combined_symbol_ranges));
   // Add constraints that are already present in the producer_map. We have to
   // compute consumer_map(producer_constraints). To keep all symbols and
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index 655d877745860f..e6e84bf82a9f7d 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -36,6 +36,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+class IndexingContext;
+
 // Interval represents a closed interval [lower_bound, upper_bound].
 struct Interval {
   std::string ToString() const;
@@ -166,10 +168,11 @@ std::vector<Interval> RangesFromTensorSizes(
 class IndexingMap {
  public:
   IndexingMap(
-      mlir::AffineMap affine_map, std::vector<Interval> dim_ranges,
-      std::vector<Interval> symbol_ranges,
+      IndexingContext* indexing_context, mlir::AffineMap affine_map,
+      std::vector<Interval> dim_ranges, std::vector<Interval> symbol_ranges,
       absl::Span<std::pair<mlir::AffineExpr, Interval>> constraints = {})
-      : affine_map_(affine_map),
+      : indexing_context_(indexing_context),
+        affine_map_(affine_map),
         dim_ranges_(std::move(dim_ranges)),
         symbol_ranges_(std::move(symbol_ranges)) {
     for (const auto& [expr, range] : constraints) {
@@ -177,10 +180,12 @@ class IndexingMap {
     }
   }
 
-  IndexingMap(mlir::AffineMap affine_map, std::vector<Interval> dim_ranges,
+  IndexingMap(IndexingContext* indexing_context, mlir::AffineMap affine_map,
+              std::vector<Interval> dim_ranges,
               std::vector<Interval> symbol_ranges,
               const llvm::DenseMap<mlir::AffineExpr, Interval>& constraints)
-      : affine_map_(affine_map),
+      : indexing_context_(indexing_context),
+        affine_map_(affine_map),
         dim_ranges_(std::move(dim_ranges)),
         symbol_ranges_(std::move(symbol_ranges)),
         constraints_(constraints) {}
@@ -188,7 +193,8 @@ class IndexingMap {
   static IndexingMap GetUndefined() { return IndexingMap(); }
 
   static IndexingMap FromTensorSizes(
-      mlir::AffineMap affine_map, absl::Span<const int64_t> dim_upper_bounds,
+      IndexingContext* indexing_context, mlir::AffineMap affine_map,
+      absl::Span<const int64_t> dim_upper_bounds,
       absl::Span<const int64_t> symbol_upper_bounds);
 
   std::string ToString(
@@ -200,7 +206,10 @@ class IndexingMap {
   bool Simplify();
 
   // Return MLIRContext.
-  mlir::MLIRContext* GetMLIRContext() const { return affine_map_.getContext(); }
+  mlir::MLIRContext* GetMLIRContext() const;
+
+  // Return IndexingContext.
+  IndexingContext* GetIndexingContext() const;
 
   // Returns the affine map.
   mlir::AffineMap GetAffineMap() const { return affine_map_; }
@@ -265,6 +274,7 @@ class IndexingMap {
   // Returns true if simplification was performed.
   bool SimplifyConstraintRanges();
 
+  IndexingContext* indexing_context_ = nullptr;
   mlir::AffineMap affine_map_;
   std::vector<Interval> dim_ranges_;
   std::vector<Interval> symbol_ranges_;
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index 2e7cbd309b1489..ffc6743863244d 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -35,12 +35,16 @@ using ::testing::ElementsAre;
 
 class IndexingMapTest : public HloTestBase {
  public:
+  IndexingMapTest()
+      : HloTestBase(), mlir_context_(), indexing_context_(&mlir_context_) {}
   mlir::MLIRContext mlir_context_;
+  IndexingContext indexing_context_;
   AffineMapPrinter printer_;
 };
 
 TEST_F(IndexingMapTest, Evaluation) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
       {4, 4}, {2, 2});
 
@@ -65,10 +69,12 @@ TEST_F(IndexingMapTest, Evaluation) {
 
 TEST_F(IndexingMapTest, Composition_Permutation) {
   IndexingMap producer = IndexingMap::FromTensorSizes(
+      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
       {4, 4}, {2, 2});
 
   IndexingMap consumer = IndexingMap::FromTensorSizes(
+      &indexing_context_,
       ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_), {4}, {4});
 
   auto composed = ComposeIndexingMaps(consumer, producer);
@@ -84,10 +90,12 @@ TEST_F(IndexingMapTest, Composition_Permutation) {
 
 TEST_F(IndexingMapTest, Composition_RestrictedInterval) {
   IndexingMap producer = IndexingMap::FromTensorSizes(
+      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
       {5, 6}, {7, 2});
 
   IndexingMap consumer = IndexingMap::FromTensorSizes(
+      &indexing_context_,
       ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_), {10}, {8});
 
   auto composed = ComposeIndexingMaps(consumer, producer);
@@ -103,6 +111,7 @@ TEST_F(IndexingMapTest, Composition_RestrictedInterval) {
 
 TEST_F(IndexingMapTest, Composition_ProducerAndConsumerHaveConstraints) {
   IndexingMap producer = IndexingMap::FromTensorSizes(
+      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
       {50, 60}, {70, 20});
   producer.AddConstraint(ParseAffineExpr("d0 mod 8", &mlir_context_),
@@ -111,6 +120,7 @@ TEST_F(IndexingMapTest, Composition_ProducerAndConsumerHaveConstraints) {
                          Interval{1, 1});
 
   IndexingMap consumer = IndexingMap::FromTensorSizes(
+      &indexing_context_,
       ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_), {10}, {8});
   consumer.AddConstraint(ParseAffineExpr("d0 + s0", &mlir_context_),
                          Interval{0, 20});
@@ -146,6 +156,7 @@ TEST_F(IndexingMapTest, Composition_ProducerAndConsumerHaveConstraints) {
 
 TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintUsesSymbol) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1)", &mlir_context_),
       {50, 60}, {70, 20});
   // This constraint cannot be removed, because it contains a "used symbol".
@@ -168,6 +179,7 @@ TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintUsesSymbol) {
 
 TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintUsesOnlyUnusedSymbols) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1)", &mlir_context_),
       {50, 60}, {70, 20});
   // This constraint can be removed, because it contains only the unused symbol.
@@ -185,6 +197,7 @@ TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintUsesOnlyUnusedSymbols) {
 
 TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithManySymbols) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      &indexing_context_,
       ParseAffineMap("(d0)[s0, s1, s2, s3, s4] -> (d0 * 4 + s1 + s3 - 42)",
                      &mlir_context_),
       {32}, {1, 2, 3, 4, 5});
@@ -204,7 +217,8 @@ TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithManySymbols) {
 
 TEST_F(IndexingMapTest, ConstraintIntervalSimplification_Sum) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100}, {});
+      &indexing_context_, ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100},
+      {});
 
   indexing_map.AddConstraint(ParseAffineExpr("(d0 mod 8) + 5", &mlir_context_),
                              Interval{50, 54});
@@ -220,7 +234,8 @@ TEST_F(IndexingMapTest, ConstraintIntervalSimplification_Sum) {
 TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_FloorDivPositiveDivisorPositiveBounds) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100}, {});
+      &indexing_context_, ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100},
+      {});
 
   indexing_map.AddConstraint(ParseAffineExpr("d0 floordiv 8", &mlir_context_),
                              Interval{5, 11});
@@ -233,9 +248,9 @@ TEST_F(IndexingMapTest,
 
 TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_FloorDivPositiveDivisorNegativeBounds) {
-  IndexingMap indexing_map =
-      IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-                  {Interval{0, 99}}, {Interval{-99, 99}});
+  IndexingMap indexing_map = IndexingMap(
+      &indexing_context_, ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
+      {Interval{0, 99}}, {Interval{-99, 99}});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 floordiv 3", &mlir_context_),
                              Interval{-11, -5});
@@ -249,9 +264,9 @@ TEST_F(IndexingMapTest,
 
 TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_FloorDivNegativeDivisorNegativeBounds) {
-  IndexingMap indexing_map =
-      IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-                  {Interval{0, 99}}, {Interval{-99, 99}});
+  IndexingMap indexing_map = IndexingMap(
+      &indexing_context_, ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
+      {Interval{0, 99}}, {Interval{-99, 99}});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 floordiv -3", &mlir_context_),
                              Interval{-11, -5});
@@ -266,7 +281,8 @@ TEST_F(IndexingMapTest,
 TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_MulPositiveMultiplierPositiveBounds) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100}, {});
+      &indexing_context_, ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100},
+      {});
 
   indexing_map.AddConstraint(ParseAffineExpr("d0 * 8", &mlir_context_),
                              Interval{14, 33});
@@ -279,9 +295,9 @@ TEST_F(IndexingMapTest,
 
 TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_MulPositiveMultiplierNegativeBounds) {
-  IndexingMap indexing_map =
-      IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-                  {Interval{0, 99}}, {Interval{-99, 99}});
+  IndexingMap indexing_map = IndexingMap(
+      &indexing_context_, ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
+      {Interval{0, 99}}, {Interval{-99, 99}});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 * 3", &mlir_context_),
                              Interval{-11, -5});
@@ -295,9 +311,9 @@ TEST_F(IndexingMapTest,
 
 TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_MulNegativeMultiplierNegativeBounds) {
-  IndexingMap indexing_map =
-      IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-                  {Interval{0, 99}}, {Interval{-99, 99}});
+  IndexingMap indexing_map = IndexingMap(
+      &indexing_context_, ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
+      {Interval{0, 99}}, {Interval{-99, 99}});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 * -3", &mlir_context_),
                              Interval{-11, -5});
@@ -311,7 +327,8 @@ TEST_F(IndexingMapTest,
 
 TEST_F(IndexingMapTest, AffineMapSimplification_ConstantDims) {
   IndexingMap indexing_map = IndexingMap(
-      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {Interval{5, 5}}, {});
+      &indexing_context_, ParseAffineMap("(d0) -> (d0)", &mlir_context_),
+      {Interval{5, 5}}, {});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0) -> (5)
@@ -324,7 +341,8 @@ TEST_F(IndexingMapTest,
        AffineMapSimplification_DivsAndModsIfSmallerThanDivisor) {
   auto serialized_map = "(d0, d1) -> (d0 + d1 floordiv 16, d1 mod 16)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap(serialized_map, &mlir_context_), {8, 16}, {});
+      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_),
+      {8, 16}, {});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0, d1) -> (d0, d1)
@@ -341,7 +359,8 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsAndModsWithMultipliers) {
       "d2 mod 10)";
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap(serialized_map, &mlir_context_), {9, 9, 9}, {});
+      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_),
+      {9, 9, 9}, {});
   indexing_map.Simplify();
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
@@ -360,7 +379,8 @@ TEST_F(IndexingMapTest,
       "                 (d0 * 16 + d1 * 4 + d2) mod 8)";
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap(serialized_map, &mlir_context_), {10, 10, 10}, {});
+      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_),
+      {10, 10, 10}, {});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
     (d0, d1, d2) -> (d0 * 2 + (d1 + d2 floordiv 4) floordiv 2,
@@ -377,7 +397,8 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsAndModsWithReverse) {
       "(d0, d1) -> (-((d0 * -11 - d1 + 109) floordiv 11) + 9, "
       "d0 * 11 + d1 + ((d0 * -11 - d1 + 109) floordiv 11) * 11 - 99)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap(serialized_map, &mlir_context_), {8, 9}, {});
+      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_),
+      {8, 9}, {});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                  (d0, d1) -> (d0, d1)
@@ -391,7 +412,8 @@ TEST_F(IndexingMapTest, AffineMapSimplification_SimplifyReshape) {
   auto serialized_map =
       "()[s0] -> ((s0 * 128) mod 715 + ((s0 * 128) floordiv 715) * 715)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap(serialized_map, &mlir_context_), {}, {128});
+      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_), {},
+      {128});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0] -> (s0 * 128)
@@ -404,7 +426,8 @@ TEST_F(IndexingMapTest, AffineMapSimplification_SimplifyReshape_Regression) {
   auto serialized_map =
       "()[s0] -> ((s0 * 128) mod 715 + ((s0 * 64) floordiv 715) * 715)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap(serialized_map, &mlir_context_), {}, {128});
+      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_), {},
+      {128});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0] -> ((s0 * 128) mod 715 + ((s0 * 64) floordiv 715) * 715)
@@ -417,7 +440,8 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsInSequence) {
       "()[s0] -> (s0 - ((s0 floordiv 2) floordiv 7) * 14 + (s0 floordiv 14) * "
       "14)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap(serialized_map, &mlir_context_), {}, {1234});
+      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_), {},
+      {1234});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                  ()[s0] -> (s0)
@@ -431,7 +455,8 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivGcdGreater1) {
       "()[s0, s1, s2] -> (s0 * 512 + s1 * 4 + s2 - ((s0 * 2 + s1 floordiv 64) "
       "floordiv 3) * 768 + ((s0 * 128 + s1) floordiv 192) * 768)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap(serialized_map, &mlir_context_), {}, {1234, 128, 4});
+      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_), {},
+      {1234, 128, 4});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1, s2] -> (s0 * 512 + s1 * 4 + s2)
@@ -447,7 +472,8 @@ TEST_F(IndexingMapTest, AffineMapSimplification_ExtractFromMod) {
       "()[s0, s1, s2, s3] -> ((s0 * 458752 + s1 + s2 * 4 + s3 * 512) mod "
       "20000)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap(serialized_map, &mlir_context_), {}, {872, 4, 128, 896});
+      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_), {},
+      {872, 4, 128, 896});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1, s2, s3] -> (
@@ -467,7 +493,8 @@ TEST_F(IndexingMapTest,
       "()[s0, s1] -> ((s0 * 16 - (s1 floordiv 4) floordiv 2 + (s1 floordiv 8) "
       "* 2) floordiv 4)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      ParseAffineMap(serialized_map, &mlir_context_), {}, {2, 128});
+      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_), {},
+      {2, 128});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1] -> (
diff --git a/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc b/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
index e7b7e39ac71325..55fa6433ea77ba 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
@@ -53,17 +53,17 @@ HloInstruction* IndexingTestBase::ParseAndGetRoot(
 HloInstructionIndexing IndexingTestBase::GetOutputToInputIndexing(
     const HloInstruction* instr, int output_id, bool use_physical_layout) {
   HloInstructionIndexing indexing =
-      ComputeOutputToInputIndexing(instr, output_id, &mlir_context_);
+      ComputeOutputToInputIndexing(instr, output_id, &indexing_context_);
 
   if (!use_physical_layout) return indexing;
 
   IndexingMap output_permutation = GetIndexingMapFromPhysicalLayoutToLogical(
-      GetOutputShape(instr, output_id), &mlir_context_);
+      GetOutputShape(instr, output_id), &indexing_context_);
 
   for (const auto& [operand_id, indexing_maps] :
        llvm::enumerate(indexing.indexing_maps)) {
     IndexingMap operand_permutation = GetIndexingMapFromLogicalToPhysicalLayout(
-        instr->operand(operand_id)->shape(), &mlir_context_);
+        instr->operand(operand_id)->shape(), &indexing_context_);
 
     absl::flat_hash_set<IndexingMap> operand_indexing_maps;
     for (const IndexingMap& indexing_map : indexing_maps) {
@@ -86,17 +86,17 @@ HloInstructionIndexing IndexingTestBase::GetOutputToInputIndexing(
 HloInstructionIndexing IndexingTestBase::GetInputToOutputIndexing(
     const HloInstruction* instr, int input_id, bool use_physical_layout) {
   HloInstructionIndexing indexing =
-      ComputeInputToOutputIndexing(instr, input_id, &mlir_context_);
+      ComputeInputToOutputIndexing(instr, input_id, &indexing_context_);
 
   if (!use_physical_layout) return indexing;
 
   IndexingMap input_permutation = GetIndexingMapFromPhysicalLayoutToLogical(
-      instr->operand(input_id)->shape(), &mlir_context_);
+      instr->operand(input_id)->shape(), &indexing_context_);
 
   for (const auto& [output_id, indexing_maps] :
        llvm::enumerate(indexing.indexing_maps)) {
     IndexingMap operand_permutation = GetIndexingMapFromLogicalToPhysicalLayout(
-        GetOutputShape(instr, output_id), &mlir_context_);
+        GetOutputShape(instr, output_id), &indexing_context_);
 
     absl::flat_hash_set<IndexingMap> operand_indexing_maps;
     for (const IndexingMap& indexing_map : indexing_maps) {
diff --git a/third_party/xla/xla/service/gpu/model/indexing_test_utils.h b/third_party/xla/xla/service/gpu/model/indexing_test_utils.h
index 62abd0e5e7fdb4..a0a304b0d43104 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_test_utils.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_test_utils.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
 
@@ -52,6 +53,8 @@ MATCHER_P(MatchIndexingString, indexing_string, "") {
 
 class IndexingTestBase : public HloTestBase {
  public:
+  IndexingTestBase()
+      : HloTestBase(), mlir_context_(), indexing_context_(&mlir_context_) {}
   HloInstruction* ParseAndGetRoot(absl::string_view hlo_string);
 
   HloInstructionIndexing GetOutputToInputIndexing(
@@ -63,6 +66,7 @@ class IndexingTestBase : public HloTestBase {
       bool use_physical_layout = false);
 
   mlir::MLIRContext mlir_context_;
+  IndexingContext indexing_context_;
   std::unique_ptr<VerifiedHloModule> module_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.cc b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
index 3599d19575e70e..8560c10de5af0d 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
@@ -252,7 +253,8 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
 
 /*static*/ std::optional<SymbolicTile> SymbolicTile::FromIndexingMap(
     const IndexingMap& indexing_map) {
-  MLIRContext* mlir_context = indexing_map.GetAffineMap().getContext();
+  IndexingContext* indexing_context = indexing_map.GetIndexingContext();
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   int64_t num_input_dims = indexing_map.GetDimensionCount();
   std::vector<AffineExpr> exprs;
   exprs.reserve(num_input_dims);
@@ -294,8 +296,8 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
       mlir_context);
 
   IndexingMap composed_indexing_map(
-      indexing_map.GetAffineMap().compose(producer_map), tile_dimension_ranges,
-      tile_symbol_ranges);
+      indexing_context, indexing_map.GetAffineMap().compose(producer_map),
+      tile_dimension_ranges, tile_symbol_ranges);
 
   composed_indexing_map.Simplify();
 

From b9752df1863297248ca2e6a74cb11125aa520474 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 Mar 2024 09:50:41 -0700
Subject: [PATCH 028/670] Include ifrt_proxy in xla_extension.so (so it is
 propagated to jaxlib).

PiperOrigin-RevId: 616858886
---
 third_party/xla/xla/python/BUILD              |  1 +
 .../xla/xla/python/ifrt_proxy/client/BUILD    | 12 +++++--
 .../xla/python/ifrt_proxy/client/py_module.cc | 22 +++++++------
 .../xla/python/ifrt_proxy/client/py_module.h  | 31 ++++++++++++++++++
 .../xla/xla/python/ifrt_proxy/jax/BUILD       |  1 -
 .../ifrt_proxy/jax/ifrt_proxy_internal.py     |  8 +++--
 third_party/xla/xla/python/xla.cc             |  5 +++
 .../xla/xla/python/xla_extension/__init__.pyi |  1 +
 .../xla/python/xla_extension/ifrt_proxy.pyi   | 32 +++++++++++++++++++
 9 files changed, 97 insertions(+), 16 deletions(-)
 create mode 100644 third_party/xla/xla/python/ifrt_proxy/client/py_module.h
 create mode 100644 third_party/xla/xla/python/xla_extension/ifrt_proxy.pyi

diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 508a1a82233fe7..3668ab3ba2ff5a 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -1258,6 +1258,7 @@ cc_library(
         "//xla/pjrt/distributed:protocol_proto_cc",
         "//xla/pjrt/distributed:service",
         "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/client:py_module",
         "//xla/python/pjrt_ifrt",
         "//xla/service/cpu:collectives_interface",
         "@local_tsl//tsl/distributed_runtime/preemption:preemption_sync_manager",
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index 7a6071ad0b2aee..7a989ec885c38a 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -14,7 +14,6 @@
 
 load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "if_google")
-load("@local_tsl//tsl:tsl.default.bzl", "tsl_pybind_extension")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -494,9 +493,17 @@ ifrt_proxy_cc_test(
     ],
 )
 
-tsl_pybind_extension(
+cc_library(
     name = "py_module",
     srcs = ["py_module.cc"],
+    hdrs = ["py_module.h"],
+    compatible_with = [],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    visibility = ["//xla/python:__pkg__"],
     deps = [
         ":grpc_client",
         ":registry",
@@ -514,6 +521,5 @@ tsl_pybind_extension(
         "@local_tsl//tsl/platform:statusor",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
-        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
     ],
 )
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc b/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
index 4b407bb438bb71..c20dc63c4d06d9 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "xla/python/ifrt_proxy/client/py_module.h"
 
 #include <functional>
 #include <memory>
@@ -32,7 +33,6 @@
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil  // NOLINT  // IWYU pragma: keep
-#include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt_proxy/client/registry.h"
@@ -100,20 +100,22 @@ absl::StatusOr<std::shared_ptr<xla::PyClient>> GetClient(
 }
 
 }  // namespace
-}  // namespace proxy
-}  // namespace ifrt
-}  // namespace xla
 
-PYBIND11_MODULE(py_module, m) {
-  pybind11_protobuf::ImportNativeProtoCasters();
+void BuildIfrtProxySubmodule(pybind11::module_& m) {
+  pybind11::module_ sub_module = m.def_submodule("ifrt_proxy", "IFRT proxy");
 
-  using ::xla::ifrt::proxy::PyClientConnectionOptions;
-  pybind11::class_<PyClientConnectionOptions>(m, "ClientConnectionOptions")
+  pybind11::class_<PyClientConnectionOptions>(sub_module,
+                                              "ClientConnectionOptions")
       .def(pybind11::init<>())
       .def_readwrite("on_disconnect", &PyClientConnectionOptions::on_disconnect)
       .def_readwrite("on_connection_update",
                      &PyClientConnectionOptions::on_connection_update);
 
-  m.def("get_client", xla::ValueOrThrowWrapper(xla::ifrt::proxy::GetClient),
-        pybind11::arg("proxy_server_address"), pybind11::arg("options"));
+  sub_module.def("get_client", xla::ValueOrThrowWrapper(GetClient),
+                 pybind11::arg("proxy_server_address"),
+                 pybind11::arg("options"));
 }
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/py_module.h b/third_party/xla/xla/python/ifrt_proxy/client/py_module.h
new file mode 100644
index 00000000000000..508d91a0f2d7c5
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/py_module.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
+
+#include "pybind11/pybind11.h"  // from @pybind11
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+void BuildIfrtProxySubmodule(pybind11::module_& m);
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/jax/BUILD b/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
index b86f65e9c3596a..1a84033e2fa33a 100644
--- a/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
@@ -31,7 +31,6 @@ pytype_strict_library(
     # copybara:uncomment_end
     deps = [
         "//xla/python:xla_client",
-        "//xla/python/ifrt_proxy/client:py_module",
         "@pybind11_abseil//pybind11_abseil:status",
     ],
 )
diff --git a/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py b/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
index 746575cdd61135..790c9567e010af 100644
--- a/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
+++ b/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
@@ -12,14 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Library to help create a IFRT proxy client."""
+"""Library to help create a IFRT proxy client.
+
+This library is no longer recommended nor used in OSS; it is used internally
+within google code. TODO(madthanu): Remove library.
+"""
 
 import dataclasses
 from typing import Callable, Optional
 
 from pybind11_abseil import status
 from xla.python import xla_client
-from xla.python.ifrt_proxy.client import py_module
 
 
 @dataclasses.dataclass
@@ -47,6 +50,7 @@ def get_client(proxy_server_address: str) -> xla_client.Client:
   """Creates an IFRT Proxy client for the given server address."""
   global _backend_created
   _backend_created = True
+  py_module = xla_client._xla.ifrt_proxy  # pylint: disable=protected-access
   cpp_options = py_module.ClientConnectionOptions()
   cpp_options.on_disconnect = _connection_options.on_disconnect
   cpp_options.on_connection_update = _connection_options.on_connection_update
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index 05752d3ff715e1..12870ab43f93d4 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -65,6 +65,7 @@ limitations under the License.
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/distributed/service.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/python/ifrt_proxy/client/py_module.h"
 #include "xla/python/py_client.h"
 #include "xla/service/cpu/collectives_interface.h"
 #include "tsl/python/lib/core/numpy.h"  //NOLINT
@@ -1016,6 +1017,10 @@ static void Init(py::module_& m) {
   BuildMlirSubmodule(m_nb);
   BuildCustomCallShardingPybindAPI(m_nb);
 
+  // The following uses python bindings for PyClient defined above using
+  // pybind11, and hence needs pybind11::module_ (not just nanobind::module_).
+  xla::ifrt::proxy::BuildIfrtProxySubmodule(m);
+
   py::class_<tsl::PreemptionSyncManager,
              std::unique_ptr<tsl::PreemptionSyncManager>>
       preemption_sync_manager(m, "PreemptionSyncManager");
diff --git a/third_party/xla/xla/python/xla_extension/__init__.pyi b/third_party/xla/xla/python/xla_extension/__init__.pyi
index 294a62a9136cfb..8fe1300bd94c73 100644
--- a/third_party/xla/xla/python/xla_extension/__init__.pyi
+++ b/third_party/xla/xla/python/xla_extension/__init__.pyi
@@ -37,6 +37,7 @@ from typing import (
 
 import numpy as np
 
+from . import ifrt_proxy
 from . import jax_jit
 from . import mlir
 from . import ops
diff --git a/third_party/xla/xla/python/xla_extension/ifrt_proxy.pyi b/third_party/xla/xla/python/xla_extension/ifrt_proxy.pyi
new file mode 100644
index 00000000000000..f65685025e5166
--- /dev/null
+++ b/third_party/xla/xla/python/xla_extension/ifrt_proxy.pyi
@@ -0,0 +1,32 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Any, Optional, Callable
+
+from xla.python import xla_extension
+
+_Status = Any
+Client = xla_extension.Client
+
+
+class ClientConnectionOptions:
+  on_disconnect: Optional[Callable[[_Status], None]] = None
+  on_connection_update: Optional[Callable[[str], None]] = None
+
+
+def get_client(
+    proxy_server_address: str,
+    options: ClientConnectionOptions
+) -> Client: ...

From a26cc64f94d09388f4fe5f3a21c309b74fed5a03 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Mon, 18 Mar 2024 10:37:00 -0700
Subject: [PATCH 029/670] Rollback of GpuTimer: improve kernel execution time
 measurement accuracy

This breaks the gemm algorithm picker test on V100

PiperOrigin-RevId: 616875670
---
 .../xla/service/gpu/conv_algorithm_picker.cc  |  12 +-
 .../xla/service/gpu/gemm_algorithm_picker.cc  |  31 +----
 .../xla/xla/stream_executor/build_defs.bzl    |   8 --
 third_party/xla/xla/stream_executor/gpu/BUILD |  42 +------
 .../xla/xla/stream_executor/gpu/gpu_timer.cc  | 114 ++----------------
 .../xla/xla/stream_executor/gpu/gpu_timer.h   |  34 +-----
 .../gpu/gpu_timer_kernel.cu.cc                |  52 --------
 .../stream_executor/gpu/gpu_timer_kernel.h    |  26 ----
 .../gpu/gpu_timer_kernel_stub.cc              |  22 ----
 9 files changed, 21 insertions(+), 320 deletions(-)
 delete mode 100644 third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
 delete mode 100644 third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
 delete mode 100644 third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_stub.cc

diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index 4c21084f51d48b..54bde3ac33e147 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -612,6 +612,7 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   // Use assignment instead of brace-list to make GCC 4.9 happy.
   RunConvOptions options;
   options.runner_cache = runner;
+  options.profile_result = &profile_result;
   // The following plan timing code is based on
   // https://github.com/NVIDIA/cudnn-frontend/blob/60496f42fdc7a4ccc059f5934e306e728a756755/include/cudnn_frontend_find_plan.h
   float max_time = 0;
@@ -624,20 +625,15 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   // Dry-run to warmup the plan.
   launch_status = RunGpuConv(config, operand_buffers, result_buffers,
                              scratch_memory, stream, options);
-  // It is intentional that the warm-up run does not have a profile result.
-  // This avoids a timeout and error message if lazy module loading is enabled
-  // by ensuring that lazy loading happens outside the GpuTimer region.
-  options.profile_result = &profile_result;
   constexpr int kMaxIter = 10;
   // Iterate until the new measurement is within kThreshold of the current
   // minimum.
   int num_iters = 0;
-  for (; num_iters < kMaxIter && launch_status.ok(); ++num_iters) {
+  for (;
+       num_iters < kMaxIter && launch_status.ok() && profile_result.is_valid();
+       num_iters++) {
     launch_status = RunGpuConv(config, operand_buffers, result_buffers,
                                scratch_memory, stream, options);
-    if (!profile_result.is_valid()) {
-      break;
-    }
     float old_min_time = min_time;
     min_time = std::min(min_time, profile_result.elapsed_time_in_ms());
     max_time = std::max(max_time, profile_result.elapsed_time_in_ms());
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index a5ff6665031619..9ed90d2ba6eadd 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
-#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -241,15 +240,6 @@ class GemmAutotuner {
 
     auto tuned_func = [&](const se::blas::AlgorithmType& algorithm)
         -> absl::StatusOr<se::blas::ProfileResult> {
-      // Do a warm-up run first, without a profile result. This avoids a timeout
-      // and error message if lazy module loading is enabled by ensuring that
-      // lazy loading happens outside the GpuTimer. RunGemm swallows error codes
-      // when profile_result is passed, as it is in the measurement below, but
-      // not otherwise. It is, therefore, consistent to ignore the error code
-      // here.
-      static_cast<void>(RunGemm(gemm_config, lhs_buffer_, rhs_buffer_,
-                                output_buffer_, workspace_buffer,
-                                deterministic_ops_, stream_, algorithm));
       se::blas::ProfileResult profile_result;
       // We expect GemmWithAlgorithm to fail sometimes -- in fact, it will fail
       // for all algorithms if we're targeting < sm_50. But because we pass a
@@ -421,28 +411,15 @@ absl::StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
                  config.GetGpuComputeCapability());
 
   if (update_algorithm) {
-    int64_t new_algorithm{};
     if (algorithm.has_gemm()) {
-      new_algorithm = algorithm.gemm().algorithm();
+      backend_config.set_selected_algorithm(algorithm.gemm().algorithm());
     } else {
       // NOTE: runtime autotuning is no longer available => set to default
-      new_algorithm = se::blas::kDefaultAlgorithm;
+      backend_config.set_selected_algorithm(se::blas::kDefaultAlgorithm);
     }
-
-    if (new_algorithm == old_algorithm &&
-        backend_config.has_selected_algorithm()) {
-      // We don't need to update the backend config if
-      // the algorithm hasn't changed unless previously
-      // the algorithm wasn't set explicitly.
-      return false;
-    }
-
-    backend_config.set_selected_algorithm(new_algorithm);
-    TF_RETURN_IF_ERROR(gemm->set_backend_config(gpu_config));
-    return true;  // We changed `gemm`
   }
-
-  return false;  // No change to `gemm`
+  TF_RETURN_IF_ERROR(gemm->set_backend_config(gpu_config));
+  return old_algorithm != backend_config.selected_algorithm();
 }
 
 absl::StatusOr<bool> RunOnComputation(HloComputation* computation,
diff --git a/third_party/xla/xla/stream_executor/build_defs.bzl b/third_party/xla/xla/stream_executor/build_defs.bzl
index 4e43fbec8d0c1e..6916574c646edf 100644
--- a/third_party/xla/xla/stream_executor/build_defs.bzl
+++ b/third_party/xla/xla/stream_executor/build_defs.bzl
@@ -88,11 +88,3 @@ def cuda_only_cc_library(name, tags = [], **kwargs):
         restricted_to = kwargs.get("restricted_to"),
         target_compatible_with = kwargs.get("target_compatible_with"),
     )
-
-# TODO(hebecker): Remove this once we've fixed our ARM build
-def if_google_arm_build(
-        if_true,  # @unused
-        if_false = []):
-    return select({
-        "//conditions:default": if_false,
-    })
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 8dfb83e2657c98..f75e32f0fe8866 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -20,7 +20,6 @@ load(
 load(
     "//xla/stream_executor:build_defs.bzl",
     "gpu_only_cc_library",
-    "if_google_arm_build",
     "if_gpu_is_configured",
 )
 load(
@@ -316,47 +315,11 @@ gpu_only_cc_library(
     ],
 )
 
-gpu_only_cc_library(
-    name = "gpu_timer_kernel_header",
-    hdrs = ["gpu_timer_kernel.h"],
-)
-
-gpu_kernel_library(
-    name = "gpu_timer_kernel",
-    srcs = if_gpu_is_configured(["gpu_timer_kernel.cu.cc"]),
-    deps = [
-        ":gpu_timer_kernel_header",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
-)
-
-# TODO(hebecker): Remove this once we have fixed our ARM build
-gpu_only_cc_library(
-    name = "gpu_timer_kernel_stub",
-    srcs = [
-        "gpu_timer_kernel_stub.cc",
-    ],
-    deps = [":gpu_timer_kernel_header"],
-)
-
-# TODO(hebecker): Remove this once we have fixed our ARM build
-cc_library(
-    name = "gpu_timer_kernel_not_on_google_arm",
-    deps = if_google_arm_build(
-        [":gpu_timer_kernel_stub"],
-        [":gpu_timer_kernel"],
-    ),
-)
-
 gpu_only_cc_library(
     name = "gpu_timer_header",
     hdrs = ["gpu_timer.h"],
     deps = [
         ":gpu_executor_header",
-        ":gpu_timer_kernel_header",
         ":gpu_types_header",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/time",
@@ -371,7 +334,6 @@ gpu_only_cc_library(
         ":gpu_driver_header",
         ":gpu_executor_header",
         ":gpu_stream",
-        ":gpu_timer_kernel_header",
         ":gpu_types_header",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_internal",
@@ -386,9 +348,7 @@ gpu_only_cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-    ] + if_gpu_is_configured([
-        ":gpu_timer_kernel_not_on_google_arm",
-    ]) + if_cuda_is_configured([
+    ] + if_cuda_is_configured([
         "//xla/stream_executor/cuda:cuda_driver",
     ]) + if_rocm_is_configured([
         "//xla/stream_executor/rocm:rocm_driver",
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
index c0256e8051c719..ecd3f40c6725c9 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
@@ -51,21 +51,10 @@ absl::Duration RandomDuration() {
   return absl::Microseconds(distribution(rng));
 }
 
-bool ShouldLaunchDelayKernel() {
-  // Only launch the delay kernel if CUDA_LAUNCH_BLOCKING is not set to 1.
-  static bool value = [] {
-    const char* blocking = std::getenv("CUDA_LAUNCH_BLOCKING");
-    return !blocking || std::string_view{blocking} != "1";
-  }();
-  return value;
-}
-
 }  // namespace
 
 /*deprecated*/ /*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(
     GpuStream* stream) {
-  // This deprecated factory does not launch the delay kernel and may lead to
-  // reduced measurement accuracy.
   GpuExecutor* parent = stream->parent();
   GpuContext* context = parent->gpu_context();
   GpuEventHandle start_event;
@@ -83,8 +72,6 @@ bool ShouldLaunchDelayKernel() {
 
 /*deprecated*/ /*static*/ absl::StatusOr<std::optional<GpuTimer>>
 GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
-  // This deprecated factory does not launch the delay kernel and may lead to
-  // reduced measurement accuracy.
   if (is_needed) {
     TF_ASSIGN_OR_RETURN(GpuTimer t, GpuTimer::Create(stream));
     return {std::make_optional(std::move(t))};
@@ -92,78 +79,16 @@ GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
   return std::nullopt;
 }
 
-/*static*/ absl::StatusOr<GpuTimer::GpuSemaphore>
-GpuTimer::GpuSemaphore::Create(StreamExecutor* executor) {
-  // Allocate the value in pinned host memory that can be read from both
-  // host and device.
-  TF_ASSIGN_OR_RETURN(auto alloc,
-                      executor->HostMemoryAllocate(sizeof(GpuSemaphoreState)));
-  return GpuSemaphore{std::move(alloc)};
+[[deprecated("So it can quietly call a deprecated method")]] /*static*/ absl::
+    StatusOr<GpuTimer>
+    GpuTimer::Create(Stream* stream) {
+  return GpuTimer::Create(AsGpuStream(stream));
 }
 
-DeviceMemory<GpuSemaphoreState> GpuTimer::GpuSemaphore::device() {
-  // This assumes unified addressing, as we do not explicitly translate the
-  // host pointer into a device pointer.
-  return DeviceMemory<GpuSemaphoreState>::MakeFromByteSize(
-      ptr_->opaque(), sizeof(GpuSemaphoreState));
-}
-
-/*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(Stream* real_stream) {
-  StreamExecutor* executor = real_stream->parent();
-  GpuStream* stream = AsGpuStream(real_stream);
-  GpuExecutor* parent = stream->parent();
-  GpuContext* context = parent->gpu_context();
-  GpuEventHandle start_event;
-  TF_RETURN_IF_ERROR(GpuDriver::InitEvent(context, &start_event,
-                                          GpuDriver::EventFlags::kDefault));
-  GpuEventHandle stop_event;
-  TF_RETURN_IF_ERROR(GpuDriver::InitEvent(context, &stop_event,
-                                          GpuDriver::EventFlags::kDefault));
-  CHECK(start_event != nullptr && stop_event != nullptr);
-  GpuSemaphore semaphore{};
-  if (ShouldLaunchDelayKernel()) {
-    // Check the assumption that this device supports unified addressing,
-    // otherwise skip the delay kernel
-    TF_ASSIGN_OR_RETURN(int status, GpuDriver::GetDeviceAttribute(
-                                        CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
-                                        parent->device()));
-    if (!status) {
-      LOG(WARNING) << "Skipping the delay kernel because the device does not "
-                      "support unified addressing";
-    } else {
-      // Allocate a semaphore value that will be used to signal to the delay
-      // kernel that it may exit.
-      TF_ASSIGN_OR_RETURN(semaphore, GpuSemaphore::Create(executor));
-      *semaphore = GpuSemaphoreState::Hold;
-      // In principle the kernel could be loaded lazily and shared across
-      // multiple GpuTimer objects.
-      TF_ASSIGN_OR_RETURN(
-          auto kernel,
-          (TypedKernel<DeviceMemory<GpuSemaphoreState>,
-                       GpuSemaphoreState>::Create(executor, "DelayKernel",
-                                                  delay_kernel::kernel())));
-      // Launch a delay kernel into this stream, which will spin until
-      // GetElapsedDuration() is called, the timer is destroyed, or the timeout
-      // in the kernel is reached.
-      TF_RETURN_IF_ERROR(real_stream->ThenLaunch(
-          ThreadDim(1, 1, 1), BlockDim(1, 1, 1), kernel, semaphore.device(),
-          GpuSemaphoreState::Release));
-    }
-  }
-  // The start event goes after the delay kernel in the stream
-  TF_RETURN_IF_ERROR(GpuDriver::RecordEvent(parent->gpu_context(), start_event,
-                                            stream->gpu_stream()));
-  return absl::StatusOr<GpuTimer>{absl::in_place, parent, start_event,
-                                  stop_event,     stream, std::move(semaphore)};
-}
-
-/*static*/ absl::StatusOr<std::optional<GpuTimer>> GpuTimer::CreateIfNeeded(
-    Stream* stream, bool is_needed) {
-  if (is_needed) {
-    TF_ASSIGN_OR_RETURN(GpuTimer t, GpuTimer::Create(stream));
-    return {std::make_optional(std::move(t))};
-  }
-  return std::nullopt;
+[[deprecated("So it can quietly call a deprecated method")]] /*static*/ absl::
+    StatusOr<std::optional<GpuTimer>>
+    GpuTimer::CreateIfNeeded(Stream* stream, bool is_needed) {
+  return GpuTimer::CreateIfNeeded(AsGpuStream(stream), is_needed);
 }
 
 /*static*/ void GpuTimer::ReturnRandomDurationsForTesting() {
@@ -172,17 +97,6 @@ DeviceMemory<GpuSemaphoreState> GpuTimer::GpuSemaphore::device() {
 
 GpuTimer::~GpuTimer() {
   GpuContext* context = parent_->gpu_context();
-  if (semaphore_ && !is_stopped_) {
-    // Signal the delay kernel that it can exit
-    *semaphore_ = GpuSemaphoreState::Release;
-    // Wait for the delay kernel to exit before destroying the value that it is
-    // watching.
-    absl::Status status =
-        GpuDriver::SynchronizeStream(context, stream_->gpu_stream());
-    if (!status.ok()) {
-      LOG(ERROR) << status;
-    }
-  }
   if (start_event_ != nullptr) {
     absl::Status status = GpuDriver::DestroyEvent(context, &start_event_);
     if (!status.ok()) {
@@ -203,18 +117,6 @@ absl::StatusOr<absl::Duration> GpuTimer::GetElapsedDuration() {
   }
   TF_RETURN_IF_ERROR(GpuDriver::RecordEvent(parent_->gpu_context(), stop_event_,
                                             stream_->gpu_stream()));
-  // If we launched the delay kernel then check if it already timed out.
-  if (semaphore_) {
-    if (*semaphore_ == GpuSemaphoreState::TimedOut) {
-      // The delay kernel did not achieve the intended result.
-      LOG(ERROR) << "Delay kernel timed out: measured time has sub-optimal "
-                    "accuracy. There may be a missing warmup execution, please "
-                    "investigate in Nsight Systems.";
-    } else {
-      // Signal that the kernel can exit
-      *semaphore_ = GpuSemaphoreState::Release;
-    }
-  }
   float elapsed_milliseconds = NAN;
   if (!GpuDriver::GetEventElapsedTime(parent_->gpu_context(),
                                       &elapsed_milliseconds, start_event_,
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
index 251c77ec7ee1ea..8fd83bec6499e3 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/time/time.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
-#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 
 namespace xla {
@@ -37,29 +36,9 @@ namespace gpu {
 class GpuExecutor;
 class GpuStream;
 
-// When a timer is created it launches a delay kernel into the given stream and
-// queues a start event immediately afterwards. This delay kernel blocks
-// execution on the stream until GetElapsedDuration() is called, at which point
-// an end event is queued and the delay kernel exits. This allows the device
-// execution time of the tasks queued to the stream while the timer is active
-// to be measured more accurately.
+// Timer is started once it's created, and is stopped once read.
 class GpuTimer {
  public:
-  class GpuSemaphore {
-   public:
-    GpuSemaphore() = default;
-    static absl::StatusOr<GpuSemaphore> Create(StreamExecutor* executor);
-    explicit operator bool() const { return bool{ptr_}; }
-    GpuSemaphoreState& operator*() {
-      return *static_cast<GpuSemaphoreState*>(ptr_->opaque());
-    }
-    DeviceMemory<GpuSemaphoreState> device();
-
-   private:
-    explicit GpuSemaphore(std::unique_ptr<HostMemoryAllocation> alloc)
-        : ptr_{std::move(alloc)} {}
-    std::unique_ptr<HostMemoryAllocation> ptr_;
-  };
   static absl::StatusOr<GpuTimer> Create(Stream* stream);
   [[deprecated("Pass Stream* not GpuStream*")]] static absl::StatusOr<GpuTimer>
   Create(GpuStream* stream);
@@ -74,20 +53,17 @@ class GpuTimer {
   CreateIfNeeded(GpuStream* stream, bool is_needed);
 
   explicit GpuTimer(GpuExecutor* parent, GpuEventHandle start_event,
-                    GpuEventHandle stop_event, GpuStream* stream,
-                    GpuSemaphore semaphore = {})
+                    GpuEventHandle stop_event, GpuStream* stream)
       : parent_(parent),
         start_event_(start_event),
         stop_event_(stop_event),
-        stream_(stream),
-        semaphore_(std::move(semaphore)) {}
+        stream_(stream) {}
 
   GpuTimer(GpuTimer&& other)
       : parent_(other.parent_),
         start_event_(std::exchange(other.start_event_, nullptr)),
         stop_event_(std::exchange(other.stop_event_, nullptr)),
-        stream_(other.stream_),
-        semaphore_(std::move(other.semaphore_)) {}
+        stream_(other.stream_) {}
 
   GpuTimer& operator=(GpuTimer&& other) {
     if (this != &other) {
@@ -95,7 +71,6 @@ class GpuTimer {
       start_event_ = std::exchange(other.start_event_, nullptr);
       stop_event_ = std::exchange(other.stop_event_, nullptr);
       stream_ = other.stream_;
-      semaphore_ = std::move(other.semaphore_);
     }
     return *this;
   }
@@ -111,7 +86,6 @@ class GpuTimer {
   GpuEventHandle start_event_ = nullptr;
   GpuEventHandle stop_event_ = nullptr;
   GpuStream* stream_;
-  GpuSemaphore semaphore_;
   bool is_stopped_ = false;
 
   GpuTimer(const GpuTimer&) = delete;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
deleted file mode 100644
index 0ce4b1d9fbb323..00000000000000
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
-
-#include <cstddef>
-
-namespace stream_executor::gpu {
-namespace {
-// Wait for the value pointed to by `semaphore` to have value `target`, timing
-// out after approximately `APPROX_TIMEOUT_SECONDS` seconds if that value is
-// not reached. This can happen if, for example, blocking launches are enabled
-// via CUDA_LAUNCH_BLOCKING=1. It can also happen if launching a kernel after
-// this delay kernel causes synchronisation, e.g. because of lazy loading.
-__global__ void DelayKernel(volatile GpuSemaphoreState* semaphore,
-                            GpuSemaphoreState target) {
-  constexpr int64_t WAIT_CYCLES{1024};
-  constexpr int64_t TIMEOUT_CYCLES{200000000};  // 100ms at 2GHz
-  const int64_t tstart{clock64()};
-  bool target_not_reached;
-  while ((target_not_reached = (*semaphore != target)) &&
-         (clock64() - tstart) < TIMEOUT_CYCLES) {
-    int64_t elapsed{};
-    const int64_t t0{clock64()};
-    do {
-      elapsed = clock64() - t0;
-    } while (elapsed < WAIT_CYCLES);
-  }
-  if (target_not_reached) {
-    // We are exiting due to the timeout. Signal this back to the host so that
-    // we can emit a warning, as it probably indicates suboptimal usage.
-    *semaphore = GpuSemaphoreState::TimedOut;
-  }
-}
-}  // namespace
-
-namespace delay_kernel {
-void* kernel() { return reinterpret_cast<void*>(DelayKernel); }
-}  // namespace delay_kernel
-
-}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
deleted file mode 100644
index 2ac358b4ee56c5..00000000000000
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
-#define XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
-
-namespace stream_executor::gpu {
-enum struct GpuSemaphoreState { Hold, Release, TimedOut };
-namespace delay_kernel {
-void* kernel();  // returns a pointer to a CUDA C++ device function
-}  // namespace delay_kernel
-}  // namespace stream_executor::gpu
-
-#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_stub.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_stub.cc
deleted file mode 100644
index 5286b5445b8b56..00000000000000
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_stub.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
-
-namespace stream_executor::gpu {
-namespace delay_kernel {
-void* kernel() { return nullptr; }
-}  // namespace delay_kernel
-}  // namespace stream_executor::gpu

From e37cdcfa8b76ec3ea8fbeb8748b472b77603e42b Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Mon, 18 Mar 2024 11:10:33 -0700
Subject: [PATCH 030/670] Cherry-pick
 https://github.com/llvm/llvm-project/commit/daa350c1995015daac552548c34b87220f21156d
 into TF

This lets MSVC compile MLIR again

PiperOrigin-RevId: 616887711
---
 ...50c1995015daac552548c34b87220f21156d.patch | 77 +++++++++++++++++++
 third_party/llvm/workspace.bzl                |  1 +
 2 files changed, 78 insertions(+)
 create mode 100644 third_party/llvm/daa350c1995015daac552548c34b87220f21156d.patch

diff --git a/third_party/llvm/daa350c1995015daac552548c34b87220f21156d.patch b/third_party/llvm/daa350c1995015daac552548c34b87220f21156d.patch
new file mode 100644
index 00000000000000..541c4e2f3bbbcb
--- /dev/null
+++ b/third_party/llvm/daa350c1995015daac552548c34b87220f21156d.patch
@@ -0,0 +1,77 @@
+commit daa350c1995015daac552548c34b87220f21156d
+Author: Benjamin Kramer <benny.kra@googlemail.com>
+Date:   Sun Mar 17 14:05:41 2024 +0100
+
+    [mlir] Work around MSVC bug
+    
+    MSVC fails to parse this construct, leading to
+    MlirTranslateMain.cpp(70): error C2065: 'inputSplitMarker': undeclared identifier
+    
+    Just switching to brace init works around the issue
+
+diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+index 51504ad58282..44c5e9826f3b 100644
+--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
++++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+@@ -128,7 +128,7 @@ struct MlirOptMainConfigCLOptions : public MlirOptMainConfig {
+         cl::desc("Print the list of registered dialects and exit"),
+         cl::location(showDialectsFlag), cl::init(false));
+ 
+-    static cl::opt<std::string, /*ExternalStorage=*/true> splitInputFile(
++    static cl::opt<std::string, /*ExternalStorage=*/true> splitInputFile{
+         "split-input-file", llvm::cl::ValueOptional,
+         cl::callback([&](const std::string &str) {
+           // Implicit value: use default marker if flag was used without value.
+@@ -137,7 +137,7 @@ struct MlirOptMainConfigCLOptions : public MlirOptMainConfig {
+         }),
+         cl::desc("Split the input file into chunks using the given or "
+                  "default marker and process each chunk independently"),
+-        cl::location(splitInputFileFlag), cl::init(""));
++        cl::location(splitInputFileFlag), cl::init("")};
+ 
+     static cl::opt<std::string, /*ExternalStorage=*/true> outputSplitMarker(
+         "output-split-marker",
+diff --git a/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp b/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
+index 1aaf8adb50a7..bd9928950ecc 100644
+--- a/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
++++ b/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
+@@ -62,7 +62,7 @@ LogicalResult mlir::mlirTranslateMain(int argc, char **argv,
+       llvm::cl::desc("Allow operation with no registered dialects (discouraged: testing only!)"),
+       llvm::cl::init(false));
+ 
+-  static llvm::cl::opt<std::string> inputSplitMarker(
++  static llvm::cl::opt<std::string> inputSplitMarker{
+       "split-input-file", llvm::cl::ValueOptional,
+       llvm::cl::callback([&](const std::string &str) {
+         // Implicit value: use default marker if flag was used without value.
+@@ -71,7 +71,7 @@ LogicalResult mlir::mlirTranslateMain(int argc, char **argv,
+       }),
+       llvm::cl::desc("Split the input file into chunks using the given or "
+                      "default marker and process each chunk independently"),
+-      llvm::cl::init(""));
++      llvm::cl::init("")};
+ 
+   static llvm::cl::opt<bool> verifyDiagnostics(
+       "verify-diagnostics",
+diff --git a/mlir/tools/mlir-pdll/mlir-pdll.cpp b/mlir/tools/mlir-pdll/mlir-pdll.cpp
+index d312765e40b0..c6ad6c361e99 100644
+--- a/mlir/tools/mlir-pdll/mlir-pdll.cpp
++++ b/mlir/tools/mlir-pdll/mlir-pdll.cpp
+@@ -136,7 +136,7 @@ int main(int argc, char **argv) {
+       llvm::cl::desc(
+           "Print out the parsed ODS information from the input file"),
+       llvm::cl::init(false));
+-  llvm::cl::opt<std::string> inputSplitMarker(
++  llvm::cl::opt<std::string> inputSplitMarker{
+       "split-input-file", llvm::cl::ValueOptional,
+       llvm::cl::callback([&](const std::string &str) {
+         // Implicit value: use default marker if flag was used without value.
+@@ -145,7 +145,7 @@ int main(int argc, char **argv) {
+       }),
+       llvm::cl::desc("Split the input file into chunks using the given or "
+                      "default marker and process each chunk independently"),
+-      llvm::cl::init(""));
++      llvm::cl::init("")};
+   llvm::cl::opt<std::string> outputSplitMarker(
+       "output-split-marker",
+       llvm::cl::desc("Split marker to use for merging the ouput"),
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index c190989fc46286..67616ae9c97943 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -17,6 +17,7 @@ def repo(name):
         ],
         build_file = "//third_party/llvm:llvm.BUILD",
         patch_file = [
+            "//third_party/llvm:daa350c1995015daac552548c34b87220f21156d.patch",
             "//third_party/llvm:generated.patch",  # Autogenerated, don't remove.
             "//third_party/llvm:build.patch",
             "//third_party/llvm:mathextras.patch",

From fa310c6b644e8859c59fdd01cf6dbb1d85496f8c Mon Sep 17 00:00:00 2001
From: Dragan Mladjenovic <Dragan.Mladjenovic@amd.com>
Date: Mon, 18 Mar 2024 11:19:07 -0700
Subject: [PATCH 031/670] PR #9873: [ROCm] Don't use CUDA PTX for ROCM in
 ComputationIdCmd

Imported from GitHub PR https://github.com/openxla/xla/pull/9873

Copybara import of the project:

--
818077159230e06ce8e94b3c556d1d68fa125b09 by Dragan Mladjenovic <Dragan.Mladjenovic@amd.com>:

[ROCm] Don't use CUDA PTX for ROCM in ComputationIdCmd

Merging this change closes #9873

PiperOrigin-RevId: 616890560
---
 .../xla/xla/service/gpu/runtime/command_buffer_cmd.cc      | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
index c1909dc98e437a..1da3c5d4253c9f 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
@@ -502,6 +502,7 @@ CommandBufferCmd::BufferUsageVector ComputationIdCmd::buffers() {
 
 absl::Status ComputationIdCmd::Initialize(const Thunk::InitializeParams& params,
                                           StateManager& state) {
+#if defined(GOOGLE_CUDA)
   {
     absl::MutexLock lock(&mutex_);
     if (memset_kernels_.contains(params.executor)) return absl::OkStatus();
@@ -514,6 +515,7 @@ absl::Status ComputationIdCmd::Initialize(const Thunk::InitializeParams& params,
 
   absl::MutexLock lock(&mutex_);
   memset_kernels_.emplace(params.executor, std::move(kernel));
+#endif  // GOOGLE_CUDA
   return absl::OkStatus();
 }
 
@@ -540,6 +542,7 @@ absl::Status ComputationIdCmd::Record(
           << "; execution_scope_id=" << execution_scope_id.value();
   VLOG(5) << "  Id: " << dest_ << " (" << dst.opaque() << ")";
 
+#if defined(GOOGLE_CUDA)
   se::Kernel* memset_kernel = [&] {
     absl::MutexLock lock(&mutex_);
     return memset_kernels_[execute_params.stream->parent()].get();
@@ -553,6 +556,10 @@ absl::Status ComputationIdCmd::Record(
   auto args = se::PackKernelArgs(/*shmem_bytes=*/0, int64_t{1}, value, dst);
   return command_buffer->Launch(execution_scope_id, se::ThreadDim(1),
                                 se::BlockDim(1), *memset_kernel, *args);
+#else
+  return command_buffer->Memset(execution_scope_id, &dst, value,
+                                /*num_elements=*/1);
+#endif  // GOOGLE_CUDA
 }
 
 //===----------------------------------------------------------------------===//

From 3f979c5b9e78106530bb425d895251d560cf444a Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Mon, 18 Mar 2024 11:42:46 -0700
Subject: [PATCH 032/670] Fix build breakage for
 //tensorflow/lite/delegates/flex:util_test.

PiperOrigin-RevId: 616898445
---
 tensorflow/lite/delegates/flex/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 5f126f68124cf8..da620e2e011019 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -342,6 +342,7 @@ tf_cc_test(
     srcs = ["util_test.cc"],
     deps = [
         ":util",
+        "//tensorflow/c:tf_datatype",
         "//tensorflow/core:framework",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
         "//tensorflow/lite:string",

From d817b4582e48e66b947882c5bfe407ae763902ef Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <slebedev@google.com>
Date: Mon, 18 Mar 2024 13:01:18 -0700
Subject: [PATCH 033/670] [xla::ffi] Forked Pointer to xla/ffi/api/ffi.h

It is useful in both "internal" and "external" FFI versions.

PiperOrigin-RevId: 616921396
---
 third_party/xla/xla/ffi/api/ffi.h       | 25 ++++++++++++++++++++++++
 third_party/xla/xla/ffi/api/ffi_test.cc | 26 +++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
index f6d742656a2a79..6c281e4878f960 100644
--- a/third_party/xla/xla/ffi/api/ffi.h
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -243,6 +243,31 @@ struct ArgDecoding<Buffer<dtype, rank>> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Attributes decoding
+//===----------------------------------------------------------------------===//
+
+// A type tag to mark i64 attributes as pointers to `T`.
+template <typename T>
+struct Pointer {};
+
+template <typename T>
+struct AttrDecoding<Pointer<T>> {
+  using Type = T*;
+
+  static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,
+                                    DiagnosticEngine& diagnostic) {
+    if (type != XLA_FFI_AttrType_I64) {
+      return diagnostic.Emit("Wrong attribute type: ")
+             << "expected i64 for passing user data but got " << type;
+    }
+
+    static_assert(sizeof(uintptr_t) == sizeof(int64_t));
+    uintptr_t ptr = *reinterpret_cast<uintptr_t*>(attr);
+    return reinterpret_cast<Type>(ptr);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Result encoding
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index af720f2686370c..53ef32daa7283c 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -231,6 +231,32 @@ TEST(FfiTest, BindingPlatformStreamInference) {
   (void)Ffi::BindTo(+[](TestStream stream) { return Error::Success(); });
 }
 
+TEST(FfiTest, PointerAttr) {
+  std::string foo = "foo";
+
+  // Test for convenience attr binding that casts i64 attribute to user-type
+  // pointers. It's up to the user to guarantee that pointer is valid.
+  auto ptr = reinterpret_cast<uintptr_t>(&foo);
+  static_assert(sizeof(ptr) == sizeof(int64_t));
+
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("ptr", static_cast<int64_t>(ptr));
+
+  CallFrameBuilder builder;
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto fn = [&](const std::string* str) {
+    EXPECT_EQ(*str, "foo");
+    return Error::Success();
+  };
+
+  auto handler = Ffi::Bind().Attr<Pointer<std::string>>("ptr").To(fn);
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
 //===----------------------------------------------------------------------===//
 // Performance benchmarks are below.
 //===----------------------------------------------------------------------===//

From 013c4759ca4b40cb1281fb19473c1d5a3f6d18cd Mon Sep 17 00:00:00 2001
From: Ionel Gog <icgog@google.com>
Date: Mon, 18 Mar 2024 13:01:32 -0700
Subject: [PATCH 034/670] Fix conversions between ShardingParam and
 HloSharding.

This change ensures that conversions work correctly for meshes with more than 2 axis, and adds additional tests for conversions between HloSharding to ShardingParam, ShardingParam to OpSharding, and ShardingParam to HloSharding.

PiperOrigin-RevId: 616921476
---
 third_party/xla/xla/python/ifrt/ir/BUILD      |   3 +
 .../xla/xla/python/ifrt/ir/sharding_param.cc  |  77 +++++++---
 .../xla/xla/python/ifrt/ir/sharding_param.h   |   3 +
 .../python/ifrt/ir/tests/verify_array.mlir    |   2 +-
 .../python/ifrt/ir/tests/verify_reshard.mlir  |   4 +-
 third_party/xla/xla/python/ifrt/support/BUILD |   1 +
 .../ifrt/support/sharding_conversions.cc      |  27 +++-
 .../ifrt/support/sharding_conversions_test.cc | 145 +++++++++++++++---
 8 files changed, 210 insertions(+), 52 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt/ir/BUILD b/third_party/xla/xla/python/ifrt/ir/BUILD
index e81f592e48baff..5f99830ce7703f 100644
--- a/third_party/xla/xla/python/ifrt/ir/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/BUILD
@@ -134,10 +134,13 @@ cc_library(
         ":ifrt_dialect_inc_gen",
         ":ifrt_interfaces_inc_gen",
         ":ifrt_ops_inc_gen",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
diff --git a/third_party/xla/xla/python/ifrt/ir/sharding_param.cc b/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
index 68c9e472901385..d8b36fb5d72d87 100644
--- a/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
+++ b/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
@@ -18,14 +18,20 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tsl/platform/errors.h"
 
 namespace xla {
 namespace ifrt {
@@ -66,14 +72,37 @@ void PopulateDevices(llvm::ArrayRef<int> permutation,
 
 }  // namespace
 
+absl::Status ShardingParam::MinorToMajor::verify() const {
+  if (permutation.size() != axis_sizes.size() || axis_sizes.empty()) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Expect same non-zero size for `permutation` and `axis_sizes`. Actual ",
+        permutation.size(), " vs ", axis_sizes.size()));
+  }
+  llvm::DenseSet<int> permutation_set(permutation.begin(), permutation.end());
+  if (permutation_set.size() != permutation.size()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("`permutation` [", absl::StrJoin(permutation, ","),
+                     "] has duplicate values"));
+  }
+  for (const int index : permutation) {
+    if (index < 0 || index >= axis_sizes.size()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Out of range axis ", index, " to the mesh of [",
+                       absl::StrJoin(permutation, ","), "] on ",
+                       absl::StrJoin(axis_sizes, "x")));
+    }
+  }
+  return absl::OkStatus();
+}
+
 mlir::LogicalResult ShardingParam::MinorToMajor::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const {
-  if (permutation.size() != axis_sizes.size() || axis_sizes.empty()) {
-    return emit_error() << "Expect same non-zero size for `permutation` and "
-                           "`axis_sizes`. Actual "
-                        << permutation.size() << " vs " << axis_sizes.size();
+  auto status = verify();
+  if (status.ok()) {
+    return mlir::success();
+  } else {
+    return emit_error() << status.message();
   }
-  return mlir::success();
 }
 
 void ShardingParam::MinorToMajor::ToDeviceList(
@@ -120,12 +149,8 @@ mlir::FailureOr<ShardingParam> ShardingParam::Parse(
   return ShardingParam(dim_shards, minor_to_major);
 }
 
-mlir::LogicalResult ShardingParam::verify(
-    llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const {
-  if (mlir::failed(minor_to_major().verify(emit_error))) {
-    return mlir::failure();
-  }
-
+absl::Status ShardingParam::verify() const {
+  TF_RETURN_IF_ERROR(minor_to_major().verify());
   int dim_index = 0;
   int cum_size = 1;
   for (const int index : minor_to_major().permutation) {
@@ -135,17 +160,11 @@ mlir::LogicalResult ShardingParam::verify(
     if (dim_index == dim_shards().size()) {
       break;
     }
-    if (index < 0 || index >= minor_to_major().axis_sizes.size()) {
-      return emit_error() << "Out of range axis " << index << " to the mesh of "
-                          << minor_to_major().permutation << " on "
-                          << minor_to_major().axis_sizes;
-    }
-
     cum_size *= minor_to_major().axis_sizes[index];
     if (cum_size > dim_shards()[dim_index]) {
-      return emit_error() << "Dimension #" << dim_index << " of "
-                          << dim_shards()[dim_index]
-                          << " shards can't be assigned to the axes";
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Dimension #", dim_index, " of ", dim_shards()[dim_index],
+          " shards can't be assigned to the axes"));
     } else if (cum_size == dim_shards()[dim_index]) {
       cum_size = 1;
       dim_index++;
@@ -155,12 +174,22 @@ mlir::LogicalResult ShardingParam::verify(
     dim_index++;
   }
   if (dim_index != dim_shards().size()) {
-    return emit_error() << "Can't shard the dims " << dim_shards()
-                        << " to the mesh of " << minor_to_major().permutation
-                        << " on " << minor_to_major().axis_sizes;
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Can't shard the dims ", absl::StrJoin(dim_shards(), "x"),
+        " to the mesh of [", absl::StrJoin(minor_to_major().permutation, ","),
+        "] on ", absl::StrJoin(minor_to_major().axis_sizes, "x")));
   }
+  return absl::OkStatus();
+}
 
-  return mlir::success();
+mlir::LogicalResult ShardingParam::verify(
+    llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const {
+  auto status = verify();
+  if (status.ok()) {
+    return mlir::success();
+  } else {
+    return emit_error() << status.message();
+  }
 }
 
 std::string ShardingParam::DebugString() const {
diff --git a/third_party/xla/xla/python/ifrt/ir/sharding_param.h b/third_party/xla/xla/python/ifrt/ir/sharding_param.h
index 5388f860b53ee7..13de6a96e9dcb5 100644
--- a/third_party/xla/xla/python/ifrt/ir/sharding_param.h
+++ b/third_party/xla/xla/python/ifrt/ir/sharding_param.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 
+#include "absl/status/status.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
@@ -79,6 +80,7 @@ class ShardingParam {
     // The size of mesh dimensions before the permutation.
     llvm::SmallVector<int, 4> axis_sizes;
 
+    absl::Status verify() const;
     mlir::LogicalResult verify(
         llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const;
 
@@ -94,6 +96,7 @@ class ShardingParam {
       : dim_shards_(dim_shards), minor_to_major_(minor_to_major) {}
 
   static mlir::FailureOr<ShardingParam> Parse(mlir::AsmParser& ods_parser);
+  absl::Status verify() const;
   mlir::LogicalResult verify(
       llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const;
 
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir
index 339d351958d3e2..81b557bf28d5e9 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir
@@ -64,7 +64,7 @@ func.func @array_requires_same_permutation_and_axis_sizes() {
 // -----
 
 func.func @array_requires_enough_devices() {
-  // expected-error@+2 {{Can't shard the dims 2, 2 to the mesh of 0 on 2}}
+  // expected-error@+2 {{Can't shard the dims 2x2 to the mesh of [0] on 2}}
   %0 = builtin.unrealized_conversion_cast to
       !ifrt.array<tensor<4x4xi32>, 2x2 to [0] on 2, [0,1]>
   return
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_reshard.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_reshard.mlir
index cc8370e81f9ad1..a34af467efe6a7 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_reshard.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_reshard.mlir
@@ -47,7 +47,7 @@ func.func @reshard_requires_same_global_shape(
 func.func @reshard_requires_non_negative_axis_index(
     %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     attributes {ifrt.function} {
-  // expected-error@+3 {{Out of range axis -1 to the mesh of -1 on 2}}
+  // expected-error@+3 {{Out of range axis -1 to the mesh of [-1] on 2}}
   %0 = ifrt.Reshard(%arg0)
       : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
       -> !ifrt.array<tensor<2x1xi32>, 1x2 to [-1] on 2, [2,3]>
@@ -59,7 +59,7 @@ func.func @reshard_requires_non_negative_axis_index(
 func.func @reshard_requires_valid_axis_index(
     %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     attributes {ifrt.function} {
-  // expected-error@+3 {{Out of range axis 1234567890 to the mesh of 1234567890 on 2}}
+  // expected-error@+3 {{Out of range axis 1234567890 to the mesh of [1234567890] on 2}}
   %0 = ifrt.Reshard(%arg0)
       : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
       -> !ifrt.array<tensor<2x1xi32>, 1x2 to [1234567890] on 2, [2,3]>
diff --git a/third_party/xla/xla/python/ifrt/support/BUILD b/third_party/xla/xla/python/ifrt/support/BUILD
index f0405ba8ca8783..33907fd34f3d13 100644
--- a/third_party/xla/xla/python/ifrt/support/BUILD
+++ b/third_party/xla/xla/python/ifrt/support/BUILD
@@ -40,6 +40,7 @@ xla_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/third_party/xla/xla/python/ifrt/support/sharding_conversions.cc b/third_party/xla/xla/python/ifrt/support/sharding_conversions.cc
index fc315091c8c18e..1c6a2b3f6f5a73 100644
--- a/third_party/xla/xla/python/ifrt/support/sharding_conversions.cc
+++ b/third_party/xla/xla/python/ifrt/support/sharding_conversions.cc
@@ -106,14 +106,21 @@ absl::StatusOr<HloSharding> ToHloSharding(const ShardingParam& sharding_param) {
     cum_size *= dim_shard;
     dims.push_back(dim_shard);
   }
+  // Applies the inverse of the transposes from `ToShardingParam`.
+  llvm::SmallVector<int, 4> permutation;
+  int num_axis = sharding_param.minor_to_major().permutation.size();
+  permutation.reserve(num_axis);
+  for (const int axis_id :
+       llvm::reverse(sharding_param.minor_to_major().permutation)) {
+    permutation.push_back(num_axis - axis_id - 1);
+  }
   if (device_count != cum_size) {
     // Add the replicated dimension.
     dims.push_back(device_count / cum_size);
-    return HloSharding::PartialTile(TileAssignment(
-        dims, reshape_dims, sharding_param.minor_to_major().permutation));
+    return HloSharding::PartialTile(
+        TileAssignment(dims, reshape_dims, permutation));
   } else {
-    return HloSharding::IotaTile(dims, reshape_dims,
-                                 sharding_param.minor_to_major().permutation);
+    return HloSharding::IotaTile(dims, reshape_dims, permutation);
   }
 }
 
@@ -175,8 +182,16 @@ absl::StatusOr<ShardingParam> ToShardingParam(const HloSharding& hlo_sharding,
            llvm::reverse(tile_assignment.iota()->reshape_dims())) {
         minor_to_major.axis_sizes.push_back(reshape_dim);
       }
-      for (int axis_id : tile_assignment.iota()->transpose_perm()) {
-        minor_to_major.permutation.push_back(axis_id);
+      // The devices generated by HloSharding
+      // np.arange(ndevices).reshape(reshape_dims).transpose(transpose_perm)
+      // must be equal to the devices ShardingParam
+      // np.arange(ndevices).reshape(reverse(axis_size)).T.transpose(perm).T
+      // Step 1: Compute transpose(transpose_perm).T.
+      // Step 2: Compute T.transpose(transpose_perm).T.
+      int num_axis = tile_assignment.iota()->transpose_perm().size();
+      for (int axis_id :
+           llvm::reverse(tile_assignment.iota()->transpose_perm())) {
+        minor_to_major.permutation.push_back(num_axis - axis_id - 1);
       }
     }
     return ShardingParam(dim_shards, std::move(minor_to_major));
diff --git a/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc b/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
index 4f9cdd2f6ffe7b..22b213ff7c2d7d 100644
--- a/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
+++ b/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/python/ifrt/support/sharding_conversions.h"
 
 #include <memory>
+#include <numeric>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -35,6 +36,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding_test_util.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
@@ -44,22 +46,24 @@ namespace support {
 namespace {
 
 using ::tsl::testing::StatusIs;
+using xla::HloSharding;
 
-absl::StatusOr<xla::HloSharding> ToHloShardingViaOpSharding(
+absl::StatusOr<HloSharding> ToHloShardingViaOpSharding(
     const ShardingParam& sharding_param, absl::Span<const int> device_list) {
   TF_ASSIGN_OR_RETURN(xla::OpSharding op_sharding,
                       ToOpSharding(sharding_param, device_list));
-  return xla::HloSharding::FromProto(op_sharding);
+  return HloSharding::FromProto(op_sharding);
 }
 
 TEST(ShardingConversionsTest, Replicated) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{1, 1, 1},
       {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_iota_sharding,
+  TF_EXPECT_OK(expected_sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_iota_sharding,
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(expected_sharding_param, {0, 1, 2, 3, 4, 5}));
   EXPECT_EQ(hlo_sharding.ToString(), "{replicated}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
@@ -67,7 +71,7 @@ TEST(ShardingConversionsTest, Replicated) {
                           ToShardingParam(hlo_iota_sharding, 3, 6));
   // We do not compare expected_sharding_param and sharding_param because they
   // haven't been canonicalized (1x1x1 to [0, 1] on 2x3 vs. 1x1x1 to [0] on 6).
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding actual_hlo_sharding,
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding actual_hlo_sharding,
                           ToHloSharding(sharding_param));
   EXPECT_EQ(hlo_iota_sharding, actual_hlo_sharding);
 }
@@ -75,10 +79,11 @@ TEST(ShardingConversionsTest, Replicated) {
 TEST(ShardingConversionsTest, SingleDeviceReplicated) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{1, 1}, {/*permutation=*/{0}, /*axis_sizes=*/{1}}};
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_iota_sharding,
+  TF_EXPECT_OK(expected_sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_iota_sharding,
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(expected_sharding_param, {0}));
   EXPECT_EQ(hlo_sharding.ToString(), "{replicated}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
@@ -91,10 +96,11 @@ TEST(ShardingConversionsTest, Permutation) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{2, 1, 3},
       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_iota_sharding,
+  TF_EXPECT_OK(expected_sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_iota_sharding,
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(expected_sharding_param, {0, 1, 2, 3, 4, 5}));
   EXPECT_EQ(hlo_sharding.ToString(), "{devices=[2,1,3]0,3,1,4,2,5}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
@@ -106,10 +112,11 @@ TEST(ShardingConversionsTest, Permutation) {
 TEST(ShardingConversionsTest, Partial) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{2, 1}, {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_iota_sharding,
+  TF_EXPECT_OK(expected_sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_iota_sharding,
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(expected_sharding_param, {0, 1, 2, 3, 4, 5}));
   EXPECT_EQ(hlo_sharding.ToString(),
             "{devices=[2,1,3]0,1,2,3,4,5 last_tile_dim_replicate}");
@@ -118,7 +125,7 @@ TEST(ShardingConversionsTest, Partial) {
                           ToShardingParam(hlo_iota_sharding, 2, 6));
   // We do not compare expected_sharding_param and sharding_param because they
   // haven't been canonicalized (2x1 to [0, 1] on 2x3 vs. 2x1 to [0] on 6).
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding actual_hlo_sharding,
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding actual_hlo_sharding,
                           ToHloSharding(sharding_param));
   EXPECT_EQ(hlo_iota_sharding, actual_hlo_sharding);
 }
@@ -126,10 +133,11 @@ TEST(ShardingConversionsTest, Partial) {
 TEST(ShardingConversionsTest, OneDimToTwoAxes) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{4}, {/*permutation=*/{1, 0}, /*axis_sizes=*/{2, 2}}};
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_iota_sharding,
+  TF_EXPECT_OK(expected_sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_iota_sharding,
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(expected_sharding_param, {0, 1, 2, 3}));
   EXPECT_EQ(hlo_sharding.ToString(), "{devices=[4]0,2,1,3}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
@@ -142,21 +150,116 @@ TEST(ShardingConversionsTest, NonTrivialDeviceAssignment) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{2, 1, 3},
       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_EXPECT_OK(expected_sharding_param.verify());
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(expected_sharding_param, {6, 5, 4, 3, 2, 1}));
   EXPECT_EQ(hlo_sharding.ToString(), "{devices=[2,1,3]6,3,5,2,4,1}");
 }
 
+TEST(ShardingConversionsTest, VerifyIncorrectShardings) {
+  ShardingParam different_permutation_and_axis{
+      /*dim_shards=*/{1, 1}, {/*permutation=*/{0, 1}, /*axis_sizes=*/{2}}};
+  EXPECT_FALSE(different_permutation_and_axis.verify().ok());
+  ShardingParam too_many_slices{/*dim_shards=*/{2, 2},
+                                {/*permutation=*/{0}, /*axis_sizes=*/{2}}};
+  EXPECT_FALSE(too_many_slices.verify().ok());
+  ShardingParam cannot_distribute_slices{
+      /*dim_shards=*/{1, 2}, {/*permutation=*/{0, 1}, /*axis_sizes=*/{3, 2}}};
+  EXPECT_FALSE(cannot_distribute_slices.verify().ok());
+  ShardingParam incorrect_permutation{
+      /*dim_shards=*/{4, 1},
+      {/*permutation=*/{0, 1, 1}, /*axis_sizes=*/{2, 2, 2}}};
+  EXPECT_FALSE(incorrect_permutation.verify().ok());
+}
+
 TEST(ShardingConversionsTest, ErrorOnDeviceAssignment) {
   ShardingParam sharding_param{/*dim_shards=*/{2, 1, 3},
                                {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_EXPECT_OK(sharding_param.verify());
   EXPECT_THAT(
       ToHloShardingViaOpSharding(sharding_param, {6, 5, 4, 3, 2}),
       StatusIs(absl::StatusCode::kOutOfRange,
                ::testing::HasSubstr("Can't map device with logical id 5")));
 }
 
+struct HloShardingTestStruct {
+  HloSharding hlo_sharding;
+  int rank;
+  int num_devices;
+};
+
+using HloShardingToShardingParamTest =
+    ::testing::TestWithParam<HloShardingTestStruct>;
+
+TEST_P(HloShardingToShardingParamTest, HloShardingToShardingParam) {
+  const auto& param = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto sharding_param,
+      ToShardingParam(param.hlo_sharding, param.rank, param.num_devices));
+  // We cannot verify sharding param because we're losing info about the
+  // axis_size during these conversions. While strictly some ShardingParam
+  // are invalid because they have more dims than axis, in practice this is not
+  // a problem because we can still correctly map the shards to the devices.
+  TF_ASSERT_OK_AND_ASSIGN(auto actual_hlo_sharding,
+                          ToHloSharding(sharding_param));
+  EXPECT_EQ(param.hlo_sharding, actual_hlo_sharding);
+  // Verify that the conversion to OpSharding is also correct.
+  std::vector<int> device_ids(param.num_devices);
+  std::iota(device_ids.begin(), device_ids.end(), 0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto hlo_via_op_sharding,
+      ToHloShardingViaOpSharding(sharding_param, device_ids));
+  EXPECT_EQ(param.hlo_sharding, hlo_via_op_sharding);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    HloShardingConversionTests, HloShardingToShardingParamTest,
+    testing::ValuesIn<HloShardingTestStruct>({
+        {HloSharding::IotaTile({4, 2}), 2, 8},
+        {HloSharding::IotaTile({2, 4}, {4, 2}, {1, 0}), 2, 8},
+        {HloSharding::IotaTile({8, 1}), 2, 8},
+        {HloSharding::IotaTile({8, 1}, {4, 2}, {1, 0}), 2, 8},
+        {HloSharding::PartialTile(TileAssignment({4, 1, 2}, {8}, {0})), 2, 8},
+        {HloSharding::PartialTile(TileAssignment({2, 1, 4}, {4, 2}, {1, 0})), 2,
+         8},
+        {HloSharding::PartialTile(TileAssignment({1, 4, 2}, {8}, {0})), 2, 8},
+        {HloSharding::PartialTile(TileAssignment({1, 2, 4}, {4, 2}, {1, 0})), 2,
+         8},
+        {HloSharding::PartialTile(TileAssignment({4, 3, 2}, {2, 3, 4},
+                                                 {2, 1, 0})),
+         2, 24},
+        {HloSharding::PartialTile(TileAssignment({4, 2, 3}, {6, 4}, {1, 0})), 2,
+         24},
+        {HloSharding::PartialTile(TileAssignment({6, 1, 4}, {24}, {0})), 2, 24},
+        {HloSharding::PartialTile(TileAssignment({12, 1, 2}, {2, 12}, {1, 0})),
+         2, 24},
+        {HloSharding::PartialTile(TileAssignment({8, 1, 3}, {6, 4}, {1, 0})), 2,
+         24},
+        {HloSharding::PartialTile(TileAssignment({2, 1, 12}, {24}, {0})), 2,
+         24},
+        {HloSharding::PartialTile(TileAssignment({3, 1, 8}, {2, 3, 4},
+                                                 {1, 0, 2})),
+         2, 24},
+        {HloSharding::PartialTile(TileAssignment({1, 4, 6}, {6, 4}, {1, 0})), 2,
+         24},
+        {HloSharding::PartialTile(TileAssignment({1, 12, 2}, {2, 12}, {1, 0})),
+         2, 24},
+
+        {HloSharding::PartialTile(TileAssignment({3, 2, 1, 4}, {2, 3, 4},
+                                                 {1, 0, 2})),
+         3, 24},
+        {HloSharding::PartialTile(TileAssignment({2, 4, 1, 3}, {2, 3, 4},
+                                                 {0, 2, 1})),
+         3, 24},
+        {HloSharding::PartialTile(TileAssignment({4, 3, 1, 2}, {2, 3, 4},
+                                                 {2, 1, 0})),
+         3, 24},
+        {HloSharding::PartialTile(TileAssignment({12, 1, 1, 2}, {2, 12},
+                                                 {1, 0})),
+         3, 24},
+    }));
+
 class ShardingConversionsEquivalentTest : public test_util::ShardingTest {
  public:
   void AssertSameTiling(const ShardingParam& sharding_param,
@@ -187,8 +290,9 @@ class ShardingConversionsEquivalentTest : public test_util::ShardingTest {
 TEST_P(ShardingConversionsEquivalentTest, ShardingParamFullySharded) {
   ShardingParam sharding_param{/*dim_shards=*/{2, 3},
                                {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
+  TF_EXPECT_OK(sharding_param.verify());
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
   AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
 }
@@ -196,8 +300,9 @@ TEST_P(ShardingConversionsEquivalentTest, ShardingParamFullySharded) {
 TEST_P(ShardingConversionsEquivalentTest, ShardingParamWithPermutation) {
   ShardingParam sharding_param{/*dim_shards=*/{2, 3},
                                {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_EXPECT_OK(sharding_param.verify());
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
   AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
 }
@@ -205,8 +310,9 @@ TEST_P(ShardingConversionsEquivalentTest, ShardingParamWithPermutation) {
 TEST_P(ShardingConversionsEquivalentTest, ShardingParamWithReplication) {
   ShardingParam sharding_param{/*dim_shards=*/{2, 1},
                                {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
+  TF_EXPECT_OK(sharding_param.verify());
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
   AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
 }
@@ -215,10 +321,11 @@ TEST_P(ShardingConversionsEquivalentTest, OpShardingReplicated) {
   OpSharding op_sharding;
   op_sharding.set_type(OpSharding::REPLICATED);
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_sharding,
-                          xla::HloSharding::FromProto(op_sharding));
+                          HloSharding::FromProto(op_sharding));
   TF_ASSERT_OK_AND_ASSIGN(auto actual, ToShardingParam(hlo_sharding, 2, 6));
   ShardingParam expected{/*dim_shards=*/{1, 1},
                          {/*permutation=*/{0}, /*axis_sizes=*/{6}}};
+  TF_EXPECT_OK(expected.verify());
   EXPECT_EQ(actual, expected);
 }
 

From 531dc955df01e8d7b259f336f1a276e92a407871 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Mon, 18 Mar 2024 13:35:32 -0700
Subject: [PATCH 035/670] Make mock_nccl_utils.cc compile.

PiperOrigin-RevId: 616932122
---
 third_party/xla/xla/service/gpu/mock_nccl_utils.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils.cc b/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
index bca0bc01bcef3f..56782e9e6d777a 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
+++ b/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
@@ -53,7 +53,6 @@ limitations under the License.
 #include "third_party/gpus/nccl/include/info.h"
 #include "third_party/gpus/nccl/include/nccl_common.h"
 #include "third_party/nccl/nccl.h"
-#include "third_party/gpus/nccl/src/include/device.h"
 #include "xla/debug_options_flags.h"
 #include "xla/executable_run_options.h"
 #include "xla/primitive_util.h"

From ad6df1bba30e0f580006ddef18c2f8e1d81e9412 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Mon, 18 Mar 2024 13:37:12 -0700
Subject: [PATCH 036/670] Deduplicate BuildAttributesMap code.

PiperOrigin-RevId: 616932606
---
 .../xla/xla/service/gpu/fusions/custom.cc     | 52 -------------------
 .../xla/service/gpu/ir_emitter_unnested.cc    | 51 ------------------
 third_party/xla/xla/service/gpu/runtime/BUILD |  3 +-
 .../service/gpu/runtime/custom_call_thunk.cc  | 51 ++++++++++++++++++
 .../service/gpu/runtime/custom_call_thunk.h   |  5 ++
 5 files changed, 58 insertions(+), 104 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index fa910bc2589cf6..8027bd69756a3d 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -79,58 +79,6 @@ absl::StatusOr<std::unique_ptr<Thunk>> BuildCustomKernelThunkForFusion(
       &fusion, std::move(custom_kernel), std::move(kernel_arguments.args()));
 }
 
-// TODO(vuson): this is duplicated from ir_emitter_unnested.cc
-// Converts MLIR dictionary attribute attached to a custom call operation to a
-// custom call thunk attributes that are forwarded to the FFI handler.
-static absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
-    mlir::DictionaryAttr dict) {
-  CustomCallThunk::AttributesMap attributes;
-  for (auto& kv : dict) {
-    std::string_view name = kv.getName().strref();
-
-    auto integer = [&](mlir::IntegerAttr integer) {
-      switch (integer.getType().getIntOrFloatBitWidth()) {
-        case 32:
-          attributes[name] = static_cast<int32_t>(integer.getInt());
-          return absl::OkStatus();
-        case 64:
-          attributes[name] = static_cast<int64_t>(integer.getInt());
-          return absl::OkStatus();
-        default:
-          return absl::InvalidArgumentError(absl::StrCat(
-              "Unsupported integer attribute bit width for attribute: ", name));
-      }
-    };
-
-    auto fp = [&](mlir::FloatAttr fp) {
-      switch (fp.getType().getIntOrFloatBitWidth()) {
-        case 32:
-          attributes[name] = static_cast<float>(fp.getValue().convertToFloat());
-          return absl::OkStatus();
-        default:
-          return absl::InvalidArgumentError(absl::StrCat(
-              "Unsupported float attribute bit width for attribute: ", name));
-      }
-    };
-
-    auto str = [&](mlir::StringAttr str) {
-      attributes[name] = str.getValue().str();
-      return absl::OkStatus();
-    };
-
-    TF_RETURN_IF_ERROR(
-        llvm::TypeSwitch<mlir::Attribute, Status>(kv.getValue())
-            .Case<mlir::IntegerAttr>(integer)
-            .Case<mlir::FloatAttr>(fp)
-            .Case<mlir::StringAttr>(str)
-            .Default([&](mlir::Attribute) {
-              return absl::InvalidArgumentError(absl::StrCat(
-                  "Unsupported attribute type for attribute: ", name));
-            }));
-  }
-  return attributes;
-}
-
 absl::StatusOr<BufferAllocation::Slice> GetSliceWithUpdatedOffsetAndSize(
     const BufferAssignment& buffer_assignment, const HloFusionAdaptor& fusion,
     const HloInstruction& fusion_instr, const HloInstruction& start,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 1826b5723b27f0..79eca88e8f96ea 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -1303,57 +1303,6 @@ absl::Status IrEmitterUnnested::EmitCholeskyThunk(const HloInstruction* instr) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-// Converts MLIR dictionary attribute attached to a custom call operation to a
-// custom call thunk attributes that are forwarded to the FFI handler.
-static absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
-    mlir::DictionaryAttr dict) {
-  CustomCallThunk::AttributesMap attributes;
-  for (auto& kv : dict) {
-    std::string_view name = kv.getName().strref();
-
-    auto integer = [&](mlir::IntegerAttr integer) {
-      switch (integer.getType().getIntOrFloatBitWidth()) {
-        case 32:
-          attributes[name] = static_cast<int32_t>(integer.getInt());
-          return absl::OkStatus();
-        case 64:
-          attributes[name] = static_cast<int64_t>(integer.getInt());
-          return absl::OkStatus();
-        default:
-          return absl::InvalidArgumentError(absl::StrCat(
-              "Unsupported integer attribute bit width for attribute: ", name));
-      }
-    };
-
-    auto fp = [&](mlir::FloatAttr fp) {
-      switch (fp.getType().getIntOrFloatBitWidth()) {
-        case 32:
-          attributes[name] = static_cast<float>(fp.getValue().convertToFloat());
-          return absl::OkStatus();
-        default:
-          return absl::InvalidArgumentError(absl::StrCat(
-              "Unsupported float attribute bit width for attribute: ", name));
-      }
-    };
-
-    auto str = [&](mlir::StringAttr str) {
-      attributes[name] = str.getValue().str();
-      return absl::OkStatus();
-    };
-
-    TF_RETURN_IF_ERROR(
-        llvm::TypeSwitch<mlir::Attribute, Status>(kv.getValue())
-            .Case<mlir::IntegerAttr>(integer)
-            .Case<mlir::FloatAttr>(fp)
-            .Case<mlir::StringAttr>(str)
-            .Default([&](mlir::Attribute) {
-              return absl::InvalidArgumentError(absl::StrCat(
-                  "Unsupported attribute type for attribute: ", name));
-            }));
-  }
-  return attributes;
-}
-
 absl::Status IrEmitterUnnested::EmitCustomCallThunk(
     const HloCustomCallInstruction* instr) {
   const std::string call_target_name = instr->custom_call_target();
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index e1ce6bd1e4c04a..df151764e4da79 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -470,10 +470,11 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_types_header",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
index 28a7dcebfc1dfa..0edf3b7c9dced4 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/call_frame.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/util.h"
+#include "tsl/platform/errors.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/stream_executor/gpu/gpu_stream.h"
@@ -149,5 +151,54 @@ absl::Status CustomCallThunk::ExecuteOnStream(const ExecuteParams& params) {
   return handler_ ? ExecuteFfiHandler(params) : ExecuteCustomCall(params);
 }
 
+absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
+    mlir::DictionaryAttr dict) {
+  CustomCallThunk::AttributesMap attributes;
+  for (auto& kv : dict) {
+    std::string_view name = kv.getName().strref();
+
+    auto integer = [&](mlir::IntegerAttr integer) {
+      switch (integer.getType().getIntOrFloatBitWidth()) {
+        case 32:
+          attributes[name] = static_cast<int32_t>(integer.getInt());
+          return absl::OkStatus();
+        case 64:
+          attributes[name] = static_cast<int64_t>(integer.getInt());
+          return absl::OkStatus();
+        default:
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Unsupported integer attribute bit width for attribute: ", name));
+      }
+    };
+
+    auto fp = [&](mlir::FloatAttr fp) {
+      switch (fp.getType().getIntOrFloatBitWidth()) {
+        case 32:
+          attributes[name] = static_cast<float>(fp.getValue().convertToFloat());
+          return absl::OkStatus();
+        default:
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Unsupported float attribute bit width for attribute: ", name));
+      }
+    };
+
+    auto str = [&](mlir::StringAttr str) {
+      attributes[name] = str.getValue().str();
+      return absl::OkStatus();
+    };
+
+    TF_RETURN_IF_ERROR(
+        llvm::TypeSwitch<mlir::Attribute, Status>(kv.getValue())
+            .Case<mlir::IntegerAttr>(integer)
+            .Case<mlir::FloatAttr>(fp)
+            .Case<mlir::StringAttr>(str)
+            .Default([&](mlir::Attribute) {
+              return absl::InvalidArgumentError(absl::StrCat(
+                  "Unsupported attribute type for attribute: ", name));
+            }));
+  }
+  return attributes;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
index dd445a248935e0..12d62c67c9af09 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
@@ -120,6 +120,11 @@ class CustomCallThunk : public Thunk {
   const HloComputation* called_computation_ = nullptr;
 };
 
+// Converts MLIR dictionary attribute attached to a custom call operation to a
+// custom call thunk attributes that are forwarded to the FFI handler.
+absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
+    mlir::DictionaryAttr dict);
+
 }  // namespace gpu
 }  // namespace xla
 

From b960be62b93ba5f14a7073c0d84d79a11d615fde Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Mon, 18 Mar 2024 13:37:15 -0700
Subject: [PATCH 037/670] Call DropAllControlDeps before removing an
 instruction from the computation during RematerializeInstructions, or
 RemoveInstruction will fail.

Before this change, hlo-opt was failing with this stack:
INTERNAL: RET_CHECK failure (third_party/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc:426) ignore_safety_check || IsSafelyRemovable(instruction) cannot remove instruction: %all-reduce-start.285 = (s32[8,64]{1,0}, s32[], s32[], s32[], s32[], /*index=5*/s32[], s32[], s32[], s32[], s32[], /*index=10*/s32[], s32[], s32[], s32[], s32[], /*index=15*/s32[], s32[], s32[], s32[], s32[]) all-reduce-start(s32[8,64]{1,0} %input_scatter_fusion.16, s32[] %convert.81230.0.remat, s32[] %copy.13646, s32[] %copy.13647, s32[] %copy.13648, /*index=5*/s32[] %copy.13649, s32[] %copy.13650, s32[] %copy.13651, s32[] %copy.13652, s32[] %copy.13653, /*index=10*/s32[] %copy.13654, s32[] %copy.13655, s32[] %copy.13656, s32[] %copy.13657, s32[] %copy.13658, /*index=15*/s32[] %copy.13659, s32[] %copy.13660, s32[] %copy.13661, s32[] %copy.13662, s32[] %copy.13663), channel_id=711, replica_groups={{0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124},{1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,65,69,73,77,81,85,89,93,97,101,105,109,113,117,121,125},{2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,66,70,74,78,82,86,90,94,98,102,106,110,114,118,122,126},{3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,67,71,75,79,83,87,91,95,99,103,107,111,115,119,123,127}}, use_global_device_ids=true, to_apply=%region_597.10276.clone.1, control-predecessors={%copy.14013, %copy.14014, %copy.14015, %copy.14016, %copy.14017, %copy.14018, %copy.14019, %copy.14020, %copy.14021, %copy.14022, %copy.14023, %copy.14024, %copy.14025, %copy.14026, %copy.14027, %copy.14028, %copy.14029, %copy.14030, %copy.14031, %copy.14032, %copy.14033, %copy.14034, %copy.14035, %copy.14036, %copy.14037, %copy.14038, %copy.14039, %copy.14040, %copy.14041, %copy.14042}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"collective_backend_config":{"is_sync":false,"no_parallel_custom_call":false}}
=== Source Location Trace: ===
third_party/tensorflow/compiler/xla/status_macros.cc:80
third_party/tensorflow/compiler/xla/service/hlo_rematerialization.cc:2165
third_party/tensorflow/compiler/xla/service/hlo_rematerialization.cc:2564
third_party/tensorflow/compiler/xla/service/hlo_rematerialization.cc:2708
third_party/tensorflow/compiler/xla/service/hlo_rematerialization.cc:2767
third_party/tensorflow/compiler/xla/service/hlo_rematerialization.cc:2905
third_party/tensorflow/compiler/xla/service/hlo_pass_pipeline.h:140
third_party/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc:185
third_party/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc:2183
third_party/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc:1923
third_party/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc:2012
third_party/tensorflow/compiler/xla/tools/hlo_opt/opt_lib.cc:155
third_party/tensorflow/compiler/xla/tools/hlo_opt/opt_lib.cc:108
third_party/tensorflow/compiler/xla/tools/hlo_opt/gpu_opt.cc:80
third_party/tensorflow/compiler/xla/tools/hlo_opt/opt_main.cc:166
third_party/tensorflow/compiler/xla/tools/hlo_opt/opt_main.cc:177

After this change, hlo-opt fails with an expected failure due to no JAX:
UNIMPLEMENTED: No registered implementation for custom call to cu_threefry2x32 for platform CUDA
=== Source Location Trace: ===
third_party/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc:1391
third_party/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc:3023
third_party/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc:2574
third_party/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc:2015
third_party/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc:3023
third_party/tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.cc:205
third_party/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc:1949
third_party/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc:2028
third_party/tensorflow/compiler/xla/tools/hlo_opt/opt_lib.cc:155
third_party/tensorflow/compiler/xla/tools/hlo_opt/opt_lib.cc:108
third_party/tensorflow/compiler/xla/tools/hlo_opt/gpu_opt.cc:80
third_party/tensorflow/compiler/xla/tools/hlo_opt/opt_main.cc:166
third_party/tensorflow/compiler/xla/tools/hlo_opt/opt_main.cc:177
PiperOrigin-RevId: 616932625
---
 third_party/xla/xla/service/hlo_rematerialization.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/third_party/xla/xla/service/hlo_rematerialization.cc b/third_party/xla/xla/service/hlo_rematerialization.cc
index 3197bb094b5f76..a524e3cca559dc 100644
--- a/third_party/xla/xla/service/hlo_rematerialization.cc
+++ b/third_party/xla/xla/service/hlo_rematerialization.cc
@@ -2162,6 +2162,10 @@ absl::StatusOr<int64_t> RematerializeInstructions(
       VLOG(2) << "The old instruction " << best->name()
               << " is an async op. Removing to maintain one start to one done "
                  "invariant to keep the HLO valid.";
+      // We need to remove all control dependencies from best before removing it
+      // from the computation.  Its control dependencies were previously copied
+      // to the remat instruction.
+      TF_RETURN_IF_ERROR(best->DropAllControlDeps());
       TF_RETURN_IF_ERROR(computation->RemoveInstruction(best));
     }
   }

From 30488e7e5785733ec638fb1839ca75173ac97e5f Mon Sep 17 00:00:00 2001
From: "Jae H. Yoo" <jaeyoo@google.com>
Date: Mon, 18 Mar 2024 13:48:08 -0700
Subject: [PATCH 038/670] Add flatbuffer export/import for bfloat16.

PiperOrigin-RevId: 616935733
---
 .../compiler/mlir/lite/flatbuffer_export.cc   |  2 +
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  4 +-
 .../lite/tests/flatbuffer2mlir/cast_bf16.mlir | 12 +++
 .../compiler/mlir/lite/tests/legalize-tf.mlir | 12 +++
 .../lite/tests/mlir2flatbuffer/cast_bf16.mlir | 74 +++++++++++++++++++
 .../mlir/lite/utils/const_tensor_utils.cc     | 43 ++++++++---
 .../lite/tools/versioning/op_version.cc       |  7 +-
 7 files changed, 138 insertions(+), 16 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/cast_bf16.mlir
 create mode 100644 tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/cast_bf16.mlir

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 1a9ff8016649ef..dd28efd44eab14 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -181,6 +181,8 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
     return tflite::TensorType_FLOAT32;
   } else if (type.isF16()) {
     return tflite::TensorType_FLOAT16;
+  } else if (type.isBF16()) {
+    return tflite::TensorType_BFLOAT16;
   } else if (type.isF64()) {
     return tflite::TensorType_FLOAT64;
   } else if (type.isa<mlir::TF::StringType>()) {
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 55388c86dfc7bf..481f5573058b8c 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -3926,10 +3926,10 @@ def TFL_CastOp : TFL_Op<"cast", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F16, F32, F64, I1, TFL_I4, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$input
+    TFL_TensorOf<[F16, BF16, F32, F64, I1, TFL_I4, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$input
   );
 
-  let results = (outs TFL_TensorOf<[F16, F32, F64, I1, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$output);
+  let results = (outs TFL_TensorOf<[F16, BF16, F32, F64, I1, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$output);
 
   // TFLite's cast op does not utilize CastOptions, instead derives types
   // from the TfLiteTensors.
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/cast_bf16.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/cast_bf16.mlir
new file mode 100644
index 00000000000000..56068d605016e7
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/cast_bf16.mlir
@@ -0,0 +1,12 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
+// Ensure cast with bfloat16 roundtrip exactly
+
+func.func @main(tensor<4x5xbf16>) -> tensor<4x5xbf16> {
+^bb0(%arg0: tensor<4x5xbf16>):
+  // CHECK-LABEL: @main
+  // CHECK:  (tensor<4x5xbf16>) -> tensor<4x5xf32>
+  // CHECK-NEXT:  (tensor<4x5xf32>) -> tensor<4x5xbf16>
+  %0 = "tfl.cast" (%arg0) : (tensor<4x5xbf16>) -> tensor<4x5xf32> loc("cast1")
+  %1 = "tfl.cast" (%0) : (tensor<4x5xf32>) -> tensor<4x5xbf16> loc("cast2")
+  func.return %1 : tensor<4x5xbf16>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 685efd5be0ca2d..a0b9f90a879507 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1875,6 +1875,18 @@ func.func @matmul_batchv3_unknown_dim(%arg0: tensor<?x10x15xf32>, %arg1: tensor<
 // CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<?x10x15xf32>, tensor<15x17xf32>) -> tensor<?x10x17xf32>
 }
 
+func.func @matmul_batchv3_unknown_dim_bf16(%arg0: tensor<?x4x5xbf16>, %arg1: tensor<5x6xf32>) -> tensor<?x4x6xbf16> {
+  %0 = "tf.Cast"(%arg0) : (tensor<?x4x5xbf16>) -> tensor<?x4x5xf32>
+  %1 = "tf.BatchMatMulV3"(%0, %arg1) {Ta = "tfdtype$DT_FLOAT", Tb = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", adj_x = false, adj_y = false} :
+(tensor<?x4x5xf32>, tensor<5x6xf32>) -> tensor<?x4x6xf32>
+  %2 = "tf.Cast"(%1) : (tensor<?x4x6xf32>) -> tensor<?x4x6xbf16>
+  func.return %2 : tensor<?x4x6xbf16>
+// CHECK-LABEL: matmul_batchv3_unknown_dim_bf16
+// CHECK: [[CST:%.*]] = "tfl.cast"(%arg0) : (tensor<?x4x5xbf16>) -> tensor<?x4x5xf32>
+// CHECK: [[BMM:%.*]] = "tfl.batch_matmul"([[CST]], %arg1) {adj_x = false, adj_y = false} : (tensor<?x4x5xf32>, tensor<5x6xf32>) -> tensor<?x4x6xf32>
+// CHECK: "tfl.cast"([[BMM]]) : (tensor<?x4x6xf32>) -> tensor<?x4x6xbf16>
+}
+
 // -----
 
 func.func @select_v2_with_6d_broadcasting(%arg0: tensor<1x1x1x1x3x1xi1>, %arg1 : tensor<1x1x1x1x1x4xf32>, %arg2 : tensor<1x1x1x2x1x1xf32>) -> tensor<1x1x1x2x3x4xf32> {
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/cast_bf16.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/cast_bf16.mlir
new file mode 100644
index 00000000000000..83255ca39a4472
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/cast_bf16.mlir
@@ -0,0 +1,74 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck %s
+
+func.func @main(tensor<4x5xbf16>) -> tensor<4x5xbf16> {
+^bb0(%arg0: tensor<4x5xbf16>):
+
+// CHECK:  {
+// CHECK-NEXT:      version: 3,
+// CHECK-NEXT:      operator_codes: [ {
+// CHECK-NEXT:        deprecated_builtin_code: 53,
+// CHECK-NEXT:        version: 7,
+// CHECK-NEXT:        builtin_code: CAST
+// CHECK-NEXT:      } ],
+// CHECK-NEXT:      subgraphs: [ {
+// CHECK-NEXT:        tensors: [ {
+// CHECK-NEXT:          shape: [ 4, 5 ],
+// CHECK-NEXT:          type: BFLOAT16,
+// CHECK-NEXT:          buffer: 1,
+// CHECK-NEXT:          name: "arg0",
+// CHECK-NEXT:          quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:          },
+// CHECK-NEXT:          has_rank: true
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          shape: [ 4, 5 ],
+// CHECK-NEXT:          buffer: 2,
+// CHECK-NEXT:          name: "cast1",
+// CHECK-NEXT:          quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:          },
+// CHECK-NEXT:          has_rank: true
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          shape: [ 4, 5 ],
+// CHECK-NEXT:          type: BFLOAT16,
+// CHECK-NEXT:          buffer: 3,
+// CHECK-NEXT:          name: "cast2",
+// CHECK-NEXT:          quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:          },
+// CHECK-NEXT:          has_rank: true
+// CHECK-NEXT:        } ],
+// CHECK-NEXT:        inputs: [ 0 ],
+// CHECK-NEXT:        outputs: [ 2 ],
+// CHECK-NEXT:        operators: [ {
+// CHECK-NEXT:          inputs: [ 0 ],
+// CHECK-NEXT:          outputs: [ 1 ]
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          inputs: [ 1 ],
+// CHECK-NEXT:          outputs: [ 2 ]
+// CHECK-NEXT:        } ],
+// CHECK-NEXT:        name: "main"
+// CHECK-NEXT:      } ],
+// CHECK-NEXT:      description: "MLIR Converted.",
+// CHECK-NEXT:      buffers: [ {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }, {
+// CHECK-NEXT:        data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:      } ],
+// CHECK-NEXT:      metadata: [ {
+// CHECK-NEXT:        name: "min_runtime_version",
+// CHECK-NEXT:        buffer: 4
+// CHECK-NEXT:      } ],
+// CHECK-NEXT:      signature_defs: [  ]
+// CHECK-NEXT:    }
+
+  %0 = "tfl.cast" (%arg0) : (tensor<4x5xbf16>) -> tensor<4x5xf32> loc("cast1")
+  %1 = "tfl.cast" (%0) : (tensor<4x5xf32>) -> tensor<4x5xbf16> loc("cast2")
+  func.return %1 : tensor<4x5xbf16>
+}
diff --git a/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc b/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
index 5ce7638f4e4da1..96d75cca30a48d 100644
--- a/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
@@ -345,22 +345,41 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
   switch (elem_type.getIntOrFloatBitWidth()) {
     case 16: {
       assert(bytes_len % 2 == 0);
-      assert(elem_type.isF16());
+      // Supports both BF16 and F16.
+      assert(elem_type.isF16() || elem_type.isBF16());
       int elem_count = bytes_len / 2;
-      std::vector<Eigen::half> values;
-      values.reserve(elem_count);
 
-      const char* data = reinterpret_cast<const char*>(buffer.data());
+      if (elem_type.isF16()) {
+        std::vector<Eigen::half> values;
+        values.reserve(elem_count);
 
-      for (int i = 0; i < elem_count; i++) {
-        uint16_t bit_repr =
-            llvm::support::endian::readNext<uint16_t, llvm::endianness::native,
-                                            llvm::support::unaligned>(data);
-        values.push_back(Eigen::numext::bit_cast<Eigen::half>(bit_repr));
-      }
+        const char* data = reinterpret_cast<const char*>(buffer.data());
 
-      return mlir::ElementsAttr(
-          DenseElementsAttr::get(shaped_type, ArrayRef<Eigen::half>(values)));
+        for (int i = 0; i < elem_count; i++) {
+          uint16_t bit_repr = llvm::support::endian::readNext<
+              uint16_t, llvm::endianness::native, llvm::support::unaligned>(
+              data);
+          values.push_back(Eigen::numext::bit_cast<Eigen::half>(bit_repr));
+        }
+
+        return mlir::ElementsAttr(
+            DenseElementsAttr::get(shaped_type, ArrayRef<Eigen::half>(values)));
+      } else {
+        std::vector<Eigen::bfloat16> values;
+        values.reserve(elem_count);
+
+        const char* data = reinterpret_cast<const char*>(buffer.data());
+
+        for (int i = 0; i < elem_count; i++) {
+          uint16_t bit_repr = llvm::support::endian::readNext<
+              uint16_t, llvm::endianness::native, llvm::support::unaligned>(
+              data);
+          values.push_back(Eigen::numext::bit_cast<Eigen::bfloat16>(bit_repr));
+        }
+
+        return mlir::ElementsAttr(DenseElementsAttr::get(
+            shaped_type, ArrayRef<Eigen::bfloat16>(values)));
+      }
     }
     case 32: {
       assert(bytes_len % 4 == 0);
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index b5d8bb151e7145..e6044fc6881990 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -1045,8 +1045,11 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 2;
     case BuiltinOperator_CAST:
-      if (op_sig.inputs.at(0).type == kTfLiteInt4 &&
-          op_sig.outputs.at(0).type == kTfLiteFloat32) {
+      if (op_sig.inputs.at(0).type == kTfLiteBFloat16 ||
+          op_sig.outputs.at(0).type == kTfLiteBFloat16) {
+        return 7;
+      } else if (op_sig.inputs.at(0).type == kTfLiteInt4 &&
+                 op_sig.outputs.at(0).type == kTfLiteFloat32) {
         return 6;
       } else if (op_sig.inputs.at(0).type == kTfLiteFloat64 ||
                  op_sig.outputs.at(0).type == kTfLiteFloat64 ||

From 5cac1546945d2971ab5c0f1ea9ead1c4c32f0bf0 Mon Sep 17 00:00:00 2001
From: Harshit Monish <143435143+hmonishN@users.noreply.github.com>
Date: Mon, 18 Mar 2024 14:20:06 -0700
Subject: [PATCH 039/670] PR #10635: Fix build error from PR 10497

Imported from GitHub PR https://github.com/openxla/xla/pull/10635

Added changes to fix build error that were encountered after merging changes from PR: https://github.com/openxla/xla/pull/10497

Used ::tsl::testing::StatusIs instead of ::testing::status::StatusIs

Copybara import of the project:

--
d344105a62e5f1749d994c2f1f2a8a76a5880c3d by hmonishN <hmonish@nvidia.com>:

Adding changes to use tsl::testing:StatusIs

Merging this change closes #10635

PiperOrigin-RevId: 616945601
---
 third_party/xla/xla/service/gpu/BUILD                  | 1 +
 third_party/xla/xla/service/gpu/autotuner_util_test.cc | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 508fd8f638e8a5..b94c1f10f9ce84 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -5968,6 +5968,7 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:protobuf",
     ]) + [
diff --git a/third_party/xla/xla/service/gpu/autotuner_util_test.cc b/third_party/xla/xla/service/gpu/autotuner_util_test.cc
index b755334876cc26..28ec27c64e8da0 100644
--- a/third_party/xla/xla/service/gpu/autotuner_util_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuner_util_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tsl/platform/env.h"
 #include "tsl/platform/logging.h"   // IWYU pragma: keep
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+#include "tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace gpu {
@@ -45,7 +46,7 @@ using ::testing::HasSubstr;
 using ::testing::IsEmpty;
 using ::testing::Not;
 using ::testing::TempDir;
-using ::testing::status::StatusIs;
+using ::tsl::testing::StatusIs;
 
 class AutotunerUtilTest : public HloTestBase {
  protected:

From 98830e91ac878d1ee15b7539dc72bb06adb2fc2f Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 18 Mar 2024 14:59:44 -0700
Subject: [PATCH 040/670] [xla:hlo] Use llvm::BitVector instead of a set when
 checking reachability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

name                               old cpu/op   new cpu/op   delta
BM_HloDfsReachabilityBuild/1        109ns ± 4%   111ns ± 4%     ~
BM_HloDfsReachabilityBuild/64      1.71µs ± 6%  1.71µs ± 4%     ~
BM_HloDfsReachabilityBuild/128     3.38µs ± 3%  3.43µs ± 3%   +1.54%
BM_HloDfsReachabilityBuild/256     6.80µs ± 4%  6.95µs ± 5%   +2.25%
BM_HloDfsReachabilityBuild/512     13.8µs ± 4%  14.2µs ± 6%   +2.63%
BM_HloDfsReachabilityBuild/4096     155µs ± 4%   157µs ± 4%     ~
BM_HloDfsReachabilityBuild/32768   1.42ms ± 5%  1.45ms ± 3%   +1.94%
BM_HloDfsReachabilityBuild/262144  32.2ms ± 4%  32.1ms ± 4%     ~
BM_HloDfsReachabilityCheck/1       7.37ns ± 3%  7.41ns ± 4%     ~
BM_HloDfsReachabilityCheck/64       295ns ± 5%   139ns ± 8%  -52.78%
BM_HloDfsReachabilityCheck/128      679ns ± 3%   278ns ± 7%  -59.05%
BM_HloDfsReachabilityCheck/256     1.53µs ± 5%  0.61µs ± 6%  -60.06%
BM_HloDfsReachabilityCheck/512     3.06µs ± 5%  1.31µs ± 6%  -57.27%
BM_HloDfsReachabilityCheck/4096    30.2µs ± 7%  17.9µs ± 4%  -40.53%
BM_HloDfsReachabilityCheck/32768    532µs ± 4%   327µs ± 5%  -38.52%
BM_HloDfsReachabilityCheck/262144  8.72ms ± 3%  6.66ms ± 4%  -23.59%

PiperOrigin-RevId: 616956892
---
 .../xla/xla/hlo/ir/hlo_dfs_reachability.cc    | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc b/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc
index 2e6bd0e8495369..ae9b25f7453e98 100644
--- a/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -45,16 +45,20 @@ bool HloDfsReachability::IsReachable(const HloInstruction* from,
 
   // Note that the DFS goes from the "uses" root towards the "defs", i.e. from
   // `to` node to `from` node, so the node indices are decreasing.
-  if (target_node_idx > dfs_root_idx) {
+  if (dfs_root_idx < target_node_idx) {
     return false;
   }
 
-  // We use LLVM support library here because it has stack-allocated maps (in
-  // contrast to absl) which significantly improves performance by avoiding heap
-  // allocations when instructions are reachable via a short chain.
-  llvm::SmallDenseSet<size_t, 8> visited_idxs{dfs_root_idx};
+  // We use LLVM support library here because it has stack-allocated bit vector
+  // which significantly improves performance by avoiding heap allocations when
+  // instructions are reachable via a short chain.
   llvm::SmallVector<const HloInstruction*> stack{to};
 
+  // We will visit instructions in the [target_node_idx, dfs_root_idx] range, so
+  // we can construct a smaller bit vector.
+  llvm::BitVector visited_idxs(1 + (dfs_root_idx - target_node_idx));
+  visited_idxs.set(dfs_root_idx - target_node_idx);
+
   auto check_and_enqueue = [&](const HloInstruction* instr) {
     if (instr == from) {
       return true;
@@ -63,9 +67,11 @@ bool HloDfsReachability::IsReachable(const HloInstruction* from,
     if (instr_idx < target_node_idx) {
       return false;
     }
-    if (auto [_, inserted] = visited_idxs.insert(instr_idx); !inserted) {
+    size_t visited_idx = instr_idx - target_node_idx;
+    if (visited_idxs.test(visited_idx)) {
       return false;
     }
+    visited_idxs.set(visited_idx);
     stack.push_back(instr);
     return false;
   };

From 1e3478bf321872470b63032ac98401905e1f81ad Mon Sep 17 00:00:00 2001
From: prrathi <53785742+prrathi@users.noreply.github.com>
Date: Mon, 18 Mar 2024 15:00:04 -0700
Subject: [PATCH 041/670] PR #10642: Make configure.py command visible

Imported from GitHub PR https://github.com/openxla/xla/pull/10642

Copybara import of the project:

--
d5df8a14b8837c0980a8641a83b9d6d9e33577cc by prrathi <53785742+prrathi@users.noreply.github.com>:

Make configure.py command visible

Merging this change closes #10642

PiperOrigin-RevId: 616956998
---
 third_party/xla/docs/build_from_source.md | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/docs/build_from_source.md b/third_party/xla/docs/build_from_source.md
index 91ef1e49608818..c273f7f3cdf8c0 100644
--- a/third_party/xla/docs/build_from_source.md
+++ b/third_party/xla/docs/build_from_source.md
@@ -10,13 +10,12 @@ If you did not clone the XLA repository or install Bazel, please check out the
 ### Configure
 
 XLA builds are configured by the `.bazelrc` file in the repository's root
-directory. The `./configure.py` script can be used to adjust
-common settings.
+directory. The `./configure.py` script can be used to adjust common settings.
 
-If you need to change the configuration, run the `./configure.py` script from the
-repository's root directory. This script has flags for the location of XLA
-dependencies and additional build configuration options (compiler
-flags, for example). Refer to the *Sample session* section for details.
+If you need to change the configuration, run the `./configure.py` script from
+the repository's root directory. This script has flags for the location of XLA
+dependencies and additional build configuration options (compiler flags, for
+example). Refer to the *Sample session* section for details.
 
 ### CPU support
 
@@ -27,26 +26,29 @@ We recommend using a suitable docker container to build/test XLA, such as
 docker run --name xla -w /xla -it -d --rm -v $PWD:/xla tensorflow/build:latest-python3.9 bash
 ```
 
-Using a docker container you can build XLA with CPU support using the following commands:
+Using a docker container you can build XLA with CPU support using the following
+commands:
 
 ```
 docker exec xla ./configure.py --backend=CPU
 docker exec xla bazel build //xla/...  --spawn_strategy=sandboxed --test_output=all
 ```
 
-If you want to build XLA targets with CPU support without Docker you need to install clang. XLA currently builds on CI with clang-17, but earlier versions should also work:
+If you want to build XLA targets with CPU support without Docker you need to
+install clang. XLA currently builds on CI with clang-17, but earlier versions
+should also work:
 
 ```
 apt install clang
 ```
 
 Then configure and build targets using the following commands:
-``` ./configure.py --backend=CPU
 
+```sh
+./configure.py --backend=CPU
 bazel build --test_output=all --spawn_strategy=sandboxed //xla/...
 ```
 
-
 ### GPU support
 
 We recommend using the same docker container as above to build XLA with GPU
@@ -76,6 +78,5 @@ Then configure and build targets using the following commands:
 bazel build --test_output=all --spawn_strategy=sandboxed //xla/...
 ```
 
-
 For more details regarding
 [TensorFlow's GPU docker images you can check out this document.](https://www.tensorflow.org/install/source#gpu_support_3)

From 8c29d81c5b08f8ae7e86f68cc7d8e49a7832b183 Mon Sep 17 00:00:00 2001
From: Olli Lupton <olupton@nvidia.com>
Date: Mon, 18 Mar 2024 15:02:24 -0700
Subject: [PATCH 042/670] PR #10562: Add missing warmup run when autotuning.

Imported from GitHub PR https://github.com/openxla/xla/pull/10562

The `xla/service/gpu:gemm_algorithm_picker_test` test, run on V100, was hitting the delay kernel timeout because of this.
See #9757 for explanation of why the best practice is to execute a warmup run **without the GpuTimer active**.
Copybara import of the project:

--
b4ccf2928ee45ec9139db003378095b948bb73d5 by Olli Lupton <olupton@nvidia.com>:

Add missing warmup run when autotuning.

The xla/service/gpu:gemm_algorithm_picker_test test, run on V100, was
hitting the delay kernel timeout because of this.

Merging this change closes #10562

PiperOrigin-RevId: 616957791
---
 third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index 9ed90d2ba6eadd..446cde8de272de 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -186,6 +186,12 @@ class GemmAutotuner {
         -> absl::StatusOr<se::blas::ProfileResult> {
       se::OwningScratchAllocator<> scratch_allocator(
           stream_->parent()->device_ordinal(), autotune_config_.GetAllocator());
+      // Run a warmup iteration without the profiler active.
+      TF_RETURN_IF_ERROR(plan->ExecuteOnStream(
+          stream_, lhs_buffer_, rhs_buffer_, output_buffer_, output_buffer_,
+          bias_buffer, aux_buffer, a_scale_buffer, b_scale_buffer,
+          c_scale_buffer, d_scale_buffer, d_amax_buffer, algorithm,
+          scratch_allocator));
       se::blas::ProfileResult profile_result;
       TF_RETURN_IF_ERROR(plan->ExecuteOnStream(
           stream_, lhs_buffer_, rhs_buffer_, output_buffer_, output_buffer_,

From 0ca187b3ced9b97ade8322a26d3a40ea2c9c38bd Mon Sep 17 00:00:00 2001
From: Swachhand Lokhande <swachhand@google.com>
Date: Mon, 18 Mar 2024 15:10:28 -0700
Subject: [PATCH 043/670] Force creating XlaSharding ops for optimizer slot
 variables.

We do a read_value on the slot variables when creating the update op (eg. ResourceApplyAdagrad) for the optimizer to make sure the XlaSharding op is also generated.

We make this a control dependency for the update op so that this appears before it.

PiperOrigin-RevId: 616960034
---
 .../python/compiler/xla/experimental/BUILD    |  25 ++++
 .../resource_variable_xla_sharding_test.py    | 136 ++++++++++++++++++
 tensorflow/python/training/optimizer.py       |  25 +++-
 3 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/python/compiler/xla/experimental/resource_variable_xla_sharding_test.py

diff --git a/tensorflow/python/compiler/xla/experimental/BUILD b/tensorflow/python/compiler/xla/experimental/BUILD
index 8cc63502e0869a..c2e2dd9d45af60 100644
--- a/tensorflow/python/compiler/xla/experimental/BUILD
+++ b/tensorflow/python/compiler/xla/experimental/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -40,3 +41,27 @@ py_strict_test(
         "@absl_py//absl/testing:absltest",
     ],
 )
+
+tpu_py_strict_test(
+    name = "resource_variable_xla_sharding_test",
+    srcs = ["resource_variable_xla_sharding_test.py"],
+    disable_v3_4chips = False,
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = ["requires-net:external"],
+    deps = [
+        ":xla_sharding",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/tpu:device_assignment",
+        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/training:adagrad",
+    ],
+)
diff --git a/tensorflow/python/compiler/xla/experimental/resource_variable_xla_sharding_test.py b/tensorflow/python/compiler/xla/experimental/resource_variable_xla_sharding_test.py
new file mode 100644
index 00000000000000..ef7192a4f45807
--- /dev/null
+++ b/tensorflow/python/compiler/xla/experimental/resource_variable_xla_sharding_test.py
@@ -0,0 +1,136 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from tensorflow.python.compiler.xla.experimental import xla_sharding
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import config
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.tpu import device_assignment
+from tensorflow.python.tpu import tpu
+from tensorflow.python.training import adagrad
+
+
+# Gets all the nodes of `op` in graph that have `input_node_name` as one of the
+# inputs
+def _get_op_nodes_with_input(input_node_name, op, graph):
+  nodes_with_input = []
+  for node in graph.node:
+    nodes_with_input += [
+        node
+        for input in node.input
+        if input == input_node_name and node.op == op
+    ]
+  return nodes_with_input
+
+
+# Gets XlaSharding ops connected to ReadVariableOp for the given variable_name
+def _get_xla_sharding_nodes_for_variable(variable_name, graph):
+  read_variable_op_nodes = _get_op_nodes_with_input(
+      variable_name, 'ReadVariableOp', graph
+  )
+  xla_sharding_op_nodes = []
+  for read_variable_op_node in read_variable_op_nodes:
+    xla_sharding_op_nodes += _get_op_nodes_with_input(
+        read_variable_op_node.name, 'XlaSharding', graph
+    )
+  return xla_sharding_op_nodes
+
+
+def _get_xla_sharding_proto_from_node(node):
+  sharding_proto = xla_sharding.xla_data_pb2.OpSharding()
+  sharding_proto.ParseFromString(node.attr['sharding'].s)
+  return sharding_proto
+
+
+class ResourceVariableXlaShardingTest(test.TestCase):
+
+  def setUp(self) -> None:
+    super().setUp()
+
+    context.enable_xla_sharding_for_resource_variables()
+    self.topology = tpu_cluster_resolver.initialize_tpu_system()
+    if len(config.list_logical_devices('TPU')) != 8:
+      self.skipTest('All tests require 8 TPUs.')
+
+    self.da = device_assignment.DeviceAssignment.build(
+        self.topology, computation_shape=[2, 2, 1, 2], num_replicas=1
+    )
+
+  def test_xla_sharding_ops_created_for_optimizer_slot_variables(self):
+    w = variables.Variable(
+        initial_value=math_ops.range(8, dtype=dtypes.float32),
+        name='w',
+    )
+    self.assertIsInstance(w, resource_variable_ops.BaseResourceVariable)
+    w = xla_sharding.split(
+        w,
+        split_dimension=0,
+        num_devices=8,
+    )
+    sharding_proto = xla_sharding.xla_data_pb2.OpSharding()
+    sharding_proto.ParseFromString(xla_sharding.get_tensor_sharding(w))
+    opt = adagrad.AdagradOptimizer(1.0)
+
+    @def_function.function
+    def computation(x):
+      def tpu_fn(x):
+        y = math_ops.add(w, x)
+        loss = math_ops.reduce_sum(y)
+        opt.minimize(loss, None, [w])
+        return loss
+
+      output = tpu.replicate(tpu_fn, [[x]], device_assignment=self.da)
+      return output
+
+    inputs = array_ops.reshape(math_ops.range(16, dtype=dtypes.float32), (2, 8))
+    result = computation(inputs)
+    self.assertSequenceEqual([[176.0]], self.evaluate(result))
+    graph = computation.get_concrete_function(inputs).graph.as_graph_def()
+
+    update_op_nodes = [
+        node for node in graph.node if node.op == 'ResourceApplyAdagrad'
+    ]
+    self.assertLen(update_op_nodes, 1)
+    update_op_node = update_op_nodes[0]
+
+    var_input_name = update_op_node.input[0]
+    var_sharding_nodes = _get_xla_sharding_nodes_for_variable(
+        var_input_name, graph
+    )
+    self.assertLen(var_sharding_nodes, 1)
+    self.assertProtoEquals(
+        _get_xla_sharding_proto_from_node(var_sharding_nodes[0]), sharding_proto
+    )
+
+    slot_var_input_name = update_op_node.input[1]
+    slot_var_sharding_nodes = _get_xla_sharding_nodes_for_variable(
+        slot_var_input_name, graph
+    )
+    self.assertLen(slot_var_sharding_nodes, 1)
+    self.assertProtoEquals(
+        _get_xla_sharding_proto_from_node(slot_var_sharding_nodes[0]),
+        sharding_proto,
+    )
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 7536c6ce90692f..5a438ce2d52d05 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -169,7 +169,30 @@ def update_op(self, optimizer, g):
             "Cannot use a constraint function on a sparse variable.")
       return optimizer._resource_apply_sparse_duplicate_indices(
           g.values, self._v, g.indices)
-    update_op = optimizer._resource_apply_dense(g, self._v)
+
+    if context.xla_sharding_for_resource_variables_enabled():
+      # For each slot variable that is annotated with an XLA sharding, we read
+      # the variable and assign the value to itself. This is done to trigger the
+      # creation of an XlaShardingOp when a ReadVariableOp is created upon the
+      # call to `slot_var.read_value()`. This is needed to ensure that slot
+      # variables with XLA sharding are sharded correctly. Please see
+      # b/307541427 for more details.
+      assign_ops = []
+      for variable_dict in optimizer._slots.values():
+        for slot_var in variable_dict.values():
+          if (
+              isinstance(slot_var, resource_variable_ops.BaseResourceVariable)
+              and slot_var._get_xla_sharding() is not None
+          ):
+            assign_ops.append(slot_var.assign(slot_var.read_value()))
+
+      # The assign_ops created above are added as a control dependency for the
+      # update op to make sure these appear before the update_op.
+      with ops.control_dependencies(assign_ops):
+        update_op = optimizer._resource_apply_dense(g, self._v)
+    else:
+      update_op = optimizer._resource_apply_dense(g, self._v)
+
     if self._v.constraint is not None:
       with ops.control_dependencies([update_op]):
         return self._v.assign(self._v.constraint(self._v))

From cd60a4dd509b300469607267b00a3513796748e6 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 18 Mar 2024 15:11:02 -0700
Subject: [PATCH 044/670] [xla:hlo] Do not compute channel dependencies when
 building DFS reachability

PiperOrigin-RevId: 616960175
---
 third_party/xla/xla/hlo/ir/hlo_computation.cc      | 4 +++-
 third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc | 7 ++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index 16c9e0ce37b1c8..7d8a080bd3840a 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -506,6 +507,7 @@ void HloComputation::ForEachInstructionPostOrderImpl(
     absl::FunctionRef<void(HloInstruction*)> func, HloInstruction* root,
     const ChannelDependencies& channel_dependencies, VisitMap& visited,
     std::vector<HloInstruction*>* dfs_stack_scratch) const {
+  bool has_channel_dependencies = !channel_dependencies.empty();
   auto* dfs_stack = dfs_stack_scratch;
   dfs_stack->clear();
   dfs_stack->push_back(root);
@@ -532,7 +534,7 @@ void HloComputation::ForEachInstructionPostOrderImpl(
     // Collectives with the same channel ID must be performed together, as these
     // represent MPMD-partitioned that will later be split into separate modules
     // and the order must be preserved.
-    if (&current != root) {
+    if (has_channel_dependencies && &current != root) {
       auto it = channel_dependencies.find(&current);
       if (it != channel_dependencies.end()) {
         dfs_stack->insert(dfs_stack->end(), it->second.begin(),
diff --git a/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc b/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc
index ae9b25f7453e98..c831f31cec03f1 100644
--- a/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc
@@ -96,10 +96,11 @@ std::unique_ptr<HloDfsReachability> HloDfsReachability::Build(
     const HloComputation* computation) {
   auto res = std::make_unique<HloDfsReachability>();
 
-  HloComputation::ChannelDependencies channel_dependencies =
-      computation->ComputeChannelDependencies();
+  // For instruction reachability we do not care about correct order of
+  // collective operations as we only care about use-def chains.
+  HloComputation::ChannelDependencies empty_channel_dependencies;
   std::vector<HloInstruction*> instructions =
-      computation->MakeInstructionPostOrder(channel_dependencies);
+      computation->MakeInstructionPostOrder(empty_channel_dependencies);
 
   res->instruction_to_idx_.reserve(instructions.size());
   for (size_t i = 0; i < instructions.size(); ++i) {

From 392a5f0d120a82e1bd4c9af486fe786ee04931bc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 Mar 2024 15:11:57 -0700
Subject: [PATCH 045/670] Temporarily disables failing
 shared_batch_scheduler_test on Windows.

PiperOrigin-RevId: 616960378
---
 tensorflow/core/kernels/batching_util/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index d34bd7331a35d5..828b1c0f60d4fb 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -190,6 +190,7 @@ tf_cc_test(
     name = "shared_batch_scheduler_test",
     size = "small",
     srcs = ["shared_batch_scheduler_test.cc"],
+    tags = ["no_windows"],
     deps = [
         ":batch_scheduler",
         ":fake_clock_env",

From ff0308108e154928b033b1ab01fb8512b9786c18 Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Mon, 18 Mar 2024 15:55:04 -0700
Subject: [PATCH 046/670] Fix readability issues in `quantization_driver.h/cc`.

PiperOrigin-RevId: 616971652
---
 .../mlir/lite/quantization/lite/BUILD         |   1 +
 .../common/quantization_lib/BUILD             |   1 +
 .../quantization_lib/quantization_driver.cc   | 408 +++++++++---------
 .../quantization_lib/quantization_driver.h    | 154 +++----
 .../quantization_lib/quantization_utils.h     |   8 +-
 5 files changed, 299 insertions(+), 273 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index ad7c1905440297..a0f55e0408932f 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -91,6 +91,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD b/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
index d41a189519fd6d..a0d64569562d38 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
@@ -35,6 +35,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/tools/optimize:quantization_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc
index 962c6656f55b65..327d109946e031 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
@@ -47,39 +46,44 @@ limitations under the License.
 
 namespace mlir {
 namespace quant {
-
 namespace {
-// This is used to identify an operand or result of an op. The second element
-// of this pair is the index of the operand or result.
-using OpValue = std::pair<mlir::Operation*, int>;
+
+constexpr int32_t kBiasMax = std::numeric_limits<int32_t>::max() / 2;
 
 // Uses the type of `value` to set the initial state of the index-th result if
 // `as_result` is true or index-th operand if `as_result` is false. The state
 // is immutable if the type is a quantized type. Returns the index of this
 // new state in the state vector.
-void InitializeStateForValue(Operation* op, const int index, const Value value,
-                             const bool as_result,
-                             std::vector<QuantState>* states,
-                             llvm::DenseMap<Value, int>* value_to_state,
-                             llvm::DenseMap<OpValue, int>* operand_states,
-                             llvm::DenseMap<OpValue, int>* result_states) {
-  const auto [cached, inserted] = value_to_state->insert({value, 0});
+void InitializeStateForValue(
+    Operation* op, const int index, const Value value, const bool as_result,
+    std::vector<QuantState>& states,
+    DenseMap<Value, QuantizationDriver::QuantStateIndex>& value_to_state,
+    DenseMap<QuantizationDriver::OpWithOperandIndex,
+             QuantizationDriver::QuantStateIndex>& operand_states,
+    DenseMap<QuantizationDriver::OpWithResultIndex,
+             QuantizationDriver::QuantStateIndex>& result_states) {
+  const auto [cached, inserted] = value_to_state.try_emplace(value, 0);
   if (!inserted) {
-    if (as_result)
-      (*result_states)[{op, index}] = cached->second;
-    else
-      (*operand_states)[{op, index}] = cached->second;
+    if (as_result) {
+      result_states[{op, index}] = cached->second;
+    } else {
+      operand_states[{op, index}] = cached->second;
+    }
     return;
   }
-  const QuantParams params =
-      quant::QuantizedType::getQuantizedElementType(value.getType());
-  const bool immutable = !HasQuantParams(params);
-  const int next_state_index = states->size();
-  states->push_back({params, immutable});
-  if (as_result)
-    (*result_states)[{op, index}] = next_state_index;
-  else
-    (*operand_states)[{op, index}] = next_state_index;
+
+  const QuantizedType quantized_type =
+      QuantizedType::getQuantizedElementType(value.getType());
+
+  const bool immutable = quantized_type != nullptr;
+  const QuantizationDriver::QuantStateIndex next_state_index = states.size();
+  states.push_back({quantized_type, immutable});
+  if (as_result) {
+    result_states[{op, index}] = next_state_index;
+  } else {
+    operand_states[{op, index}] = next_state_index;
+  }
+
   cached->second = next_state_index;
 }
 
@@ -87,32 +91,31 @@ void InitializeStateForValue(Operation* op, const int index, const Value value,
 
 void QuantizationDriver::InitializeArgState(const BlockArgument arg,
                                             const Value arg_value) {
-  const auto [cached, inserted] = value_to_state_.insert({arg_value, 0});
+  const auto [cached, inserted] = value_to_state_.try_emplace(arg_value, 0);
   if (!inserted) {
     arg_states_[arg] = cached->second;
     return;
   }
-  const QuantParams params =
-      quant::QuantizedType::getQuantizedElementType(arg_value.getType());
-  const bool immutable = !HasQuantParams(params);
-  const int next_state_index = states_.size();
-  states_.push_back({params, immutable});
+
+  const QuantizedType quantized_type =
+      QuantizedType::getQuantizedElementType(arg_value.getType());
+  const bool immutable = quantized_type != nullptr;
+  const QuantizationDriver::QuantStateIndex next_state_index = states_.size();
+  states_.push_back({quantized_type, immutable});
   arg_states_[arg] = next_state_index;
   cached->second = next_state_index;
 }
 
 void QuantizationDriver::InitializeOperandState(Operation* op, const int index,
                                                 const Value value) {
-  ::mlir::quant::InitializeStateForValue(op, index, value, /*as_result=*/false,
-                                         &states_, &value_to_state_,
-                                         &operand_states_, &result_states_);
+  InitializeStateForValue(op, index, value, /*as_result=*/false, states_,
+                          value_to_state_, operand_states_, result_states_);
 }
 
 void QuantizationDriver::InitializeResultState(Operation* op, const int index,
                                                const Value value) {
-  ::mlir::quant::InitializeStateForValue(op, index, value, /*as_result=*/true,
-                                         &states_, &value_to_state_,
-                                         &operand_states_, &result_states_);
+  InitializeStateForValue(op, index, value, /*as_result=*/true, states_,
+                          value_to_state_, operand_states_, result_states_);
 }
 
 std::unique_ptr<OpQuantSpec> QuantizationDriver::GetQuantSpec(Operation* op) {
@@ -133,11 +136,11 @@ bool QuantizationDriver::IsQuantized(Operation* op) {
 
 bool QuantizationDriver::SetConstantResultParams(Operation* op) {
   DenseFPElementsAttr attr;
-  const Value res = op->getResult(0);
-  if (!matchPattern(res, m_Constant(&attr))) {
+  const Value result = op->getResult(0);
+  if (!matchPattern(result, m_Constant(&attr))) {
     return false;
   }
-  // TODO(fengliuai): make storage_type_width and narrow_range configurable.
+  // TODO: b/323478683 - Make storage_type_width and narrow_range configurable.
   Type final_type;
   const auto it = optimized_weights_.find(op);
   const bool is_weight = it != optimized_weights_.end();
@@ -159,42 +162,44 @@ bool QuantizationDriver::SetConstantResultParams(Operation* op) {
     final_type = GetUniformQuantizedTypeForWeight(
         attr, /*symmetric=*/is_weight && is_signed_,
         /*num_bits=*/8, is_signed_,
-        /*narrow_range_=*/is_weight, legacy_float_scale_);
+        /*narrow_range=*/is_weight, legacy_float_scale_);
   }
-  if (const auto quant_type =
-          final_type.dyn_cast_or_null<quant::QuantizedType>()) {
-    return SetResultParams(op, 0, quant_type);
+  if (const auto quant_type = final_type.dyn_cast_or_null<QuantizedType>();
+      quant_type != nullptr) {
+    return SetResultParams(op, /*result_index=*/0, quant_type);
   }
   return false;
 }
 
-bool QuantizationDriver::SetResultParams(Operation* op, const int res_index,
-                                         const QuantParams params) {
-  auto& state = GetResultQuantState(op, res_index);
-  if (state.params == params) {
+bool QuantizationDriver::SetResultParams(Operation* op, const int result_index,
+                                         const QuantizedType quantized_type) {
+  QuantState& state = GetResultQuantState(op, result_index);
+  if (state.params == quantized_type) {
     return false;
   }
   if (!state.IsEmpty()) {
-    auto& rescales = GetResultRequantizeStates(op, res_index);
+    RequantizeStates& rescales = GetResultRequantizeStates(op, result_index);
     RequantizeState& rescale = rescales.emplace_back();
     rescale.pos = RequantizeState::ON_INPUT;
-    rescale.params = params;
+    rescale.params = quantized_type;
     return true;
   }
-  state.params = params;
-  AddUserToList(op, res_index);
+  state.params = quantized_type;
+  AddUserToList(op, result_index);
   return true;
 }
 
-QuantParams QuantizationDriver::GetBiasParams(
-    Operation* op, const int bias_index, const std::vector<int>& non_biases,
+QuantizedType QuantizationDriver::GetBiasParams(
+    Operation* op, const int bias_index,
+    const ArrayRef<int> non_bias_operand_indices,
     const AccumulatorScaleFunc func) {
   QuantState& bias_state = GetOperandQuantState(op, bias_index);
   if (!bias_state.IsEmpty()) {
     return bias_state.params;
   }
-  std::vector<QuantParams> op_types;
-  op_types.reserve(non_biases.size());
+  std::vector<QuantizedType> op_types{};
+  op_types.reserve(non_bias_operand_indices.size());
+
   int adjusted_quant_dim = -1;
   if (op->getNumOperands() > bias_index) {
     // Some kernels allow 1D bias, broadcasting it inside the kernel. In this
@@ -211,68 +216,75 @@ QuantParams QuantizationDriver::GetBiasParams(
     }
   }
 
-  for (int non_bias : non_biases) {
-    const QuantState& non_bias_type = GetOperandQuantState(op, non_bias);
-    op_types.push_back(non_bias_type.params);
+  for (const int non_bias_operand_index : non_bias_operand_indices) {
+    const QuantState& non_bias_state =
+        GetOperandQuantState(op, non_bias_operand_index);
+    op_types.push_back(non_bias_state.params);
   }
   return func(op_types, adjusted_quant_dim, legacy_float_scale_);
 }
 
-bool QuantizationDriver::SetOperandParams(Operation* op, const int index,
-                                          const QuantParams params,
+bool QuantizationDriver::SetOperandParams(Operation* op,
+                                          const int operand_index,
+                                          const QuantizedType quantized_type,
                                           const bool override) {
-  auto& state = GetOperandQuantState(op, index);
-  if (state.params == params) {
+  QuantState& state = GetOperandQuantState(op, operand_index);
+  if (state.params == quantized_type) {
     return false;
   }
 
   if (!state.IsEmpty() && !override) {
-    auto& rescales = GetOperandRequantizeStates(op, index);
+    RequantizeStates& rescales = GetOperandRequantizeStates(op, operand_index);
     for (RequantizeState& rescale : rescales) {
-      if (rescale.params == params) {
-        rescale.users.emplace_back(op, index);
+      if (rescale.params == quantized_type) {
+        rescale.users.emplace_back(op, operand_index);
         return true;
       }
     }
     RequantizeState& rescale = rescales.emplace_back();
     rescale.pos = RequantizeState::ON_OUTPUT;
-    rescale.params = params;
-    rescale.users.emplace_back(op, index);
+    rescale.params = quantized_type;
+    rescale.users.emplace_back(op, operand_index);
     return true;
   }
 
-  state.params = params;
-  AddOperandToList(op, index);
+  state.params = quantized_type;
+  AddOperandToList(op, operand_index);
   return true;
 }
 
-void QuantizationDriver::QuantizeOpResult(Operation* op, const int index,
-                                          const QuantParams params) {
+void QuantizationDriver::QuantizeOpResult(Operation* op, const int result_index,
+                                          const QuantizedType quantized_type) {
   builder_.setInsertionPointAfter(op);
-  const Value original_result = op->getResult(index);
-  QuantizeValue(original_result, params, op->getLoc());
+  const Value original_result = op->getResult(result_index);
+  QuantizeValue(original_result, quantized_type, op->getLoc());
 }
 
-void QuantizationDriver::QuantizeArg(BlockArgument arg, QuantParams params) {
+void QuantizationDriver::QuantizeArg(BlockArgument arg,
+                                     const QuantizedType quantized_type) {
   builder_.setInsertionPointToStart(arg.getOwner());
-  QuantizeValue(arg, params, builder_.getUnknownLoc());
+  QuantizeValue(arg, quantized_type, builder_.getUnknownLoc());
 }
 
-void QuantizationDriver::QuantizeValue(Value value, QuantParams params,
-                                       Location loc) {
+void QuantizationDriver::QuantizeValue(Value value,
+                                       QuantizedType quantized_type,
+                                       const Location loc) {
   const Type expressed_type = value.getType();
-  const Type new_type = params.castFromExpressedType(expressed_type);
-  // This value isn't an expressed type (float), skip.
-  if (!new_type) return;
+  const Type new_value_type =
+      quantized_type.castFromExpressedType(expressed_type);
+  // Skip if `value` or `value`'s element type doesn't match the expressed type
+  // of `quantized_type`.
+  if (new_value_type == nullptr) return;
+
   auto quantize =
-      builder_.create<quantfork::QuantizeCastOp>(loc, new_type, value);
+      builder_.create<quantfork::QuantizeCastOp>(loc, new_value_type, value);
   auto dequantize = builder_.create<quantfork::DequantizeCastOp>(
       loc, expressed_type, quantize.getResult());
 
   // This attribute is set to distinguish the quantize ops being added by the
   // quantization pass. These ops can be removed without losing original
   // program accuracy.
-  // TODO(fengliuai): make the attribute being part of op definition.
+  // TODO: b/323478683 - Make the attribute being part of op definition.
   quantize->setAttr(kVolatileOpAttrName, builder_.getUnitAttr());
 
   // `original_result` has a use to `quantize`, so this will replace that use
@@ -281,17 +293,18 @@ void QuantizationDriver::QuantizeValue(Value value, QuantParams params,
   quantize.getOperation()->replaceUsesOfWith(dequantize, value);
 }
 
-void QuantizationDriver::RequantizeOpResult(Operation* op, const int index,
-                                            RequantizeStates* states) {
-  if (states->empty()) return;
+void QuantizationDriver::RequantizeOpResult(Operation* op,
+                                            const int result_index,
+                                            RequantizeStates& states) {
+  if (states.empty()) return;
 
   builder_.setInsertionPointAfter(op);
-  Value value = op->getResult(index);
-  RequantizeState::RequantizePosition pos = states->front().pos;
+  Value value = op->getResult(result_index);
+  RequantizeState::RequantizePosition pos = states.front().pos;
   if (pos == RequantizeState::NO_REQUANTIZE) {
     return;
   }
-  for (auto& state : *states) {
+  for (const RequantizeState& state : states) {
     // Check that all requantization positions are the same for each state.
     // Unsure if this check is required.
     if (state.pos != pos) {
@@ -300,7 +313,7 @@ void QuantizationDriver::RequantizeOpResult(Operation* op, const int index,
   }
   if (pos == RequantizeState::ON_OUTPUT) {
     Operation* user = value.getUses().begin().getUser();
-    if (llvm::isa<quantfork::QuantizeCastOp>(user)) {
+    if (isa<quantfork::QuantizeCastOp>(user)) {
       // The requantize op is inserted between `quantize` and `dequantize` ops.
       value = user->getResult(0);
       builder_.setInsertionPointAfter(user);
@@ -310,12 +323,12 @@ void QuantizationDriver::RequantizeOpResult(Operation* op, const int index,
 }
 
 void QuantizationDriver::RequantizeArg(const BlockArgument arg,
-                                       RequantizeStates* states) {
+                                       RequantizeStates& states) {
   Value value = arg;
   builder_.setInsertionPointToStart(arg.getOwner());
   if (value.hasOneUse()) {
     Operation* user = value.use_begin().getUser();
-    if (auto q = llvm::dyn_cast<quantfork::QuantizeCastOp>(user)) {
+    if (auto q = dyn_cast<quantfork::QuantizeCastOp>(user)) {
       value = q.getResult();
       builder_.setInsertionPoint(arg.getOwner(), ++Block::iterator(user));
     }
@@ -323,14 +336,13 @@ void QuantizationDriver::RequantizeArg(const BlockArgument arg,
   RequantizeValue(value, states, builder_.getUnknownLoc());
 }
 
-void QuantizationDriver::RequantizeValue(Value value, RequantizeStates* states,
+void QuantizationDriver::RequantizeValue(Value value, RequantizeStates& states,
                                          const Location loc) {
-  if (states->empty() ||
-      states->front().pos == RequantizeState::NO_REQUANTIZE) {
+  if (states.empty() || states.front().pos == RequantizeState::NO_REQUANTIZE) {
     return;
   }
-  if (states->front().pos == RequantizeState::ON_INPUT) {
-    auto& state = states->front();
+  if (states.front().pos == RequantizeState::ON_INPUT) {
+    RequantizeState& state = states.front();
     const Type expressed_type = value.getType();
     // The value needs to be requantized. A Quantize op will be created to use
     // it as the operand and replace its uses.
@@ -350,7 +362,7 @@ void QuantizationDriver::RequantizeValue(Value value, RequantizeStates* states,
   if (!value.hasOneUse()) {
     return;
   }
-  auto dequant_op = llvm::dyn_cast_or_null<quantfork::DequantizeCastOp>(
+  auto dequant_op = dyn_cast_or_null<quantfork::DequantizeCastOp>(
       value.use_begin().getUser());
   if (!dequant_op) {
     return;
@@ -363,10 +375,9 @@ void QuantizationDriver::RequantizeValue(Value value, RequantizeStates* states,
   // Whether to replace quantization params of the first dequantize op
   // after the quantized value is produced.
   // If there is a use other than the requantize states, then we can't clobber.
-  bool clobber_first = num_uses <= states->size();
-  for (auto& state : *states) {
-    Type expressed_type =
-        quant::QuantizedType::castToExpressedType(value.getType());
+  bool clobber_first = num_uses <= states.size();
+  for (RequantizeState& state : states) {
+    Type expressed_type = QuantizedType::castToExpressedType(value.getType());
     if (!expressed_type) continue;
     // The value needs to be requantized. A Quantize op will be created to use
     // it as the operand and replace its uses.
@@ -384,8 +395,8 @@ void QuantizationDriver::RequantizeValue(Value value, RequantizeStates* states,
     } else {
       auto new_dequant_op = builder_.create<quantfork::DequantizeCastOp>(
           loc, dequant_op.getResult().getType(), requantize_op.getResult());
-      for (auto& op_index : state.users) {
-        op_index.first->setOperand(op_index.second, new_dequant_op.getResult());
+      for (auto [op, operand_idx] : state.users) {
+        op->setOperand(operand_idx, new_dequant_op.getResult());
       }
     }
   }
@@ -400,12 +411,12 @@ void QuantizationDriver::RequantizeValue(Value value, RequantizeStates* states,
 // - use the single input if it is ready, or,
 // - use the single output if it is ready, or,
 // - use the first ready one in the collection.
-QuantParams QuantizationDriver::GetQuantParamsForSameScaleConstraint(
+QuantizedType QuantizationDriver::GetQuantParamsForSameScaleConstraint(
     Operation* op) {
   // Two vector to collect Non-empty operands and results states.
   std::vector<QuantState*> mutable_states, immutable_states;
   for (int i = 0; i < op->getNumOperands(); ++i) {
-    auto& state = GetOperandQuantState(op, i);
+    QuantState& state = GetOperandQuantState(op, i);
     if (state.immutable) {
       immutable_states.push_back(&state);
     } else if (!state.IsEmpty()) {
@@ -422,7 +433,7 @@ QuantParams QuantizationDriver::GetQuantParamsForSameScaleConstraint(
   }
 
   for (int i = 0; i < op->getNumResults(); ++i) {
-    auto& state = GetResultQuantState(op, i);
+    QuantState& state = GetResultQuantState(op, i);
     if (state.immutable) {
       immutable_states.push_back(&state);
     } else if (!state.IsEmpty()) {
@@ -476,14 +487,11 @@ void QuantizationDriver::PreprocessConstantOps() {
 
     // The following loop will change the value uses, thus we cache all the uses
     // needs to be changed.
-    llvm::SmallVector<std::pair<Operation*, int>> uses;
-    for (auto& use : value.getUses()) {
+    SmallVector<std::pair<Operation*, int>> uses;
+    for (OpOperand& use : value.getUses()) {
       uses.push_back({use.getOwner(), use.getOperandNumber()});
     }
-    for (const auto& indexed_use : llvm::enumerate(uses)) {
-      Operation* user = indexed_use.value().first;
-      const int operand_num = indexed_use.value().second;
-
+    for (const auto [user, operand_num] : uses) {
       const std::unique_ptr<OpQuantSpec> spec = GetQuantSpec(user);
       const std::unique_ptr<OpQuantScaleSpec> scale_spec =
           GetQuantScaleSpec(user);
@@ -493,9 +501,9 @@ void QuantizationDriver::PreprocessConstantOps() {
       // other values. So any constants which are not bias, an operand of an
       // op with same scale requirements, and haven't been quantized are
       // weights.
-      if (biases.find(operand_num) == biases.end() &&
+      if (!biases.contains(operand_num) &&
           !scale_spec->has_same_scale_requirement &&
-          !llvm::dyn_cast<quantfork::QuantizeCastOp>(user)) {
+          !dyn_cast<quantfork::QuantizeCastOp>(user)) {
         // Needs to scan the content of weights to get the quantization
         // parameters if there are no quantization parameters (FakeQuant ops).
         // For this case, the weight will not be duplicated.
@@ -511,9 +519,9 @@ void QuantizationDriver::PreprocessConstantOps() {
         // other values. Duplicate this constant in case it is shared by
         // different users.
         if (uses.size() > 1) {
-          auto new_cst =
+          auto new_constant_op =
               builder_.create<arith::ConstantOp>(cst.getLoc(), cst.getValue());
-          user->setOperand(operand_num, new_cst);
+          user->setOperand(operand_num, new_constant_op);
         }
       }
     }
@@ -521,13 +529,13 @@ void QuantizationDriver::PreprocessConstantOps() {
 }
 
 void QuantizationDriver::SetupAllStates() {
-  for (auto arg : fn_.getArguments()) {
+  for (BlockArgument arg : fn_.getArguments()) {
     args_.push_back(arg);
     Value value = arg;
     // If the argument is quantized, it should only has one user.
     if (arg.hasOneUse()) {
       Operation* user = value.use_begin().getUser();
-      if (auto q = llvm::dyn_cast<quantfork::QuantizeCastOp>(user)) {
+      if (auto q = dyn_cast<quantfork::QuantizeCastOp>(user)) {
         value = q.getResult();
       }
     }
@@ -543,29 +551,29 @@ void QuantizationDriver::SetupAllStates() {
 
     for (int i = 0; i < op->getNumOperands(); ++i) {
       Value operand = op->getOperand(i);
-      if (auto* inst = operand.getDefiningOp()) {
+      if (Operation* inst = operand.getDefiningOp()) {
         // If the operand comes from a `quantfork::DequantizeCastOp`, we use
         // the quantized input of this `quantfork::DequantizeCastOp` to set the
         // state.
-        if (auto dq = llvm::dyn_cast<quantfork::DequantizeCastOp>(inst)) {
+        if (auto dq = dyn_cast<quantfork::DequantizeCastOp>(inst)) {
           operand = dq.getArg();
         }
       }
       InitializeOperandState(op, i, operand);
     }
 
-    for (int res = 0; res < op->getNumResults(); ++res) {
-      Value result = op->getResult(res);
+    for (int i = 0; i < op->getNumResults(); ++i) {
+      Value result = op->getResult(i);
       // If the result has been quantized, it should only be used by a
       // `quantfork::QuantizeCastOp`. For this case, we uses the quantized
       // result to create the state and mark it immutable.
       if (result.hasOneUse()) {
         Operation* user = result.use_begin().getUser();
-        if (auto q = llvm::dyn_cast<quantfork::QuantizeCastOp>(user)) {
+        if (auto q = dyn_cast<quantfork::QuantizeCastOp>(user)) {
           result = q.getResult();
         }
       }
-      InitializeResultState(op, res, result);
+      InitializeResultState(op, i, result);
     }
   });
 }
@@ -577,7 +585,7 @@ arith::ConstantOp QuantizationDriver::DuplicateConstantOpIfNeeded(
   }
   OpBuilder builder(op->getContext());
   builder.setInsertionPointAfter(op);
-  arith::ConstantOp new_op = llvm::cast<arith::ConstantOp>(builder.clone(*op));
+  arith::ConstantOp new_op = cast<arith::ConstantOp>(builder.clone(*op));
   target_op->getOpOperand(operand_index).set(new_op.getResult());
   InitializeOperandState(target_op, operand_index, new_op.getResult());
   InitializeResultState(new_op, 0, new_op.getResult());
@@ -585,13 +593,13 @@ arith::ConstantOp QuantizationDriver::DuplicateConstantOpIfNeeded(
 }
 
 bool QuantizationDriver::ShouldCheckBiasScale(
-    Operation* op, const int bias_index, const std::vector<int>& input_indices,
-    const QuantParams params, int& input_index, int& filter_index) {
+    Operation* op, const int bias_index, ArrayRef<int> input_indices,
+    const QuantizedType quantized_type, int& input_index, int& filter_index) {
   // For now, restrict scale adjustment to ops with affine quantized weights,
   // and having weights and biases as constants. This currently only applies to
   // FC and Conv* ops. Restriction for the weight can be relaxed if there are
   // needs for adjusting scale of variable weights.
-  auto affine_op = llvm::dyn_cast<AffineQuantizedOpInterface>(op);
+  auto affine_op = dyn_cast<AffineQuantizedOpInterface>(op);
   auto bias_op = op->getOperand(bias_index).getDefiningOp<arith::ConstantOp>();
   if (!affine_op || !bias_op || input_indices.size() != 2) return false;
   if (!bias_op.getValue().isa<DenseFPElementsAttr>()) return false;
@@ -607,22 +615,20 @@ bool QuantizationDriver::ShouldCheckBiasScale(
     return false;
   }
 
-  const auto input_state = GetOperandQuantState(op, input_index);
-  const auto filter_state = GetOperandQuantState(op, filter_index);
+  const QuantState& input_state = GetOperandQuantState(op, input_index);
+  const QuantState& filter_state = GetOperandQuantState(op, filter_index);
   // If quantization parameter for the filter is fixed, should return it as-is.
   // Only checks ops with 8-bit input and weights, and 32-bit biases.
-  if (!(input_state.params.getStorageTypeIntegralWidth() == 8 &&
-        filter_state.params.getStorageTypeIntegralWidth() == 8 &&
-        params.getStorageTypeIntegralWidth() == 32)) {
-    return false;
-  }
-  return true;
+  return input_state.params.getStorageTypeIntegralWidth() == 8 &&
+         filter_state.params.getStorageTypeIntegralWidth() == 8 &&
+         quantized_type.getStorageTypeIntegralWidth() == 32;
 }
 
 bool QuantizationDriver::SetBiasParamsWithAdjustments(
-    Operation* op, const int bias_index, const std::vector<int>& input_indices,
-    const QuantParams params) {
+    Operation* op, const int bias_index, ArrayRef<int> input_indices,
+    const QuantizedType params) {
   bool changed = false;
+
   int input_index;
   int filter_index;
   if (!ShouldCheckBiasScale(op, bias_index, input_indices, params, input_index,
@@ -630,8 +636,8 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
     return SetOperandParams(op, bias_index, params);
   }
 
-  quant::QuantState input_state = GetOperandQuantState(op, input_index);
-  quant::QuantState filter_state = GetOperandQuantState(op, filter_index);
+  QuantState input_state = GetOperandQuantState(op, input_index);
+  QuantState filter_state = GetOperandQuantState(op, filter_index);
   auto bias_op = op->getOperand(bias_index).getDefiningOp<arith::ConstantOp>();
   const double input_scale =
       input_state.params.cast<UniformQuantizedType>().getScale();
@@ -639,15 +645,15 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
   auto bias_values = bias_op.getValue().cast<DenseFPElementsAttr>();
   // Restrict maximum absolute value of bias within INT_MAX / 2, to make some
   // room for accumulator.
-  const int32_t kBiasMax = std::numeric_limits<int32_t>::max() / 2;
-  if (auto bias_params = params.dyn_cast<UniformQuantizedType>()) {
+  if (auto bias_quantized_type = params.dyn_cast<UniformQuantizedType>();
+      bias_quantized_type != nullptr) {
     double bias_half_range = 0.0f;
     for (auto bias : bias_values.getValues<APFloat>()) {
       if (bias_half_range < std::abs(bias.convertToFloat())) {
         bias_half_range = std::abs(bias.convertToFloat());
       }
     }
-    if (bias_half_range / bias_params.getScale() < kBiasMax) {
+    if (bias_half_range / bias_quantized_type.getScale() < kBiasMax) {
       return SetOperandParams(op, bias_index, params);
     }
     const double new_bias_scale =
@@ -659,30 +665,36 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
             bias_op->getLoc(), params.getFlags(), params.getStorageType(),
             params.getExpressedType(), new_bias_scale, 0,
             params.getStorageTypeMin(), params.getStorageTypeMax()));
-    auto filter_op = DuplicateConstantOpIfNeeded(
+    arith::ConstantOp filter_op = DuplicateConstantOpIfNeeded(
         op->getOperand(filter_index).getDefiningOp<arith::ConstantOp>(), op,
         filter_index);
     if (!filter_op) {
       return SetOperandParams(op, bias_index, params);
     }
 
-    const auto filter_param = filter_state.params.cast<UniformQuantizedType>();
+    const auto filter_quantized_type =
+        filter_state.params.cast<UniformQuantizedType>();
     changed |= SetOperandParams(
         op, filter_index,
         UniformQuantizedType::getChecked(
-            filter_op->getLoc(), filter_param.getFlags(),
-            filter_param.getStorageType(), filter_param.getExpressedType(),
-            new_bias_scale / input_scale, 0, filter_param.getStorageTypeMin(),
-            filter_param.getStorageTypeMax()),
+            filter_op->getLoc(), filter_quantized_type.getFlags(),
+            filter_quantized_type.getStorageType(),
+            filter_quantized_type.getExpressedType(),
+            new_bias_scale / input_scale, 0,
+            filter_quantized_type.getStorageTypeMin(),
+            filter_quantized_type.getStorageTypeMax()),
         /*override=*/true);
-  } else if (auto bias_params =
-                 params.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
-    const auto filter_params =
+  } else if (auto bias_quantized_type =
+                 params.dyn_cast<quant::UniformQuantizedPerAxisType>();
+             bias_quantized_type != nullptr) {
+    const auto filter_quantized_type =
         filter_state.params.cast<quant::UniformQuantizedPerAxisType>();
-    std::vector<double> new_bias_scales = bias_params.getScales().vec();
-    std::vector<double> new_filter_scales = filter_params.getScales().vec();
+    std::vector<double> new_bias_scales = bias_quantized_type.getScales().vec();
+    std::vector<double> new_filter_scales =
+        filter_quantized_type.getScales().vec();
+
     bool needs_adjustment = false;
-    for (int i = 0; i < bias_params.getScales().size(); ++i) {
+    for (int i = 0; i < bias_quantized_type.getScales().size(); ++i) {
       const float abs_bias = std::abs(bias_values.getValues<float>()[i]);
       if (abs_bias / new_bias_scales[i] > kBiasMax) {
         new_bias_scales[i] = static_cast<double>(abs_bias) / kBiasMax;
@@ -698,21 +710,23 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
         quant::UniformQuantizedPerAxisType::getChecked(
             bias_op->getLoc(), params.getFlags(), params.getStorageType(),
             params.getExpressedType(), new_bias_scales,
-            bias_params.getZeroPoints(), bias_params.getQuantizedDimension(),
+            bias_quantized_type.getZeroPoints(),
+            bias_quantized_type.getQuantizedDimension(),
             params.getStorageTypeMin(), params.getStorageTypeMax()));
 
-    auto filter_op = DuplicateConstantOpIfNeeded(
+    arith::ConstantOp filter_op = DuplicateConstantOpIfNeeded(
         op->getOperand(filter_index).getDefiningOp<arith::ConstantOp>(), op,
         filter_index);
     changed |= SetOperandParams(
         op, filter_index,
         quant::UniformQuantizedPerAxisType::getChecked(
-            filter_op->getLoc(), filter_params.getFlags(),
-            filter_params.getStorageType(), filter_params.getExpressedType(),
-            new_filter_scales, filter_params.getZeroPoints(),
-            filter_params.getQuantizedDimension(),
-            filter_params.getStorageTypeMin(),
-            filter_params.getStorageTypeMax()),
+            filter_op->getLoc(), filter_quantized_type.getFlags(),
+            filter_quantized_type.getStorageType(),
+            filter_quantized_type.getExpressedType(), new_filter_scales,
+            filter_quantized_type.getZeroPoints(),
+            filter_quantized_type.getQuantizedDimension(),
+            filter_quantized_type.getStorageTypeMin(),
+            filter_quantized_type.getStorageTypeMax()),
         /*override=*/true);
   }
   return changed;
@@ -720,12 +734,12 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
 
 // This method scans the operations in the function to setup the initial
 // states for quantization parameter propagation.
-// TODO(fengliuai): This algorithm assumes there are only one pair of
+// TODO: b/323478683 - This algorithm assumes there are only one pair of
 // `quantfork::QuantizeCastOp` and `quantfork::DequantizeCastOp` ops between two
 // quantizable ops. A sanity check should be applied.
 void QuantizationDriver::Initialize() {
   // Duplicate the bias constant, so the states can be setup correctly.
-  // TODO(fengliuai): Function definition should also be duplicated if there
+  // TODO: b/323478683 - Function definition should also be duplicated if there
   // are multiple call sites.
   PreprocessConstantOps();
 
@@ -736,21 +750,21 @@ void QuantizationDriver::Initialize() {
 // Propagates the quantization parameters to the operands, results, and biases.
 // TODO: b/323478683 - Do not use while loop to handle this logic.
 bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
-  // TODO(fengliuai): uses a typed indicator instead of a bool value.
+  // TODO: b/323478683 - Use a typed indicator instead of a bool value.
   bool changed = false;
   while (!work_list_.empty()) {
     Operation* op = work_list_.back();
     work_list_.pop_back();
 
     // This op has been quantized, so we should not consider it again.
-    if (llvm::is_contained(quantized_, op)) continue;
+    if (quantized_.contains(op)) continue;
     quantized_.insert(op);
 
-    if (auto cst = llvm::dyn_cast<arith::ConstantOp>(op)) {
+    if (auto constant_op = dyn_cast<arith::ConstantOp>(op); constant_op) {
       // If the workflow requires inferring ranges from the content
       // (post-training quantization) and it is weight (filter) and hasn't
       // been quantized, we infer the quantization parameters from the content.
-      if (infer_tensor_range_ && IsWeight(cst) && !IsQuantized(op)) {
+      if (infer_tensor_range_ && IsWeight(constant_op) && !IsQuantized(op)) {
         // The quantization parameters are determined by the content of the
         // constant.
         changed |= SetConstantResultParams(op);
@@ -761,7 +775,7 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
     std::unique_ptr<OpQuantScaleSpec> scale_spec = GetQuantScaleSpec(op);
 
     if (scale_spec->has_same_scale_requirement) {
-      const auto params = GetQuantParamsForSameScaleConstraint(op);
+      const QuantizedType params = GetQuantParamsForSameScaleConstraint(op);
       // The quantization parameters haven't been propagated to any operands
       // or results. Skip this node for now.
       if (!params) {
@@ -792,12 +806,13 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
       }
 
       // Use the final state to set all the results' parameters.
-      for (int res = 0; res < op->getNumResults(); ++res)
-        if (auto type = op->getResult(res).getType().dyn_cast<ShapedType>()) {
+      for (int i = 0; i < op->getNumResults(); ++i)
+        if (auto type = op->getResult(i).getType().dyn_cast<ShapedType>();
+            type != nullptr) {
           // Without this check, it will accidentally propagate the quantization
           // information by the shared non-float-tensors.
           if (type.getElementType().isa<FloatType>())
-            changed |= SetResultParams(op, res, params);
+            changed |= SetResultParams(op, i, params);
         }
     }
 
@@ -807,8 +822,8 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
         !is_qdq_conversion_) {
       // Infer ranges from the activation ops. This is usually required for
       // the post-training quantization workflow.
-      // TODO(fengliuai): different result can have different fixed range.
-      const auto params =
+      // TODO: b/323478683 - Different result can have different fixed range.
+      const QuantizedType params =
           scale_spec->fixed_output_range_func(is_signed_, bit_width_);
       for (auto i = 0; i < op->getNumResults(); ++i) {
         // The range is null if the result has been quantized.
@@ -818,16 +833,20 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
       }
     }
 
-    const auto spec = GetQuantSpec(op);
-    for (auto& it : spec->biases_params) {
-      const auto params =
-          GetBiasParams(op, it.first, it.second.first, it.second.second);
+    const std::unique_ptr<OpQuantSpec> spec = GetQuantSpec(op);
+    for (const auto& [bias_operand_idx, non_bias_params] :
+         spec->biases_params) {
+      const auto& [non_bias_operand_indices, accumulator_scale_func] =
+          non_bias_params;
+      const QuantizedType params =
+          GetBiasParams(op, bias_operand_idx, non_bias_operand_indices,
+                        accumulator_scale_func);
       if (!params) {
         quantized_.erase(op);
         continue;
       }
-      changed |=
-          SetBiasParamsWithAdjustments(op, it.first, it.second.first, params);
+      changed |= SetBiasParamsWithAdjustments(op, bias_operand_idx,
+                                              non_bias_operand_indices, params);
     }
   }
 
@@ -836,9 +855,9 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
 
 // Finalizes the arguments and result states in the function.
 void QuantizationDriver::Finalize() {
-  for (auto arg : args_) {
-    auto& state = GetArgQuantState(arg);
-    auto& requantizes = GetArgRequantizeStates(arg);
+  for (BlockArgument arg : args_) {
+    const QuantState& state = GetArgQuantState(arg);
+    RequantizeStates& requantizes = GetArgRequantizeStates(arg);
     if (state.IsEmpty() || (state.immutable && requantizes.empty())) {
       continue;
     }
@@ -848,25 +867,24 @@ void QuantizationDriver::Finalize() {
     }
 
     if (!requantizes.empty()) {
-      RequantizeArg(arg, &requantizes);
+      RequantizeArg(arg, requantizes);
     }
   }
 
-  for (auto it : result_states_) {
-    Operation* op = it.first.first;
-    const int res_index = it.first.second;
-    auto& state = GetResultQuantState(op, res_index);
-    auto& requantizes = GetResultRequantizeStates(op, res_index);
+  for (const auto& [op_with_result_idx, quant_state_idx] : result_states_) {
+    const auto [op, result_idx] = op_with_result_idx;
+    const QuantState& state = GetResultQuantState(op, result_idx);
+    RequantizeStates& requantizes = GetResultRequantizeStates(op, result_idx);
     if (state.IsEmpty() || (state.immutable && requantizes.empty())) {
       continue;
     }
 
     if (!state.immutable) {
-      QuantizeOpResult(op, res_index, state.params);
+      QuantizeOpResult(op, result_idx, state.params);
     }
 
     if (!requantizes.empty()) {
-      RequantizeOpResult(op, res_index, &requantizes);
+      RequantizeOpResult(op, result_idx, requantizes);
     }
   }
 }
@@ -885,7 +903,7 @@ void QuantizationDriver::Run() {
 }
 
 void ApplyQuantizationParamsPropagation(
-    const mlir::func::FuncOp func, const bool is_signed, const int bit_width,
+    const func::FuncOp func, const bool is_signed, const int bit_width,
     const bool disable_per_channel,
     const OpQuantSpecGetter op_quant_spec_getter,
     const bool infer_tensor_ranges, const bool legacy_float_scale,
@@ -897,7 +915,7 @@ void ApplyQuantizationParamsPropagation(
 }
 
 void ApplyQuantizationParamsPropagation(
-    const mlir::func::FuncOp func, const bool is_signed, const int bit_width,
+    const func::FuncOp func, const bool is_signed, const int bit_width,
     const bool disable_per_channel,
     const OpQuantSpecGetter op_quant_spec_getter,
     const OpQuantScaleSpecGetter op_quant_scale_spec_getter,
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h
index 59741f48307a16..d054e9ed738ce0 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h
@@ -17,14 +17,13 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_DRIVER_H_
 
 #include <memory>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
@@ -40,20 +39,16 @@ limitations under the License.
 namespace mlir {
 namespace quant {
 
-static bool HasQuantParams(QuantParams p) {
-  return p == quant::QuantizedType();
-}
-
 // The state for each op result during the quantization parameters propagation.
 struct QuantState {
   // Quantization parameters propagated to an op result.
-  QuantParams params;
+  QuantizedType params;
   // A flag indicates this state (the params) shouldn't be changed after it is
   // initialized. This flag will be set to true if the quantization parameters
   // are from the quantization-aware training.
   const bool immutable;
 
-  bool IsEmpty() { return HasQuantParams(params); }
+  bool IsEmpty() const { return params == nullptr; }
 };
 
 // The state for rescaling the propagated quantization parameters. This can be
@@ -70,7 +65,7 @@ struct RequantizeState {
   } pos = NO_REQUANTIZE;
 
   // Quantization parameters will be used to add the requantize ops.
-  QuantParams params;
+  QuantizedType params;
 
   // Avoid clobbering all uses of the value, limit to just these ops.
   SmallVector<std::pair<Operation*, int>> users;
@@ -99,15 +94,25 @@ using RequantizeStates = SmallVector<RequantizeState>;
 //
 class QuantizationDriver {
  public:
-  explicit QuantizationDriver(func::FuncOp fn, bool is_signed, int bit_width,
-                              bool disable_per_channel,
+  // Type alias of int used to access `states_`.
+  using QuantStateIndex = int;
+
+  // (op, operand index) pair.
+  using OpWithOperandIndex = std::pair<Operation*, int>;
+
+  // (op, result index) pair.
+  using OpWithResultIndex = std::pair<Operation*, int>;
+
+  explicit QuantizationDriver(func::FuncOp func_op, const bool is_signed,
+                              const int bit_width,
+                              const bool disable_per_channel,
                               OpQuantSpecGetter op_quant_spec_getter,
                               OpQuantScaleSpecGetter op_quant_scale_spec_getter,
-                              bool infer_tensor_range,
-                              bool legacy_float_scale = false,
-                              bool is_qdq_conversion = false)
-      : fn_(fn),
-        builder_(fn.getBody()),
+                              const bool infer_tensor_range,
+                              const bool legacy_float_scale = false,
+                              const bool is_qdq_conversion = false)
+      : fn_(func_op),
+        builder_(func_op.getBody()),
         is_signed_(is_signed),
         bit_width_(bit_width),
         disable_per_channel_(disable_per_channel),
@@ -130,7 +135,7 @@ class QuantizationDriver {
   // result.
   void Finalize();
 
-  llvm::SmallVector<BlockArgument, 4> GetArgs() { return args_; }
+  SmallVector<BlockArgument, 4> GetArgs() { return args_; }
 
   // Returns the state of the block argument.
   QuantState& GetArgQuantState(BlockArgument arg) {
@@ -138,10 +143,6 @@ class QuantizationDriver {
   }
 
  private:
-  // This is used to identify an operand or result of an op. The second element
-  // of this pair is the index of the operand or result.
-  using OpValue = std::pair<mlir::Operation*, int>;
-
   // Duplicates the constant op if it has multiple uses, and replaces
   // target_op->operand[operand_index] with the newly created op. This also
   // replaces corresponsing quantization states.
@@ -153,13 +154,13 @@ class QuantizationDriver {
   // prevent overflow of quantized bias values. This also changes quantization
   // state of other inputs when needed.
   bool SetBiasParamsWithAdjustments(Operation* op, int bias_index,
-                                    const std::vector<int>& input_indices,
-                                    QuantParams params);
+                                    ArrayRef<int> input_indices,
+                                    QuantizedType params);
 
   // Checks preconditions to adjust bias scale.
   bool ShouldCheckBiasScale(Operation* op, int bias_index,
-                            const std::vector<int>& input_indices,
-                            QuantParams params, int& input_index,
+                            ArrayRef<int> input_indices,
+                            QuantizedType quantized_type, int& input_index,
                             int& filter_index);
 
   // Preprocesses the constants by doing the following:
@@ -187,84 +188,92 @@ class QuantizationDriver {
   bool IsQuantized(Operation* op);
 
   // Adds all the users of index-th result of op to the work list.
-  void AddUserToList(Operation* op, int index) {
+  void AddUserToList(Operation* op, const int index) {
     for (Operation* user : op->getResult(index).getUsers()) {
       work_list_.push_back(user);
     }
   }
 
   // Adds the defining op of index-th operand of op to the work list.
-  void AddOperandToList(Operation* op, int index) {
-    if (Operation* inst = op->getOperand(index).getDefiningOp()) {
-      work_list_.push_back(inst);
+  void AddOperandToList(Operation* op, const int index) {
+    if (Operation* operand_op = op->getOperand(index).getDefiningOp();
+        operand_op != nullptr) {
+      work_list_.push_back(operand_op);
     }
   }
 
   // Returns the quantization params for the bias input from the non-bias
   // operands which have their indexes in the `non_biases` vector. The returned
   // parameters are calculated by `func`.
-  QuantParams GetBiasParams(Operation* op, int bias_index,
-                            const std::vector<int>& non_biases,
-                            AccumulatorScaleFunc func);
-
-  // Sets the quantization parameters of the result to a fixed value. If any
-  // quantization parameters have been propagated, a `requantize` will happen on
-  // the input of propagated quantization.
-  bool SetResultParams(Operation* op, int index, QuantParams params);
-
-  // Sets the quantization parameters of the operand to a fixed value. If any
+  QuantizedType GetBiasParams(Operation* op, int bias_index,
+                              ArrayRef<int> non_bias_operand_indices,
+                              AccumulatorScaleFunc func);
+
+  // Sets the quantization parameters of the result to `quantized_type`. If
+  // any quantization parameters have been propagated, a requantize will
+  // happen on the input of propagated quantization. Returns `true` if internal
+  // state has been modified.
+  bool SetResultParams(Operation* op, int result_index,
+                       QuantizedType quantized_type);
+
+  // Sets the quantization parameters of the operand to `quantized_type`. If any
   // quantization parameters have been propagated, a `requantize` will happen on
   // the output of propagated quantization. When `override` is set, quantization
-  // state of the value is replaced instead of adding requantization.
-  bool SetOperandParams(Operation* op, int index, QuantParams params,
-                        bool override = false);
+  // state of the value is replaced instead of adding requantization. Returns
+  // `true` if internal state has been modified.
+  bool SetOperandParams(Operation* op, int operand_index,
+                        QuantizedType quantized_type, bool override = false);
 
   // Sets the quantization parameters of the constant result according to its
   // content.
   bool SetConstantResultParams(Operation* op);
 
-  // Inserts the Quantize and Dequantize ops for quantizing the index-th result
-  // of the op.
-  void QuantizeOpResult(Operation* op, int index, QuantParams params);
+  // Inserts the Quantize and Dequantize ops after `op`'s `index`-th result. The
+  // quantized element type for the result is `quantized_type`.
+  void QuantizeOpResult(Operation* op, int result_index,
+                        QuantizedType quantized_type);
 
-  void QuantizeArg(BlockArgument arg, QuantParams params);
+  // Inserts the Quantize and Dequantize ops after `arg`. The quantized element
+  // type for `arg` is `quantized_type`.
+  void QuantizeArg(BlockArgument arg, QuantizedType quantized_type);
 
-  // Inserts the Quantize and Dequantize ops to quantize the value and returns
-  // the Quantize op.
-  void QuantizeValue(Value value, QuantParams params, Location loc);
+  // Inserts the Quantize and Dequantize ops (i.e. QDQ) after `value`. The
+  // quantized element type for `value` is `quantized_type`.
+  void QuantizeValue(Value value, QuantizedType quantized_type, Location loc);
 
   // Inserts the Quantize ops for requantizing the index-th result of the op.
-  void RequantizeOpResult(Operation* op, int index, RequantizeStates* states);
+  void RequantizeOpResult(Operation* op, int result_index,
+                          RequantizeStates& states);
 
   // Inserts the Quantize ops for requantizing a block argument.
-  void RequantizeArg(BlockArgument arg, RequantizeStates* states);
+  void RequantizeArg(BlockArgument arg, RequantizeStates& states);
 
   // Inserts the Quantize and Dequantize ops to quantize the value and returns
   // the Quantize op.
-  void RequantizeValue(Value value, RequantizeStates* states, Location loc);
+  void RequantizeValue(Value value, RequantizeStates& states, Location loc);
 
   // Returns the quantization parameter satisfies the same scale
   // constraints for the op. Returns an empty option if this quantization
   // parameter doesn't exist.
-  QuantParams GetQuantParamsForSameScaleConstraint(Operation* op);
+  QuantizedType GetQuantParamsForSameScaleConstraint(Operation* op);
 
   // Returns the state of the index-th operand of the op.
-  QuantState& GetOperandQuantState(Operation* op, int index) {
+  QuantState& GetOperandQuantState(Operation* op, const int index) {
     return states_[operand_states_[{op, index}]];
   }
 
   // Returns the state of the index-th result of the op.
-  QuantState& GetResultQuantState(Operation* op, int index) {
+  QuantState& GetResultQuantState(Operation* op, const int index) {
     return states_[result_states_[{op, index}]];
   }
 
   // Returns the states of the index-th operand of the op.
-  RequantizeStates& GetOperandRequantizeStates(Operation* op, int index) {
+  RequantizeStates& GetOperandRequantizeStates(Operation* op, const int index) {
     return rescale_states_[operand_states_[{op, index}]];
   }
 
   // Returns the states of the index-th result of the op.
-  RequantizeStates& GetResultRequantizeStates(Operation* op, int index) {
+  RequantizeStates& GetResultRequantizeStates(Operation* op, const int index) {
     return rescale_states_[result_states_[{op, index}]];
   }
 
@@ -278,10 +287,6 @@ class QuantizationDriver {
   // a new entry in the state vector.
   void InitializeArgState(BlockArgument arg, Value arg_value);
 
-  // Sets the state of index-th operand / result of op.
-  void InitializeStateForValue(Operation* op, int index, Value value,
-                               bool as_result);
-
   // Sets the state of the index-th operand of the op. If this operand is
   // cached, uses the cached result without creating new entry in the state
   // vector. Otherwise, allocate a new entry in the state vector.
@@ -301,12 +306,13 @@ class QuantizationDriver {
   // We should distinguish weights and bias constants. Biases are specified by
   // the quantization spec or are the operands of ops with same scale spec. The
   // rest are weights.
-  llvm::DenseSet<Operation*> weights_;
+  DenseSet<Operation*> weights_;
 
   // The weights require narrow_range quantization. This map collects all the
-  // weight operands defined by the op quant spec. If the value of the entry is
-  // positive, per-channel quantization is required.
-  llvm::DenseMap<Operation*, int> optimized_weights_;
+  // weight operands defined by the op quant spec. The value of each entry is
+  // the quantization dimension. If it is positive, per-channel quantization is
+  // required.
+  DenseMap<Operation*, int> optimized_weights_;
 
   // All the ops needs to propagate the quantization parameters to.
   std::vector<Operation*> work_list_;
@@ -319,18 +325,18 @@ class QuantizationDriver {
   // The map contains all the quantization parameters which are required to
   // satisfy the same operands and results constraint. The keys of this map are
   // the values from `operand_states_` and `result_state_`.
-  std::unordered_map<int, RequantizeStates> rescale_states_;
+  absl::flat_hash_map<QuantStateIndex, RequantizeStates> rescale_states_;
 
   // Maps of indexes to the propagation state vector from the ops operands,
   // results and arguments.
-  llvm::DenseMap<OpValue, int> operand_states_;
-  llvm::DenseMap<OpValue, int> result_states_;
-  llvm::DenseMap<BlockArgument, int> arg_states_;
-  llvm::DenseMap<Value, int> value_to_state_;
+  DenseMap<OpWithOperandIndex, QuantStateIndex> operand_states_;
+  DenseMap<OpWithResultIndex, QuantStateIndex> result_states_;
+  DenseMap<BlockArgument, QuantStateIndex> arg_states_;
+  DenseMap<Value, QuantStateIndex> value_to_state_;
 
   // This vector is to preserve the arguments order, so the newly inserted
   // quantized ops for the arguments are deterministically ordered.
-  llvm::SmallVector<BlockArgument, 4> args_;
+  SmallVector<BlockArgument, 4> args_;
 
   OpQuantSpecGetter op_quant_spec_getter_;
   OpQuantScaleSpecGetter op_quant_scale_spec_getter_;
@@ -357,7 +363,7 @@ class QuantizationDriver {
 // Setting `infer_tensor_range` to true, to infer quantization parameters from
 // the activation ops and weight constants. This is only used for post-training
 // quantization.
-void ApplyQuantizationParamsPropagation(mlir::func::FuncOp func, bool is_signed,
+void ApplyQuantizationParamsPropagation(func::FuncOp func, bool is_signed,
                                         int bit_width, bool disable_per_channel,
                                         OpQuantSpecGetter op_quant_spec_getter,
                                         bool infer_tensor_ranges,
@@ -365,8 +371,8 @@ void ApplyQuantizationParamsPropagation(mlir::func::FuncOp func, bool is_signed,
                                         bool is_qdq_conversion);
 
 void ApplyQuantizationParamsPropagation(
-    mlir::func::FuncOp func, bool is_signed, int bit_width,
-    bool disable_per_channel, OpQuantSpecGetter op_quant_spec_getter,
+    func::FuncOp func, bool is_signed, int bit_width, bool disable_per_channel,
+    OpQuantSpecGetter op_quant_spec_getter,
     OpQuantScaleSpecGetter op_quant_scale_spec_getter, bool infer_tensor_ranges,
     bool legacy_float_scale, bool is_qdq_conversion);
 
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
index d95ba49cf8e800..88017117098aca 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
@@ -26,10 +26,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <type_traits>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/DenseMap.h"
@@ -86,11 +86,11 @@ inline constexpr double kNearZeroTolerance = 1.0e-6;
 using QuantParams = QuantizedType;
 using QuantSpec = QuantizationSpecs;
 using SignedInteger = std::pair<unsigned, unsigned>;  // bitwidth and sign
-using QuantParamsForResults = llvm::SmallVector<QuantParams, 4>;
+using QuantParamsForResults = llvm::SmallVector<QuantizedType, 4>;
 using AccumulatorScaleFunc =
-    std::function<QuantParams(const std::vector<QuantParams>&, int, bool)>;
+    std::function<QuantizedType(const std::vector<QuantizedType>&, int, bool)>;
 using BiasParamsMap =
-    std::unordered_map<int, std::pair<std::vector<int>, AccumulatorScaleFunc>>;
+    absl::flat_hash_map<int, std::pair<std::vector<int>, AccumulatorScaleFunc>>;
 // UniformQuantizedType GetFixedOutputRange(bool sign, int bit_width)
 using GetFixedOutputRangeFunc = std::function<UniformQuantizedType(bool, int)>;
 // bool RequiredSameOperandsAndResultsScale(bool sign, int $bit_width)

From 6c62a390ba07f071e52f4c727a62446388e26044 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 Mar 2024 15:55:19 -0700
Subject: [PATCH 047/670] 1. Fix flaky test after recently enabling the
 modelling of resharding memory costs by default in auto-sharding. 2. Also
 check shapes of parameters in the test instead of sharding annotations as
 sharding annotations may not be preserved through the compilation.

PiperOrigin-RevId: 616971712
---
 third_party/xla/xla/service/gpu/BUILD         |  1 +
 .../gpu/auto_sharding_gpu_compiler_test.cc    | 37 ++++++++++++++++---
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index b94c1f10f9ce84..dd3fc565d28481 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3929,6 +3929,7 @@ xla_cc_test(
     srcs = ["auto_sharding_gpu_compiler_test.cc"],
     tags = tf_cuda_tests_tags() + ["no_oss"],  # TODO(b/277355322): Make autosharding work in OSS
     deps = [
+        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:gpu_plugin",
         "//xla/service:hlo_module_config",
diff --git a/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc
index 06928aa44a08b1..eab4b0d48e5dbb 100644
--- a/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc
@@ -17,10 +17,12 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
+#include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/logging.h"
 
@@ -30,6 +32,8 @@ namespace {
 
 namespace m = ::xla::match;
 
+using ::testing::Conditional;
+
 class AutoShardingTest : public HloTestBase {
  protected:
   const char* const dot_hlo_string_ = R"(
@@ -60,14 +64,35 @@ ENTRY matmul {
 };
 
 TEST_F(AutoShardingTest, MatMulWithAutosharding) {
-  auto compiled_module = CompileMatMul(true, 4);
-  auto* instruction =
+  std::unique_ptr<HloModule> compiled_module = CompileMatMul(true, 4);
+  const HloInstruction* parameter1 =
       compiled_module->entry_computation()->parameter_instruction(0);
-  VLOG(2) << instruction->ToString();
+  const HloInstruction* parameter2 =
+      compiled_module->entry_computation()->parameter_instruction(1);
+  bool is_parameter1_replicated = ShapeUtil::Equal(
+      parameter1->shape(), ShapeUtil::MakeShape(PrimitiveType::F32, {32, 64}));
+  bool is_parameter2_replicated = ShapeUtil::Equal(
+      parameter2->shape(), ShapeUtil::MakeShape(PrimitiveType::F32, {64, 128}));
+
+  // Check that at least one of the parameters is sharded, thereby telling us
+  // that the dot is as well.
+  VLOG(2) << parameter1->ToString();
+  EXPECT_THAT(
+      parameter1,
+      Conditional(
+          is_parameter2_replicated,
+          AnyOf(GmockMatch(m::Op().WithShape(PrimitiveType::F32, {8, 64})),
+                GmockMatch(m::Op().WithShape(PrimitiveType::F32, {32, 16}))),
+          GmockMatch(m::Op().WithShape(PrimitiveType::F32, {32, 64}))));
+
+  VLOG(2) << parameter2->ToString();
   EXPECT_THAT(
-      instruction,
-      AnyOf(GmockMatch(m::Op().WithSharding("{devices=[1,4]0,1,2,3}")),
-            GmockMatch(m::Op().WithSharding("{devices=[4,1]0,1,2,3}"))));
+      parameter2,
+      Conditional(
+          is_parameter1_replicated,
+          AnyOf(GmockMatch(m::Op().WithShape(PrimitiveType::F32, {16, 128})),
+                GmockMatch(m::Op().WithShape(PrimitiveType::F32, {64, 32}))),
+          GmockMatch(m::Op().WithShape(PrimitiveType::F32, {64, 128}))));
 }
 
 TEST_F(AutoShardingTest, MatMulWithoutAutosharding) {

From 400c2b04ab3360d9bfdc5f9eada6a45cd827e8f0 Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Mon, 18 Mar 2024 15:56:28 -0700
Subject: [PATCH 048/670] Rename
 `fold_constant_transpose_pass.cc`->`fold_constant_transpose.cc`.

Conventionally the `_pass` prefix isn't used.

PiperOrigin-RevId: 616971947
---
 tensorflow/compiler/mlir/quantization/stablehlo/BUILD           | 2 +-
 ...ld_constant_transpose_pass.cc => fold_constant_transpose.cc} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename tensorflow/compiler/mlir/quantization/stablehlo/passes/{fold_constant_transpose_pass.cc => fold_constant_transpose.cc} (100%)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 99c93739949a9c..11b100be5601c1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -48,7 +48,7 @@ cc_library(
     srcs = [
         "passes/convert_func_to_bfloat16.cc",
         "passes/convert_xla_call_module_op_to_bfloat16.cc",
-        "passes/fold_constant_transpose_pass.cc",
+        "passes/fold_constant_transpose.cc",
         "passes/lift_quantizable_spots_as_functions.cc",
         "passes/lift_quantizable_spots_as_functions_fusion.inc",
         "passes/lift_quantizable_spots_as_functions_simple.inc",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose_pass.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
similarity index 100%
rename from tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose_pass.cc
rename to tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc

From 28c0ea47129c47ee10dcf7ee3a47e8a25a2eca42 Mon Sep 17 00:00:00 2001
From: Juhyun Lee <impjdi@google.com>
Date: Mon, 18 Mar 2024 16:01:41 -0700
Subject: [PATCH 049/670] Add tensor shape check for ADD & MUL.

PiperOrigin-RevId: 616973218
---
 .../delegates/gpu/common/model_builder.cc     | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 548cbcba1afc80..10e3efd3bbbb5a 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -1143,19 +1143,32 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
         int input_tensor1 = 1;
         if (operation_type_ == OperationType::MUL ||
             operation_type_ == OperationType::ADD) {
-          // The "larger" input tensor must be bound to 1st input and the
-          // "smaller" input tensor must be bound to 2nd input.
+          // The "larger" input tensor MUST be the 1st argument, and the
+          // "smaller" input tensor must be the 2nd.
           BHWC shape0;
           RETURN_IF_ERROR(ExtractTensorShape(*input0, &shape0));
           BHWC shape1;
           RETURN_IF_ERROR(ExtractTensorShape(*input1, &shape1));
+          if (shape0.b != shape1.b) {
+            return absl::InvalidArgumentError(absl::StrCat(
+                "Tensor shape (b) mismatch: ", shape0.b, " vs ", shape1.b));
+          } else if (shape0.c != shape1.c) {
+            return absl::InvalidArgumentError(absl::StrCat(
+                "Tensor shape (c) mismatch: ", shape0.c, " vs ", shape1.c));
+          } else if (!(shape0.h <= shape1.h && shape0.w <= shape1.w) &&
+                     !(shape0.h >= shape1.h && shape0.w >= shape1.w)) {
+            // One input tensor must be consistently larger (or smaller) than or
+            // as same shaped as the other input tensor in both dimensions.
+            return absl::InvalidArgumentError(absl::StrCat(
+                "Tensor shape (h, w) mismatch: (", shape0.h, ", ", shape0.w,
+                ") vs (", shape1.h, ", ", shape1.w, ")"));
+          }
           if (shape0.h <= shape1.h && shape0.w <= shape1.w &&
               shape0.c == shape1.c) {
             input_tensor0 = 1;
             input_tensor1 = 0;
           }
         }
-
         RETURN_IF_ERROR(reader->AddInput(node, input_tensor0));
         RETURN_IF_ERROR(reader->AddInput(node, input_tensor1));
       }

From 1b85215e88f7254b50ed0552d41195d6ebd6e122 Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Mon, 18 Mar 2024 16:33:18 -0700
Subject: [PATCH 050/670] Cleanup: `lift_as_function_call.h/cc`.

* Remove unnecessary `llvm::` and `mlir::`.
* Use `ArrayRef`s where applicable.
* Use pass-by-reference where applicable (e.g. OpBuilder).
* Use pass-by-value where applicable (e.g. Value).

PiperOrigin-RevId: 616981588
---
 .../common/lift_as_function_call.cc           | 118 +++++++++---------
 .../common/lift_as_function_call.h            |  27 ++--
 .../common/lift_as_function_call_test.cc      |  29 ++---
 3 files changed, 89 insertions(+), 85 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
index 9c700ed50bc4d0..86ba98a7ee1139 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
@@ -69,12 +69,10 @@ constexpr int64_t kDefaultVersion = 9;
 constexpr StringRef kPlatformCpu = "CPU";
 // Name of `tf.XlaCallModule`'s dictionary attribute for keeping the
 // deserialized stablehlo module's attributes.
-constexpr llvm::StringRef kStablehloModuleAttrsAttrName =
-    "_stablehlo_module_attrs";
+constexpr StringRef kStablehloModuleAttrsAttrName = "_stablehlo_module_attrs";
 // Attribute required for running shape refinement pass enabled in XlaCallModule
 // version 8 and above.
-constexpr llvm::StringRef kUsesShapePolymorphismAttr =
-    "jax.uses_shape_polymorphism";
+constexpr StringRef kUsesShapePolymorphismAttr = "jax.uses_shape_polymorphism";
 
 // Checks if the op is inside a lifted function.
 bool IsInLiftedFunc(Operation& op) {
@@ -83,16 +81,16 @@ bool IsInLiftedFunc(Operation& op) {
 
 // Inserts the function to the symbol table of the module thread-safely.
 StringAttr InsertToSymbolTable(Operation& module, Operation& function,
-                               const std::string& func_name) {
+                               const StringRef func_name) {
   static tensorflow::mutex* mtx = new tensorflow::mutex();
   tensorflow::mutex_lock lock(*mtx);
 
   SymbolTable symbol_table(&module);
-  std::string unique_name = func_name;
+  std::string unique_name = func_name.str();
   int32_t uniquing_counter = 0;
   while (symbol_table.lookup(unique_name) != nullptr) {
     ++uniquing_counter;
-    unique_name = func_name + "_" + std::to_string(uniquing_counter);
+    unique_name = absl::StrCat(func_name.str(), "_", uniquing_counter);
   }
   function.setAttr("sym_name",
                    StringAttr::get(module.getContext(), unique_name));
@@ -101,9 +99,11 @@ StringAttr InsertToSymbolTable(Operation& module, Operation& function,
 
 // Creates the TF::PartitionedCallOp with the given arguments and output types.
 // This function call op is for invoking the TF subgraphs.
-ValueRange createTFPartitionedCallOp(OpBuilder builder, Location location,
-                                     StringRef func_name,
-                                     TypeRange output_types, ValueRange args) {
+ValueRange CreateTFPartitionedCallOp(OpBuilder& builder,
+                                     const Location location,
+                                     const StringRef func_name,
+                                     const TypeRange output_types,
+                                     const ValueRange args) {
   TF::PartitionedCallOp call_op = builder.create<TF::PartitionedCallOp>(
       location, output_types, args,
       FlatSymbolRefAttr::get(builder.getStringAttr(func_name)),
@@ -112,7 +112,7 @@ ValueRange createTFPartitionedCallOp(OpBuilder builder, Location location,
   // Set the attribute to annotate this function call op as a quantizable spot.
   call_op->setAttr(
       kQuantTraitAttrName,
-      builder.getStringAttr(llvm::StringRef(
+      builder.getStringAttr(StringRef(
           std::string(QuantTraitValues[QuantizationTrait::FullyQuantizable]))));
 
   return call_op.getOutput();
@@ -120,10 +120,11 @@ ValueRange createTFPartitionedCallOp(OpBuilder builder, Location location,
 
 // Creates the TF::XlaCallModuleOp with the given arguments and output types.
 // This function call op is for invoking the StableHLO subgraphs.
-ValueRange createTFXlaCallModuleOp(OpBuilder builder, Location location,
-                                   StringRef func_name, TypeRange output_types,
-                                   ValueRange args) {
-  auto ctx = builder.getContext();
+ValueRange CreateTFXlaCallModuleOp(OpBuilder& builder, const Location location,
+                                   const StringRef func_name,
+                                   const TypeRange output_types,
+                                   const ValueRange args) {
+  MLIRContext* ctx = builder.getContext();
   // Collect the shapes of the output to fill up the Sout attribute.
   SmallVector<Attribute> shape_attrs;
   for (const Type result_type : output_types) {
@@ -133,7 +134,7 @@ ValueRange createTFXlaCallModuleOp(OpBuilder builder, Location location,
   auto empty_array_attr = ArrayAttr::get(ctx, {});
   auto platforms = ArrayAttr::get(ctx, {StringAttr::get(ctx, kPlatformCpu)});
 
-  TF::XlaCallModuleOp call_op = builder.create<TF::XlaCallModuleOp>(
+  auto call_op = builder.create<TF::XlaCallModuleOp>(
       location,
       /*output=*/output_types,
       /*args=*/args,
@@ -159,7 +160,7 @@ ValueRange createTFXlaCallModuleOp(OpBuilder builder, Location location,
   // Set the attribute to annotate this function call op as a quantizable spot.
   call_op->setAttr(
       kQuantTraitAttrName,
-      builder.getStringAttr(llvm::StringRef(
+      builder.getStringAttr(StringRef(
           std::string(QuantTraitValues[QuantizationTrait::FullyQuantizable]))));
 
   // Set jax.uses_shape_polymorphism=true to enable shape refinement at runtime.
@@ -172,27 +173,25 @@ ValueRange createTFXlaCallModuleOp(OpBuilder builder, Location location,
 }
 
 // Creates the function call op based on the given call_op_type argument.
-ValueRange createFunctionCallOp(OpBuilder builder, Location location,
-                                FunctionCallOpType call_op_type,
-                                StringRef func_name, TypeRange output_types,
-                                ValueRange args) {
+ValueRange CreateFunctionCallOp(OpBuilder& builder, const Location location,
+                                const FunctionCallOpType call_op_type,
+                                const StringRef func_name,
+                                const TypeRange output_types,
+                                const ValueRange args) {
   switch (call_op_type) {
     case FunctionCallOpType::TFXlaCallModuleOp:
-      return createTFXlaCallModuleOp(builder, location, func_name, output_types,
+      return CreateTFXlaCallModuleOp(builder, location, func_name, output_types,
                                      args);
     case FunctionCallOpType::TFPartitionedCallOp:
-      return createTFPartitionedCallOp(builder, location, func_name,
+      return CreateTFPartitionedCallOp(builder, location, func_name,
                                        output_types, args);
-    default:
-      llvm_unreachable("unhandled call op type");
   }
 }
 
 // Finds ops in the paths from arguments to results. The ops is listed in an
 // order that the former ops shouldn't have any dependencies on the later ones.
-llvm::SmallVector<Operation*> FindOpsFromArgumentsToResults(
-    const llvm::SmallVector<Value>& arguments,
-    const llvm::SmallVector<Value>& results) {
+SmallVector<Operation*> FindOpsFromArgumentsToResults(
+    const ArrayRef<Value> arguments, const ArrayRef<Value> results) {
   std::queue<Value> value_queue;
   for (Value result : results) {
     value_queue.push(result);
@@ -213,7 +212,7 @@ llvm::SmallVector<Operation*> FindOpsFromArgumentsToResults(
     Operation* defining_node = current_value.getDefiningOp();
     if (defining_node == nullptr) continue;
     op_stack.push(defining_node);
-    for (const auto& arg : defining_node->getOperands()) {
+    for (Value arg : defining_node->getOperands()) {
       if (!argument_set.contains(arg.getImpl())) {
         value_queue.push(arg);
       }
@@ -221,7 +220,7 @@ llvm::SmallVector<Operation*> FindOpsFromArgumentsToResults(
   }
 
   // Remove duplicate ops from the op stack.
-  llvm::SmallVector<Operation*> sorted_ops;
+  SmallVector<Operation*> sorted_ops;
   absl::flat_hash_set<Operation*> unique_ops;
   while (!op_stack.empty()) {
     Operation* current_op = op_stack.top();
@@ -243,9 +242,9 @@ llvm::SmallVector<Operation*> FindOpsFromArgumentsToResults(
 // "0:transpose_a,1:transpose_b", where 0 and 1 are the respective attribute
 // identifiers.
 // This function returns success if all attributes could be found.
-LogicalResult SetAttributeMap(
-    MLIRContext& context, const llvm::SmallVector<NamedAttribute>& attributes,
-    const llvm::SmallVector<Operation*>& ops) {
+LogicalResult SetAttributeMap(MLIRContext& context,
+                              const ArrayRef<NamedAttribute> attributes,
+                              const ArrayRef<Operation*> ops) {
   // A map to find which operation an attribute belongs to.
   // The key for this map uses the entire NamedAttribute object, i.e. the
   // {attribute_name, attribute_value} pair.
@@ -270,8 +269,8 @@ LogicalResult SetAttributeMap(
             attr_to_op_map.begin(), attr_to_op_map.end(), [&](auto attr_op) {
               return std::get<0>(attr_op).getName() == attribute.getName();
             }) == attr_to_op_map.end()) {
-      mlir::emitError(UnknownLoc::get(&context),
-                      "Could not find attribute: " + attribute.getName().str());
+      emitError(UnknownLoc::get(&context),
+                "Could not find attribute: " + attribute.getName().str());
       return failure();
     }
 
@@ -293,7 +292,7 @@ LogicalResult SetAttributeMap(
 
       // Append "<identifier>:<attribute_name>". Ex) "0:transpose_a".
       const std::string identifier = std::to_string(idx);
-      const mlir::StringAttr attribute_name = attribute.getName();
+      const StringAttr attribute_name = attribute.getName();
       absl::StrAppend(&new_attr_map_str, identifier, ":", attribute_name.str());
       owner_op->setAttr(kAttrMapAttribute,
                         StringAttr::get(&context, new_attr_map_str));
@@ -303,14 +302,14 @@ LogicalResult SetAttributeMap(
 }
 
 // Creates a function to wrap the section between arguments and results.
-llvm::SmallVector<Value, 4> LiftAsFunctionCall(
-    OpBuilder builder, Location location, FunctionCallOpType call_op_type,
-    StringRef func_name, const llvm::SmallVector<Value>& arguments,
-    const llvm::SmallVector<Value>& results,
-    const llvm::SmallVector<NamedAttribute>& attributes) {
+SmallVector<Value, 4> LiftAsFunctionCall(
+    OpBuilder& builder, const Location location,
+    const FunctionCallOpType call_op_type, const StringRef func_name,
+    const ArrayRef<Value> arguments, const ArrayRef<Value> results,
+    const ArrayRef<NamedAttribute> attributes) {
   MLIRContext* context = builder.getContext();
   if (results.empty()) {
-    mlir::emitError(UnknownLoc::get(context), "No result values specified");
+    emitError(UnknownLoc::get(context), "No result values specified");
     return {};
   }
   Operation* result_op = results[0].getDefiningOp();
@@ -324,10 +323,11 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
   TypeRange result_types{ValueRange{results}};
   auto func_type = FunctionType::get(context, arg_types, result_types);
 
-  llvm::SmallVector<Location> arg_locs;
-  for (const auto& arg : arguments) {
+  SmallVector<Location> arg_locs;
+  for (Value arg : arguments) {
     arg_locs.push_back(arg.getLoc());
   }
+
   auto wrap_func = builder.create<func::FuncOp>(location, func_name, func_type);
   wrap_func.setVisibility(SymbolTable::Visibility::Private);
   // The callee function for TF::XlaCallModuleOp must have this attribute.
@@ -361,34 +361,36 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
     builder.clone(*op, mapping);
   }
 
-  llvm::SmallVector<Value> return_values;
+  SmallVector<Value> return_values;
   for (Value result : results) {
     return_values.push_back(mapping.lookupOrNull(result));
   }
-  builder.create<mlir::func::ReturnOp>(location, return_values);
+  builder.create<func::ReturnOp>(location, return_values);
 
   // Create a function call to the newly created function.
   StringAttr new_func_name =
-      InsertToSymbolTable(*module, *wrap_func, func_name.str());
+      InsertToSymbolTable(*module, *wrap_func, func_name);
   builder.setInsertionPointAfter(result_op);
   ValueRange new_results =
-      createFunctionCallOp(builder, call_op_loc, call_op_type,
+      CreateFunctionCallOp(builder, call_op_loc, call_op_type,
                            new_func_name.getValue(), result_types, arguments);
-  return llvm::SmallVector<Value, 4>(new_results.begin(), new_results.end());
+  return SmallVector<Value, 4>(new_results.begin(), new_results.end());
 }
 
-llvm::SmallVector<Value, 4> LiftAsFunctionCall(
-    OpBuilder builder, Location location, FunctionCallOpType call_op_type,
-    StringRef func_name, const llvm::SmallVector<Value>& arguments,
-    const llvm::SmallVector<Value>& results) {
-  llvm::SmallVector<NamedAttribute> attributes;
+SmallVector<Value, 4> LiftAsFunctionCall(OpBuilder& builder,
+                                         const Location location,
+                                         const FunctionCallOpType call_op_type,
+                                         const StringRef func_name,
+                                         const ArrayRef<Value> arguments,
+                                         const ArrayRef<Value> results) {
+  SmallVector<NamedAttribute> attributes;
   return LiftAsFunctionCall(builder, location, call_op_type, func_name,
                             arguments, results, attributes);
 }
 
-llvm::SmallVector<Value> AppendToVector(
-    const llvm::SmallVector<Value>& arguments, Value append) {
-  llvm::SmallVector<Value> ret(arguments);
+SmallVector<Value> AppendToVector(const ArrayRef<Value> arguments,
+                                  Value append) {
+  SmallVector<Value> ret(arguments);
   ret.push_back(append);
   return ret;
 }
@@ -402,7 +404,7 @@ llvm::SmallVector<Value> AppendToVector(
 //    could process the following equation by setting the attributes properly:
 //    abc,cd->abd.
 // 4. The output should be in the form: [batch dims][lhs dims][rhs dims]
-bool IsEinsumSupportedByXlaDotV2(mlir::StringAttr equation_attr) {
+bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr) {
   StringRef equation = equation_attr.getValue();
 
   if (!absl::StrContains(equation, "->") || !absl::StrContains(equation, ",") ||
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
index f2edd732f50cc5..db86b56734ab99 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
@@ -48,10 +48,10 @@ inline constexpr StringRef kQuantizationMethodAttr = "_quantization_method";
 enum FunctionCallOpType { TFPartitionedCallOp = 0, TFXlaCallModuleOp = 1 };
 
 // Checks if the op is inside a lifted function.
-bool IsInLiftedFunc(Operation &op);
+bool IsInLiftedFunc(Operation& op);
 
 // Checks if the given einsum op is supported for XlaDotV2 quantization.
-bool IsEinsumSupportedByXlaDotV2(mlir::StringAttr equation_attr);
+bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr);
 
 // Gets the quantization method from the given `XlaCallModuleOp`. It is
 // retrieved from the `kQuantizationMethodAttr` string attribute. Returns
@@ -64,23 +64,24 @@ absl::StatusOr<::stablehlo::quantization::Method> GetQuantizationMethod(
 // The generated function call op type will be decided by the given call_op_type
 // argument. Currently, it supports TF::XlaCallModuleOp and
 // TF::PartitionedCallOp function call op generations.
-llvm::SmallVector<Value, 4> LiftAsFunctionCall(
-    OpBuilder builder, Location location, FunctionCallOpType call_op_type,
-    StringRef func_name, const llvm::SmallVector<Value> &arguments,
-    const llvm::SmallVector<Value> &results,
-    const llvm::SmallVector<NamedAttribute> &attributes);
+SmallVector<Value, 4> LiftAsFunctionCall(OpBuilder& builder, Location location,
+                                         FunctionCallOpType call_op_type,
+                                         StringRef func_name,
+                                         ArrayRef<Value> arguments,
+                                         ArrayRef<Value> results,
+                                         ArrayRef<NamedAttribute> attributes);
 
 // Same as above but with empty attributes.
-llvm::SmallVector<Value, 4> LiftAsFunctionCall(
-    OpBuilder builder, Location location, FunctionCallOpType call_op_type,
-    StringRef func_name, const llvm::SmallVector<Value> &arguments,
-    const llvm::SmallVector<Value> &results);
+SmallVector<Value, 4> LiftAsFunctionCall(OpBuilder& builder, Location location,
+                                         FunctionCallOpType call_op_type,
+                                         StringRef func_name,
+                                         ArrayRef<Value> arguments,
+                                         ArrayRef<Value> results);
 
 // Add the second argument to the first argument, which is expected to be an
 // argument list.
 // Used to attach bias to einsum argument list.
-llvm::SmallVector<Value> AppendToVector(
-    const llvm::SmallVector<Value> &arguments, Value append);
+SmallVector<Value> AppendToVector(ArrayRef<Value> arguments, Value append);
 
 }  // namespace mlir::quant
 
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
index 3d1285928f5f18..30c1a342f8d4d5 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
@@ -46,7 +46,7 @@ using ::testing::NotNull;
 using ::tsl::testing::IsOk;
 using ::tsl::testing::StatusIs;
 
-using LiftAsFunctionCallTest = ::mlir::quant::QuantizationTestBase;
+using LiftAsFunctionCallTest = QuantizationTestBase;
 
 constexpr absl::string_view kModuleLifted = R"mlir(
   module {
@@ -65,9 +65,8 @@ TEST_F(LiftAsFunctionCallTest, LiftedFunctionSucceeds) {
       module_op->lookupSymbol<func::FuncOp>("composite_dot_general_fn_1");
   ASSERT_THAT(composite_dot_general_fn, NotNull());
 
-  Operation* dot_general_op =
-      FindOperationOfType<mlir::stablehlo::DotGeneralOp>(
-          composite_dot_general_fn);
+  auto dot_general_op = FindOperationOfType<mlir::stablehlo::DotGeneralOp>(
+      composite_dot_general_fn);
   EXPECT_TRUE(IsInLiftedFunc(*dot_general_op));
 }
 
@@ -87,7 +86,7 @@ TEST_F(LiftAsFunctionCallTest, FunctionLiftedAsXlaCallModuleOp) {
   func::FuncOp main_fn = FindMainFuncOp(*module_op);
   ASSERT_THAT(main_fn, NotNull());
 
-  Operation* dot_general_op =
+  auto dot_general_op =
       FindOperationOfType<mlir::stablehlo::DotGeneralOp>(main_fn);
 
   const SmallVector<NamedAttribute>& attributes = {
@@ -97,19 +96,20 @@ TEST_F(LiftAsFunctionCallTest, FunctionLiftedAsXlaCallModuleOp) {
               1, mlir::stablehlo::PrecisionAttr::get(
                      ctx_.get(), mlir::stablehlo::Precision::DEFAULT)))),
   };
+  const SmallVector<Value> operands(dot_general_op->getOperands());
+  const SmallVector<Value> results(dot_general_op->getResults());
   Operation* lifted_op =
       LiftAsFunctionCall(builder_, dot_general_op->getLoc(),
                          FunctionCallOpType::TFXlaCallModuleOp,
-                         "composite_dot_general_fn",
-                         dot_general_op->getOperands(),
-                         dot_general_op->getResults(), attributes)[0]
+                         "composite_dot_general_fn", operands, results,
+                         attributes)[0]
           .getDefiningOp();
   const auto entry_function_symbol_ref =
       lifted_op->getAttrOfType<FlatSymbolRefAttr>("_entry_function");
   SymbolTable symbol_table(*module_op);
   auto entry_func = dyn_cast_or_null<func::FuncOp>(
       symbol_table.lookup(entry_function_symbol_ref.getValue()));
-  Operation* lifted_dot_general_op =
+  auto lifted_dot_general_op =
       FindOperationOfType<mlir::stablehlo::DotGeneralOp>(entry_func);
 
   EXPECT_TRUE(isa<TF::XlaCallModuleOp>(lifted_op));
@@ -129,13 +129,14 @@ TEST_F(LiftAsFunctionCallTest, FunctionNoAttrLiftedAsXlaCallModuleOp) {
   func::FuncOp main_fn = FindMainFuncOp(*module_op);
   ASSERT_THAT(main_fn, NotNull());
 
-  Operation* dot_general_op =
+  auto dot_general_op =
       FindOperationOfType<mlir::stablehlo::DotGeneralOp>(main_fn);
+  const SmallVector<Value> operands(dot_general_op->getOperands());
+  const SmallVector<Value> results(dot_general_op->getResults());
   Operation* lifted_op =
-      LiftAsFunctionCall(
-          builder_, dot_general_op->getLoc(),
-          FunctionCallOpType::TFXlaCallModuleOp, "composite_dot_general_fn",
-          dot_general_op->getOperands(), dot_general_op->getResults())[0]
+      LiftAsFunctionCall(builder_, dot_general_op->getLoc(),
+                         FunctionCallOpType::TFXlaCallModuleOp,
+                         "composite_dot_general_fn", operands, results)[0]
           .getDefiningOp();
   EXPECT_TRUE(isa<TF::XlaCallModuleOp>(lifted_op));
   EXPECT_EQ(lifted_op->getAttr("_original_entry_function").cast<StringAttr>(),

From 4452743c353854631826fd1793753e0a41181926 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Mon, 18 Mar 2024 16:33:26 -0700
Subject: [PATCH 051/670] [xla:gpu] Dynamic offsets must be read one by one

When creating address computation thunk, we need to load dynamic offsets from device to host one by one, as dynamic-slice and DUS ops have each offset defined by a separate runtime value.

PiperOrigin-RevId: 616981622
---
 .../gpu/runtime/address_computation_thunk.cc  |  86 +++---
 .../gpu/runtime/address_computation_thunk.h   |   8 +-
 .../runtime/address_computation_thunk_test.cc | 246 +++++++++++++-----
 3 files changed, 235 insertions(+), 105 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 8affba065d2d78..28cf9163774ca5 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -46,11 +46,11 @@ AddressComputationThunk::AddressComputationThunk(
     ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
     std::vector<std::optional<const BufferAllocation::Slice>> operands,
     std::vector<std::optional<const BufferAllocation::Slice>> results,
-    std::vector<std::optional<const BufferAllocation::Slice>>
+    std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
         operand_offset_buffer_indices,
     std::vector<std::optional<const Shape>> operand_orig_shapes,
     std::vector<std::optional<const Shape>> operand_sliced_shapes,
-    std::vector<std::optional<const BufferAllocation::Slice>>
+    std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
         result_offset_buffer_indices,
     std::vector<std::optional<const Shape>> result_orig_shapes,
     std::vector<std::optional<const Shape>> result_sliced_shapes)
@@ -79,6 +79,10 @@ absl::Status AddressComputationThunk::Prepare(
       TF_RET_CHECK(operand_sliced_shapes_[i]->IsArray());
       TF_RET_CHECK(operand_orig_shapes_[i].has_value() &&
                    operand_orig_shapes_[i]->IsArray());
+      TF_RET_CHECK(operand_sliced_shapes_[i]->rank() ==
+                   operand_orig_shapes_[i]->rank());
+      TF_RET_CHECK(operand_offset_buffer_indices_[i]->size() ==
+                   operand_orig_shapes_[i]->rank());
     }
   }
 
@@ -93,6 +97,10 @@ absl::Status AddressComputationThunk::Prepare(
       TF_RET_CHECK(result_sliced_shapes_[i]->IsArray());
       TF_RET_CHECK(result_orig_shapes_[i].has_value() &&
                    result_orig_shapes_[i]->IsArray());
+      TF_RET_CHECK(result_sliced_shapes_[i]->rank() ==
+                   result_orig_shapes_[i]->rank());
+      TF_RET_CHECK(result_offset_buffer_indices_[i]->size() ==
+                   result_orig_shapes_[i]->rank());
     }
   }
 
@@ -167,32 +175,37 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
       continue;
     }
 
-    se::DeviceMemoryBase offset_src =
-        orig_allocations.GetDeviceAddress(*operand_offset_buffer_indices_[i]);
-
-    // Copy the ith offset from device to host.
     const Shape& src_shape = *operand_orig_shapes_[i];
     const Shape& dst_shape = *operand_sliced_shapes_[i];
-    int64_t* offset_dst = &operand_offsets_base[i];
-    TF_RETURN_IF_ERROR(stream.Memcpy(offset_dst, offset_src,
-                                     dst_shape.rank() * sizeof(int64_t)));
-
-    if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
-      return absl::InternalError(absl::StrFormat(
-          "Failed to retrieve all slice offset values on stream %p: %s",
-          &stream, blocked.message()));
+    TF_RET_CHECK(IsContiguousSlice(src_shape, dst_shape));
+
+    std::vector<int64_t> slice_starts;
+    slice_starts.reserve(dst_shape.rank());
+
+    // Get offset for ith operand, which has `dst_shape.rank()` components.
+    for (auto [idx, offset_slice] :
+         llvm::enumerate(*operand_offset_buffer_indices_[i])) {
+      se::DeviceMemoryBase offset_src =
+          orig_allocations.GetDeviceAddress(offset_slice);
+      int64_t* offset_dst = &operand_offsets_base[i + idx];
+      // Copy the idx-th component of the ith offset from device to host.
+      TF_RETURN_IF_ERROR(
+          stream.Memcpy(offset_dst, offset_src, sizeof(int64_t)));
+
+      if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
+        return absl::InternalError(absl::StrFormat(
+            "Failed to retrieve all slice offset values on stream %p: %s",
+            &stream, blocked.message()));
+      }
+      slice_starts.push_back(*offset_dst);
     }
 
     // Compute new slice. No need to copy the content to new buffers as we can
     // reuse the original buffers since slices are contiguous.
-    TF_RET_CHECK(IsContiguousSlice(src_shape, dst_shape));
-
     int64_t new_size = ShapeUtil::ByteSizeOf(dst_shape);
     BufferAllocation::Slice orig_slice = *embedded_thunk_operands_[i];
 
     int64_t new_offset = orig_slice.offset();
-    std::vector<int64_t> slice_starts(offset_dst,
-                                      offset_dst + dst_shape.rank());
     for (auto [start, stride] :
          llvm::zip(slice_starts, *ShapeUtil::ByteStrides(src_shape))) {
       new_offset += start * stride;
@@ -221,32 +234,37 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
       continue;
     }
 
-    se::DeviceMemoryBase offset_src =
-        orig_allocations.GetDeviceAddress(*result_offset_buffer_indices_[i]);
-
-    // Copy the ith offset from device to host.
     const Shape& src_shape = *result_orig_shapes_[i];
     const Shape& dst_shape = *result_sliced_shapes_[i];
-    int64_t* offset_dst = &result_offsets_base[i];
-    TF_RETURN_IF_ERROR(stream.Memcpy(offset_dst, offset_src,
-                                     dst_shape.rank() * sizeof(int64_t)));
-
-    if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
-      return absl::InternalError(absl::StrFormat(
-          "Failed to retrieve all slice offset values on stream %p: %s",
-          &stream, blocked.message()));
+    TF_RET_CHECK(IsContiguousSlice(src_shape, dst_shape));
+
+    std::vector<int64_t> slice_starts;
+    slice_starts.reserve(dst_shape.rank());
+
+    // Get offset for ith result, which has `dst_shape.rank()` components.
+    for (auto [idx, offset_slice] :
+         llvm::enumerate(*result_offset_buffer_indices_[i])) {
+      se::DeviceMemoryBase offset_src =
+          orig_allocations.GetDeviceAddress(offset_slice);
+      int64_t* offset_dst = &result_offsets_base[i + idx];
+      // Copy the idx-th component of the ith offset from device to host.
+      TF_RETURN_IF_ERROR(
+          stream.Memcpy(offset_dst, offset_src, sizeof(int64_t)));
+
+      if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
+        return absl::InternalError(absl::StrFormat(
+            "Failed to retrieve all slice offset values on stream %p: %s",
+            &stream, blocked.message()));
+      }
+      slice_starts.push_back(*offset_dst);
     }
 
     // Compute new slice. No need to copy the content to new buffers as we can
     // reuse the original buffers since slices are contiguous.
-    TF_RET_CHECK(IsContiguousSlice(src_shape, dst_shape));
-
     int64_t new_size = ShapeUtil::ByteSizeOf(dst_shape);
     BufferAllocation::Slice orig_slice = *embedded_thunk_results_[i];
 
     int64_t new_offset = orig_slice.offset();
-    std::vector<int64_t> slice_starts(offset_dst,
-                                      offset_dst + dst_shape.rank());
     for (auto [start, stride] :
          llvm::zip(slice_starts, *ShapeUtil::ByteStrides(src_shape))) {
       new_offset += start * stride;
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index d4bdbfe287d9b1..b52b5fdfde861e 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -46,11 +46,11 @@ class AddressComputationThunk : public Thunk {
       ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
       std::vector<std::optional<const BufferAllocation::Slice>> operands,
       std::vector<std::optional<const BufferAllocation::Slice>> results,
-      std::vector<std::optional<const BufferAllocation::Slice>>
+      std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
           operand_offset_buffer_indices,
       std::vector<std::optional<const Shape>> operand_orig_shapes,
       std::vector<std::optional<const Shape>> operand_sliced_shapes,
-      std::vector<std::optional<const BufferAllocation::Slice>>
+      std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
           result_offset_buffer_indices,
       std::vector<std::optional<const Shape>> result_orig_shapes,
       std::vector<std::optional<const Shape>> result_sliced_shapes);
@@ -69,11 +69,11 @@ class AddressComputationThunk : public Thunk {
       embedded_thunk_operands_;
   std::vector<std::optional<const BufferAllocation::Slice>>
       embedded_thunk_results_;
-  std::vector<std::optional<const BufferAllocation::Slice>>
+  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
       operand_offset_buffer_indices_;
   std::vector<std::optional<const Shape>> operand_orig_shapes_;
   std::vector<std::optional<const Shape>> operand_sliced_shapes_;
-  std::vector<std::optional<const BufferAllocation::Slice>>
+  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
       result_offset_buffer_indices_;
   std::vector<std::optional<const Shape>> result_orig_shapes_;
   std::vector<std::optional<const Shape>> result_sliced_shapes_;
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index e783cdea0ba6a3..1167cf18a93c57 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -77,7 +77,7 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
   int64_t lhs_length = sizeof(float) * 2 * 4;
   int64_t rhs_length = sizeof(float) * 3 * 1;
   int64_t out_length = sizeof(float) * 1 * 1;
-  int64_t lhs_offset_length = sizeof(int64_t) * 2;
+  int64_t offset_length = sizeof(int64_t);
 
   // Step 1:
   // Prepare embedded and address computation thunks.
@@ -95,10 +95,15 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
   BufferAllocation alloc_workspace(/*index=*/3, 1024 * 1024, /*color=*/0);
   BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
 
-  BufferAllocation alloc_lhs_offset(/*index=*/4, lhs_offset_length,
-                                    /*color=*/0);
-  BufferAllocation::Slice slice_lhs_offset(&alloc_lhs_offset, 0,
-                                           lhs_offset_length);
+  BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_offset_1(/*index=*/5, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
 
   BufferAllocation alloc_lhs_fake(/*index=*/0, rhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, rhs_length);
@@ -119,10 +124,12 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)), {slice_lhs, slice_rhs},
-      {slice_out, slice_workspace}, {slice_lhs_offset, std::nullopt},
+      {slice_out, slice_workspace}, {lhs_offsets, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt},
       {std::nullopt, std::nullopt}, {std::nullopt, std::nullopt},
@@ -157,15 +164,17 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
       executor->AllocateArray<float>(1024 * 1024);
   TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
 
-  se::DeviceMemory<int64_t> lhs_offset = executor->AllocateArray<int64_t>(2);
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> lhs_offset_arr{0, 1};
-  TF_ASSERT_OK(
-      stream.Memcpy(&lhs_offset, lhs_offset_arr.data(), lhs_offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({lhs, rhs, out, workspace, lhs_offset}, 0,
-                                executor->GetAllocator());
+  BufferAllocations allocations(
+      {lhs, rhs, out, workspace, lhs_offset_0, lhs_offset_1}, 0,
+      executor->GetAllocator());
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
@@ -194,7 +203,7 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   int64_t lhs_length = sizeof(float) * 2 * 4;
   int64_t rhs_length = sizeof(float) * 4 * 3;
   int64_t out_length = sizeof(float) * 2 * 2;
-  int64_t offset_length = sizeof(int64_t) * 2;
+  int64_t offset_length = sizeof(int64_t);
   int64_t slice_length = sizeof(float) * 2 * 2;
 
   // Step 1:
@@ -213,11 +222,25 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   BufferAllocation alloc_workspace(/*index=*/3, 1024 * 1024, /*color=*/0);
   BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
 
-  BufferAllocation alloc_lhs_offset(/*index=*/4, offset_length, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_offset(&alloc_lhs_offset, 0, offset_length);
+  BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_offset_1(/*index=*/5, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
 
-  BufferAllocation alloc_rhs_offset(/*index=*/5, offset_length, /*color=*/0);
-  BufferAllocation::Slice slice_rhs_offset(&alloc_rhs_offset, 0, offset_length);
+  BufferAllocation alloc_rhs_offset_0(/*index=*/6, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_rhs_offset_0(&alloc_rhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_rhs_offset_1(/*index=*/7, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_rhs_offset_1(&alloc_rhs_offset_1, 0,
+                                             offset_length);
 
   BufferAllocation alloc_lhs_fake(/*index=*/0, slice_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, slice_length);
@@ -241,10 +264,14 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
+  std::vector<BufferAllocation::Slice> rhs_offsets{slice_rhs_offset_0,
+                                                   slice_rhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)), {slice_lhs, slice_rhs},
-      {slice_out, slice_workspace}, {slice_lhs_offset, slice_rhs_offset},
+      {slice_out, slice_workspace}, {lhs_offsets, rhs_offsets},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}),
        ShapeUtil::MakeShape(PrimitiveType::F32, {4, 3})},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2}),
@@ -286,21 +313,23 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
       executor->AllocateArray<float>(1024 * 1024);
   TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
 
-  se::DeviceMemory<int64_t> lhs_offset = executor->AllocateArray<int64_t>(2);
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> lhs_offset_arr{0, 1};
-  TF_ASSERT_OK(
-      stream.Memcpy(&lhs_offset, lhs_offset_arr.data(), offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
 
-  se::DeviceMemory<int64_t> rhs_offset = executor->AllocateArray<int64_t>(2);
+  se::DeviceMemory<int64_t> rhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> rhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> rhs_offset_arr{2, 1};
-  TF_ASSERT_OK(
-      stream.Memcpy(&rhs_offset, rhs_offset_arr.data(), offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&rhs_offset_0, &rhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&rhs_offset_1, &rhs_offset_arr[1], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations(
-      {lhs, rhs, out, workspace, lhs_offset, rhs_offset}, 0,
-      executor->GetAllocator());
+  BufferAllocations allocations({lhs, rhs, out, workspace, lhs_offset_0,
+                                 lhs_offset_1, rhs_offset_0, rhs_offset_1},
+                                0, executor->GetAllocator());
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
@@ -322,7 +351,7 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
 
   int64_t length = sizeof(float) * 2 * 4;
   int64_t out_length = sizeof(float) * 1;
-  int64_t offset_length = sizeof(int64_t) * 2;
+  int64_t offset_length = sizeof(int64_t);
   int64_t slice_length = sizeof(float) * 3;
 
   // Step 1:
@@ -341,17 +370,31 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
   BufferAllocation alloc_workspace(/*index=*/3, 1024 * 1024, /*color=*/0);
   BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
 
-  BufferAllocation alloc_lhs_offset(/*index=*/4, offset_length, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_offset(&alloc_lhs_offset, 0, offset_length);
+  BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_offset_1(/*index=*/5, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
 
-  BufferAllocation alloc_rhs_offset(/*index=*/5, offset_length, /*color=*/0);
-  BufferAllocation::Slice slice_rhs_offset(&alloc_rhs_offset, 0, offset_length);
+  BufferAllocation alloc_rhs_offset_0(/*index=*/6, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_rhs_offset_0(&alloc_rhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_rhs_offset_1(/*index=*/7, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_rhs_offset_1(&alloc_rhs_offset_1, 0,
+                                             offset_length);
 
   BufferAllocation alloc_lhs_fake(/*index=*/0, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, slice_length);
+  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs, 0, slice_length);
 
   BufferAllocation alloc_rhs_fake(/*index=*/1, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_rhs_fake(&alloc_rhs_fake, 0, slice_length);
+  BufferAllocation::Slice slice_rhs_fake(&alloc_rhs, 0, slice_length);
 
   // Preparing config for GEMM thunk.
   auto config =
@@ -369,10 +412,14 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
+  std::vector<BufferAllocation::Slice> rhs_offsets{slice_rhs_offset_0,
+                                                   slice_rhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)), {slice_lhs, slice_rhs},
-      {slice_out, slice_workspace}, {slice_lhs_offset, slice_rhs_offset},
+      {slice_out, slice_workspace}, {lhs_offsets, rhs_offsets},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}),
        ShapeUtil::MakeShape(PrimitiveType::F32, {8, 1})},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}),
@@ -418,21 +465,23 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
       executor->AllocateArray<float>(1024 * 1024);
   TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
 
-  se::DeviceMemory<int64_t> lhs_offset = executor->AllocateArray<int64_t>(2);
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> lhs_offset_arr{0, 1};
-  TF_ASSERT_OK(
-      stream.Memcpy(&lhs_offset, lhs_offset_arr.data(), offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
 
-  se::DeviceMemory<int64_t> rhs_offset = executor->AllocateArray<int64_t>(2);
+  se::DeviceMemory<int64_t> rhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> rhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> rhs_offset_arr{2, 0};
-  TF_ASSERT_OK(
-      stream.Memcpy(&rhs_offset, rhs_offset_arr.data(), offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&rhs_offset_0, &rhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&rhs_offset_1, &rhs_offset_arr[1], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations(
-      {lhs, rhs, out, workspace, lhs_offset, rhs_offset}, 0,
-      executor->GetAllocator());
+  BufferAllocations allocations({lhs, rhs, out, workspace, lhs_offset_0,
+                                 lhs_offset_1, rhs_offset_0, rhs_offset_1},
+                                0, executor->GetAllocator());
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
@@ -480,7 +529,7 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   int64_t dst_count = 8 * 8;
   int64_t src_length = sizeof(int32_t) * src_count;
   int64_t dst_length = sizeof(int32_t) * dst_count;
-  int64_t offset_length = sizeof(int64_t) * 4;
+  int64_t offset_length = sizeof(int64_t);
   int64_t slice_length = sizeof(int32_t) * dst_count;
 
   // Step 1:
@@ -493,8 +542,17 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   BufferAllocation alloc_dst(/*index=*/1, dst_length, /*color=*/0);
   BufferAllocation::Slice slice_dst(&alloc_dst, 0, dst_length);
 
-  BufferAllocation alloc_offset(/*index=*/2, offset_length, /*color=*/0);
-  BufferAllocation::Slice slice_offset(&alloc_offset, 0, offset_length);
+  BufferAllocation alloc_offset_0(/*index=*/2, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_offset_0(&alloc_offset_0, 0, offset_length);
+
+  BufferAllocation alloc_offset_1(/*index=*/3, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_offset_1(&alloc_offset_1, 0, offset_length);
+
+  BufferAllocation alloc_offset_2(/*index=*/4, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_offset_2(&alloc_offset_2, 0, offset_length);
+
+  BufferAllocation alloc_offset_3(/*index=*/5, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_offset_3(&alloc_offset_3, 0, offset_length);
 
   // Fake slices for embedded thunk creation.
   BufferAllocation alloc_src_fake(/*index=*/0, slice_length, /*color=*/0);
@@ -520,10 +578,13 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
       /*called_computation=*/nullptr));
 
   // Wrapping address computation thunk around the custom call thunk.
+  std::vector<BufferAllocation::Slice> slice_offsets{
+      slice_offset_0, slice_offset_1, slice_offset_2, slice_offset_3};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)), {slice_src}, {slice_dst},
-      {slice_offset}, {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 8})},
+      {slice_offsets},
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 8})},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
       // original slice result and not bitcasted one)
       {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 8, 8})}, {std::nullopt},
@@ -546,14 +607,21 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   se::DeviceMemory<int32_t> dst = executor->AllocateArray<int32_t>(dst_count);
   TF_ASSERT_OK(stream.MemZero(&dst, dst_length));
 
-  se::DeviceMemory<int64_t> offset = executor->AllocateArray<int64_t>(4);
+  se::DeviceMemory<int64_t> offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> offset_1 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> offset_2 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> offset_3 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> offset_arr{3, 5, 2, 0};
-  TF_ASSERT_OK(stream.Memcpy(&offset, offset_arr.data(), offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&offset_0, &offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&offset_1, &offset_arr[1], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&offset_2, &offset_arr[2], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&offset_3, &offset_arr[3], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({src, dst, offset}, 0,
-                                executor->GetAllocator());
+  BufferAllocations allocations(
+      {src, dst, offset_0, offset_1, offset_2, offset_3}, 0,
+      executor->GetAllocator());
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
@@ -591,7 +659,7 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
   int64_t slice_count = 2 * 2;
   int64_t src_length = sizeof(int32_t) * src_count;
   int64_t dst_length = sizeof(int32_t) * dst_count;
-  int64_t offset_length = sizeof(int64_t) * 4;
+  int64_t offset_length = sizeof(int64_t);
   int64_t slice_length = sizeof(int32_t) * slice_count;
 
   // Step 1:
@@ -604,11 +672,37 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
   BufferAllocation alloc_dst(/*index=*/1, dst_length, /*color=*/0);
   BufferAllocation::Slice slice_dst(&alloc_dst, 0, dst_length);
 
-  BufferAllocation alloc_src_offset(/*index=*/2, offset_length, /*color=*/0);
-  BufferAllocation::Slice slice_src_offset(&alloc_src_offset, 0, offset_length);
+  BufferAllocation alloc_src_offset_0(/*index=*/2, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_0(&alloc_src_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_src_offset_1(/*index=*/3, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_1(&alloc_src_offset_1, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_src_offset_2(/*index=*/4, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_2(&alloc_src_offset_2, 0,
+                                             offset_length);
 
-  BufferAllocation alloc_dst_offset(/*index=*/3, offset_length, /*color=*/0);
-  BufferAllocation::Slice slice_dst_offset(&alloc_dst_offset, 0, offset_length);
+  BufferAllocation alloc_src_offset_3(/*index=*/5, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_3(&alloc_src_offset_3, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_0(/*index=*/6, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_0(&alloc_dst_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_1(/*index=*/7, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_1(&alloc_dst_offset_1, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_2(/*index=*/8, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_2(&alloc_dst_offset_2, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_3(/*index=*/9, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_3(&alloc_dst_offset_3, 0,
+                                             offset_length);
 
   // Fake slices for embedded thunk creation.
   BufferAllocation alloc_src_fake(/*index=*/0, slice_length, /*color=*/0);
@@ -637,15 +731,21 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
       /*called_computation=*/nullptr));
 
   // Wrapping address computation thunk around the custom call thunk.
+  std::vector<BufferAllocation::Slice> slice_src_offsets{
+      slice_src_offset_0, slice_src_offset_1, slice_src_offset_2,
+      slice_src_offset_3};
+  std::vector<BufferAllocation::Slice> slice_dst_offsets{
+      slice_dst_offset_0, slice_dst_offset_1, slice_dst_offset_2,
+      slice_dst_offset_3};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)), {slice_src}, {slice_dst},
-      {slice_src_offset},
+      {slice_src_offsets},
       {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 2})},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
       // original slice result and not bitcasted one)
       {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})},
-      {slice_dst_offset},
+      {slice_dst_offsets},
       {{ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2, 2, 2})}},
       {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})});
 
@@ -671,20 +771,32 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
   se::DeviceMemory<int32_t> dst = executor->AllocateArray<int32_t>(dst_count);
   TF_ASSERT_OK(stream.MemZero(&dst, dst_length));
 
-  se::DeviceMemory<int64_t> src_offset = executor->AllocateArray<int64_t>(4);
+  se::DeviceMemory<int64_t> src_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> src_offset_1 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> src_offset_2 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> src_offset_3 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> src_offset_arr{3, 5, 2, 0};
-  TF_ASSERT_OK(
-      stream.Memcpy(&src_offset, src_offset_arr.data(), offset_length));
-
-  se::DeviceMemory<int64_t> dst_offset = executor->AllocateArray<int64_t>(4);
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_0, &src_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_1, &src_offset_arr[1], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_2, &src_offset_arr[2], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_3, &src_offset_arr[3], offset_length));
+
+  se::DeviceMemory<int64_t> dst_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> dst_offset_1 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> dst_offset_2 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> dst_offset_3 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> dst_offset_arr{1, 1, 0, 0};
-  TF_ASSERT_OK(
-      stream.Memcpy(&dst_offset, dst_offset_arr.data(), offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_0, &dst_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_1, &dst_offset_arr[1], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_2, &dst_offset_arr[2], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_3, &dst_offset_arr[3], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({src, dst, src_offset, dst_offset}, 0,
-                                executor->GetAllocator());
+  BufferAllocations allocations(
+      {src, dst, src_offset_0, src_offset_1, src_offset_2, src_offset_3,
+       dst_offset_0, dst_offset_1, dst_offset_2, dst_offset_3},
+      0, executor->GetAllocator());
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, &stream, &stream, {}, nullptr, nullptr);

From befa96da07437b7735c385a6fae7c7ccb8ef1c21 Mon Sep 17 00:00:00 2001
From: Eunjae Kim <eunjaekim@google.com>
Date: Mon, 18 Mar 2024 16:41:09 -0700
Subject: [PATCH 052/670] Fix the shared_batch_scheduler_test to avoid using
 the designated initializer to fix the windows build failure

PiperOrigin-RevId: 616983664
---
 tensorflow/core/kernels/batching_util/BUILD   |  1 -
 .../shared_batch_scheduler_test.cc            | 66 +++++++++----------
 2 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 828b1c0f60d4fb..d34bd7331a35d5 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -190,7 +190,6 @@ tf_cc_test(
     name = "shared_batch_scheduler_test",
     size = "small",
     srcs = ["shared_batch_scheduler_test.cc"],
-    tags = ["no_windows"],
     deps = [
         ":batch_scheduler",
         ":fake_clock_env",
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index 29b79b3bb4b712..680bbb5dd56206 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -434,39 +434,39 @@ TEST_P(
 
     // Create two queues.
 
-    const SharedBatchScheduler<FakeTaskWithoutCriticality>::QueueOptions
-        queue_options = {
-            .input_batch_size_limit = 10,
-            .batch_timeout_micros = 1000 * 1000,
-            .max_enqueued_batches = 2,
-            .enable_large_batch_splitting = enable_input_batch_split(),
-            .split_input_task_func =
-                [](std::unique_ptr<FakeTaskWithoutCriticality>* input_task,
-                   int open_batch_remaining_slot, int max_batch_size,
-                   std::vector<std::unique_ptr<FakeTaskWithoutCriticality>>*
-                       output_tasks) -> Status {
-              std::unique_ptr<FakeTaskWithoutCriticality> owned_input_task =
-                  std::move(*input_task);
-              const int input_task_size = owned_input_task->size();
-
-              const internal::InputSplitMetadata input_split_metadata(
-                  input_task_size, open_batch_remaining_slot, max_batch_size);
-
-              const absl::FixedArray<int> task_sizes =
-                  input_split_metadata.task_sizes();
-              const int num_batches = task_sizes.size();
-
-              output_tasks->resize(num_batches);
-              for (int i = 0; i < num_batches; i++) {
-                (*output_tasks)[i] =
-                    std::make_unique<FakeTaskWithoutCriticality>(task_sizes[i]);
-              }
-
-              return absl::OkStatus();
-            },
-            .enable_lazy_split = enable_lazy_split(),
-            .max_execution_batch_size = 10,
-            .enable_priority_queue = true};
+    SharedBatchScheduler<FakeTaskWithoutCriticality>::QueueOptions
+        queue_options;
+    queue_options.input_batch_size_limit = 10;
+    queue_options.batch_timeout_micros = 1000 * 1000;
+    queue_options.max_enqueued_batches = 2;
+    queue_options.enable_large_batch_splitting = enable_input_batch_split();
+    queue_options.split_input_task_func =
+        [](std::unique_ptr<FakeTaskWithoutCriticality>* input_task,
+           int open_batch_remaining_slot, int max_batch_size,
+           std::vector<std::unique_ptr<FakeTaskWithoutCriticality>>*
+               output_tasks) -> Status {
+      std::unique_ptr<FakeTaskWithoutCriticality> owned_input_task =
+          std::move(*input_task);
+      const int input_task_size = owned_input_task->size();
+
+      const internal::InputSplitMetadata input_split_metadata(
+          input_task_size, open_batch_remaining_slot, max_batch_size);
+
+      const absl::FixedArray<int> task_sizes =
+          input_split_metadata.task_sizes();
+      const int num_batches = task_sizes.size();
+
+      output_tasks->resize(num_batches);
+      for (int i = 0; i < num_batches; i++) {
+        (*output_tasks)[i] =
+            std::make_unique<FakeTaskWithoutCriticality>(task_sizes[i]);
+      }
+
+      return absl::OkStatus();
+    };
+    queue_options.enable_lazy_split = enable_lazy_split();
+    queue_options.max_execution_batch_size = 10;
+    queue_options.enable_priority_queue = true;
 
     std::unique_ptr<BatchScheduler<FakeTaskWithoutCriticality>> queue_0;
     TF_CHECK_OK(shared_batch_scheduler->AddQueue(queue_options,

From dfe2e26f8673f529f43e0f8e20ad8a7afad15aeb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 Mar 2024 16:54:49 -0700
Subject: [PATCH 053/670] Support host offloaded values as entry computation
 output

PiperOrigin-RevId: 616987214
---
 third_party/xla/xla/service/BUILD             |  4 +-
 third_party/xla/xla/service/host_offloader.cc | 58 ++++++++++++-
 third_party/xla/xla/service/host_offloader.h  |  2 +
 .../xla/xla/service/host_offloader_test.cc    | 83 ++++++++++++++++++-
 .../xla/xla/service/layout_assignment.cc      | 36 +++++---
 5 files changed, 168 insertions(+), 15 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 1ce30da506c7f3..f41fe0f3a7c93c 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -4571,7 +4571,6 @@ cc_library(
     deps = [
         ":call_graph",
         ":computation_layout",
-        ":hlo_alias_analysis",
         ":hlo_dce",
         ":hlo_graph_dumper",
         ":hlo_pass",
@@ -4581,6 +4580,7 @@ cc_library(
         "//xla:permutation_util",
         "//xla:shape_layout",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -4598,8 +4598,8 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/host_offloader.cc b/third_party/xla/xla/service/host_offloader.cc
index b484d7fec3418b..9058a9aa48c515 100644
--- a/third_party/xla/xla/service/host_offloader.cc
+++ b/third_party/xla/xla/service/host_offloader.cc
@@ -166,6 +166,56 @@ HloInstruction* FindDSAnnotation(HloInstruction* hlo) {
 
 }  // namespace
 
+absl::StatusOr<bool> HostOffloader::TryOutputStreaming(
+    HloInstruction* custom_call) {
+  const HloBuffer& unique_buffer =
+      alias_analysis_->GetUniqueBufferAt(custom_call);
+  bool is_used_as_output_with_host_memory_space = false;
+  const HloComputation* const entry_computation =
+      custom_call->GetModule()->entry_computation();
+  for (const HloValue* value : unique_buffer.values()) {
+    // Check if this is memory-only.
+    if (!AllPositionsAreAllowed(value)) {
+      // Found a position which is not allowed.
+      return false;
+    }
+
+    // Look for a value used as a output.
+    for (const auto& position : value->positions()) {
+      const HloInstruction* instruction = position.instruction;
+      const ShapeIndex& index = position.index;
+      if (instruction->parent() == entry_computation && instruction->IsRoot()) {
+        const Shape& output_shape =
+            ShapeUtil::GetSubshape(entry_computation->parent()
+                                       ->entry_computation_layout()
+                                       .result_shape(),
+                                   index);
+        CHECK(output_shape.has_layout());
+
+        if (output_shape.layout().memory_space() != kHostMemorySpaceColor) {
+          return FailedPrecondition(
+              "Output buffer is annotated with %s but is not marked with host "
+              "memory space in the entry computation.",
+              custom_call->name());
+        }
+        is_used_as_output_with_host_memory_space = true;
+      }
+    }
+  }
+  if (!is_used_as_output_with_host_memory_space) {
+    VLOG(1) << "Buffer annotated by " << custom_call->name()
+            << " is not used as an output with host memory space.";
+    return false;
+  }
+
+  VLOG(3) << "Found an output buffer annotated with " << custom_call->name()
+          << ". Expecting that we'll need to insert copies.";
+
+  annotations_for_copy_to_host_to_insert_.emplace(custom_call);
+  AddAllPositionsToBeMovedToHostMemory(unique_buffer);
+  return true;
+}
+
 Status HostOffloader::HandleMoveToHostCustomCall(HloInstruction* custom_call) {
   VLOG(2) << "Found a custom call annotating start-of-host-offload: "
           << custom_call->ToString();
@@ -195,7 +245,11 @@ Status HostOffloader::HandleMoveToHostCustomCall(HloInstruction* custom_call) {
   } else if (op_being_annotated->opcode() == HloOpcode::kCopy) {
     TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithCopy(op_being_annotated));
   } else {
-    TF_RETURN_IF_ERROR(MemoryOnlyOffloadInsertCopies(custom_call));
+    TF_ASSIGN_OR_RETURN(bool did_output_streaming,
+                        TryOutputStreaming(custom_call));
+    if (!did_output_streaming) {
+      TF_RETURN_IF_ERROR(MemoryOnlyOffloadInsertCopies(custom_call));
+    }
   }
   return OkStatus();
 }
@@ -576,7 +630,7 @@ absl::StatusOr<bool> HostOffloader::Run(
   // Run HloAliasAnalysis on module.
   TF_ASSIGN_OR_RETURN(alias_analysis_, HloAliasAnalysis::Run(module));
 
-  // Iterate over all instructions and look for XLA host offload annoations.
+  // Iterate over all instructions and look for XLA host offload annotations.
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
     for (HloInstruction* instruction :
diff --git a/third_party/xla/xla/service/host_offloader.h b/third_party/xla/xla/service/host_offloader.h
index 85966a312dc790..8bd2c0fb26598a 100644
--- a/third_party/xla/xla/service/host_offloader.h
+++ b/third_party/xla/xla/service/host_offloader.h
@@ -21,6 +21,7 @@
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_pass_interface.h"
 
@@ -67,6 +68,7 @@ class HostOffloader : public HloModulePass {
   void AddAllPositionsToBeMovedToHostMemory(const HloBuffer& unique_buffer);
 
   absl::StatusOr<bool> TryParameterStreaming(HloInstruction* custom_call);
+  absl::StatusOr<bool> TryOutputStreaming(HloInstruction* custom_call);
   Status HandleMoveToHostCustomCall(HloInstruction* custom_call);
   Status HandleMoveToDeviceCustomCall(HloInstruction* custom_call);
 
diff --git a/third_party/xla/xla/service/host_offloader_test.cc b/third_party/xla/xla/service/host_offloader_test.cc
index 4eb459c2e60222..6b367fe53a2f54 100644
--- a/third_party/xla/xla/service/host_offloader_test.cc
+++ b/third_party/xla/xla/service/host_offloader_test.cc
@@ -1779,7 +1779,7 @@ ENTRY main {
 
 TEST_F(HostOffloaderTest, ParameterStreaming) {
   const std::string& hlo_string = R"(
-HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)S(5)}, s32[2,1]{1,0:T(2,128)})->(s32[2,1]{1,0:T(2,128)S(5)}, s32[2,1]{1,0:T(2,128)S(5)})}
+HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)S(5)}, s32[2,1]{1,0:T(2,128)})->(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})}
 
 ENTRY main {
   param_0 = s32[2,1]{1,0} parameter(0)
@@ -1854,6 +1854,87 @@ ENTRY main {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, OutputStreaming) {
+  const std::string& hlo_string = R"(
+HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})->(s32[2,1]{1,0:T(2,128)S(5)}, s32[2,1]{1,0:T(2,128)})}
+
+ENTRY main {
+  param_0 = s32[2,1]{1,0} parameter(0)
+  param_1 = s32[2,1]{1,0} parameter(1)
+  constant_2 = s32[] constant(2)
+  constant_4 = s32[] constant(4)
+  broadcast_0 = s32[2,1]{1,0} broadcast(constant_2), dimensions={}
+  multiply_0 = s32[2,1]{1,0} multiply(param_1, broadcast_0)
+  multiply_1 = s32[2,1]{1,0} multiply(multiply_0, param_0)
+  broadcast_1 = s32[2,1]{1,0} broadcast(constant_4), dimensions={}
+  multiply_2 = s32[2,1]{1,0} multiply(multiply_1, broadcast_1)
+  custom_call = s32[2,1]{1,0} custom-call(multiply_2), custom_call_target="MoveToHost"
+  ROOT tuple = (s32[2,1]{1,0}, s32[2,1]{1,0}) tuple(custom_call, multiply_1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  //         constant
+  //            |
+  // param1 broadcast  param0
+  //     \  /          /
+  //   multiply       /
+  //       \         /
+  //        \       /
+  //         multiply   constant
+  //         |     |       |
+  //         |  ---+---broadcast
+  //         | /   |
+  //      multiply |
+  //          |    |
+  //         copy  |
+  //           \   |
+  //           tuple
+  HloInstruction* param_1;
+  HloInstruction* broadcast_0;
+  HloInstruction* multiply_0;
+  HloInstruction* param_0;
+  HloInstruction* multiply_1;
+  HloInstruction* broadcast_1;
+  HloInstruction* multiply_2;
+  HloInstruction* copy;
+  HloInstruction* tuple;
+  auto multiplyPattern =
+      m::Multiply(&multiply_1,
+                  m::Multiply(&multiply_0, m::Parameter(&param_1),
+                              m::Broadcast(&broadcast_0, m::ConstantScalar(2))),
+                  m::Parameter(&param_0));
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          &tuple,
+          m::Copy(&copy, m::Multiply(
+                             &multiply_2, multiplyPattern,
+                             m::Broadcast(&broadcast_1, m::ConstantScalar(4)))),
+          multiplyPattern)));
+  TestShapeHasMemorySpace(param_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(broadcast_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(param_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(broadcast_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_2->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {0}),
+                          Layout::kHostMemorySpace);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {1}),
+                          Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index 67874e38d67a17..c79261e6bb8d6c 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -15,24 +15,23 @@ limitations under the License.
 
 #include "xla/service/layout_assignment.h"
 
-#include <algorithm>
+#include <cstdint>
 #include <deque>
-#include <functional>
 #include <map>
 #include <memory>
-#include <numeric>
 #include <ostream>
 #include <set>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -40,12 +39,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/map_util.h"
 #include "xla/permutation_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/computation_layout.h"
-#include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_dce.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/service/tuple_points_to_analysis.h"
@@ -53,15 +53,15 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
-#include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -2019,12 +2019,28 @@ Status LayoutAssignment::PropagateBufferConstraintToUses(
 Status LayoutAssignment::PropagateResultConstraint(
     const ComputationLayoutConstraint& layout_constraint,
     LayoutConstraints* constraints) {
+  ShapeLayout result_layout =
+      layout_constraint.computation_layout().result_layout();
+  // Clear out memory space in layout for entry computation root. Host offloader
+  // will do the analysis later and add back the memory space for host outputs.
+  if (constraints->computation()->IsEntryComputation()) {
+    Shape result_shape = result_layout.shape();
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachMutableSubshapeWithStatus(
+        &result_shape, [](Shape* subshape, const ShapeIndex& shape_index) {
+          if (subshape->has_layout() && subshape->IsArray()) {
+            subshape->mutable_layout()->set_memory_space(
+                Layout::kDefaultMemorySpace);
+          }
+          return OkStatus();
+        }));
+    TF_RETURN_IF_ERROR(result_layout.CopyLayoutFromShape(result_shape));
+  }
+
   // Propagate the use constraint of the root instruction up to the logical
   // buffers which make up the result.
   return PropagateUseConstraintToDefs(
-      layout_constraint.computation_layout().result_layout(),
-      constraints->computation()->root_instruction(), constraints,
-      current_priority_);
+      result_layout, constraints->computation()->root_instruction(),
+      constraints, current_priority_);
 }
 
 // Infers the layout of the array at the given index in the given instruction's

From a141be8dae4222f4fbc23b0ba9919cb8fbca2ac6 Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Mon, 18 Mar 2024 17:09:51 -0700
Subject: [PATCH 054/670] Implement basic `QuantizationReport`.

This is a minimally working version of `QuantizationReport` where a user may add a single `QuantizationResult` manually.
In future revisions, it will be able to parse `QuantizationResult`s from `ModuleOp` and populate internal data automatically.

PiperOrigin-RevId: 616991325
---
 .../mlir/quantization/stablehlo/cc/BUILD      | 20 ++++++
 .../mlir/quantization/stablehlo/cc/report.cc  | 29 +++++++++
 .../mlir/quantization/stablehlo/cc/report.h   | 48 ++++++++++++++
 .../quantization/stablehlo/cc/report_test.cc  | 64 +++++++++++++++++++
 .../stablehlo/quantization_config.proto       | 26 ++++++++
 5 files changed, 187 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc
 create mode 100644 tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
 create mode 100644 tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
index 7a36ad58dc34a4..2ba0127d2b9c97 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
@@ -276,6 +276,26 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "report",
+    srcs = ["report.cc"],
+    hdrs = ["report.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "report_test",
+    srcs = ["report_test.cc"],
+    deps = [
+        ":report",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "context",
     srcs = [],
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc
new file mode 100644
index 00000000000000..ef24c16dbf4acc
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc
@@ -0,0 +1,29 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h"
+
+#include <utility>
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+using ::stablehlo::quantization::QuantizationResult;
+
+void QuantizationReport::AddQuantizationResult(QuantizationResult&& result) {
+  *quantization_results_.add_results() = std::move(result);
+}
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
new file mode 100644
index 00000000000000..94eb47463f16c1
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
@@ -0,0 +1,48 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_REPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_REPORT_H_
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// A class that manages information about `QuantizableUnit`s post-quantization,
+// internally in the form of `QuantizationUnits`. It is used to collect
+// quantization summary from a quantized `ModuleOp` and emit it in a human- and
+// machine-readable format.
+class QuantizationReport {
+ public:
+  QuantizationReport() = default;
+
+  // Adds a `QuantizationResult` to the report.
+  void AddQuantizationResult(
+      ::stablehlo::quantization::QuantizationResult&& result);
+
+  // Returns `QuantizationResults` that are registered in this report.
+  const ::stablehlo::quantization::QuantizationResults& GetQuantizationResults()
+      const {
+    return quantization_results_;
+  }
+
+ private:
+  // Quantization results that are registered in this report. A quantization
+  // result may be added manually by calling `AddQuantizationResult`.
+  ::stablehlo::quantization::QuantizationResults quantization_results_;
+};
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_REPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc
new file mode 100644
index 00000000000000..f6897f7fde401d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h"
+
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::quant::stablehlo {
+namespace {
+
+using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::QuantizableUnit;
+using ::stablehlo::quantization::QuantizationResult;
+using ::stablehlo::quantization::QuantizationResults;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+using ::testing::StrEq;
+
+TEST(QuantizationReportTest, GetQuantizationResultsReturnsEmptyResults) {
+  QuantizationReport report{};
+
+  const QuantizationResults& results = report.GetQuantizationResults();
+  ASSERT_THAT(results.results(), IsEmpty());
+}
+
+TEST(QuantizationReportTest, AddQuantizationResult) {
+  // Construct a `QuantizationResult` to add, representing a unit named
+  // `quantized_my_function` that is not quantized.
+  QuantizationResult result{};
+  QuantizableUnit& quantizable_unit = *result.mutable_quantizable_unit();
+  quantizable_unit.set_name("quantized_my_function");
+
+  Method& method = *result.mutable_method();
+  method.mutable_no_quantization();
+
+  QuantizationReport report{};
+  report.AddQuantizationResult(std::move(result));
+
+  const QuantizationResults& results = report.GetQuantizationResults();
+  ASSERT_THAT(results.results(), SizeIs(1));
+
+  const QuantizationResult& first_result = results.results(0);
+  EXPECT_THAT(first_result.quantizable_unit().name(),
+              StrEq("quantized_my_function"));
+  EXPECT_TRUE(first_result.method().has_no_quantization());
+}
+
+}  // namespace
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
index 81aff6e46d5850..56645d7f3d73ad 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
@@ -63,6 +63,32 @@ message PipelineConfig {
   optional bool unpack_quantized_types = 1;
 }
 
+// Represents a single quantizable unit, a (nearly) minimum unit of work when
+// applying quantization. It may correspond to a single or multiple ops.
+// Next ID: 2
+message QuantizableUnit {
+  // Name of the `FuncOp` symbol corresponding to the "lifted function",
+  // representing a single quantizable unit. This value is guaranteed to be
+  // unique across a single `ModuleOp`.
+  string name = 1;
+}
+
+// Represents a quantization result of a single `QuantizableUnit`. It is
+// essentially a `(QuantizableUnit, Method)` pair, where the `Method`
+// corresponds to the quantization method eventually applied to the
+// `QuantizableUnit`.
+// Next ID: 3
+message QuantizationResult {
+  QuantizableUnit quantizable_unit = 1;
+  Method method = 2;
+}
+
+// A series of `QuantizationResult`s. See `QuantizationResult` for details.
+// Next ID: 2
+message QuantizationResults {
+  repeated QuantizationResult results = 1;
+}
+
 // A quantization method representing "do not quantize". Mostly used for
 // denylisting quantizable units from quantization.
 message NoQuantization {}

From 858019381d8496e45a40184a705482bc78f34d5f Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 18 Mar 2024 17:22:55 -0700
Subject: [PATCH 055/670] [PJRT] Drop mentions of CPU support from the
 stream_executor client.

We never use the stream_executor client on CPU any more, since the TFRT CPU client is better in every way.

PiperOrigin-RevId: 616994524
---
 .../xla/pjrt/pjrt_stream_executor_client.cc   | 65 +------------------
 1 file changed, 2 insertions(+), 63 deletions(-)

diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 9275f94492133d..cfe5962915dce0 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -616,9 +616,7 @@ void PjRtStreamExecutorBuffer::ScopedHold::AddToInput(
   }
 }
 
-bool PjRtStreamExecutorBuffer::IsOnCpu() const {
-  return client()->platform_id() == CpuId();
-}
+bool PjRtStreamExecutorBuffer::IsOnCpu() const { return false; }
 
 StatusOr<Shape> PjRtStreamExecutorBuffer::logical_on_device_shape() {
   if (on_device_shape_.is_static()) {
@@ -827,59 +825,6 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
       ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(shape_strides)));
   bool host_and_device_strides_equal =
       (size == 0 || *byte_strides == shape_strides);
-  // The CPU platform is special because the "host" and the "device" are in the
-  // same memory space. If the input shape is in the correct layout and we don't
-  // want to defer the copy onto a thread, we can use the following fast
-  // path.
-  bool is_cpu_platform =
-      local_device->executor()->platform()->id() == se::host::kHostPlatformId;
-  if (is_cpu_platform) {
-    // If we are on the host platform and the input buffer is sufficiently
-    // aligned, we can simply point to the input array's data without any
-    // further copies. At the time of writing we require a 16-byte alignment
-    // because XLA may generate code which requires it.
-    bool can_use_zero_copy =
-        host_buffer_semantics == HostBufferSemantics::kZeroCopy &&
-        ((absl::bit_cast<std::uintptr_t>(data) &
-          (cpu_function_runtime::MinAlign() - 1)) == 0);
-    if (host_and_device_strides_equal &&
-        (host_buffer_semantics ==
-             HostBufferSemantics::kImmutableOnlyDuringCall ||
-         can_use_zero_copy)) {
-      absl::AnyInvocable<void() &&> on_delete_callback;
-      se::DeviceMemoryBase buffer;
-      // If we are on the host platform and the input buffer is sufficiently
-      // aligned, we can simply point to the input array's data without any
-      // further copies. At the time of writing we require a 16-byte alignment
-      // because XLA may generate code which requires it.
-      if (can_use_zero_copy) {
-        on_delete_callback = std::move(on_done_with_host_buffer);
-        buffer = se::DeviceMemoryBase(
-            const_cast<void*>(static_cast<const void*>(data)), size);
-      } else {
-        void* staging_buffer = host_memory_allocator()->AllocateRaw(
-            cpu_function_runtime::MinAlign(), size);
-        buffer = se::DeviceMemoryBase(staging_buffer, size);
-        std::memcpy(staging_buffer, data, size);
-        if (on_done_with_host_buffer) {
-          std::move(on_done_with_host_buffer)();
-        }
-        on_delete_callback = [staging_buffer, host_memory_allocator =
-                                                  host_memory_allocator()]() {
-          host_memory_allocator->DeallocateRaw(staging_buffer);
-        };
-      }
-      absl::Span<const std::shared_ptr<BufferSequencingEvent>>
-          definition_events;
-      auto device_buffer = std::make_shared<TrackedDeviceBuffer>(
-          /*allocator=*/nullptr, local_device->local_device_id().value(),
-          std::initializer_list<se::DeviceMemoryBase>{buffer},
-          definition_events, std::move(on_delete_callback));
-      return std::unique_ptr<PjRtBuffer>(
-          std::make_unique<PjRtStreamExecutorBuffer>(
-              device_shape, std::move(device_buffer), this, device));
-    }
-  }
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<PjRtStreamExecutorBuffer> py_buffer,
@@ -1038,13 +983,7 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
               }
             }));
       };
-  if (is_cpu_platform) {
-    // Using the thread_pool would be a double thread hop; the code
-    // already defers its work onto a stream (= thread on CPU).
-    transfer_h2d();
-  } else {
-    thread_pool()->Schedule(transfer_h2d);
-  }
+  thread_pool()->Schedule(transfer_h2d);
   return std::unique_ptr<PjRtBuffer>(std::move(py_buffer));
 }
 

From e7849b639bb26a4b4f570c8868412989fc228949 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <isergachev@nvidia.com>
Date: Mon, 18 Mar 2024 17:35:25 -0700
Subject: [PATCH 056/670] PR #10612: [GPU] cuDNN GEMM fusions: enable
 noncontracting dimension transformations.

Imported from GitHub PR https://github.com/openxla/xla/pull/10612

This kind of transformations is already in use by the Triton GEMM backend for a while.
Copybara import of the project:

--
51eaaf6c8b722ef7c3273825d0585371cd55da26 by Ilia Sergachev <isergachev@nvidia.com>:

[GPU] Support broadcasts in cuDNN GEMM fusions.

--
40aed83572cd3f09fa8530b6c130bf84be226593 by Ilia Sergachev <isergachev@nvidia.com>:

[XLA:GPU] Enable noncontracting to batch dimension transformation in cuDNN GEMM fusions.

This transformation is already in use by the Triton GEMM backend for a while.

Merging this change closes #10612

PiperOrigin-RevId: 616998256
---
 .../xla/service/gpu/cudnn_fusion_compiler.cc  |  86 +++++++--
 .../xla/xla/service/gpu/fusions/cudnn_test.cc | 180 ++++++++++++++++++
 third_party/xla/xla/xla.proto                 |   6 +-
 3 files changed, 259 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
index 3b4a5c4cc5b825..1bd93e3ee2a243 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
@@ -103,6 +103,13 @@ inline std::optional<fe::DataType_t> ToCudnnDataType(const PrimitiveType type) {
   }
 }
 
+int FusionLevel(const HloInstruction& hlo) {
+  return hlo.GetModule()
+      ->config()
+      .debug_options()
+      .xla_gpu_cudnn_gemm_fusion_level();
+};
+
 // Extracts dimensions and strides from HLO tensors in the format expected by
 // cuDNN.
 class GemmDimensionAdapter {
@@ -139,17 +146,21 @@ class GemmDimensionAdapter {
                             std::vector<int64_t>& strides) {
     const DotDimensionNumbers& dims = dot_.dot_dimension_numbers();
     // GEMM fusions require a specific canonical order of dimensions.
+    constexpr int kBatchDimensionIndex = 0;
+    constexpr int kOutputLHSNonContractingDimensionIndex = 1;
     std::vector<int64_t> dim_indices;
+    int lhs_noncontracting_index = -1;
     switch (scope) {
       case TritonFusionAnalysis::Scope::LHS:
-        dim_indices = {dims.lhs_batch_dimensions().empty()
-                           ? -1
-                           : dims.lhs_batch_dimensions(0),
-                       GetNonContractingDims(dot_.operand(0)->shape(),
-                                             dims.lhs_batch_dimensions(),
-                                             dims.lhs_contracting_dimensions())
-                           .value()[0],
-                       dims.lhs_contracting_dimensions(0)};
+        lhs_noncontracting_index =
+            GetNonContractingDims(dot_.operand(0)->shape(),
+                                  dims.lhs_batch_dimensions(),
+                                  dims.lhs_contracting_dimensions())
+                .value()[0];
+        dim_indices = {
+            dims.lhs_batch_dimensions().empty() ? -1
+                                                : dims.lhs_batch_dimensions(0),
+            lhs_noncontracting_index, dims.lhs_contracting_dimensions(0)};
         break;
       case TritonFusionAnalysis::Scope::RHS:
         dim_indices = {dims.rhs_batch_dimensions().empty()
@@ -162,8 +173,9 @@ class GemmDimensionAdapter {
                            .value()[0]};
         break;
       case TritonFusionAnalysis::Scope::OUTPUT:
+        lhs_noncontracting_index = dot_.shape().rank() - 2;
         dim_indices = {dims.lhs_batch_dimensions().empty() ? -1 : 0,
-                       dot_.shape().rank() - 2, dot_.shape().rank() - 1};
+                       lhs_noncontracting_index, dot_.shape().rank() - 1};
         break;
       case TritonFusionAnalysis::Scope::META:
         LOG(FATAL) << "Unsupported scope.";
@@ -177,17 +189,67 @@ class GemmDimensionAdapter {
         strides.push_back(strides.empty() ? 1 : strides.back());
         continue;
       } else {
-        if (spec->size() != 1) {
+        if (spec->size() == 1) {
+          // The dimension is not split, nothing to do.
+        } else if (spec->size() == 2) {
+          if (FusionLevel(hlo) < 3) {
+            return false;
+          }
+          if (!dims.lhs_batch_dimensions().empty()) {
+            VLOG(8) << "Noncontracting dimension split is not compatible with "
+                       "batch dimensions.";
+            return false;
+          }
+          if (index != lhs_noncontracting_index) {
+            VLOG(8) << "Only LHS noncontracting dimension can be split.";
+            return false;
+          }
+          switch (scope) {
+            case TritonFusionAnalysis::Scope::LHS:
+              lhs_noncontracting_split = spec->back().count;
+              break;
+            case TritonFusionAnalysis::Scope::OUTPUT:
+              if (lhs_noncontracting_split != spec->back().count) {
+                VLOG(8) << "Output non-contracting dimension has to be split "
+                           "the same way as the LHS input one if it is split.";
+                return false;
+              }
+              break;
+            default:
+              VLOG(8) << "Only LHS noncontracting dimension can be split.";
+              return false;
+          }
+          // Assign the major part of the noncontracting dimension to the
+          // unused batch one.
+          CHECK_EQ(dimensions[kBatchDimensionIndex], 1);
+          dimensions[kBatchDimensionIndex] = spec->back().count;
+          strides[kBatchDimensionIndex] = spec->back().stride;
+        } else {
+          VLOG(8) << "The dimension is split multiple times.";
           return false;
         }
         dimensions.push_back(spec->front().count);
         strides.push_back(spec->front().stride);
       }
     }
+    if (lhs_noncontracting_split > 1 &&
+        scope == TritonFusionAnalysis::Scope::OUTPUT &&
+        dimensions[kBatchDimensionIndex] == 1) {
+      // LHS input noncontracting dimension is split but the corresponding
+      // output one is not. Assign part of the output one to the unused batch
+      // dimension.
+      dimensions[kBatchDimensionIndex] = lhs_noncontracting_split;
+      dimensions[kOutputLHSNonContractingDimensionIndex] /=
+          lhs_noncontracting_split;
+      strides[kBatchDimensionIndex] =
+          strides[kOutputLHSNonContractingDimensionIndex] *
+          dimensions[kOutputLHSNonContractingDimensionIndex];
+    }
     return true;
   }
 
  private:
+  int64_t lhs_noncontracting_split = 1;
   const HloDotInstruction& dot_;
 };
 
@@ -254,7 +316,9 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
     } else if (hlo->opcode() == HloOpcode::kReshape ||
                hlo->opcode() == HloOpcode::kBitcast ||
                hlo->opcode() == HloOpcode::kTranspose ||
-               hlo->opcode() == HloOpcode::kCopy) {
+               hlo->opcode() == HloOpcode::kCopy ||
+               (FusionLevel(fusion) >= 2 &&
+                hlo->opcode() == HloOpcode::kBroadcast)) {
       // All these are accounted for separately as transformations of strides.
       hlo_to_cudnn[hlo] = operand(0);
     } else if (hlo->IsElementwise()) {
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
index 2d69800a63b69a..40da7bbbc039ff 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
@@ -28,6 +28,7 @@ class CuDnnFusionTest : public GpuCodegenTest {
     // Let this group of tests just use first available plan skipping
     // autotuning.
     debug_options.set_xla_gpu_autotune_level(0);
+    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(1);
     return debug_options;
   }
   bool IsAtLeastHopperWithCuDnn9() {
@@ -291,6 +292,185 @@ ENTRY %e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+class CuDnnFusionLevel2Test : public CuDnnFusionExecutionTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options =
+        CuDnnFusionExecutionTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(2);
+    return debug_options;
+  }
+};
+
+TEST_F(CuDnnFusionLevel2Test, BroadcastToDim2ExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[16,32] parameter(2)
+  p2b = f16[16,32,128] broadcast(p2), dimensions={0,1}
+  a = f16[16,32,128] add(p0, p2b)
+  ROOT r = f16[16,32,64] dot(a, p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[16,32] parameter(2)
+  ROOT _ = f16[16,32,64] fusion(p0, p1, p2), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(CuDnnFusionLevel2Test, BroadcastToDim1ExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[16,128] parameter(2)
+  p2b = f16[16,32,128] broadcast(p2), dimensions={0,2}
+  a = f16[16,32,128] add(p0, p2b)
+  ROOT r = f16[16,32,64] dot(a, p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[16,128] parameter(2)
+  ROOT _ = f16[16,32,64] fusion(p0, p1, p2), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(CuDnnFusionLevel2Test, BroadcastToDim0ExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = bf16[32,128] parameter(0)
+  p0b = bf16[5,32,128] broadcast(p0), dimensions={1,2}
+  p1 = bf16[5,128,64] parameter(1)
+  ROOT r = f32[5,32,64] dot(p0b, p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = bf16[32,128] parameter(0)
+  p1 = bf16[5,128,64] parameter(1)
+  ROOT _ = f32[5,32,64] fusion(p0, p1), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(CuDnnFusionLevel2Test, BroadcastTo2DimsExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[128] parameter(2)
+  p2b = f16[16,32,128] broadcast(p2), dimensions={2}
+  a = f16[16,32,128] add(p0, p2b)
+  ROOT r = f16[16,32,64] dot(a, p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[128] parameter(2)
+  ROOT _ = f16[16,32,64] fusion(p0, p1, p2), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(CuDnnFusionLevel2Test, BroadcastTo3DimsExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[] parameter(2)
+  p2b = f16[16,32,128] broadcast(p2), dimensions={}
+  a = f16[16,32,128] add(p0, p2b)
+  ROOT r = f16[16,32,64] dot(a, p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[] parameter(2)
+  ROOT _ = f16[16,32,64] fusion(p0, p1, p2), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+class CuDnnFusionLevel3Test : public CuDnnFusionExecutionTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options =
+        CuDnnFusionExecutionTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(3);
+    return debug_options;
+  }
+};
+
+TEST_F(CuDnnFusionLevel3Test,
+       DotWithSplitNonContractingInputExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = s8[4,3,16,400]{2,1,3,0} parameter(0)
+  cp0 = s8[4,3,16,400]{3,2,1,0} copy(p0)
+  bc0 = s8[192,400]{1,0} bitcast(cp0)
+  cvt0 = bf16[192,400]{1,0} convert(bc0)
+  p1 = bf16[1,128,400]{2,1,0} parameter(1)
+  bc1 = bf16[128,400]{1,0} reshape(p1)
+  ROOT d = bf16[192,128]{1,0} dot(cvt0, bc1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY r {
+  p0 = s8[4,3,16,400]{2,1,3,0} parameter(0)
+  p1 = bf16[1,128,400]{2,1,0} parameter(1)
+  ROOT r = bf16[192,128]{1,0} fusion(p0, p1), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(CuDnnFusionLevel3Test,
+       DotWithSplitNonContractingInOutExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = s8[4,3,16,400]{2,1,3,0} parameter(0)
+  cp0 = s8[4,3,16,400]{3,2,1,0} copy(p0)
+  bc0 = s8[192,400]{1,0} bitcast(cp0)
+  cvt0 = bf16[192,400]{1,0} convert(bc0)
+  p1 = bf16[1,128,400]{2,1,0} parameter(1)
+  bc1 = bf16[128,400]{1,0} reshape(p1)
+  d = bf16[192,128]{1,0} dot(cvt0, bc1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bc = bf16[4,3,16,128]{3,2,1,0} bitcast(d)
+  ROOT cp = bf16[4,3,16,128]{2,1,3,0} copy(bc)
+}
+
+ENTRY r {
+  p0 = s8[4,3,16,400]{2,1,3,0} parameter(0)
+  p1 = bf16[1,128,400]{2,1,0} parameter(1)
+  ROOT r = bf16[4,3,16,128]{2,1,3,0} fusion(p0, p1), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 class CuDnnFusionRewriteTest : public CuDnnFusionTest {
  public:
   DebugOptions GetDebugOptionsForTest() override {
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 69a40c9dc09e41..1c84566bebb4aa 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -723,8 +723,10 @@ message DebugOptions {
 
   // Let GEMM fusion autotuning probe cuDNN as a backend.
   // Current levels:
-  // 0: disabled.
-  // 1: fusions of GEMM, elementwise, transpose/reshape operations.
+  // 0: Disabled.
+  // 1: Fusions of GEMM, elementwise, transpose/reshape operations.
+  // 2: + Broadcasts.
+  // 3: + Nontrivial noncontracting dimension reshapes/transposes.
   int32 xla_gpu_cudnn_gemm_fusion_level = 285;
 
   // Next id: 286

From 7c6c9233e5ac4a668fc411bc3153fc05736f6a3c Mon Sep 17 00:00:00 2001
From: Kevin Gleason <gleasonk@google.com>
Date: Mon, 18 Mar 2024 17:43:26 -0700
Subject: [PATCH 057/670] Only mark ops converted by pattern at illegal

PiperOrigin-RevId: 617000313
---
 .../stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir   | 2 ++
 .../lite/stablehlo/transforms/smuggle_disallowed_ops.cc     | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir
index ec8ab139054e63..4a0f6a5d5e673b 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir
@@ -1,10 +1,12 @@
 // RUN: odml_to_stablehlo %s -skip-resize -smuggle-disallowed-ops -o - | FileCheck %s
+// RUN: odml-to-stablehlo-opt %s --smuggle-disallowed-ops-pass | FileCheck %s --check-prefix=CHECK-OPT
 
 // CHECK-LABEL: @main
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 975 : i32}, tf_saved_model.semantics}  {
   func.func @serving_default(%arg0: tensor<1x32x32x128xf32> {tf_saved_model.index_path = ["a"]}) -> (tensor<1x64x64x128xf32> {tf_saved_model.index_path = ["b"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "c:0", outputs = "d:0"}, tf_saved_model.exported_names = ["serving_default"]} {
       %0  = "tf.Const"() {value = dense<[56, 904]> : tensor<2xi32>} : () -> tensor<2xi32>
       // CHECK: %1 = stablehlo.custom_call @tf.ResizeBilinear(%arg0, %0) {align_corners = false, device = "", half_pixel_centers = true} : (tensor<1x32x32x128xf32>, tensor<2xi32>) -> tensor<1x64x64x128xf32>
+      // CHECK-OPT: %0 = stablehlo.custom_call @tf.ResizeBilinear(%arg0, %cst) {align_corners = false, device = "", half_pixel_centers = true} : (tensor<1x32x32x128xf32>, tensor<2xi32>) -> tensor<1x64x64x128xf32>
       %1 = "tf.ResizeBilinear"(%arg0, %0) {
         align_corners = false, device = "", half_pixel_centers = true
       } : (tensor<1x32x32x128xf32>, tensor<2xi32>) -> tensor<1x64x64x128xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
index 033ec78751e6b6..06754ea72b580c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
@@ -70,6 +71,9 @@ class SmuggleDisallowedOpsPass
   StringRef getDescription() const final {
     return "Smuggle disallowed ops via stablehlo.custom_calls";
   }
+  void getDependentDialects(DialectRegistry& registry) const final {
+    registry.insert<mlir::stablehlo::StablehloDialect>();
+  }
 
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
@@ -77,7 +81,7 @@ class SmuggleDisallowedOpsPass
     patterns.add<SmuggleOpPattern<TF::ResizeNearestNeighborOp>>(&getContext());
 
     ConversionTarget target(getContext());
-    target.addIllegalDialect<TF::TensorFlowDialect>();
+    target.addIllegalOp<TF::ResizeBilinearOp, TF::ResizeNearestNeighborOp>();
     target.addLegalDialect<mlir::stablehlo::StablehloDialect>();
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns)))) {

From cd0c17316ee2f49238b10d3c63ec9bc6fce72c98 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 18 Mar 2024 17:51:46 -0700
Subject: [PATCH 058/670] [xla:hlo] Do not add processed instructions to DFS
 stack

PiperOrigin-RevId: 617002781
---
 third_party/xla/xla/hlo/ir/hlo_computation.cc | 51 +++++++++----------
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index 7d8a080bd3840a..d5418d1a9ad47a 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -45,9 +45,11 @@ limitations under the License.
 #include "xla/printer.h"
 #include "xla/service/mapped_ptr_container_sorter.h"
 #include "xla/service/name_uniquer.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
+#include "tsl/lib/gtl/iterator_range.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
@@ -510,22 +512,29 @@ void HloComputation::ForEachInstructionPostOrderImpl(
   bool has_channel_dependencies = !channel_dependencies.empty();
   auto* dfs_stack = dfs_stack_scratch;
   dfs_stack->clear();
-  dfs_stack->push_back(root);
+
+  // Pushes instruction to dfs stack only if it was not already processed.
+  auto dfs_stack_push = [&](HloInstruction* instr) {
+    VisitState state = visited.GetState(instr->index_in_parent_);
+    if (state != kVisited) dfs_stack->push_back(instr);
+  };
+
+  dfs_stack_push(root);
   while (!dfs_stack->empty()) {
-    HloInstruction& current = *dfs_stack->back();
+    HloInstruction* current = dfs_stack->back();
+    DCHECK_EQ(current->parent(), this)
+        << "Instruction " << current->name()
+        << " is not in the current computation (" << name() << ").";
 
-    VisitMap::Handle h = current.index_in_parent_;
+    VisitMap::Handle h = current->index_in_parent_;
     VisitState state = visited.GetState(h);
     if (state == kNew) {
       visited.SetState(h, kVisiting);
     } else {
       dfs_stack->pop_back();
       if (state != kVisited) {
-        DCHECK_EQ(current.parent(), this)
-            << "Instruction " << current.name()
-            << " is not in the current computation (" << name() << ").";
-        func(&current);
         visited.SetState(h, kVisited);
+        func(current);
       }
       continue;
     }
@@ -534,34 +543,22 @@ void HloComputation::ForEachInstructionPostOrderImpl(
     // Collectives with the same channel ID must be performed together, as these
     // represent MPMD-partitioned that will later be split into separate modules
     // and the order must be preserved.
-    if (has_channel_dependencies && &current != root) {
-      auto it = channel_dependencies.find(&current);
+    if (has_channel_dependencies && current != root) {
+      auto it = channel_dependencies.find(current);
       if (it != channel_dependencies.end()) {
-        dfs_stack->insert(dfs_stack->end(), it->second.begin(),
-                          it->second.end());
+        absl::c_for_each(it->second, dfs_stack_push);
       }
     }
 
     // Add the operands to the stack in reverse order so the first operand is
     // processed first. This will produce a more natural ordering and a nicer
     // result for things like HLO stringification.
-    const HloInstruction::InstructionVector& operands = current.operands();
-
-    for (auto it = operands.rbegin(); it != operands.rend(); ++it) {
-      HloInstruction* operand = *it;
-      if (visited.GetState(operand->index_in_parent_) != kVisited) {
-        dfs_stack->push_back(operand);
-      } else {
-        // Already fully visited, so we avoid pushing onto the stack
-      }
-    }
+    const HloInstruction::InstructionVector& operands = current->operands();
+    absl::c_for_each(tsl::gtl::make_range(operands.rbegin(), operands.rend()),
+                     dfs_stack_push);
 
-    const PtrVec<HloInstruction*>& predecessors =
-        current.control_predecessors();
-    if (!predecessors.empty()) {
-      dfs_stack->insert(dfs_stack->end(), predecessors.begin(),
-                        predecessors.end());
-    }
+    // Add control predecessors to the stack.
+    absl::c_for_each(current->control_predecessors(), dfs_stack_push);
   }
 }
 

From 9c7585f17c4e8dc497c14c35b02ad3285d7a65be Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 18 Mar 2024 18:11:01 -0700
Subject: [PATCH 059/670] [xla:hlo] NFC: Convert VisitState to enum class

PiperOrigin-RevId: 617007972
---
 third_party/xla/xla/hlo/ir/hlo_computation.cc | 36 +++++++++++++------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index d5418d1a9ad47a..449a86db314cc3 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <optional>
+#include <ostream>
 #include <queue>
 #include <stack>
 #include <string>
@@ -58,7 +59,22 @@ namespace xla {
 
 using absl::StrCat;
 
-enum VisitState { kNew = 0, kVisiting = 1, kVisited = 2 };
+enum class VisitState { kNew = 0, kVisiting = 1, kVisited = 2 };
+
+static std::ostream& operator<<(std::ostream& os, const VisitState& state) {
+  switch (state) {
+    case VisitState::kNew:
+      os << "new";
+      break;
+    case VisitState::kVisiting:
+      os << "visiting";
+      break;
+    case VisitState::kVisited:
+      os << "visited";
+      break;
+  }
+  return os;
+}
 
 class HloComputation::VisitMap {
  public:
@@ -516,7 +532,7 @@ void HloComputation::ForEachInstructionPostOrderImpl(
   // Pushes instruction to dfs stack only if it was not already processed.
   auto dfs_stack_push = [&](HloInstruction* instr) {
     VisitState state = visited.GetState(instr->index_in_parent_);
-    if (state != kVisited) dfs_stack->push_back(instr);
+    if (state != VisitState::kVisited) dfs_stack->push_back(instr);
   };
 
   dfs_stack_push(root);
@@ -528,12 +544,12 @@ void HloComputation::ForEachInstructionPostOrderImpl(
 
     VisitMap::Handle h = current->index_in_parent_;
     VisitState state = visited.GetState(h);
-    if (state == kNew) {
-      visited.SetState(h, kVisiting);
+    if (state == VisitState::kNew) {
+      visited.SetState(h, VisitState::kVisiting);
     } else {
       dfs_stack->pop_back();
-      if (state != kVisited) {
-        visited.SetState(h, kVisited);
+      if (state != VisitState::kVisited) {
+        visited.SetState(h, VisitState::kVisited);
         func(current);
       }
       continue;
@@ -1568,16 +1584,16 @@ std::unique_ptr<HloComputation> HloComputation::CloneInContext(
       auto it = visited.find(cur);
       if (it != visited.end()) {
         dfs_stack.pop_back();
-        if (it->second == kVisited) {
+        if (it->second == VisitState::kVisited) {
           continue;
         }
-        CHECK_EQ(it->second, kVisiting);
+        CHECK_EQ(it->second, VisitState::kVisiting);
         postorder.push_back(cur);
-        it->second = kVisited;
+        it->second = VisitState::kVisited;
         continue;
       }
 
-      visited.insert({cur, kVisiting});
+      visited.insert({cur, VisitState::kVisiting});
       for (HloInstruction* operand : cur->operands()) {
         const HloInstruction* new_operand = replace(operand);
         if (new_operand) {

From 44b161ac851da17cbd3efd9f5d6b58d8a96cd1bc Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Mon, 18 Mar 2024 19:09:20 -0700
Subject: [PATCH 060/670] Move the logic for populating default calibration
 options to `PopulateDefaults`.

This change populates calibration method as part of `PopulateDefaults`. This is reused for ODML use cases. However, for ODML the value for `unpack_quantized_types` is explicitly set to `False` because ODML use cases require uniform quantized types to be left intact.

PiperOrigin-RevId: 617019651
---
 .../mlir/lite/quantization/stablehlo/BUILD    |  1 +
 .../quantization/stablehlo/quantization.cc    | 12 +++----
 .../quantization/stablehlo/quantization.h     |  2 +-
 .../mlir/quantization/stablehlo/cc/config.cc  | 16 ++++++++++
 .../quantization/stablehlo/cc/config_test.cc  | 31 +++++++++++++++++++
 .../stablehlo/cc/static_range_ptq.cc          |  6 +---
 .../stablehlo/cc/static_range_ptq.h           | 13 +-------
 .../stablehlo/quantization_config.proto       |  2 +-
 tensorflow/lite/python/lite.py                |  6 +++-
 9 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
index df286611f3e356..f469cbc8fddacf 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
@@ -18,6 +18,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/stablehlo:tf_stablehlo",
         "//tensorflow/compiler/mlir/quantization/stablehlo:passes",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:config",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:static_range_ptq",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
index ccba41d07e103b..0cc946a23d4e25 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
@@ -41,6 +42,7 @@ namespace tensorflow {
 namespace {
 
 using ::mlir::quant::stablehlo::StaticRangePtqComponent;
+using ::stablehlo::quantization::PopulateDefaults;
 using ::stablehlo::quantization::QuantizationConfig;
 using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::PyFunctionLibrary;
@@ -79,7 +81,7 @@ absl::StatusOr<mlir::ModuleOp> RunQuantization(
     const SavedModelBundle* saved_model_bundle,
     const absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& saved_model_tags,
-    QuantizationConfig& quantization_config,
+    const QuantizationConfig& quantization_config,
     const PyFunctionLibrary* quantization_py_function_lib,
     mlir::ModuleOp module_op) {
   if (saved_model_bundle == nullptr) {
@@ -94,10 +96,8 @@ absl::StatusOr<mlir::ModuleOp> RunQuantization(
         "be nullptr.");
   }
 
-  if (!quantization_config.has_calibration_options()) {
-    *quantization_config.mutable_calibration_options() =
-        mlir::quant::stablehlo::GetDefaultCalibrationOptions();
-  }
+  const QuantizationConfig config_with_defaults =
+      PopulateDefaults(quantization_config);
 
   const absl::flat_hash_map<std::string, SignatureDef> signature_def_map =
       GetSignatureDefMapFromBundle(*saved_model_bundle);
@@ -132,7 +132,7 @@ absl::StatusOr<mlir::ModuleOp> RunQuantization(
       /*signature_keys=*/exported_names, saved_model_tags, signature_def_map,
       GetFunctionAliases(*saved_model_bundle));
   const absl::StatusOr<mlir::ModuleOp> quantized_module_op =
-      static_range_ptq_component.Run(module_op, quantization_config);
+      static_range_ptq_component.Run(module_op, config_with_defaults);
   if (!quantized_module_op.ok()) {
     return absl::InternalError("Failed to run quantization. Status msg: " +
                                quantized_module_op.status().ToString());
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h
index ef6496315e8e61..c55d59cad0f1a0 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h
@@ -50,7 +50,7 @@ absl::StatusOr<mlir::ModuleOp> RunQuantization(
     const SavedModelBundle* saved_model_bundle,
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& saved_model_tags,
-    stablehlo::quantization::QuantizationConfig& quantization_config,
+    const stablehlo::quantization::QuantizationConfig& quantization_config,
     const tensorflow::quantization::PyFunctionLibrary*
         quantization_py_function_lib,
     mlir::ModuleOp module_op);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
index 679e1f8754be9b..e8a4aa87bb0619 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
@@ -15,11 +15,27 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 
 namespace stablehlo::quantization {
+namespace {
+
+// Creates `CalibrationOptions` with default fields. Uses simple min-max
+// calibration by default.
+CalibrationOptions GetDefaultCalibrationOptions() {
+  CalibrationOptions options{};
+  options.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
+  return options;
+}
+
+}  // namespace
 
 QuantizationConfig PopulateDefaults(
     const QuantizationConfig& user_provided_config) {
   QuantizationConfig config = user_provided_config;
 
+  if (!config.has_calibration_options()) {
+    *config.mutable_calibration_options() = GetDefaultCalibrationOptions();
+  }
+
   PipelineConfig& pipeline_config = *config.mutable_pipeline_config();
   if (!pipeline_config.has_unpack_quantized_types()) {
     pipeline_config.set_unpack_quantized_types(true);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
index 5912788bddf96b..164cd6bae237f8 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
@@ -14,12 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 
 namespace stablehlo::quantization {
 namespace {
 
+using ::testing::Eq;
+
 TEST(PopulateDefaultsTest, PopulateDefaultsForEmptyConfig) {
   QuantizationConfig config{};
 
@@ -37,5 +40,33 @@ TEST(PopulateDefaultsTest, PopulateDefaultsForConfigWithUnpackQuantizedTypes) {
   EXPECT_FALSE(new_config.pipeline_config().unpack_quantized_types());
 }
 
+TEST(PopulateDefaultsTest, DefaultCalibrationOptionsPopulated) {
+  QuantizationConfig config{};
+
+  const QuantizationConfig new_config = PopulateDefaults(config);
+  EXPECT_THAT(new_config.calibration_options().calibration_method(),
+              Eq(CalibrationOptions::CALIBRATION_METHOD_MIN_MAX));
+}
+
+TEST(PopulateDefaultsTest, ExplicitCalibrationOptionsNotOverridden) {
+  QuantizationConfig config{};
+  CalibrationOptions& calibration_options =
+      *config.mutable_calibration_options();
+  calibration_options.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX);
+  calibration_options.mutable_calibration_parameters()->set_initial_num_bins(
+      512);
+
+  // Test that if the user explicitly provided `calibration_options`, it is not
+  // overridden.
+  const QuantizationConfig new_config = PopulateDefaults(config);
+  EXPECT_THAT(new_config.calibration_options().calibration_method(),
+              Eq(CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .initial_num_bins(),
+              Eq(512));
+}
+
 }  // namespace
 }  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
index eaafdf1770f7f9..e4b3595ae0f2de 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
@@ -243,17 +243,13 @@ absl::StatusOr<ModuleOp> StaticRangePtqComponent::Run(
 absl::Status QuantizeStaticRangePtq(
     const absl::string_view src_saved_model_path,
     const absl::string_view dst_saved_model_path,
-    QuantizationConfig quantization_config,
+    const QuantizationConfig& quantization_config,
     const std::vector<std::string>& signature_keys,
     const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map,
     const PyFunctionLibrary& py_function_library) {
   std::unordered_set<std::string> tags;
   tags.insert(quantization_config.tf_saved_model().tags().begin(),
               quantization_config.tf_saved_model().tags().end());
-  if (!quantization_config.has_calibration_options()) {
-    *quantization_config.mutable_calibration_options() =
-        GetDefaultCalibrationOptions();
-  }
 
   std::unique_ptr<MLIRContext> ctx = CreateMlirContextForQuantization();
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
index e5056418bbae55..69bd9da6733c0c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
@@ -37,17 +37,6 @@ limitations under the License.
 
 namespace mlir::quant::stablehlo {
 
-using ::stablehlo::quantization::CalibrationOptions;
-
-// Create default configuration for the calibration step, which is the min/max
-// calibration method.
-inline CalibrationOptions GetDefaultCalibrationOptions() {
-  CalibrationOptions options{};
-  options.set_calibration_method(
-      CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
-  return options;
-}
-
 // Component for static-range post-training quantization (PTQ).
 // TODO: b/320607042 - Add tests in python level.
 class StaticRangePtqComponent : public Component {
@@ -102,7 +91,7 @@ class StaticRangePtqComponent : public Component {
 absl::Status QuantizeStaticRangePtq(
     absl::string_view src_saved_model_path,
     absl::string_view dst_saved_model_path,
-    ::stablehlo::quantization::QuantizationConfig quantization_config,
+    const ::stablehlo::quantization::QuantizationConfig& quantization_config,
     const std::vector<std::string>& signature_keys,
     const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
         signature_def_map,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
index 56645d7f3d73ad..b4c4dbdf1f26c8 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
@@ -226,7 +226,7 @@ message CalibrationOptions {
   }
 
   // Determines how to calibrate.
-  // The default calibration method is MIN_MAX.
+  // Default value: CALIBRATION_METHOD_MIN_MAX
   CalibrationMethod calibration_method = 1;
 
   // Defines the parameters required for calibration. Parameters such as the
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index c6804b0f35ed18..952392dcb8df84 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -864,7 +864,11 @@ def _get_base_converter_args(self):
                     )
                 ],
                 enable_per_channel_quantized_weight=True,
-            )
+            ),
+            # For ODML use cases, uniform quantized types should be left intact.
+            pipeline_config=qc.PipelineConfig(
+                unpack_quantized_types=False,
+            ),
         )
 
         args["quantization_config"] = quantization_config

From bfee9197ad47168d103a6963fdaf8c8a8f27cfa1 Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Mon, 18 Mar 2024 19:33:33 -0700
Subject: [PATCH 061/670] [xla:cpu] Remove lmhlo dependency

PiperOrigin-RevId: 617023917
---
 .../xla/mlir/backends/cpu/transforms/BUILD    |   1 -
 .../cpu/transforms/xla_cpu_to_cpu_runtime.cc  | 139 +-----------------
 2 files changed, 4 insertions(+), 136 deletions(-)

diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD b/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
index 9fb60b8442c698..ff81d082104b3a 100644
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
@@ -44,7 +44,6 @@ cc_library(
         "//xla/mlir/runtime/utils:custom_calls",
         "//xla/mlir/xla_cpu/ir:xla_cpu",
         "//xla/mlir_hlo",
-        "//xla/mlir_hlo:lhlo",
         "//xla/service:hlo_parser",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc b/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc
index 16223cead19f7c..fb3bb71548c6f9 100644
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc
+++ b/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/mlir/runtime/transforms/type_converter.h"
 #include "xla/mlir/runtime/utils/custom_calls.h"
 #include "xla/mlir/xla_cpu/ir/xla_cpu.h"
-#include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/hlo_parser.h"
 
@@ -48,8 +47,6 @@ namespace {
 
 using namespace mlir;  // NOLINT
 
-using mlir::lmhlo::CustomCallOp;
-
 using xla_cpu::PartitionIdOp;
 using xla_cpu::ReplicaIdOp;
 
@@ -115,133 +112,6 @@ func::CallOp CreateCallForDpsCollectiveOp(Operation* op,
 
 //===----------------------------------------------------------------------===//
 
-class CustomCallOpLowering : public OpRewritePattern<CustomCallOp> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.cpu.custom_call";
-
- public:
-  CustomCallOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  // Rewrite custom call with `API_VERSION_TYPED_FFI` version into XLA runtime
-  // custom calls bypassing custom call adaptor.
-  LogicalResult rewriteTypedCustomCall(CustomCallOp op,
-                                       PatternRewriter& rewriter) const {
-    // TODO(ezhulenev): Support target arg mapping, or explain why we do not
-    // need them for typed custom calls.
-    if (op.getTargetArgMapping())
-      return op.emitOpError(
-          "API_VERSION_TYPED_FFI custom calls do not "
-          "support target arg mapping");
-
-    // Create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee =
-        custom_calls_.GetOrCreate(b, op.getCallTargetName(), op);
-    callee->setAttr("rt.dynamic", UnitAttr::get(b.getContext()));
-
-    // Forward backend config to the custom call implementation.
-    auto config = op.getBackendConfig();
-    if (!config) return op.emitOpError("Failed to get backend config");
-    auto dict = config->cast<mlir::DictionaryAttr>();
-    llvm::SmallVector<NamedAttribute> backend_config(dict.begin(), dict.end());
-
-    // Call the custom call function forwarding user-defined attributes.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), op.getOperands());
-    AppendCustomCallAttrs(call, backend_config);
-
-    return success();
-  }
-
-  LogicalResult matchAndRewrite(CustomCallOp op,
-                                PatternRewriter& rewriter) const override {
-    // Typed custom calls lowered directly to XLA runtime custom calls.
-    if (op.getApiVersion() == mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI)
-      return rewriteTypedCustomCall(op, rewriter);
-
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // By default all operands passed to the custom call handler.
-    llvm::SmallVector<Value> operands = op.getOperands();
-
-    // Get the number of outputs from operand_segment_sizes.
-    int64_t num_results = op->getAttrOfType<DenseI32ArrayAttr>(
-        op.getOperandSegmentSizesAttrName())[1];
-
-    // If custom call has target arguments mapping, then we need to pass empty
-    // memrefs in place of holes.
-    if (op.getTargetArgMapping().has_value()) {
-      auto mapping = *op.getTargetArgMapping();
-      int64_t num_args = mapping.getNumArgs();
-      num_results = mapping.getNumResults();
-
-      // Always create an `alloca` in the parent function entry block.
-      // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
-      Value hole = [&]() -> Value {
-        OpBuilder::InsertionGuard guard(b);
-        b.setInsertionPointToStart(
-            &op->getParentOfType<func::FuncOp>().front());
-        return b.create<memref::AllocaOp>(MemRefType::get({0}, b.getI8Type()));
-      }();
-
-      // We represent holes as empty i8 memrefs.
-      operands = llvm::SmallVector<Value>(num_args + num_results, hole);
-
-      // Update operands to mapped custom call arguments.
-      auto args = mapping.getArgsToTargetArgs();
-      for (const auto& indexed : llvm::enumerate(args))
-        operands[indexed.value()] = op.getArgs()[indexed.index()];
-
-      // Update operands to mapped custom call results.
-      auto res = mapping.getResultsToTargetResults();
-      for (const auto& indexed : llvm::enumerate(res))
-        operands[num_args + indexed.value()] = op.getOutput()[indexed.index()];
-    }
-
-    // TODO(jreiffers): This will break if an output has a non-default layout.
-    operands = EnsureFlatMemrefs(operands, b);
-    // Create a custom call function declaration.
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, kCustomCallTarget, TypeRange(ValueRange(operands)), TypeRange());
-
-    // The ABI is different depending on whether the original op was outputting
-    // a tuple or not. For multiple outputs this is trivial but for a single
-    // output we rely on the xla_shape attribute to distinguish the ABIs.
-    bool output_tuple = num_results > 1;
-    if (auto xla_shape = op->getAttrOfType<StringAttr>("xla_shape"))
-      output_tuple = ParseShape(xla_shape.strref())->IsTuple();
-
-    // This is not equivalent to op.getApiVersionAttr() - that call returns null
-    // if the attribute is absent. getApiVersion returns the default.
-    Attribute api_version =
-        mhlo::CustomCallApiVersionAttr::get(getContext(), op.getApiVersion());
-    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
-        {b.getStringAttr("num_results"),
-         b.getI32IntegerAttr(static_cast<int32_t>(num_results))},
-        {b.getStringAttr("output_tuple"), b.getBoolAttr(output_tuple)},
-        {b.getStringAttr("api_version"), api_version},
-        {b.getStringAttr("call_target_name"), op.getCallTargetNameAttr()}};
-
-    if (auto backend_config = op.getBackendConfigAttr()) {
-      custom_call_attrs.emplace_back(b.getStringAttr("backend_config"),
-                                     op.getBackendConfigAttr());
-    }
-
-    // Call the runtime intrinsic with the original operands.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), operands);
-    AppendCustomCallAttrs(call, custom_call_attrs);
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
 template <typename IdOp>
 class IdOpLowering : public OpRewritePattern<IdOp> {
  public:
@@ -542,11 +412,10 @@ void ConvertXlaCpuToCpuRuntimePass::runOnOperation() {
 
   // Convert xla_cpu operations to XLA cpu runtime custom calls.
   RewritePatternSet patterns(ctx);
-  patterns
-      .insert<AllReduceLowering, AllToAllLowering, CollectivePermuteLowering,
-              ConvolutionLowering, CustomCallOpLowering, FftLowering,
-              InfeedLowering, OutfeedLowering, RngBitGeneratorLowering>(
-          ctx, custom_calls);
+  patterns.insert<AllReduceLowering, AllToAllLowering,
+                  CollectivePermuteLowering, ConvolutionLowering, FftLowering,
+                  InfeedLowering, OutfeedLowering, RngBitGeneratorLowering>(
+      ctx, custom_calls);
   patterns.insert<IdOpLowering<PartitionIdOp>>(ctx, "xla.cpu.partition_id",
                                                custom_calls);
   patterns.insert<IdOpLowering<ReplicaIdOp>>(ctx, "xla.cpu.replica_id",

From d2f259e64f4035c7beee80af12edcb7eb4f282ef Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 18 Mar 2024 19:45:00 -0700
Subject: [PATCH 062/670] [xla:ffi] Add XLA_FFI_Handler_Traits to capture
 properties of an FFI handler

PiperOrigin-RevId: 617026090
---
 third_party/xla/xla/ffi/api/api.h             | 30 ++++++++++---------
 third_party/xla/xla/ffi/api/c_api.h           | 12 +++++++-
 third_party/xla/xla/ffi/ffi_api.cc            | 20 +++++++------
 third_party/xla/xla/ffi/ffi_api.h             | 13 +++++---
 third_party/xla/xla/ffi/ffi_test.cc           |  6 +++-
 .../xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc     |  4 +--
 third_party/xla/xla/python/xla_compiler.cc    |  4 +--
 .../address_computation_fusion_rewriter.cc    |  4 +--
 .../xla/xla/service/gpu/fusions/custom.cc     | 11 +++----
 .../xla/service/gpu/ir_emitter_unnested.cc    |  6 ++--
 .../runtime/address_computation_thunk_test.cc | 12 ++++----
 11 files changed, 73 insertions(+), 49 deletions(-)

diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index bc21d1856a8f85..7faddec1e350d4 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -107,10 +107,9 @@ class Ffi {
 
   // Registers handler with an XLA runtime under the given name on a given
   // platform.
-  static inline XLA_FFI_Error* RegisterStaticHandler(const XLA_FFI_Api* api,
-                                                     std::string_view name,
-                                                     std::string_view platform,
-                                                     XLA_FFI_Handler* handler);
+  static inline XLA_FFI_Error* RegisterStaticHandler(
+      const XLA_FFI_Api* api, std::string_view name, std::string_view platform,
+      XLA_FFI_Handler* handler, XLA_FFI_Handler_Traits traits = 0);
 
  protected:
   template <typename... Args>
@@ -131,7 +130,8 @@ class Ffi {
 XLA_FFI_Error* Ffi::RegisterStaticHandler(const XLA_FFI_Api* api,
                                           std::string_view name,
                                           std::string_view platform,
-                                          XLA_FFI_Handler* handler) {
+                                          XLA_FFI_Handler* handler,
+                                          XLA_FFI_Handler_Traits traits) {
   // Make copies of string views to guarantee they are null terminated.
   std::string name_str(name);
   std::string platform_str(platform);
@@ -142,6 +142,7 @@ XLA_FFI_Error* Ffi::RegisterStaticHandler(const XLA_FFI_Api* api,
   args.name = name_str.c_str();
   args.platform = platform_str.c_str();
   args.handler = handler;
+  args.traits = traits;
   return api->XLA_FFI_Handler_Register(&args);
 }
 
@@ -1294,15 +1295,16 @@ auto DictionaryDecoder(Members... m) {
 // TODO(ezhulenev): Add a callback so that end users can log registration error
 // to appropriate logging destination, e.g. LOG(FATAL) for duplicate internal
 // FFI handlers.
-#define XLA_FFI_REGISTER_HANDLER(API, NAME, PLATFORM, FUNC) \
-  XLA_FFI_REGISTER_HANDLER_(API, NAME, PLATFORM, FUNC, __COUNTER__)
-#define XLA_FFI_REGISTER_HANDLER_(API, NAME, PLATFORM, FUNC, N) \
-  XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N)
-#define XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N)           \
-  XLA_FFI_ATTRIBUTE_UNUSED static const XLA_FFI_Error*                     \
-      xla_ffi_static_handler_##N##_registered_ = [] {                      \
-        return ::xla::ffi::Ffi::RegisterStaticHandler(API, NAME, PLATFORM, \
-                                                      FUNC);               \
+#define XLA_FFI_REGISTER_HANDLER(API, NAME, PLATFORM, FUNC, ...)    \
+  XLA_FFI_REGISTER_HANDLER_(API, NAME, PLATFORM, FUNC, __COUNTER__, \
+                            ##__VA_ARGS__)
+#define XLA_FFI_REGISTER_HANDLER_(API, NAME, PLATFORM, FUNC, N, ...) \
+  XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N, ##__VA_ARGS__)
+#define XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N, ...)       \
+  XLA_FFI_ATTRIBUTE_UNUSED static const XLA_FFI_Error*                      \
+      xla_ffi_static_handler_##N##_registered_ = [] {                       \
+        return ::xla::ffi::Ffi::RegisterStaticHandler(API, NAME, PLATFORM,  \
+                                                      FUNC, ##__VA_ARGS__); \
       }()
 
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/api/c_api.h b/third_party/xla/xla/ffi/api/c_api.h
index 114b2b4f6fbf1a..5549c5f3c2a30d 100644
--- a/third_party/xla/xla/ffi/api/c_api.h
+++ b/third_party/xla/xla/ffi/api/c_api.h
@@ -267,6 +267,15 @@ XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_CallFrame, attrs);
 // External functions registered with XLA as FFI handlers.
 typedef XLA_FFI_Error* XLA_FFI_Handler(XLA_FFI_CallFrame* call_frame);
 
+enum XLA_FFI_Handler_TraitsBits {
+  // Calls to FFI handler are safe to trace into the command buffer. It means
+  // that calls to FFI handler always launch exactly the same device operations
+  // (can depend on attribute values) that can be captured and then replayed.
+  XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE = 1u << 0,
+};
+
+typedef uint32_t XLA_FFI_Handler_Traits;
+
 struct XLA_FFI_Handler_Register_Args {
   size_t struct_size;
   void* priv;
@@ -274,9 +283,10 @@ struct XLA_FFI_Handler_Register_Args {
   const char* name;      // null terminated
   const char* platform;  // null terminated
   XLA_FFI_Handler* handler;
+  XLA_FFI_Handler_Traits traits;
 };
 
-XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Handler_Register_Args, handler);
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Handler_Register_Args, traits);
 
 typedef XLA_FFI_Error* XLA_FFI_Handler_Register(
     XLA_FFI_Handler_Register_Args* args);
diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index 3173157a10ba90..75de43e277cfc1 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -84,7 +84,7 @@ Status Call(XLA_FFI_Handler* handler, CallFrame& call_frame,
 //===----------------------------------------------------------------------===//
 
 using HandlerKey = std::pair<std::string, std::string>;
-using HandlerRegistry = absl::flat_hash_map<HandlerKey, XLA_FFI_Handler*>;
+using HandlerRegistry = absl::flat_hash_map<HandlerKey, HandlerRegistration>;
 
 static HandlerKey MakeHandlerKey(std::string_view name,
                                  std::string_view platform) {
@@ -97,9 +97,10 @@ static HandlerRegistry& GetHandlerRegistry() {
 }
 
 static Status RegisterHandler(std::string_view name, std::string_view platform,
-                              XLA_FFI_Handler* handler) {
-  auto emplaced =
-      GetHandlerRegistry().try_emplace(MakeHandlerKey(name, platform), handler);
+                              XLA_FFI_Handler* handler,
+                              XLA_FFI_Handler_Traits traits) {
+  auto emplaced = GetHandlerRegistry().try_emplace(
+      MakeHandlerKey(name, platform), HandlerRegistration{handler, traits});
   if (!emplaced.second)
     return absl::InvalidArgumentError(
         absl::StrCat("Duplicate FFI handler registration for ", name,
@@ -107,8 +108,8 @@ static Status RegisterHandler(std::string_view name, std::string_view platform,
   return OkStatus();
 }
 
-absl::StatusOr<XLA_FFI_Handler*> FindHandler(std::string_view name,
-                                             std::string_view platform) {
+absl::StatusOr<HandlerRegistration> FindHandler(std::string_view name,
+                                                std::string_view platform) {
   auto it = GetHandlerRegistry().find(MakeHandlerKey(name, platform));
   if (it == GetHandlerRegistry().end())
     return absl::NotFoundError(absl::StrCat("No FFI handler registered for ",
@@ -116,9 +117,9 @@ absl::StatusOr<XLA_FFI_Handler*> FindHandler(std::string_view name,
   return it->second;
 }
 
-absl::flat_hash_map<std::string, XLA_FFI_Handler*> StaticRegisteredHandlers(
+absl::flat_hash_map<std::string, HandlerRegistration> StaticRegisteredHandlers(
     std::string_view platform) {
-  absl::flat_hash_map<std::string, XLA_FFI_Handler*> calls;
+  absl::flat_hash_map<std::string, HandlerRegistration> calls;
   for (const auto& [metadata, handler] : GetHandlerRegistry()) {
     if (absl::AsciiStrToLower(platform) == metadata.second) {
       calls[metadata.first] = handler;
@@ -236,7 +237,8 @@ static XLA_FFI_Error* XLA_FFI_Handler_Register(
       "XLA_FFI_Handler_Register", XLA_FFI_Handler_Register_Args_STRUCT_SIZE,
       args->struct_size));
 
-  if (auto status = RegisterHandler(args->name, args->platform, args->handler);
+  if (auto status = RegisterHandler(args->name, args->platform, args->handler,
+                                    args->traits);
       !status.ok()) {
     return new XLA_FFI_Error{std::move(status)};
   }
diff --git a/third_party/xla/xla/ffi/ffi_api.h b/third_party/xla/xla/ffi/ffi_api.h
index eae9eeda0a34c3..d101a8974587b6 100644
--- a/third_party/xla/xla/ffi/ffi_api.h
+++ b/third_party/xla/xla/ffi/ffi_api.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 
 namespace xla::ffi {
 
@@ -62,12 +61,18 @@ Status Call(XLA_FFI_Handler* handler, CallFrame& call_frame,
 // XLA FFI registry
 //===----------------------------------------------------------------------===//
 
+struct HandlerRegistration {
+  XLA_FFI_Handler* handler = nullptr;
+  XLA_FFI_Handler_Traits traits = 0;
+};
+
 // Returns registered FFI handler for a given name and platform, or an error if
 // it's not found in the static registry.
-absl::StatusOr<XLA_FFI_Handler*> FindHandler(std::string_view name,
-                                             std::string_view platform);
+absl::StatusOr<HandlerRegistration> FindHandler(std::string_view name,
+                                                std::string_view platform);
+
 // Returns all registered calls in the static registry for a given platform.
-absl::flat_hash_map<std::string, XLA_FFI_Handler*> StaticRegisteredHandlers(
+absl::flat_hash_map<std::string, HandlerRegistration> StaticRegisteredHandlers(
     std::string_view platform);
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index c28da195f7ba57..7c4e5fe1e083fb 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -49,7 +49,8 @@ TEST(FfiTest, StaticRegistration) {
   XLA_FFI_DEFINE_HANDLER(NoOp1, noop);
 
   XLA_FFI_REGISTER_HANDLER(GetXlaFfiApi(), "no-op-0", "Host", NoOp0);
-  XLA_FFI_REGISTER_HANDLER(GetXlaFfiApi(), "no-op-1", "Host", NoOp1);
+  XLA_FFI_REGISTER_HANDLER(GetXlaFfiApi(), "no-op-1", "Host", NoOp1,
+                           XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
 
   auto handler0 = FindHandler("no-op-0", "Host");
   auto handler1 = FindHandler("no-op-1", "Host");
@@ -57,6 +58,9 @@ TEST(FfiTest, StaticRegistration) {
   TF_ASSERT_OK(handler0.status());
   TF_ASSERT_OK(handler1.status());
 
+  ASSERT_EQ(handler0->traits, 0);
+  ASSERT_EQ(handler1->traits, XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
+
   EXPECT_THAT(StaticRegisteredHandlers("Host"),
               UnorderedElementsAre(Pair("no-op-0", _), Pair("no-op-1", _)));
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index 6d753c5dd4c117..f5583b3878dd12 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -430,8 +430,8 @@ TEST(PjrtCApiGpuExtensionTest, CustomCallTyped) {
       reinterpret_cast<const PJRT_Gpu_Custom_Call*>(next)->custom_call(&args);
 
   CHECK_EQ(error, nullptr);
-  auto* custom_call = xla::ffi::FindHandler(function_name, "CUDA").value();
-  EXPECT_EQ(reinterpret_cast<void*>(custom_call), kNoop);
+  auto registration = xla::ffi::FindHandler(function_name, "CUDA").value();
+  EXPECT_EQ(reinterpret_cast<void*>(registration.handler), kNoop);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/python/xla_compiler.cc b/third_party/xla/xla/python/xla_compiler.cc
index 69bb526a05683d..1ef547777794ae 100644
--- a/third_party/xla/xla/python/xla_compiler.cc
+++ b/third_party/xla/xla/python/xla_compiler.cc
@@ -943,10 +943,10 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
           targets[nb::str(name.data(), name.size())] = nb::capsule(target);
         }
 
-        for (const auto& [name, target] :
+        for (const auto& [name, registration] :
              ffi::StaticRegisteredHandlers(platform)) {
           targets[nb::str(name.data(), name.size())] =
-              nb::capsule(reinterpret_cast<void*>(target));
+              nb::capsule(reinterpret_cast<void*>(registration.handler));
         }
         return targets;
       },
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index ebee4e06f65d6d..ad124ed3eabde2 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -80,12 +80,12 @@ bool IsCustomCall(const HloInstruction* hlo, absl::string_view platform_name) {
   void* call_target = CustomCallTargetRegistry::Global()->Lookup(
       call_target_name, std::string(platform_name));
 
-  absl::StatusOr<XLA_FFI_Handler*> handler =
+  absl::StatusOr<ffi::HandlerRegistration> handler_registration =
       ffi::FindHandler(call_target_name, platform_name);
 
   // At least one implementation should be available at run time.
   bool found_custom_call = !is_ffi_custom_call && call_target != nullptr;
-  bool found_ffi_handler = is_ffi_custom_call && handler.ok();
+  bool found_ffi_handler = is_ffi_custom_call && handler_registration.ok();
 
   return found_custom_call || found_ffi_handler;
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 8027bd69756a3d..619fe2281611d7 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -187,7 +187,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
   const BufferAssignment& buffer_assignment =
       ir_emitter_context.buffer_assignment();
 
-  const std::string call_target_name = custom_call.custom_call_target();
+  const std::string& call_target_name = custom_call.custom_call_target();
 
   // Typed FFI custom calls is a replacement for legacy custom calls with
   // a rich type safe API. It's under construction and not fully supported.
@@ -197,12 +197,12 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
   void* call_target = CustomCallTargetRegistry::Global()->Lookup(
       call_target_name, std::string(ir_emitter_context.platform_name()));
 
-  absl::StatusOr<XLA_FFI_Handler*> handler =
+  absl::StatusOr<ffi::HandlerRegistration> registration =
       ffi::FindHandler(call_target_name, ir_emitter_context.platform_name());
 
   // At least one implementation should be available at run time.
   bool found_custom_call = !is_ffi_custom_call && call_target != nullptr;
-  bool found_ffi_handler = is_ffi_custom_call && handler.ok();
+  bool found_ffi_handler = is_ffi_custom_call && registration.ok();
 
   if (!found_custom_call && !found_ffi_handler) {
     return absl::InternalError(
@@ -323,8 +323,9 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
   auto ffi_thunk = [&] {
     auto& called_computations = custom_call.called_computations();
     return std::make_unique<CustomCallThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(&custom_call), *handler,
-        std::move(operands), std::move(results), std::move(attributes),
+        Thunk::ThunkInfo::WithProfileAnnotation(&custom_call),
+        registration->handler, std::move(operands), std::move(results),
+        std::move(attributes),
         called_computations.empty() ? nullptr : called_computations[0]);
   };
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 79eca88e8f96ea..09c0631925406b 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -1315,12 +1315,12 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
   void* call_target = CustomCallTargetRegistry::Global()->Lookup(
       call_target_name, std::string(platform_name()));
 
-  absl::StatusOr<XLA_FFI_Handler*> handler =
+  absl::StatusOr<ffi::HandlerRegistration> registration =
       ffi::FindHandler(call_target_name, platform_name());
 
   // At least one implementation should be available at run time.
   bool found_custom_call = !is_ffi_custom_call && call_target != nullptr;
-  bool found_ffi_handler = is_ffi_custom_call && handler.ok();
+  bool found_ffi_handler = is_ffi_custom_call && registration.ok();
 
   if (!found_custom_call && !found_ffi_handler) {
     auto& debug_options = ir_emitter_context_->debug_options();
@@ -1452,7 +1452,7 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
   auto ffi_thunk = [&] {
     auto& called_computations = instr->called_computations();
     return std::make_unique<CustomCallThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(instr), *handler,
+        Thunk::ThunkInfo::WithProfileAnnotation(instr), registration->handler,
         std::move(operands), std::move(results), std::move(attributes),
         called_computations.empty() ? nullptr : called_computations[0]);
   };
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index 1167cf18a93c57..dc57a6447922e4 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -560,8 +560,8 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
 
   // Preparing custom call thunk: setting up call target and operands + results
   // buffers.
-  auto handler = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
-  ASSERT_TRUE(handler.ok());
+  auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
+  ASSERT_TRUE(registration.ok());
 
   std::vector<std::optional<CustomCallThunk::Slice>> operands{
       CustomCallThunk::Slice{slice_src_fake,
@@ -573,7 +573,7 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   // Creating embedded custom call thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<CustomCallThunk>(
-      Thunk::ThunkInfo(nullptr), *handler, operands, results,
+      Thunk::ThunkInfo(nullptr), registration->handler, operands, results,
       /*attributes=*/CustomCallThunk::AttributesMap(),
       /*called_computation=*/nullptr));
 
@@ -713,8 +713,8 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
 
   // Preparing custom call thunk: setting up call target and operands + results
   // buffers.
-  auto handler = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
-  ASSERT_TRUE(handler.ok());
+  auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
+  ASSERT_TRUE(registration.ok());
 
   std::vector<std::optional<CustomCallThunk::Slice>> operands{
       CustomCallThunk::Slice{slice_src_fake,
@@ -726,7 +726,7 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
   // Creating embedded custom call thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<CustomCallThunk>(
-      Thunk::ThunkInfo(nullptr), *handler, operands, results,
+      Thunk::ThunkInfo(nullptr), registration->handler, operands, results,
       /*attributes=*/CustomCallThunk::AttributesMap(),
       /*called_computation=*/nullptr));
 

From a644925a6a62fb08e1d984553bf81bc9daff4a4e Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Mon, 18 Mar 2024 20:15:27 -0700
Subject: [PATCH 063/670] Handle element_size_in_bits in constant folding.

Constant folding runs HloEvaluator, which creates Literals. Before, if element_size_in_bits was nonzero in a constant folded op, an error would occur since Literals do not support element_size_in_bits and so CHECKed it was zero in the constructor. Now Literal will silently set element_size_in_bits to zero in the constructor.

Because the newly created constant-folded constant op derives its Shape from the constant-folded literal, now HloConstantFolding explicitly sets element_size_in_bits on the newly created constant op since the Literal will always have element_size_in_bits set to zero.

This will be needed to support int4 in arbitrary ops on CPUs/GPUs.

PiperOrigin-RevId: 617033705
---
 third_party/xla/xla/literal.cc                | 24 +++++++++++------
 third_party/xla/xla/literal.h                 |  5 ++++
 .../xla/xla/service/hlo_constant_folding.cc   | 10 +++++++
 .../xla/service/hlo_constant_folding_test.cc  | 26 +++++++++++++++++++
 4 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/literal.cc b/third_party/xla/xla/literal.cc
index 7c1ae28e8a9d58..d5364cb848e652 100644
--- a/third_party/xla/xla/literal.cc
+++ b/third_party/xla/xla/literal.cc
@@ -249,6 +249,21 @@ Literal::Literal() : Literal(NilShape()) {}
 Literal::Literal(const Shape& shape)
     : Literal(shape, /*allocate_arrays=*/true) {}
 
+void Literal::SetShape(const Shape& shape) {
+  Shape shape_storage;
+  const Shape* shape_ptr = &shape;
+  if (LayoutUtil::HasCustomElementSizeInBits(shape)) {
+    shape_storage = shape;
+    shape_storage.mutable_layout()->set_element_size_in_bits(0);
+    shape_ptr = &shape_storage;
+  }
+  if (const Shape* intered_shape_ptr = TryInternShape(*shape_ptr)) {
+    shape_ = intered_shape_ptr;
+  } else {
+    shape_ = std::make_unique<Shape>(*shape_ptr);
+  }
+}
+
 void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays,
                        ArrayValueState leaf_array_value_state) {
   if (shape.IsTuple()) {
@@ -276,16 +291,9 @@ void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays,
 Literal::Literal(const Shape& shape, bool allocate_arrays,
                  ArrayValueState leaf_array_value_state)
     : MutableLiteralBase() {
-  if (const Shape* intered_shape_ptr = TryInternShape(shape)) {
-    shape_ = intered_shape_ptr;
-  } else {
-    shape_ = std::make_unique<Shape>(shape);
-  }
+  SetShape(shape);
   CHECK(leaf_array_value_state != ArrayValueState::kKnown ||
         LayoutUtil::HasLayout(*shape_));
-  // Currently we do nibble packing/unpacking in TPU host/device transfer.
-  CHECK(!LayoutUtil::HasCustomElementSizeInBits(*shape_))
-      << "Literal does not support layouts with custom bit size: " << *shape_;
   root_piece_.set_subshape(shape_.get());
   CHECK(&root_piece_.subshape() == shape_.get());
 
diff --git a/third_party/xla/xla/literal.h b/third_party/xla/xla/literal.h
index 8f8894dbc26ea8..a6b4758cf64234 100644
--- a/third_party/xla/xla/literal.h
+++ b/third_party/xla/xla/literal.h
@@ -1469,6 +1469,11 @@ class Literal : public MutableLiteralBase {
   // Deallocate the buffers held by this literal.
   void DeallocateBuffers();
 
+  // Sets the shape_ field from a Shape. shape_'s element_size_in_bits field
+  // on the layout is always set to 0 since Literals do not support packed
+  // subbyte elements.
+  void SetShape(const Shape& shape);
+
   // Recursively sets the subshapes and buffers of all subpieces rooted at
   // 'piece'. If 'allocate_array' is true, memory is allocated for the arrays in
   // the shape.
diff --git a/third_party/xla/xla/service/hlo_constant_folding.cc b/third_party/xla/xla/service/hlo_constant_folding.cc
index 71f58d9a241232..7afdb75649edc3 100644
--- a/third_party/xla/xla/service/hlo_constant_folding.cc
+++ b/third_party/xla/xla/service/hlo_constant_folding.cc
@@ -233,6 +233,16 @@ StatusOr<bool> HloConstantFolding::Run(
       dead_instructions.push_back(instruction);
       HloInstruction* new_constant = computation->AddInstruction(
           HloInstruction::CreateConstant(std::move(result)));
+      if (new_constant->shape().has_layout()) {
+        // Update element_size_in_bits on the new instruction's layout. Literals
+        // always have element_size_in_bits set to 0, and CreateConstant copies
+        // the shape/layout from the Literal, so we need to set
+        // element_size_in_bits here.
+        new_constant->mutable_shape()
+            ->mutable_layout()
+            ->set_element_size_in_bits(
+                instruction->shape().layout().element_size_in_bits());
+      }
       TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(new_constant));
     }
   }
diff --git a/third_party/xla/xla/service/hlo_constant_folding_test.cc b/third_party/xla/xla/service/hlo_constant_folding_test.cc
index 4150b24ead5ee1..4958bee65f54d1 100644
--- a/third_party/xla/xla/service/hlo_constant_folding_test.cc
+++ b/third_party/xla/xla/service/hlo_constant_folding_test.cc
@@ -346,6 +346,32 @@ TEST_F(HloConstantFoldingTest, FoldOpsWhereOneOperandIsBroadcast) {
                                   )));
 }
 
+TEST_F(HloConstantFoldingTest, FoldInt4Ops) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+  ENTRY entry {
+    c0 = s4[2]{0:E(4)} constant({1, 2})
+    c1 = s4[2]{0:E(4)} constant({3, 4})
+    add1 = s4[2]{0:E(4)} add(c0, c1)
+    c2 = s4[]{:E(4)} constant(5)
+    add2 = s4[2]{0:E(4)} add(c0, s4[2]{0:E(4)} broadcast(c2))
+    ROOT root = tuple(add1, add2)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloConstantFolding constant_folding;
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&constant_folding, module.get()));
+  EXPECT_TRUE(result);
+  auto is_4_bit = [](const HloInstruction* instr) {
+    return instr->shape().layout().element_size_in_bits() == 4;
+  };
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Constant().WithPredicate(is_4_bit),
+                                  m::Constant().WithPredicate(is_4_bit))));
+}
+
 TEST_F(HloConstantFoldingTest, BigReduceWindow) {
   constexpr absl::string_view kModuleStr = R"(
     HloModule test

From 9c1b61a664d66b15d07c25e713d8d96f6bf347c8 Mon Sep 17 00:00:00 2001
From: Wilsin Gosti <wilsin@google.com>
Date: Mon, 18 Mar 2024 20:36:42 -0700
Subject: [PATCH 064/670] #tf-data Set the iterator prefix and `DebugString` of
 `GlobalShuffleDataset` to `GlobalShuffle` to be consistent with other
 datasets.

PiperOrigin-RevId: 617037057
---
 .../kernels/data/experimental/global_shuffle_dataset_op.cc  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc b/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
index e0cbd047bc945b..ad0006724bd5ef 100644
--- a/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
@@ -47,6 +47,7 @@ namespace {
 
 constexpr int32_t kIndexShuffleRounds = 8;
 
+constexpr const char kDatasetType[] = "GlobalShuffle";
 constexpr const char kElementCount[] = "element_count";
 constexpr const char kGlobalShuffleDataset[] = "GlobalShuffleDataset";
 constexpr const char kReshuffleEachIteration[] = "reshuffle_each_iteration";
@@ -105,7 +106,7 @@ class GlobalShuffleDatasetOp::Dataset : public DatasetBase {
   }
 
   std::string DebugString() const override {
-    return name_utils::DatasetDebugString(kGlobalShuffleDataset);
+    return name_utils::DatasetDebugString(kDatasetType);
   }
 
   int64_t CardinalityInternal(CardinalityOptions options) const override {
@@ -340,8 +341,7 @@ std::unique_ptr<IteratorBase>
 GlobalShuffleDatasetOp::Dataset::MakeIteratorInternal(
     const std::string& prefix) const {
   return std::make_unique<GlobalShuffleDatasetOp::Dataset::Iterator>(
-      Iterator::Params{
-          this, name_utils::IteratorPrefix(kGlobalShuffleDataset, prefix)},
+      Iterator::Params{this, name_utils::IteratorPrefix(kDatasetType, prefix)},
       seed_generator_->get());
 }
 

From a66b17075a05337532a1758cfe81c4f11abe3029 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 Mar 2024 21:05:42 -0700
Subject: [PATCH 065/670] When possible, use the device ids provided by the
 user instead of defaulting to the iota order.

PiperOrigin-RevId: 617041894
---
 .../auto_sharding/auto_sharding.cc            | 13 +++++--
 .../auto_sharding/auto_sharding_strategy.cc   | 27 ++++++++++++--
 .../auto_sharding/auto_sharding_test.cc       | 37 +++++++++++++++++++
 .../auto_sharding/cluster_environment.h       | 11 ++++++
 4 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index dc7a4eb01edf79..a59f4ee2f335b3 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -3631,9 +3631,16 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
         return changed.status();
       }
     }
-    std::vector<int64_t> device_mesh_ids = std::vector<int64_t>(total_devices);
-    std::iota(device_mesh_ids.begin(), device_mesh_ids.end(), 0);
-    device_mesh.SetValues(device_mesh_ids);
+    if (option_.device_mesh_ids.size() == total_devices) {
+      // It is unclear what device order to use for partial meshes. So we only
+      // use the actual device order only for the final full mesh.
+      device_mesh.SetValues(option_.device_mesh_ids);
+    } else {
+      std::vector<int64_t> device_mesh_ids =
+          std::vector<int64_t>(total_devices);
+      std::iota(device_mesh_ids.begin(), device_mesh_ids.end(), 0);
+      device_mesh.SetValues(device_mesh_ids);
+    }
 
     // TODO (zhuohan): Include the prof result as an option.
     spmd::ProfilingResult prof_result;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
index 8dfe3877e3b7b2..4563141e30b67f 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
@@ -393,13 +393,32 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
           // Find output shardings.
           switch (opcode) {
             case HloOpcode::kSlice: {
+              // When solve_nd_sharding_iteratively is true, in some cases, we
+              // can have 1D shardings where the total number of tiles is larger
+              // than the number of elements in the partial mesh (and is
+              // actually equal to the number of devices in the original
+              // mesh). Below, we use the correct mesh depending on the number
+              // of elements in the 1D sharding.
               bool is_1d_sharding =
                   VectorGreaterThanOneElementCount(
                       input_spec.tile_assignment().dimensions()) == 1;
-              output_spec = PropagateDimwiseShardingSlice(
-                  input_spec, operand->shape(), ins->shape(),
-                  is_1d_sharding ? cluster_env.device_mesh_1d_
-                                 : cluster_env.device_mesh_);
+              if (is_1d_sharding &&
+                  input_spec.TotalNumTiles() ==
+                      cluster_env.device_mesh_1d_.num_elements()) {
+                output_spec = PropagateDimwiseShardingSlice(
+                    input_spec, operand->shape(), ins->shape(),
+                    cluster_env.device_mesh_1d_);
+              } else if (is_1d_sharding) {
+                CHECK_EQ(input_spec.TotalNumTiles(),
+                         cluster_env.original_device_mesh_1d_.num_elements());
+                output_spec = PropagateDimwiseShardingSlice(
+                    input_spec, operand->shape(), ins->shape(),
+                    cluster_env.original_device_mesh_1d_);
+              } else {
+                output_spec = PropagateDimwiseShardingSlice(
+                    input_spec, operand->shape(), ins->shape(),
+                    cluster_env.device_mesh_);
+              }
               break;
             }
             case HloOpcode::kPad:
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index aa1167dfac33bb..27b9df98e3a88c 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -269,6 +269,43 @@ ENTRY %elementwise {
             op::Sharding("{devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}")));
 }
 
+TEST_F(AutoShardingTest, SliceMixedUserShardingTest) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY %elementwise {
+  param = s32[512,3084]{1,0} parameter(0), sharding={devices=[4,1]0,2,1,3}
+  slice = s32[512,2048]{1,0} slice(param), slice={[0:512], [0:2048]}
+  ROOT copy = s32[512,2048]{1,0} copy(slice)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      AutoSharding(
+          /* option */ {
+              .enable = true,
+              .preserve_shardings =
+                  AutoShardingOption::PreserveShardingsType::kKeepAllShardings,
+              .solve_nd_sharding_iteratively = true,
+              .device_mesh_shape = {2, 2},
+              .device_mesh_ids = {0, 2, 1, 3},
+              .device_mesh_alpha = {1.0, 1.0},
+              .device_mesh_beta = {0.01, 1.0}})
+          .Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+
+  std::vector<HloInstruction*> instructions =
+      module->entry_computation()->MakeInstructionPostOrder();
+  EXPECT_THAT(instructions,
+              Each(ResultOf(
+                  [](const HloInstruction* ins) { return ins->has_sharding(); },
+                  IsTrue())));
+  EXPECT_THAT(instructions, Each(op::Sharding("{devices=[4,1]0,2,1,3}")));
+}
+
 TEST_F(AutoShardingTest, RngBitGeneratorArrayInput) {
   constexpr absl::string_view hlo_string = R"(
 HloModule rng_bit_generator
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
index 7bab542bdbd2e3..19736d19e25f0a 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
@@ -51,6 +51,7 @@ class ClusterEnvironment {
         prof_result_(prof_result),
         total_devices_(device_mesh.num_elements()),
         device_mesh_1d_(device_mesh),
+        original_device_mesh_1d_(original_device_mesh),
         auto_sharding_option_(auto_sharding_option) {
     // Build replica group for each dimension.
     non_zero_mesh_dims_ =
@@ -71,6 +72,12 @@ class ClusterEnvironment {
     std::vector<int64_t> device_mesh_1d_shape(device_mesh.num_dimensions(), 1);
     device_mesh_1d_shape[largest_dim_idx] = device_mesh.num_elements();
     device_mesh_1d_.Reshape(device_mesh_1d_shape);
+
+    std::vector<int64_t> original_device_mesh_1d_shape(
+        original_device_mesh.num_dimensions(), 1);
+    original_device_mesh_1d_shape[largest_dim_idx] =
+        original_device_mesh.num_elements();
+    original_device_mesh_1d_.Reshape(original_device_mesh_1d_shape);
   }
 
   size_t NumDevices() const { return total_devices_; }
@@ -171,6 +178,10 @@ class ClusterEnvironment {
   // Used for mixed mesh shape strategies.
   Array<int64_t> device_mesh_1d_;
 
+  // Cache a flatten 1d version of the original device mesh.
+  // Used for mixed mesh shape strategies.
+  Array<int64_t> original_device_mesh_1d_;
+
   // The option may override the cost of communication primitives
   const AutoShardingOption& auto_sharding_option_;
 

From 165e3288ce537138b6f02f9a71dd23466960df85 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 18 Mar 2024 21:09:55 -0700
Subject: [PATCH 066/670] [OptimizeFunctionGraph] Prune the function library to
 reachable functions in the post-optimized graph.

Adds an overload `FunctionLibraryDefinition::ReachableDefinitions(const Graph&)` that enables capturing the reachable definitions from a `tensorflow::Graph` (and not just a protobuf-based graph).

PiperOrigin-RevId: 617042757
---
 .../optimize_function_graph_utils.cc          |  7 +-
 tensorflow/core/framework/function.cc         | 90 +++++++++++++------
 tensorflow/core/framework/function.h          |  1 +
 3 files changed, 70 insertions(+), 28 deletions(-)

diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
index 357520c827c393..264067a10a73d5 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/replicate_per_replica_nodes.h"
 #include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/optimized_function_graph.pb.h"
@@ -640,8 +641,12 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
 
   graph->mutable_flib_def()->set_default_registry(nullptr);
   graph->mutable_flib_def()->Clear();
+
+  FunctionLibraryDefinition pruned_lib_def =
+      reachable_lib_def.ReachableDefinitions(*graph);
+
   return OptimizedFunctionGraphInfo(
-      function_name, std::move(graph), std::move(reachable_lib_def),
+      function_name, std::move(graph), std::move(pruned_lib_def),
       node_name_to_control_ret, ret_types, ret_nodes.size(),
       env->NowMicros() - graph_optimization_start_time_usecs,
       optimization_source);
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 5a63dc61f019ac..0b6bacd94af0d9 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1820,9 +1820,12 @@ namespace {
 
 constexpr char kApiImplements[] = "api_implements";
 
-std::set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib,
-    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
+template <typename NodeType, typename NodeIter, typename OpTypeGetter,
+          typename AttrGetter>
+std::set<string> ReachableFunctions(const FunctionLibraryDefinition& flib,
+                                    NodeIter begin, NodeIter end,
+                                    OpTypeGetter op_type_getter,
+                                    AttrGetter attr_getter) {
   // Functions that are reachable from the graph.
   std::set<string> reachable_funcs;
 
@@ -1860,31 +1863,33 @@ std::set<string> ReachableFunctions(
     }
   };
 
+  const auto process_attr_value = [&](const AttrValue& attr_value) {
+    // 1. AttrValue.func
+    if (attr_value.has_func()) {
+      add_to_func_queue(attr_value.func().name());
+    }
+
+    // 2. AttrValue.ListValue.func
+    if (attr_value.has_list()) {
+      for (const auto& func : attr_value.list().func()) {
+        add_to_func_queue(func.name());
+      }
+    }
+  };
+
   // Add all the functions that are reachable from the given node to the queue.
-  const auto process_node = [&](const NodeDef& node) {
+  const auto process_node = [&](NodeType node) {
     // Node itself can be a call to the function.
-    add_to_func_queue(node.op());
+    add_to_func_queue(op_type_getter(node));
 
     // Or node can have an attribute referencing a function.
-    for (const auto& attr : node.attr()) {
-      const auto& attr_value = attr.second;
-
-      // 1. AttrValue.func
-      if (attr_value.has_func()) {
-        add_to_func_queue(attr_value.func().name());
-      }
-
-      // 2. AttrValue.ListValue.func
-      if (attr_value.has_list()) {
-        for (const auto& func : attr_value.list().func()) {
-          add_to_func_queue(func.name());
-        }
-      }
+    for (const auto& attr : attr_getter(node)) {
+      process_attr_value(attr.second);
     }
   };
 
   // Add all functions that are directly called from the optimized graph.
-  std::for_each(nodes.begin(), nodes.end(), process_node);
+  std::for_each(begin, end, process_node);
 
   // Process all reachable functions.
   while (!func_queue.empty()) {
@@ -1901,7 +1906,18 @@ std::set<string> ReachableFunctions(
 
     // Find all the functions called from the function body.
     const auto& func_body = func->fdef().node_def();
-    std::for_each(func_body.begin(), func_body.end(), process_node);
+
+    const auto process_node_def = [&](const NodeDef node) {
+      // Node itself can be a call to the function.
+      add_to_func_queue(node.op());
+
+      // Or node can have an attribute referencing a function.
+      for (const auto& attr : node.attr()) {
+        process_attr_value(attr.second);
+      }
+    };
+
+    std::for_each(func_body.begin(), func_body.end(), process_node_def);
 
     // Check if the function has a registered gradient.
     const string grad_func_name = flib.FindGradient(func_name);
@@ -1911,10 +1927,13 @@ std::set<string> ReachableFunctions(
   return reachable_funcs;
 }
 
+template <typename NodeType, typename NodeIter, typename OpTypeGetter,
+          typename AttrGetter>
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib,
-    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
-  std::set<string> reachable_funcs = ReachableFunctions(flib, nodes);
+    const FunctionLibraryDefinition& flib, NodeIter begin, NodeIter end,
+    OpTypeGetter op_type_getter, AttrGetter attr_getter) {
+  std::set<string> reachable_funcs = ReachableFunctions<NodeType>(
+      flib, begin, end, op_type_getter, attr_getter);
 
   FunctionLibraryDefinition reachable_flib(flib.default_registry(),
                                            FunctionDefLibrary());
@@ -1961,12 +1980,26 @@ const char* IsSet(void* ptr) { return ptr == nullptr ? "unset" : "set"; }
 
 FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
     const GraphDef& graph) const {
-  return ReachableFunctionLibraryDefinition(*this, graph.node());
+  return ReachableFunctionLibraryDefinition<const NodeDef&>(
+      *this, graph.node().begin(), graph.node().end(),
+      [](const NodeDef& ndef) { return ndef.op(); },
+      [](const NodeDef& ndef) { return ndef.attr(); });
 }
 
 FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
     const FunctionDef& func) const {
-  return ReachableFunctionLibraryDefinition(*this, func.node_def());
+  return ReachableFunctionLibraryDefinition<const NodeDef&>(
+      *this, func.node_def().begin(), func.node_def().end(),
+      [](const NodeDef& ndef) { return ndef.op(); },
+      [](const NodeDef& ndef) { return ndef.attr(); });
+}
+
+FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
+    const Graph& graph) const {
+  return ReachableFunctionLibraryDefinition<const Node*>(
+      *this, graph.nodes().begin(), graph.nodes().end(),
+      [](const Node* node) { return node->type_string(); },
+      [](const Node* node) { return node->attrs(); });
 }
 
 absl::StatusOr<FunctionLibraryDefinition>
@@ -1975,7 +2008,10 @@ FunctionLibraryDefinition::ReachableDefinitions(
   auto* func = Find(function_name);
   if (func) {
     FunctionLibraryDefinition ret =
-        ReachableFunctionLibraryDefinition(*this, func->node_def());
+        ReachableFunctionLibraryDefinition<const NodeDef&>(
+            *this, func->node_def().begin(), func->node_def().end(),
+            [](const NodeDef& ndef) { return ndef.op(); },
+            [](const NodeDef& ndef) { return ndef.attr(); });
     TF_RETURN_IF_ERROR(ret.CopyFunctionDefFrom(function_name, *this));
     return ret;
   } else {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index af956ac1524427..eb74ea58905405 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -569,6 +569,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // reachable from the nodes of `graph` or `func`.
   FunctionLibraryDefinition ReachableDefinitions(const GraphDef& graph) const;
   FunctionLibraryDefinition ReachableDefinitions(const FunctionDef& func) const;
+  FunctionLibraryDefinition ReachableDefinitions(const Graph& graph) const;
   absl::StatusOr<FunctionLibraryDefinition> ReachableDefinitions(
       const std::string& function_name) const;
 

From a74f5d1d9238d696d4347d28bfee70e45b5dc78c Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Mon, 18 Mar 2024 21:44:15 -0700
Subject: [PATCH 067/670] Implement expanding presets from
 `QuantizationConfig`.

`ExpandPresets` transfers quantization presets and populates other fields in `QuantizationConfig`.

PiperOrigin-RevId: 617049405
---
 .../mlir/quantization/stablehlo/cc/config.cc  | 46 +++++++++
 .../mlir/quantization/stablehlo/cc/config.h   | 17 ++++
 .../quantization/stablehlo/cc/config_test.cc  | 98 +++++++++++++++++++
 .../stablehlo/quantization_config.proto       | 35 +++++--
 4 files changed, 189 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
index e8a4aa87bb0619..0284c00523f420 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
 namespace stablehlo::quantization {
 namespace {
 
@@ -23,11 +25,55 @@ CalibrationOptions GetDefaultCalibrationOptions() {
   CalibrationOptions options{};
   options.set_calibration_method(
       CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
+
   return options;
 }
 
+void ExpandStaticRangePtqPreset(const StaticRangePtqPreset& preset,
+                                QuantizationConfig& config) {
+  // Populate with preset's representative dataset configs if the user didn't
+  // explicitly specify other representative dataset configs to the top-level
+  // `CalibrationOptions`.
+  if (config.calibration_options().representative_datasets().empty()) {
+    auto preset_datasets = preset.representative_datasets();
+    config.mutable_calibration_options()
+        ->mutable_representative_datasets()
+        ->Add(preset_datasets.begin(), preset_datasets.end());
+  }
+
+  // Create a new `QuantizationSpecs` to replace the existing one. The expansion
+  // from `StaticRangePtqPreset` gets populated first and then user-provided
+  // explicit `QuantizationSpec`s will be appended.
+  QuantizationSpecs new_specs{};
+  QuantizationSpec& spec = *new_specs.add_specs();
+  spec.mutable_matcher()->mutable_function_name()->set_regex(".*");
+  spec.mutable_method()->mutable_static_range_ptq();
+
+  const QuantizationSpecs& previous_specs = config.specs();
+  new_specs.mutable_specs()->Add(previous_specs.specs().begin(),
+                                 previous_specs.specs().end());
+
+  config.mutable_specs()->Swap(&new_specs);
+}
+
 }  // namespace
 
+QuantizationConfig ExpandPresets(const QuantizationConfig& config) {
+  QuantizationConfig new_config = config;
+
+  // Update the `new_config` with each preset's expansions.
+  switch (config.preset_case()) {
+    case QuantizationConfig::kStaticRangePtqPreset:
+      ExpandStaticRangePtqPreset(config.static_range_ptq_preset(), new_config);
+      break;
+    default:
+      // Preset has not been specified. The expansion is a no-op.
+      break;
+  }
+
+  return new_config;
+}
+
 QuantizationConfig PopulateDefaults(
     const QuantizationConfig& user_provided_config) {
   QuantizationConfig config = user_provided_config;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
index 20b9efa4a60fa0..5dc4554d784c92 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
@@ -24,6 +24,23 @@ namespace stablehlo::quantization {
 QuantizationConfig PopulateDefaults(
     const QuantizationConfig& user_provided_config);
 
+// Returns a copy of `QuantizationConfig` where presets are expanded and
+// transformed into other fields in `QuantizationConfig`.
+//
+// The expansion rules are as follows:
+// * StaticRangePtqPreset
+//   - The preset's `representative_datasets` field will be transferred to
+//   `QuantizationConfig.calibration_options.representative_datasets`, unless
+//   the user explicitly provided representative dataset configs to
+//   `calibration_options`. In that case, the explicit configs take precedence
+//   and the preset's configs are ignored.
+//   - For `QuantizationSpecs`, the expanded `QuantizationSpec`s will be
+//   populated first and user-provided `QuantizationSpec`s, if any, will be
+//   appended. This expresses the fact that user-provided specs take precedence.
+// * Preset unspecified
+//   - No-op.
+QuantizationConfig ExpandPresets(const QuantizationConfig& config);
+
 }  // namespace stablehlo::quantization
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONFIG_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
index 164cd6bae237f8..b606c797819c4b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
@@ -22,6 +22,8 @@ namespace stablehlo::quantization {
 namespace {
 
 using ::testing::Eq;
+using ::testing::SizeIs;
+using ::testing::StrEq;
 
 TEST(PopulateDefaultsTest, PopulateDefaultsForEmptyConfig) {
   QuantizationConfig config{};
@@ -68,5 +70,101 @@ TEST(PopulateDefaultsTest, ExplicitCalibrationOptionsNotOverridden) {
               Eq(512));
 }
 
+TEST(ExpandPresetsTest, ExpandUnspecifiedPreset) {
+  QuantizationConfig config{};
+  const QuantizationConfig new_config = ExpandPresets(config);
+
+  // Test that nothing has been changed.
+  EXPECT_FALSE(new_config.has_specs());
+  EXPECT_FALSE(new_config.has_calibration_options());
+  EXPECT_FALSE(new_config.has_pipeline_config());
+}
+
+TEST(ExpandPresetsTest, ExpandStaticRangePtqPreset) {
+  QuantizationConfig config{};
+  RepresentativeDatasetConfig& preset_dataset_config =
+      *config.mutable_static_range_ptq_preset()->add_representative_datasets();
+  preset_dataset_config.mutable_tf_record()->set_path("/test/path");
+
+  const QuantizationConfig new_config = ExpandPresets(config);
+  ASSERT_THAT(new_config.specs().specs(), SizeIs(1));
+
+  const QuantizationSpec& spec = new_config.specs().specs(0);
+  EXPECT_THAT(spec.matcher().function_name().regex(), StrEq(".*"));
+  EXPECT_TRUE(spec.method().has_static_range_ptq());
+
+  // Test that representative dataset config has been transferred to the
+  // `CalibrationOptions`.
+  ASSERT_THAT(new_config.calibration_options().representative_datasets(),
+              SizeIs(1));
+  EXPECT_THAT(new_config.calibration_options()
+                  .representative_datasets(0)
+                  .tf_record()
+                  .path(),
+              StrEq("/test/path"));
+}
+
+TEST(ExpandPresetsTest,
+     ExpandStaticRangePtqPresetWithExplicitRepresentativeDatasetConfigs) {
+  // Test the scenario where both
+  // `config.calibration_options.representative_datasets` and
+  // `config.static_range_ptq_preset.representative_datasets` are both
+  // specified. In this case, the one set to the `calibration_options` takes
+  // precedence.
+  QuantizationConfig config{};
+  RepresentativeDatasetConfig& top_level_dataset_config =
+      *config.mutable_calibration_options()->add_representative_datasets();
+  top_level_dataset_config.mutable_tf_record()->set_path("/test/path/1");
+
+  RepresentativeDatasetConfig& preset_dataset_config =
+      *config.mutable_static_range_ptq_preset()->add_representative_datasets();
+  preset_dataset_config.mutable_tf_record()->set_path("/test/path/2");
+
+  const QuantizationConfig new_config = ExpandPresets(config);
+
+  // Test that representative dataset config has not been transferred to the
+  // `CalibrationOptions`. Top-level config takes precedence.
+  ASSERT_THAT(new_config.calibration_options().representative_datasets(),
+              SizeIs(1));
+  EXPECT_THAT(new_config.calibration_options()
+                  .representative_datasets(0)
+                  .tf_record()
+                  .path(),
+              StrEq("/test/path/1"));
+}
+
+TEST(ExpandPresetsTest,
+     ExpandStaticRangePtqPresetWithExplicitSpecsAppendedAfterExpandedSpecs) {
+  QuantizationConfig config{};
+  config.mutable_static_range_ptq_preset();
+
+  QuantizationSpec& user_provided_spec = *config.mutable_specs()->add_specs();
+  user_provided_spec.mutable_matcher()->mutable_function_name()->set_regex(
+      "composite_dot_general_fn_1");
+  user_provided_spec.mutable_method()->mutable_no_quantization();
+
+  // Test that the expanded `QuantizationSpec`s are populated first and then
+  // user-provided specs are appended.
+  //
+  // It should look like:
+  //
+  // specs {matcher {function_name {regex: ".*"}} method {static_range_ptq {}}}
+  // specs {
+  //   matcher {function_name {regex: "composite_dot_general_fn_1"}}
+  //   method {no_quantization {}}
+  // }
+  const QuantizationConfig new_config = ExpandPresets(config);
+  ASSERT_THAT(new_config.specs().specs(), SizeIs(2));
+
+  const QuantizationSpec& first_spec = new_config.specs().specs(0);
+  EXPECT_THAT(first_spec.matcher().function_name().regex(), StrEq(".*"));
+  EXPECT_TRUE(first_spec.method().has_static_range_ptq());
+
+  const QuantizationSpec& second_spec = new_config.specs().specs(1);
+  EXPECT_THAT(second_spec.matcher().function_name().regex(),
+              StrEq("composite_dot_general_fn_1"));
+  EXPECT_TRUE(second_spec.method().has_no_quantization());
+}
+
 }  // namespace
 }  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
index b4c4dbdf1f26c8..36b781a7d28914 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
@@ -28,13 +28,23 @@ message RepresentativeDatasetConfig {
 }
 
 // Preset config for static-range post-training quantization (PTQ).
+//
 // Minimal user input about representative datasets is required. Representative
 // datasets are required for static-range PTQ to retrieve quantization
 // statistics via calibration.
+//
+// This preset is equivalent to the following `QuantizationSpecs`:
+//
+// ```
+// specs {matcher {function_name {regex: ".*"}} method {static_range_ptq {}}}
+// ```
+//
 // Next ID: 3
 message StaticRangePtqPreset {
   // Configures representative dataset. Each item corresponds to a
   // representative dataset used to calibrate a function.
+  // If `QuantizationConfig.calibration_options.representative_datasets` is also
+  // provided then this field will be ignored.
   repeated RepresentativeDatasetConfig representative_datasets = 1;
 
   // NOTE: This field will be deprecated.
@@ -93,6 +103,9 @@ message QuantizationResults {
 // denylisting quantizable units from quantization.
 message NoQuantization {}
 
+// Configurations for static-range post-training quantization method.
+message StaticRangePtq {}
+
 // Represents a matching method that matches quantizable units by lifted
 // functions' names.
 message FunctionNameMatcherSpec {
@@ -110,7 +123,10 @@ message MatcherSpec {
 
 // Specifies how to quantize matched quantizable units.
 message Method {
-  NoQuantization no_quantization = 1;
+  oneof method {
+    NoQuantization no_quantization = 1;
+    StaticRangePtq static_range_ptq = 2;
+  }
 }
 
 // A QuantizationSpec is essentially a (matcher spec, quantization method) pair,
@@ -184,9 +200,10 @@ message DebuggerConfig {
 }
 
 // Defines various calibration options.
+// Next ID: 4
 message CalibrationOptions {
   // Configurations for calibration methods.
-  // NEXT ID: 7
+  // Next ID: 7
   enum CalibrationMethod {
     CALIBRATION_METHOD_UNSPECIFIED = 0;
     // Use the min, max values of all sample datasets.
@@ -211,7 +228,7 @@ message CalibrationOptions {
   }
 
   // Parameters required for calibration.
-  // NEXT ID: 4
+  // Next ID: 4
   message CalibrationParameters {
     // The number of bins when histogram is initialized. It can be increased
     // because histogram is dynamically expanded by sample inputs.
@@ -234,6 +251,10 @@ message CalibrationOptions {
   // MIN_MAX and AVERAGE_MIN_MAX don't require this parameter and methods
   // starting with HISTOGRAM require this parameter.
   CalibrationParameters calibration_parameters = 2;
+
+  // Configures representative dataset. Each item corresponds to a
+  // representative dataset used to calibrate a function.
+  repeated RepresentativeDatasetConfig representative_datasets = 3;
 }
 
 // Quantization configuration for StableHLO Quantizer. This is the primary
@@ -242,10 +263,10 @@ message CalibrationOptions {
 message QuantizationConfig {
   // Config presets provide predefined popular or common quantization specs.
   // Lightweight users may choose one of the presets for quick experiments. Each
-  // preset is completely represented by `QuantizationSpecs`. When extra entries
-  // in `QuantizationSpecs` are provided along with a preset, then the preset
-  // will be overridden for the quantizable units matched by those additional
-  // `QuantizationSpec`s.
+  // preset is completely represented by other fields in `QuantizationConfig`.
+  //
+  // When extra entries in `QuantizationSpecs` are provided along with a preset,
+  // then those entries will take precedence.
   oneof preset {
     // Performs best-effort static-range post-training quantization (PTQ).
     StaticRangePtqPreset static_range_ptq_preset = 1;

From ac3c5809b04ad37428d003e489e7651a50eb6b5e Mon Sep 17 00:00:00 2001
From: Deqiang Chen <deqiangc@google.com>
Date: Mon, 18 Mar 2024 21:56:22 -0700
Subject: [PATCH 068/670] Lower tf.IfrtRestoreVariableOp to
 tf_mlrt.IfrtRestoreVariableOp

PiperOrigin-RevId: 617051341
---
 .../compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td | 28 ++++++++++++++++++-
 .../mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir      | 21 ++++++++++++++
 .../mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc   | 23 ++++++++++++++-
 .../mlir/tfrt/transforms/mlrt/util.cc         |  6 ++--
 4 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
index 7fbc42ad3db93f..72eac197011a6d 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
@@ -427,7 +427,7 @@ def AsyncWhileOp : TensorflowMlrt_Op<"async_while", [Pure]> {
   }];
 }
 
-def IfrtLoadVariableOp: TensorflowMlrt_Op<"ifrt_load_variable", []> {
+def IfrtLoadVariableOp: TensorflowMlrt_Op<"ifrt_load_variable", [Pure]> {
   let summary = "Loads a variable tensor as an IFRT array for mlrt";
 
   let description = [{
@@ -458,5 +458,31 @@ def IfrtLoadVariableOp: TensorflowMlrt_Op<"ifrt_load_variable", []> {
   );
 }
 
+def IfrtRestoreVariableOp: TensorflowMlrt_Op<"ifrt_restore_variable", []> {
+  let summary = "Restore variable tensors";
+  let description = [{
+    This is the MLRT version of tf.IfrtRestoreVariableOp.
+
+    This Op is similar to a combination of RestoreV2 and AssignVariable Op, but
+    this Op's execution is asynchronous.
+
+    This Op is specific to MLRT runtime and is not a stable interface for
+    serialization.
+
+    This Op will restore the tensors asynchronously and allow the runtime to look
+    for them.
+    The runtime shall handle the possibility that the tensors are not ready when requested
+    because the tensors are loaded asynchronously.
+
+  }];
+
+  let arguments = (ins
+    TFTensorType:$prefix,
+    TFTensorType:$tensor_names,
+    TFTensorType:$shape_and_slices,
+    Variadic<TFTensorType>:$var_handles,
+    TypeArrayAttr: $restored_dtypes
+  );
+}
 
 #endif
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
index eb2e0587364d6e..4cd2d6f3613a27 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
@@ -476,3 +476,24 @@ func.func @ifrt_load_variable_test() -> () {
   func.return
 }
 
+// -----
+
+// Test lowering of IfrtRestoreVariableOp
+
+// CHECK-LABEL: func @ifrt_restore_variable_test
+func.func @ifrt_restore_variable_test() -> () {
+  // CHECK-NEXT: [[PREFIX:%.*]] = tf_mlrt.executeop
+  %cst = "tf.Const"() {__op_key = 0: i32, value = dense<"restore_ariables"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
+  // CHECK-NEXT: [[SLICE:%.*]] = tf_mlrt.executeop
+  %cst_0 = "tf.Const"()  {__op_key = 1: i32, value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+  // CHECK-NEXT: [[NAME:%.*]] = tf_mlrt.executeop
+  %cst_1 = "tf.Const"()  {__op_key = 2: i32, value = dense<["y"]> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+  // CHECK-NEXT: [[HANDLE:%.*]] = tf_mlrt.executeop
+  %handle = "tf.VarHandleOp"() {__op_key = 3: i32, container = "x", shared_name = "y"} : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+  // CHECK-NEXT: "tf_mlrt.ifrt_restore_variable"([[PREFIX]], [[NAME]], [[SLICE]], [[HANDLE]]) {restored_dtypes = [f32]}
+  "tf.IfrtRestoreVariableOp"(%cst, %cst_1, %cst_0, %handle) {restored_dtypes = [f32]} : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<!tf_type.resource<tensor<3x1xf32>>>) -> ()
+  // CHECK-NEXT: return
+  func.return
+}
+
+
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
index 8271a5c796e5c4..0fb986e567b2f4 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -343,6 +343,26 @@ class IfrtLoadVariableOpConversion
   }
 };
 
+// Convert tf.IfrtRestoreVariableOp to tf_mlrt.IfrtRestoreVariableOp
+class IfrtRestoreVariableOpConversion
+    : public mlir::OpConversionPattern<mlir::TF::IfrtRestoreVariableOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::TF::IfrtRestoreVariableOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    auto new_op = rewriter.create<tf_mlrt::IfrtRestoreVariableOp>(
+        op.getLoc(), adaptor.getOperands()[0], adaptor.getOperands()[1],
+        adaptor.getOperands()[2],
+        adaptor.getOperands().slice(3, adaptor.getOperands().size() - 3),
+        op.getRestoredDtypes());
+    rewriter.replaceOp(op, new_op);
+
+    return mlir::success();
+  }
+};
+
 std::optional<std::string> DecodeLongName(mlir::Location loc) {
   if (auto name_loc = loc.dyn_cast<mlir::NameLoc>()) {
     return name_loc.getName().str();
@@ -1189,7 +1209,8 @@ class TfToMlrtConversionPass
     patterns.add<WhileOpConversion>(&context, &type_converter_, &symbol_table);
     patterns.add<AsyncOpConversion, GetResourceOpConversion,
                  SetResourceOpConversion, IfrtLoadVariableOpConversion,
-                 TFAwaitOpConversion, TFPromiseOpConversion>(&context);
+                 IfrtRestoreVariableOpConversion, TFAwaitOpConversion,
+                 TFPromiseOpConversion>(&context);
     patterns.add<BatchFunctionOpConversion, CaseOpConversion, CondOpConversion,
                  TFAsyncWhileOpConversion, TFMapFnOpConversion>(type_converter_,
                                                                 &context);
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
index d9e1b7f73ac0c8..a1f9d401f5c485 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
@@ -35,9 +35,9 @@ bool UseFallback(mlir::Operation *op) {
   return !llvm::isa<
       mlir::TF::_TfrtSetResourceOp, mlir::TF::_TfrtGetResourceOp,
       mlir::TF::BatchFunctionOp, mlir::TF::CaseOp, mlir::TF::IfrtLoadVariableOp,
-      mlir::TF::StatefulPartitionedCallOp, mlir::TF::PartitionedCallOp,
-      mlir::TF::LegacyCallOp, mlir::TF::IfOp, mlir::TF::WhileOp,
-      mlir::TF::TPUCompileMlirAndExecuteOp>(op);
+      mlir::TF::IfrtRestoreVariableOp, mlir::TF::StatefulPartitionedCallOp,
+      mlir::TF::PartitionedCallOp, mlir::TF::LegacyCallOp, mlir::TF::IfOp,
+      mlir::TF::WhileOp, mlir::TF::TPUCompileMlirAndExecuteOp>(op);
 }
 
 }  // namespace mlrt_compiler

From a0911b4c89e9511d0089a37b0d32fb4c8b4795e6 Mon Sep 17 00:00:00 2001
From: Doyoung Gwak <doyounggwak@google.com>
Date: Mon, 18 Mar 2024 23:20:46 -0700
Subject: [PATCH 069/670] Migrate DebuggerOptions to DebuggerConfig

PiperOrigin-RevId: 617066168
---
 RELEASE.md                                      |  2 ++
 .../mlir/quantization/stablehlo/cc/debugger.cc  |  8 ++++----
 .../mlir/quantization/stablehlo/cc/debugger.h   |  2 +-
 .../integration_test/quantize_model_test.py     |  5 ++---
 .../tensorflow/python/pywrap_quantize_model.cc  | 10 +++++-----
 .../tensorflow/python/quantize_model.cc         | 15 +--------------
 .../tensorflow/python/quantize_model.py         | 12 ++++++------
 .../tensorflow/quantization_options.proto       | 17 +----------------
 .../quantization/tensorflow/quantize_passes.cc  |  6 +++---
 ...ion.experimental.-quantization-options.pbtxt |  4 ++--
 ...ion.experimental.-quantization-options.pbtxt |  4 ++--
 11 files changed, 29 insertions(+), 56 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 6ff074b10e465d..cd4e3a2cc3bdb8 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -29,6 +29,8 @@
 * GPU
     * Support for NVIDIA GPUs with compute capability 8.9 (e.g. L4 & L40) has
       been added to TF binary distributions (Python wheels).
+* Replace `DebuggerOptions` of TensorFlow Quantizer, and migrate to
+  `DebuggerConfig` of StableHLO Quantizer.
 
 ## Keras
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
index 1ba51790de0ac9..134ce2a5a89ebd 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
@@ -30,16 +30,16 @@ limitations under the License.
 namespace stablehlo::quantization {
 namespace {
 
+using ::stablehlo::quantization::DebuggerConfig;
 using ::tensorflow::NodeDef;
 using ::tensorflow::SignatureDef;
-using ::tensorflow::quantization::DebuggerOptions;
 using ::tensorflow::quantization::ExportedModel;
 using ::tensorflow::quantization::PyFunctionLibrary;
 
 }  // namespace
 
 void EnableDebugging(
-    ExportedModel& exported_model, const DebuggerOptions& debugger_options,
+    ExportedModel& exported_model, const DebuggerConfig& debugger_config,
     const PyFunctionLibrary& py_function_library,
     const absl::string_view src_saved_model_path,
     const std::unordered_set<std::string>& tags,
@@ -52,13 +52,13 @@ void EnableDebugging(
     }
   });
 
-  if (debugger_options.debugger_type() ==
+  if (debugger_config.debugger_type() ==
       DebuggerConfig::DEBUGGER_TYPE_WHOLE_MODEL) {
     // TODO: b/295139417 - Remove CustomAggregator op in unquantized dump model.
     // TODO: b/296916287 - Create a separate function for saving unquantized
     // dump model.
     py_function_library.SaveExportedModel(
-        debugger_options.unquantized_dump_model_path(), exported_model,
+        debugger_config.unquantized_dump_model_path(), exported_model,
         src_saved_model_path, tags, signature_def_map);
 
     // Update the `DumpTensor` ops' file name in `graph_def`.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
index 6bb427ecbdf1fd..4cb1523a7594ee 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
@@ -38,7 +38,7 @@ namespace stablehlo::quantization {
 // and compare them offline.
 void EnableDebugging(
     tensorflow::quantization::ExportedModel& exported_model,
-    const tensorflow::quantization::DebuggerOptions& debugger_options,
+    const stablehlo::quantization::DebuggerConfig& debugger_config,
     const tensorflow::quantization::PyFunctionLibrary& py_function_library,
     absl::string_view src_saved_model_path,
     const std::unordered_set<std::string>& tags,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index a28d7ebe4bf7f3..18e5a14de44110 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -83,7 +83,6 @@
     'UniformQuantizedDotHybrid',
 )
 
-_DebuggerOptions = quant_opts_pb2.DebuggerOptions
 _DebuggerConfig = stablehlo_quant_config_pb2.DebuggerConfig
 
 # Lists of ops whose channel dimension should be changed if per_channel
@@ -5926,7 +5925,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
             preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
         ),
         op_set=quant_opts_pb2.XLA,
-        debugger_options=_DebuggerOptions(
+        debugger_config=_DebuggerConfig(
             debugger_type=_DebuggerConfig.DebuggerType.DEBUGGER_TYPE_WHOLE_MODEL,
             unquantized_dump_model_path=unquantized_dump_model_path,
             log_dir_path=log_dir_path,
@@ -6039,7 +6038,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
             preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
         ),
         op_set=target_opset,
-        debugger_options=_DebuggerOptions(
+        debugger_config=_DebuggerConfig(
             debugger_type=debugger_type,
             log_dir_path=log_dir_path,
         ),
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
index 8273279df67787..d61cb59905d66f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
@@ -89,7 +89,7 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
         // Remove the `tpu` tag from the debug quantized saved model as it is
         // for CPU. Note the 'tpu' value should be the same as `TPU` defined in
         // tensorflow/python/saved_model/tag_constants.py.
-        if (quantization_options.has_debugger_options()) {
+        if (quantization_options.has_debugger_config()) {
           tags.erase("tpu");
         }
         py_function_library.SaveExportedModel(
@@ -138,7 +138,7 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
         // Remove the `tpu` tag from the debug quantized saved model as it is
         // for CPU. Note the 'tpu' value should be the same as `TPU` defined in
         // tensorflow/python/saved_model/tag_constants.py.
-        if (quantization_options.has_debugger_options()) {
+        if (quantization_options.has_debugger_config()) {
           tags.erase("tpu");
         }
         py_function_library.SaveExportedModel(
@@ -255,9 +255,9 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
                        << status;
         }
 
-        if (quantization_options.has_debugger_options()) {
+        if (quantization_options.has_debugger_config()) {
           EnableDebugging(*exported_model,
-                          quantization_options.debugger_options(),
+                          quantization_options.debugger_config(),
                           py_function_library, src_saved_model_path, tags,
                           signature_def_map);
         }
@@ -283,7 +283,7 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
         // Remove the `tpu` tag from the debug quantized saved model as it is
         // for CPU. Note the 'tpu' value should be the same as `TPU` defined in
         // tensorflow/python/saved_model/tag_constants.py.
-        if (quantization_options.has_debugger_options()) {
+        if (quantization_options.has_debugger_config()) {
           tags.erase("tpu");
         }
         py_function_library.SaveExportedModel(
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index 08b71190bbb5b5..10bedefb55161d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -79,18 +79,6 @@ using ::stablehlo::quantization::DebuggerConfig;
 using ::stablehlo::quantization::QuantizationConfig;
 using ::stablehlo::quantization::io::GetLocalTmpFileName;
 
-// TODO: b/326355110 - Removes `ConvertDebuggerOptionToDebuggerConfig` when
-// merging `DebuggingOption` to `DebuggingConfig`.
-DebuggerConfig ConvertDebuggerOptionToDebuggerConfig(
-    const DebuggerOptions &debugger_options) {
-  DebuggerConfig debugger_config;
-  debugger_config.set_debugger_type(debugger_options.debugger_type());
-  debugger_config.set_unquantized_dump_model_path(
-      debugger_options.unquantized_dump_model_path());
-  debugger_config.set_log_dir_path(debugger_options.log_dir_path());
-  return debugger_config;
-}
-
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportAndPreprocessSavedModel(
     absl::string_view saved_model_path,
     const std::vector<std::string> &signature_keys,
@@ -268,8 +256,7 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
   if (is_stablehlo) {
     QuantizationConfig quantization_config;
     *quantization_config.mutable_debugger_config() =
-        ConvertDebuggerOptionToDebuggerConfig(
-            quantization_options.debugger_options());
+        quantization_options.debugger_config();
     PreCalibrationComponent pre_calibration_component(context.get());
     TF_ASSIGN_OR_RETURN(*module_ref, pre_calibration_component.Run(
                                          *module_ref, quantization_config));
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index 1bf3fe81c7d8ba..961db5334e3bbe 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -692,7 +692,7 @@ def _populate_quantization_options_default_values(
         ' quantization via TF Quantizer.'
     )
 
-  if quantization_options.HasField('debugger_options'):
+  if quantization_options.HasField('debugger_config'):
     # Set `force_graph_mode_calibration` to True to avoid skipping op execution,
     # which are not connected to return ops, during calibration execution.
     # Setting `force_graph_mode_calibration` to True enables execution of the
@@ -704,11 +704,11 @@ def _populate_quantization_options_default_values(
     )
     quantization_options.force_graph_mode_calibration = True
 
-    if not quantization_options.debugger_options.log_dir_path:
-      quantization_options.debugger_options.log_dir_path = '/tmp/dumps'
+    if not quantization_options.debugger_config.log_dir_path:
+      quantization_options.debugger_config.log_dir_path = '/tmp/dumps'
 
     if (
-        quantization_options.debugger_options.debugger_type
+        quantization_options.debugger_config.debugger_type
         == stablehlo_quant_config_pb2.DebuggerConfig.DebuggerType.DEBUGGER_TYPE_UNSPECIFIED
     ):
       raise ValueError(
@@ -716,9 +716,9 @@ def _populate_quantization_options_default_values(
       )
 
     if (
-        quantization_options.debugger_options.debugger_type
+        quantization_options.debugger_config.debugger_type
         == stablehlo_quant_config_pb2.DebuggerConfig.DebuggerType.DEBUGGER_TYPE_WHOLE_MODEL
-        and not quantization_options.debugger_options.unquantized_dump_model_path
+        and not quantization_options.debugger_config.unquantized_dump_model_path
     ):
       raise ValueError(
           'Debugger type whole model verify was used but'
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
index 13d3876500fe0d..d2c79b6ce4c668 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
@@ -145,21 +145,6 @@ message RepresentativeDatasetFile {
   }
 }
 
-// Configuration for quantization debugger.
-// NEXT ID: 4
-message DebuggerOptions {
-  // Type of quantization debugger. Depending on the type, inputs and outputs
-  // are wired differently.
-  stablehlo.quantization.DebuggerConfig.DebuggerType debugger_type = 1;
-
-  // Path to save unquantized model with dump tensor ops attached.
-  // Used when debugger_type is WHOLE_MODEL.
-  string unquantized_dump_model_path = 2;
-
-  // Path to save debugger related logs. Defaults to '/tmp/dumps'.
-  string log_dir_path = 3;
-}
-
 // Defines various options to specify and control the behavior of the quantizer.
 // It consists of
 // 1) Model-wise quantization configuration as a default configuration. If it is
@@ -251,7 +236,7 @@ message QuantizationOptions {
   stablehlo.quantization.CalibrationOptions calibration_options = 15;
 
   // Configuration related to quantization debugger.
-  DebuggerOptions debugger_options = 16;
+  stablehlo.quantization.DebuggerConfig debugger_config = 16;
 
   reserved 3;
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
index 0d5e43cd6f334e..0e756021844a5c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
@@ -149,10 +149,10 @@ void AddQuantizePtqPreCalibrationPasses(
   pm.addPass(mlir::quant::CreateLiftQuantizableSpotsAsFunctionsPass(
       quantization_options));
   // TODO: b/295140328 - Add debugger support for weight only
-  if (quantization_options.has_debugger_options()) {
+  if (quantization_options.has_debugger_config()) {
     pm.addPass(mlir::quant::CreateAddDumpTensorOpPass(
-        quantization_options.debugger_options().debugger_type(),
-        quantization_options.debugger_options().log_dir_path()));
+        quantization_options.debugger_config().debugger_type(),
+        quantization_options.debugger_config().log_dir_path()));
   }
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::quant::CreateInsertCustomAggregationOpsPass(
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-quantization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-quantization-options.pbtxt
index 7edb5900b4b5f4..e00e4c66e47900 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-quantization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-quantization-options.pbtxt
@@ -96,11 +96,11 @@ tf_proto {
       type_name: ".stablehlo.quantization.CalibrationOptions"
     }
     field {
-      name: "debugger_options"
+      name: "debugger_config"
       number: 16
       label: LABEL_OPTIONAL
       type: TYPE_MESSAGE
-      type_name: ".tensorflow.quantization.DebuggerOptions"
+      type_name: ".stablehlo.quantization.DebuggerConfig"
     }
     nested_type {
       name: "RepresentativeDatasetsEntry"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-quantization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-quantization-options.pbtxt
index 7edb5900b4b5f4..e00e4c66e47900 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-quantization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-quantization-options.pbtxt
@@ -96,11 +96,11 @@ tf_proto {
       type_name: ".stablehlo.quantization.CalibrationOptions"
     }
     field {
-      name: "debugger_options"
+      name: "debugger_config"
       number: 16
       label: LABEL_OPTIONAL
       type: TYPE_MESSAGE
-      type_name: ".tensorflow.quantization.DebuggerOptions"
+      type_name: ".stablehlo.quantization.DebuggerConfig"
     }
     nested_type {
       name: "RepresentativeDatasetsEntry"

From a88a81a7db64070b0f66e77f88b1e54c9981106d Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 18 Mar 2024 23:23:17 -0700
Subject: [PATCH 070/670] [XLA:Python] Improve error checking for the return
 value of the to_iterable function of custom pytree nodes.

PiperOrigin-RevId: 617066587
---
 third_party/xla/xla/python/BUILD         |  1 +
 third_party/xla/xla/python/pytree.cc     | 48 ++++++++++++++++--------
 third_party/xla/xla/python/pytree.h      |  8 +++-
 third_party/xla/xla/python/xla_client.py |  2 +-
 4 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 3668ab3ba2ff5a..1c58803c966be2 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -863,6 +863,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "//third_party/nanobind",
+        "@local_config_python//:python_headers",  # buildcleaner: keep
         "//xla/pjrt:exceptions",
         "@local_tsl//tsl/platform:logging",
     ],
diff --git a/third_party/xla/xla/python/pytree.cc b/third_party/xla/xla/python/pytree.cc
index edd43c8dd74a31..0c8dcf5fe02e49 100644
--- a/third_party/xla/xla/python/pytree.cc
+++ b/third_party/xla/xla/python/pytree.cc
@@ -18,7 +18,11 @@ limitations under the License.
 
 #include "xla/python/pytree.h"
 
+#include <Python.h>
+
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
 #include <iterator>
 #include <limits>
 #include <memory>
@@ -93,6 +97,28 @@ void PyTreeRegistry::Register(nb::object type, nb::callable to_iterable,
   }
 }
 
+std::pair<nanobind::iterable, nanobind::object>
+PyTreeRegistry::Registration::ToIterable(nanobind::handle o) const {
+  nb::object out = to_iterable(o);
+  nb::tuple leaves_and_aux_data;
+  if (!nb::try_cast<nb::tuple>(out, leaves_and_aux_data) ||
+      leaves_and_aux_data.size() != 2) {
+    throw std::invalid_argument(absl::StrCat(
+        "The to_iterable function for a custom PyTree node should return "
+        "a (children, aux_data) tuple, got ",
+        nb::cast<std::string_view>(nb::repr(out))));
+  }
+  nb::iterable leaves;
+  if (!nb::try_cast<nb::iterable>(leaves_and_aux_data[0], leaves)) {
+    throw std::invalid_argument(absl::StrCat(
+        "The to_iterable function for a custom PyTree node should return "
+        "a (children, aux_data) tuple where 'children' is iterable, "
+        "got ",
+        nb::cast<std::string_view>(nb::repr(out))));
+  }
+  return std::make_pair(std::move(leaves), nb::object(leaves_and_aux_data[1]));
+}
+
 // Computes the node kind of a given Python object.
 PyTreeKind PyTreeRegistry::KindOfObject(
     nb::handle obj, PyTreeRegistry::Registration const** custom) const {
@@ -257,14 +283,10 @@ void PyTreeDef::FlattenImpl(nb::handle handle, T& leaves,
         break;
       }
       case PyTreeKind::kCustom: {
-        nb::tuple out = nb::cast<nb::tuple>(node.custom->to_iterable(handle));
-        if (out.size() != 2) {
-          throw xla::XlaRuntimeError(
-              "PyTree custom to_iterable function should return a pair");
-        }
-        node.node_data = out[1];
+        auto [leaves, aux_data] = node.custom->ToIterable(handle);
+        node.node_data = std::move(aux_data);
         node.arity = 0;
-        for (nb::handle entry : nb::cast<nb::iterable>(out[0])) {
+        for (nb::handle entry : leaves) {
           ++node.arity;
           recurse(entry);
         }
@@ -558,20 +580,16 @@ nb::list PyTreeDef::FlattenUpTo(nb::handle xs) const {
               nb::cast<std::string_view>(nb::repr(node.custom->type)),
               nb::cast<std::string_view>(nb::repr(object))));
         }
-        nb::tuple out = nb::cast<nb::tuple>(node.custom->to_iterable(object));
-        if (out.size() != 2) {
-          throw xla::XlaRuntimeError(
-              "PyTree custom to_iterable function should return a pair");
-        }
-        if (node.node_data.not_equal(out[1])) {
+        auto [leaves, aux_data] = node.custom->ToIterable(object);
+        if (node.node_data.not_equal(aux_data)) {
           throw std::invalid_argument(absl::StrFormat(
               "Mismatch custom node data: %s != %s; value: %s.",
               nb::cast<std::string_view>(nb::repr(node.node_data)),
-              nb::cast<std::string_view>(nb::repr(out[1])),
+              nb::cast<std::string_view>(nb::repr(aux_data)),
               nb::cast<std::string_view>(nb::repr(object))));
         }
         int arity = 0;
-        for (nb::handle entry : nb::cast<nb::iterable>(out[0])) {
+        for (nb::handle entry : leaves) {
           ++arity;
           agenda.push_back(nb::borrow<nb::object>(entry));
         }
diff --git a/third_party/xla/xla/python/pytree.h b/third_party/xla/xla/python/pytree.h
index 266af78b56c552..9a453ad0f17f8f 100644
--- a/third_party/xla/xla/python/pytree.h
+++ b/third_party/xla/xla/python/pytree.h
@@ -19,9 +19,9 @@ limitations under the License.
 // See https://jax.readthedocs.io/en/latest/pytrees.html for the documentation
 // about pytree.
 
+#include <cstddef>
 #include <memory>
 #include <optional>
-#include <stdexcept>
 #include <string>
 #include <utility>
 #include <vector>
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/hash/hash.h"
+#include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/pytree.pb.h"
@@ -67,6 +68,11 @@ class PyTreeRegistry : public std::enable_shared_from_this<PyTreeRegistry> {
     nanobind::callable to_iterable;
     // A function with signature: (aux_data, iterable) -> object
     nanobind::callable from_iterable;
+
+    // Helper that calls to_iterable and validates that it returns a pair
+    // of an iterable and an aux_data object
+    std::pair<nanobind::iterable, nanobind::object> ToIterable(
+        nanobind::handle o) const;
   };
 
   // Registers a new custom type. Objects of `type` will be treated as container
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index d8b24aba09dcb5..ca419694f95d3a 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -48,7 +48,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 246
+_version = 247
 
 # Version number for MLIR:Python components.
 mlir_api_version = 55

From 752146579214f1ecb13fc9bcea8d221e7da8067f Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Tue, 19 Mar 2024 00:09:52 -0700
Subject: [PATCH 071/670] #tf-data Support global shuffle for the skip dataset.

PiperOrigin-RevId: 617074621
---
 tensorflow/core/kernels/data/BUILD            |  6 +-
 .../core/kernels/data/skip_dataset_op.cc      | 61 ++++++++++++-
 tensorflow/python/data/kernel_tests/BUILD     |  1 +
 .../python/data/kernel_tests/skip_test.py     | 88 +++++++++++++++++++
 4 files changed, 151 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 9509792dcd2450..3210941c5d2d7d 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -1228,9 +1228,11 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 2a0c75f4c54b93..c5ccea131b96c2 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -14,9 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/skip_dataset_op.h"
 
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 namespace data {
@@ -40,6 +48,14 @@ class SkipDatasetOp::Dataset : public DatasetBase {
   Dataset(OpKernelContext* ctx, int64_t count, const DatasetBase* input)
       : DatasetBase(DatasetContext(ctx)), count_(count), input_(input) {
     input_->Ref();
+    if (input_ != nullptr && count >= 0) {
+      random_indexing_compatible_ = input_->RandomIndexingCompatible();
+    } else {
+      random_indexing_compatible_ = absl::FailedPreconditionError(
+          absl::StrCat("Global shuffling does not support empty dataset or "
+                       "skipping the entire dataset. Got skip(",
+                       count, ")."));
+    }
   }
 
   ~Dataset() override { input_->Unref(); }
@@ -90,6 +106,10 @@ class SkipDatasetOp::Dataset : public DatasetBase {
     return input_->Get(ctx, index + count_, out_tensors);
   }
 
+  absl::Status RandomIndexingCompatible() const override {
+    return random_indexing_compatible_;
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -156,10 +176,13 @@ class SkipDatasetOp::Dataset : public DatasetBase {
         return absl::OkStatus();
       }
 
+      IteratorContextWithIndexMapper ctx_with_index_mapper(ctx, this);
       if (i_ < dataset()->count_) {
         int num_skipped;
-        TF_RETURN_IF_ERROR(input_impl_->Skip(ctx, dataset()->count_ - i_,
+        TF_RETURN_IF_ERROR(input_impl_->Skip(ctx_with_index_mapper.Get(),
+                                             dataset()->count_ - i_,
                                              end_of_sequence, &num_skipped));
+        ctx_with_index_mapper.MergeCheckpoint();
         i_ += num_skipped;
         if (*end_of_sequence) {
           // We reached the end before the count was reached.
@@ -169,14 +192,29 @@ class SkipDatasetOp::Dataset : public DatasetBase {
       }
 
       // Return GetNext() on the underlying iterator.
-      TF_RETURN_IF_ERROR(
-          input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx_with_index_mapper.Get(),
+                                              out_tensors, end_of_sequence));
+      ctx_with_index_mapper.MergeCheckpoint();
       if (*end_of_sequence) {
         input_impl_.reset();
       }
       return absl::OkStatus();
     }
 
+    IndexMapperFn GetIndexMapper(
+        IndexMapperFn parent_index_mapper) const override {
+      int64_t skip_count = dataset()->count_;
+      return [parent_index_mapper,
+              skip_count](size_t element_position) -> size_t {
+        if (element_position < skip_count) {
+          // The first `skip_count` elements are to be skipped.
+          return parent_index_mapper(element_position);
+        }
+        // Maps the range [skip_count, cardinality) to a permuted range.
+        return parent_index_mapper(element_position - skip_count) + skip_count;
+      };
+    }
+
    protected:
     std::shared_ptr<model::Node> CreateNode(
         IteratorContext* ctx, model::Node::Args args) const override {
@@ -198,6 +236,22 @@ class SkipDatasetOp::Dataset : public DatasetBase {
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
+      if (ctx->restored_element_count().has_value()) {
+        mutex_lock l(mu_);
+        if (*ctx->restored_element_count() > 0) {
+          i_ = dataset()->count_;
+          // For upstream iterators, the restored count is the returned element
+          // count + skipped element count.
+          IteratorContext::Params params(ctx);
+          params.restored_element_count =
+              *ctx->restored_element_count() + dataset()->count_;
+          IteratorContext ctx_with_restored_count(params);
+          return RestoreInput(&ctx_with_restored_count, reader, input_impl_);
+        }
+        i_ = 0;
+        return RestoreInput(ctx, reader, input_impl_);
+      }
+
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(), kCurIndex, &i_));
       int64_t input_empty;
@@ -219,6 +273,7 @@ class SkipDatasetOp::Dataset : public DatasetBase {
 
   const int64_t count_;
   const DatasetBase* const input_;
+  absl::Status random_indexing_compatible_;
 };
 
 SkipDatasetOp::SkipDatasetOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 3b63533996613b..bdb05c5950c821 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -1179,6 +1179,7 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
diff --git a/tensorflow/python/data/kernel_tests/skip_test.py b/tensorflow/python/data/kernel_tests/skip_test.py
index d117ced2b12222..bba8d1e30ca68d 100644
--- a/tensorflow/python/data/kernel_tests/skip_test.py
+++ b/tensorflow/python/data/kernel_tests/skip_test.py
@@ -13,9 +13,13 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for `tf.data.Dataset.skip()`."""
+
+from typing import Callable, Optional
+
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.data.experimental.ops import global_shuffle_op
 from tensorflow.python.data.experimental.ops import random_access
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
@@ -124,5 +128,89 @@ def testMultipleCombinations(self, elements, skip):
           self.evaluate(random_access.at(dataset, index=i)), i + skip)
 
 
+class SkipGlobalShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[10],
+              count=[0, 2],
+              repetitions=[1, 2],
+              seed=[None, 42],
+              reshuffle_each_iteration=[True, False])))
+  def testSkip(
+      self,
+      dataset_range: int,
+      count: int,
+      repetitions: int,
+      seed: Optional[int],
+      reshuffle_each_iteration: bool):
+    dataset = dataset_ops.Dataset.range(dataset_range)
+    dataset = dataset.skip(count)
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+    if repetitions > 1:
+      dataset = dataset.repeat(repetitions)
+    dataset = global_shuffle_op._global_shuffle(
+        dataset, seed=seed, reshuffle_each_iteration=reshuffle_each_iteration)
+
+    expected = list(range(count, dataset_range)) * repetitions
+    dataset_output = self.getDatasetOutput(
+        dataset, requires_initialization=True)
+    self.assertCountEqual(dataset_output, expected)
+    self.assertNotEqual(dataset_output, expected)
+    self.assertLen(dataset_output, self.evaluate(dataset.cardinality()))
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(skip=[-2, -1])))
+  def testNegativeSkip(self, skip: int):
+    dataset = dataset_ops.Dataset.range(10).skip(skip)
+    with self.assertRaises(errors.FailedPreconditionError):
+      dataset = global_shuffle_op._global_shuffle(dataset)
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+
+class SkipGlobalShuffleCheckpointTest(
+    checkpoint_test_base.CheckpointTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[10],
+              count=[0, 2],
+              repetitions=[1, 2],
+              reshuffle_each_iteration=[True, False],
+              symbolic_checkpoint=[True, False])))
+  def testSkip(
+      self,
+      verify_fn: Callable[..., None],
+      dataset_range: int,
+      count: int,
+      repetitions: int,
+      reshuffle_each_iteration: bool,
+      symbolic_checkpoint: bool):
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset = dataset_ops.Dataset.range(dataset_range)
+      dataset = dataset.skip(count)
+      dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+      if repetitions > 1:
+        dataset = dataset.repeat(repetitions)
+      dataset = global_shuffle_op._global_shuffle(
+          dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(
+        self,
+        _build_dataset,
+        num_outputs=(dataset_range - count) * repetitions,
+        assert_items_equal=reshuffle_each_iteration,
+    )
+
+
 if __name__ == "__main__":
   test.main()

From 65d46501f75ecb6b564c5c156d889dd0a48269e3 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Tue, 19 Mar 2024 00:28:35 -0700
Subject: [PATCH 072/670] [xla:gpu] No need to distinguish operand vs. result
 slices for AddressComputationThunk

Distinguishing between operand and result is only required when creating the embedded thunk, during `ExecuteOnStream` all we need is the list of buffers.

PiperOrigin-RevId: 617077937
---
 .../gpu/runtime/address_computation_thunk.cc  | 198 ++++--------------
 .../gpu/runtime/address_computation_thunk.h   |  32 +--
 .../runtime/address_computation_thunk_test.cc |  61 +++---
 3 files changed, 86 insertions(+), 205 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 28cf9163774ca5..3872683e70a75d 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -44,63 +44,38 @@ namespace gpu {
 
 AddressComputationThunk::AddressComputationThunk(
     ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
-    std::vector<std::optional<const BufferAllocation::Slice>> operands,
-    std::vector<std::optional<const BufferAllocation::Slice>> results,
+    std::vector<std::optional<const BufferAllocation::Slice>> arguments,
     std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
-        operand_offset_buffer_indices,
-    std::vector<std::optional<const Shape>> operand_orig_shapes,
-    std::vector<std::optional<const Shape>> operand_sliced_shapes,
-    std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
-        result_offset_buffer_indices,
-    std::vector<std::optional<const Shape>> result_orig_shapes,
-    std::vector<std::optional<const Shape>> result_sliced_shapes)
+        offset_buffer_indices,
+    std::vector<std::optional<const Shape>> orig_shapes,
+    std::vector<std::optional<const Shape>> sliced_shapes)
     : Thunk(Kind::kAddressComputation, thunk_info),
       embedded_thunk_(std::make_unique<SequentialThunk>(
           ThunkInfo(thunk_info.op), std::move(*embedded_thunk))),
-      embedded_thunk_operands_(std::move(operands)),
-      embedded_thunk_results_(std::move(results)),
-      operand_offset_buffer_indices_(std::move(operand_offset_buffer_indices)),
-      operand_orig_shapes_(std::move(operand_orig_shapes)),
-      operand_sliced_shapes_(std::move(operand_sliced_shapes)),
-      result_offset_buffer_indices_(std::move(result_offset_buffer_indices)),
-      result_orig_shapes_(std::move(result_orig_shapes)),
-      result_sliced_shapes_(std::move(result_sliced_shapes)) {}
+      embedded_thunk_arguments_(std::move(arguments)),
+      offset_buffer_indices_(std::move(offset_buffer_indices)),
+      orig_shapes_(std::move(orig_shapes)),
+      sliced_shapes_(std::move(sliced_shapes)) {}
 
 absl::Status AddressComputationThunk::Prepare(
     const PrepareParams& params, ResourceRequests& resource_requests) {
-  auto num_operands = embedded_thunk_operands_.size();
-  TF_RET_CHECK(num_operands == operand_offset_buffer_indices_.size());
-  TF_RET_CHECK(num_operands == operand_orig_shapes_.size());
-  TF_RET_CHECK(num_operands == operand_sliced_shapes_.size());
-  for (unsigned i = 0; i < num_operands; ++i) {
-    if (operand_sliced_shapes_[i].has_value()) {
-      TF_RET_CHECK(embedded_thunk_operands_[i].has_value());
-      TF_RET_CHECK(operand_offset_buffer_indices_[i].has_value());
-      TF_RET_CHECK(operand_sliced_shapes_[i]->IsArray());
-      TF_RET_CHECK(operand_orig_shapes_[i].has_value() &&
-                   operand_orig_shapes_[i]->IsArray());
-      TF_RET_CHECK(operand_sliced_shapes_[i]->rank() ==
-                   operand_orig_shapes_[i]->rank());
-      TF_RET_CHECK(operand_offset_buffer_indices_[i]->size() ==
-                   operand_orig_shapes_[i]->rank());
-    }
-  }
-
-  auto num_results = embedded_thunk_results_.size();
-  TF_RET_CHECK(num_results == result_offset_buffer_indices_.size());
-  TF_RET_CHECK(num_results == result_orig_shapes_.size());
-  TF_RET_CHECK(num_results == result_sliced_shapes_.size());
-  for (unsigned i = 0; i < num_results; ++i) {
-    if (result_sliced_shapes_[i].has_value()) {
-      TF_RET_CHECK(embedded_thunk_results_[i].has_value());
-      TF_RET_CHECK(result_offset_buffer_indices_[i].has_value());
-      TF_RET_CHECK(result_sliced_shapes_[i]->IsArray());
-      TF_RET_CHECK(result_orig_shapes_[i].has_value() &&
-                   result_orig_shapes_[i]->IsArray());
-      TF_RET_CHECK(result_sliced_shapes_[i]->rank() ==
-                   result_orig_shapes_[i]->rank());
-      TF_RET_CHECK(result_offset_buffer_indices_[i]->size() ==
-                   result_orig_shapes_[i]->rank());
+  auto num_arguments = embedded_thunk_arguments_.size();
+  TF_RET_CHECK(num_arguments == offset_buffer_indices_.size());
+  TF_RET_CHECK(num_arguments == orig_shapes_.size());
+  TF_RET_CHECK(num_arguments == sliced_shapes_.size());
+  for (auto [argument, offset_slice, orig_shape, sliced_shape] :
+       llvm::zip(embedded_thunk_arguments_, offset_buffer_indices_,
+                 orig_shapes_, sliced_shapes_)) {
+    if (offset_slice.has_value()) {
+      TF_RET_CHECK(argument.has_value());
+      TF_RET_CHECK(orig_shape.has_value());
+      TF_RET_CHECK(sliced_shape.has_value());
+
+      TF_RET_CHECK(orig_shape->IsArray());
+      TF_RET_CHECK(sliced_shape->IsArray());
+
+      TF_RET_CHECK(offset_slice->size() == orig_shape->rank());
+      TF_RET_CHECK(sliced_shape->rank() == orig_shape->rank());
     }
   }
 
@@ -112,38 +87,17 @@ absl::Status AddressComputationThunk::Initialize(
     const InitializeParams& params) {
   TF_RETURN_IF_ERROR(embedded_thunk_->Initialize(params));
 
-  unsigned operand_offset_count = 0;
-  for (auto maybe_shape : operand_sliced_shapes_) {
-    operand_offset_count +=
-        (maybe_shape == std::nullopt) ? 1 : maybe_shape->rank();
-  }
-
-  {
-    absl::MutexLock lock(&mutex_);
-    if (auto it = operand_offsets_.find(params.executor);
-        it == operand_offsets_.end()) {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> allocation,
-                          params.executor->HostMemoryAllocate(
-                              operand_offset_count * sizeof(int64_t)));
-      operand_offsets_.emplace(params.executor, std::move(allocation));
-    }
-  }
-
-  unsigned result_offset_count = 0;
-  for (auto maybe_shape : result_sliced_shapes_) {
-    result_offset_count +=
-        (maybe_shape == std::nullopt) ? 1 : maybe_shape->rank();
+  unsigned offset_count = 0;
+  for (auto maybe_shape : sliced_shapes_) {
+    offset_count += (maybe_shape == std::nullopt) ? 1 : maybe_shape->rank();
   }
 
-  {
-    absl::MutexLock lock(&mutex_);
-    if (auto it = result_offsets_.find(params.executor);
-        it == result_offsets_.end()) {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> allocation,
-                          params.executor->HostMemoryAllocate(
-                              result_offset_count * sizeof(int64_t)));
-      result_offsets_.emplace(params.executor, std::move(allocation));
-    }
+  absl::MutexLock lock(&mutex_);
+  if (auto it = offsets_.find(params.executor); it == offsets_.end()) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<se::MemoryAllocation> allocation,
+        params.executor->HostMemoryAllocate(offset_count * sizeof(int64_t)));
+    offsets_.emplace(params.executor, std::move(allocation));
   }
 
   return absl::OkStatus();
@@ -155,28 +109,27 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
   std::vector<se::DeviceMemoryBase> new_buffers;
   const BufferAllocations& orig_allocations = *params.buffer_allocations;
 
-  // Get memory allocation for copying operand offsets from device.
-  int64_t* operand_offsets_base = [&] {
+  // Get memory allocation for copying offsets from device.
+  int64_t* offsets_base = [&] {
     absl::MutexLock lock(&mutex_);
-    return reinterpret_cast<int64_t*>(
-        operand_offsets_.at(stream.parent())->opaque());
+    return reinterpret_cast<int64_t*>(offsets_.at(stream.parent())->opaque());
   }();
 
-  for (unsigned i = 0; i < operand_offset_buffer_indices_.size(); ++i) {
-    if (embedded_thunk_operands_[i] == std::nullopt) {
+  for (unsigned i = 0; i < offset_buffer_indices_.size(); ++i) {
+    if (embedded_thunk_arguments_[i] == std::nullopt) {
       new_buffers.push_back(se::DeviceMemoryBase());
       continue;
     }
 
     se::DeviceMemoryBase orig_operand =
-        orig_allocations.GetDeviceAddress(*embedded_thunk_operands_[i]);
-    if (operand_offset_buffer_indices_[i] == std::nullopt) {
+        orig_allocations.GetDeviceAddress(*embedded_thunk_arguments_[i]);
+    if (offset_buffer_indices_[i] == std::nullopt) {
       new_buffers.push_back(orig_operand);
       continue;
     }
 
-    const Shape& src_shape = *operand_orig_shapes_[i];
-    const Shape& dst_shape = *operand_sliced_shapes_[i];
+    const Shape& src_shape = *orig_shapes_[i];
+    const Shape& dst_shape = *sliced_shapes_[i];
     TF_RET_CHECK(IsContiguousSlice(src_shape, dst_shape));
 
     std::vector<int64_t> slice_starts;
@@ -184,10 +137,10 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
 
     // Get offset for ith operand, which has `dst_shape.rank()` components.
     for (auto [idx, offset_slice] :
-         llvm::enumerate(*operand_offset_buffer_indices_[i])) {
+         llvm::enumerate(*offset_buffer_indices_[i])) {
       se::DeviceMemoryBase offset_src =
           orig_allocations.GetDeviceAddress(offset_slice);
-      int64_t* offset_dst = &operand_offsets_base[i + idx];
+      int64_t* offset_dst = &offsets_base[i + idx];
       // Copy the idx-th component of the ith offset from device to host.
       TF_RETURN_IF_ERROR(
           stream.Memcpy(offset_dst, offset_src, sizeof(int64_t)));
@@ -203,7 +156,7 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
     // Compute new slice. No need to copy the content to new buffers as we can
     // reuse the original buffers since slices are contiguous.
     int64_t new_size = ShapeUtil::ByteSizeOf(dst_shape);
-    BufferAllocation::Slice orig_slice = *embedded_thunk_operands_[i];
+    BufferAllocation::Slice orig_slice = *embedded_thunk_arguments_[i];
 
     int64_t new_offset = orig_slice.offset();
     for (auto [start, stride] :
@@ -214,65 +167,6 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
     new_buffers.push_back(orig_operand.GetByteSlice(new_offset, new_size));
   }
 
-  // Get memory allocation for copying result offsets from device.
-  int64_t* result_offsets_base = [&] {
-    absl::MutexLock lock(&mutex_);
-    return reinterpret_cast<int64_t*>(
-        result_offsets_.at(stream.parent())->opaque());
-  }();
-
-  for (unsigned i = 0; i < result_offset_buffer_indices_.size(); ++i) {
-    if (embedded_thunk_results_[i] == std::nullopt) {
-      new_buffers.push_back(se::DeviceMemoryBase());
-      continue;
-    }
-
-    se::DeviceMemoryBase orig_result =
-        orig_allocations.GetDeviceAddress(*embedded_thunk_results_[i]);
-    if (result_offset_buffer_indices_[i] == std::nullopt) {
-      new_buffers.push_back(orig_result);
-      continue;
-    }
-
-    const Shape& src_shape = *result_orig_shapes_[i];
-    const Shape& dst_shape = *result_sliced_shapes_[i];
-    TF_RET_CHECK(IsContiguousSlice(src_shape, dst_shape));
-
-    std::vector<int64_t> slice_starts;
-    slice_starts.reserve(dst_shape.rank());
-
-    // Get offset for ith result, which has `dst_shape.rank()` components.
-    for (auto [idx, offset_slice] :
-         llvm::enumerate(*result_offset_buffer_indices_[i])) {
-      se::DeviceMemoryBase offset_src =
-          orig_allocations.GetDeviceAddress(offset_slice);
-      int64_t* offset_dst = &result_offsets_base[i + idx];
-      // Copy the idx-th component of the ith offset from device to host.
-      TF_RETURN_IF_ERROR(
-          stream.Memcpy(offset_dst, offset_src, sizeof(int64_t)));
-
-      if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
-        return absl::InternalError(absl::StrFormat(
-            "Failed to retrieve all slice offset values on stream %p: %s",
-            &stream, blocked.message()));
-      }
-      slice_starts.push_back(*offset_dst);
-    }
-
-    // Compute new slice. No need to copy the content to new buffers as we can
-    // reuse the original buffers since slices are contiguous.
-    int64_t new_size = ShapeUtil::ByteSizeOf(dst_shape);
-    BufferAllocation::Slice orig_slice = *embedded_thunk_results_[i];
-
-    int64_t new_offset = orig_slice.offset();
-    for (auto [start, stride] :
-         llvm::zip(slice_starts, *ShapeUtil::ByteStrides(src_shape))) {
-      new_offset += start * stride;
-    }
-
-    new_buffers.push_back(orig_result.GetByteSlice(new_offset, new_size));
-  }
-
   // Safe to create a local BufferAllocations here since buffers are only slices
   // of bigger ones allocated elsewhere.
   BufferAllocations new_allocations(new_buffers,
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index b52b5fdfde861e..a08d5c19d0d47b 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -44,16 +44,11 @@ class AddressComputationThunk : public Thunk {
  public:
   AddressComputationThunk(
       ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
-      std::vector<std::optional<const BufferAllocation::Slice>> operands,
-      std::vector<std::optional<const BufferAllocation::Slice>> results,
+      std::vector<std::optional<const BufferAllocation::Slice>> arguments,
       std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
-          operand_offset_buffer_indices,
-      std::vector<std::optional<const Shape>> operand_orig_shapes,
-      std::vector<std::optional<const Shape>> operand_sliced_shapes,
-      std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
-          result_offset_buffer_indices,
-      std::vector<std::optional<const Shape>> result_orig_shapes,
-      std::vector<std::optional<const Shape>> result_sliced_shapes);
+          offset_buffer_indices,
+      std::vector<std::optional<const Shape>> orig_shapes,
+      std::vector<std::optional<const Shape>> sliced_shapes);
 
   AddressComputationThunk(const AddressComputationThunk&) = delete;
   AddressComputationThunk& operator=(const AddressComputationThunk&) = delete;
@@ -66,26 +61,17 @@ class AddressComputationThunk : public Thunk {
  private:
   std::unique_ptr<SequentialThunk> embedded_thunk_;
   std::vector<std::optional<const BufferAllocation::Slice>>
-      embedded_thunk_operands_;
-  std::vector<std::optional<const BufferAllocation::Slice>>
-      embedded_thunk_results_;
-  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
-      operand_offset_buffer_indices_;
-  std::vector<std::optional<const Shape>> operand_orig_shapes_;
-  std::vector<std::optional<const Shape>> operand_sliced_shapes_;
+      embedded_thunk_arguments_;
   std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
-      result_offset_buffer_indices_;
-  std::vector<std::optional<const Shape>> result_orig_shapes_;
-  std::vector<std::optional<const Shape>> result_sliced_shapes_;
+      offset_buffer_indices_;
+  std::vector<std::optional<const Shape>> orig_shapes_;
+  std::vector<std::optional<const Shape>> sliced_shapes_;
 
   // Pinned host memory for transferring offset values from device to host.
   absl::Mutex mutex_;
   absl::flat_hash_map<se::StreamExecutor*,
                       std::unique_ptr<se::MemoryAllocation>>
-      operand_offsets_ ABSL_GUARDED_BY(mutex_);
-  absl::flat_hash_map<se::StreamExecutor*,
-                      std::unique_ptr<se::MemoryAllocation>>
-      result_offsets_ ABSL_GUARDED_BY(mutex_);
+      offsets_ ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index dc57a6447922e4..d2b2d48262ccc2 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -128,12 +128,13 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
                                                    slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)), {slice_lhs, slice_rhs},
-      {slice_out, slice_workspace}, {lhs_offsets, std::nullopt},
-      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt},
-      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt},
-      {std::nullopt, std::nullopt}, {std::nullopt, std::nullopt},
-      {std::nullopt, std::nullopt});
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
+       std::nullopt, std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -270,14 +271,15 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
                                                    slice_rhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)), {slice_lhs, slice_rhs},
-      {slice_out, slice_workspace}, {lhs_offsets, rhs_offsets},
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      {lhs_offsets, rhs_offsets, std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}),
-       ShapeUtil::MakeShape(PrimitiveType::F32, {4, 3})},
+       ShapeUtil::MakeShape(PrimitiveType::F32, {4, 3}), std::nullopt,
+       std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2}),
-       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2})},
-      {std::nullopt, std::nullopt}, {std::nullopt, std::nullopt},
-      {std::nullopt, std::nullopt});
+       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2}), std::nullopt,
+       std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -418,14 +420,15 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
                                                    slice_rhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)), {slice_lhs, slice_rhs},
-      {slice_out, slice_workspace}, {lhs_offsets, rhs_offsets},
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      {lhs_offsets, rhs_offsets, std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}),
-       ShapeUtil::MakeShape(PrimitiveType::F32, {8, 1})},
+       ShapeUtil::MakeShape(PrimitiveType::F32, {8, 1}), std::nullopt,
+       std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}),
-       ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1})},
-      {std::nullopt, std::nullopt}, {std::nullopt, std::nullopt},
-      {std::nullopt, std::nullopt});
+       ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), std::nullopt,
+       std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -582,13 +585,12 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
       slice_offset_0, slice_offset_1, slice_offset_2, slice_offset_3};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src}, {slice_dst},
-      {slice_offsets},
-      {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 8})},
+      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src, slice_dst},
+      {slice_offsets, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 8}), std::nullopt},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
       // original slice result and not bitcasted one)
-      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 8, 8})}, {std::nullopt},
-      {std::nullopt}, {std::nullopt});
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 8, 8}), std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -739,15 +741,14 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
       slice_dst_offset_3};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src}, {slice_dst},
-      {slice_src_offsets},
-      {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 2})},
+      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src, slice_dst},
+      {slice_src_offsets, slice_dst_offsets},
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 2}),
+       ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2, 2, 2})},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
       // original slice result and not bitcasted one)
-      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})},
-      {slice_dst_offsets},
-      {{ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2, 2, 2})}},
-      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})});
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2}),
+       ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})});
 
   // Step 2:
   // Execute address computation thunk.

From a5ec72acbf058dfe016f97f49b27e9b4668d48da Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 02:02:27 -0700
Subject: [PATCH 073/670] compat: Update forward compatibility horizon to
 2024-03-19

PiperOrigin-RevId: 617095822
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 813819ae0aec8d..ef8e89811f42bb 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 18)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 19)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 5b4e1879bd6799140f91a856082f0f49a3d989fe Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 02:02:35 -0700
Subject: [PATCH 074/670] Update GraphDef version to 1806.

PiperOrigin-RevId: 617095844
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index b199c37ee80142..0d3b39573bb785 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1805  // Updated: 2024/3/18
+#define TF_GRAPH_DEF_VERSION 1806  // Updated: 2024/3/19
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From c9c2f388759e39ea4dd09e90ece46cc6278c336f Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 02:43:09 -0700
Subject: [PATCH 075/670] #shlo_ref Add std compatible member functions to
 `Shape`.

PiperOrigin-RevId: 617104178
---
 tensorflow/lite/experimental/shlo/shape.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tensorflow/lite/experimental/shlo/shape.h b/tensorflow/lite/experimental/shlo/shape.h
index 262c7b0bc1e901..72a322299972d1 100644
--- a/tensorflow/lite/experimental/shlo/shape.h
+++ b/tensorflow/lite/experimental/shlo/shape.h
@@ -66,6 +66,24 @@ class Shape {
   // and possible confusion with C++ container's usage of size().
   DimensionSize NumElements() const;
 
+  // The following members are provided for compatibility with the standard
+  // library.
+  using value_type = DimensionSize;
+
+  const value_type& operator[](int dim) const { return dims_[dim]; }
+  value_type& operator[](int dim) { return dims_[dim]; }
+
+  auto cbegin() const { return dims_.begin(); }
+  auto begin() const { return dims_.begin(); }
+  auto begin() { return dims_.begin(); }
+  auto cend() const { return dims_.end(); }
+  auto end() const { return dims_.end(); }
+  auto end() { return dims_.end(); }
+  bool empty() const { return dims_.empty(); }
+  size_t size() const { return dims_.size(); }
+  const value_type* data() const { return dims_.data(); }
+  value_type* data() { return dims_.data(); }
+
  private:
   absl::InlinedVector<DimensionSize, kMaxNumDimensions> dims_;
 };

From 89ffb3c7f16df4c2e72896465c0200ff6ac9e735 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Tue, 19 Mar 2024 03:01:54 -0700
Subject: [PATCH 076/670] Support all transposes.

PiperOrigin-RevId: 617107984
---
 .../xla/xla/service/gpu/fusions/fusions.cc    |  2 +-
 .../fusions/mlir/computation_partitioner.cc   | 18 +++---
 .../gpu/fusions/mlir/elemental_hlo_to_mlir.cc | 31 ++++++++--
 .../mlir/elemental_hlo_to_mlir_test.cc        | 21 +++++++
 .../xla/service/gpu/fusions/transpose_mlir.cc | 52 ++++++++--------
 .../xla/service/gpu/fusions/transpose_mlir.h  |  9 +--
 .../gpu/fusions/transpose_mlir_test.cc        | 59 ++++++++++++++++++-
 7 files changed, 146 insertions(+), 46 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.cc b/third_party/xla/xla/service/gpu/fusions/fusions.cc
index 6dd5bf20d80152..5037dd676e0171 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.cc
@@ -212,7 +212,7 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
       return std::make_unique<ScatterFusion>(analysis);
     }
     case HloFusionAnalysis::EmitterFusionKind::kTranspose: {
-      if (check_mlir_emitters(MlirTransposeFusion::IsSupported)) {
+      if (check_mlir_emitters(nullptr)) {
         return std::make_unique<MlirTransposeFusion>(analysis);
       }
       return std::make_unique<TransposeFusion>(analysis);
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc
index 472efdb0197501..ad46b914a9e484 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc
@@ -341,15 +341,19 @@ mlir::func::FuncOp CreateSubgraphMlirFunction(
     return *ConvertPrimitiveTypeToMlirType(shape.element_type(), b);
   };
 
-  const xla::Shape* one_root_shape = nullptr;
+  const xla::Shape* first_root_shape = nullptr;
   for (auto* root : subgraph.roots) {
     if (root->shape().IsTuple()) {
       for (auto& shape : root->shape().tuple_shapes()) {
-        one_root_shape = &shape;
+        if (!first_root_shape) {
+          first_root_shape = &shape;
+        }
         result_types.push_back(element_type(shape));
       }
     } else {
-      one_root_shape = &root->shape();
+      if (!first_root_shape) {
+        first_root_shape = &root->shape();
+      }
       result_types.push_back(element_type(root->shape()));
     }
   }
@@ -362,13 +366,13 @@ mlir::func::FuncOp CreateSubgraphMlirFunction(
       parameter_types.push_back(TensorShapeToMlirType(param->shape(), b));
       arg_attrs.emplace_back();
     }
-    for (int dim = 0; dim < one_root_shape->rank(); ++dim) {
+    for (int dim = 0; dim < first_root_shape->rank(); ++dim) {
       parameter_types.push_back(b.getIndexType());
       arg_attrs.emplace_back(mlir::DictionaryAttr::get(
           b.getContext(),
-          {b.getNamedAttr(
-              "xla.range",
-              b.getIndexArrayAttr({0, one_root_shape->dimensions(dim) - 1}))}));
+          {b.getNamedAttr("xla.range",
+                          b.getIndexArrayAttr(
+                              {0, first_root_shape->dimensions(dim) - 1}))}));
     }
 
     // Populate arguments for injected parameters (values that are computed
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index 8e2124dbe0e87b..b9169f82207ae2 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -187,12 +187,14 @@ bool IsUnsupportedTuple(const HloInstruction* instr) {
     return true;
   }
 
-  // All tuple elements must have the same dimensions (element types may
-  // differ).
+  // All tuple elements must have bitcast-compatible dimensions (element types
+  // may differ).
   auto first_shape = instr->shape().tuple_shapes(0);
   for (int i = 1; i < instr->operand_count(); ++i) {
-    if (instr->shape().tuple_shapes(i).dimensions() !=
-        first_shape.dimensions()) {
+    const auto& tuple_shape = instr->shape().tuple_shapes(i);
+    if (!ShapeUtil::EqualIgnoringElementType(tuple_shape, first_shape) &&
+        !ShapeUtil::IsReshapeOrTransposeBitcast(tuple_shape, first_shape,
+                                                /*ignore_element_type=*/true)) {
       return true;
     }
   }
@@ -544,6 +546,8 @@ Value ApplyAffineExpr(mlir::AffineExpr expr, ValueRange dims,
 
 SmallVector<Value> ApplyAffineMap(mlir::AffineMap map, ValueRange dims,
                                   ValueRange symbols, ImplicitLocOpBuilder& b) {
+  CHECK_EQ(map.getNumDims(), dims.size());
+  CHECK_EQ(map.getNumSymbols(), symbols.size());
   SmallVector<Value> result;
   result.reserve(map.getNumResults());
   for (auto expr : map.getResults()) {
@@ -606,6 +610,7 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
     result_element_type = sign_converter.convertType(element_mlir_type);
   }
 
+  IndexingContext indexing_context(builder.getContext());
   // Handle ops that aren't elementwise and aren't just indexing
   // transformations.
   switch (instr->opcode()) {
@@ -648,11 +653,26 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
                         builder);
     case HloOpcode::kTuple: {
       CHECK(!IsUnsupportedTuple(instr));
+      const auto& first_shape = instr->shape().tuple_shapes(0);
+      CHECK_EQ(first_shape.rank(), indices.size())
+          << "Indices for tuple must be for the first tuple element";
       SmallVector<Value> operands;
       for (int i = 0; i < instr->operand_count(); ++i) {
+        llvm::SmallVector<Value> operand_indices;
+        // The tuple shapes only need to be bitcast compatible, so insert
+        // bitcasts where necessary.
+        if (i > 0 && !ShapeUtil::EqualIgnoringElementType(
+                         first_shape, instr->operand(i)->shape())) {
+          auto operand_map = GetBitcastMap(
+              first_shape, instr->operand(i)->shape(), &indexing_context);
+          operand_indices =
+              ApplyAffineMap(operand_map.GetAffineMap(), indices, {}, builder);
+        } else {
+          operand_indices = indices;
+        }
         TF_ASSIGN_OR_RETURN(
             operands.emplace_back(),
-            GetSingleOperandValue(operand_provider, instr, i, indices));
+            GetSingleOperandValue(operand_provider, instr, i, operand_indices));
       }
       return operands;
     }
@@ -675,7 +695,6 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
                             operand->shape().element_type(), builder));
     arg_types.push_back(operand_element_type);
   }
-  IndexingContext indexing_context(builder.getContext());
   auto input_indices =
       GetInputIndices(ComputeOutputToInputIndexing(instr, 0, &indexing_context),
                       indices, builder);
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
index cd07c607d11f93..f326b06113e0f5 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
@@ -728,6 +728,27 @@ TEST_F(ElementalHloToMlirTest, IotaComplex) {
   )"));
 }
 
+TEST_F(ElementalHloToMlirTest, MixedIndexingTuple) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      %p0 = f32[10,10] parameter(0)
+      %p1 = f32[100] parameter(1)
+      ROOT tuple = (f32[10,10], f32[100]) tuple(%p0, %p1)
+    })",
+                   R"(
+    // CHECK:      @main_tuple(
+    // CHECK-SAME:     %[[P0:.*]]: tensor<10x10xf32>,
+    // CHECK-SAME:     %[[P1:.*]]: tensor<100xf32>,
+    // CHECK-SAME:     %[[X:.*]]: index {{{.*}}}, %[[Y:.*]]: index {{{.*}}}
+    // CHECK:        %[[A:.*]] = tensor.extract %[[P0]][%[[X]], %[[Y]]]
+    // CHECK:        %[[IDX:.*]] = affine.apply
+    // CHECK-SAME:       affine_map<()[s0, s1] -> (s0 * 10 + s1)>()
+    // CHECK-SAME:       [%[[X]], %[[Y]]]
+    // CHECK:        %[[B:.*]] = tensor.extract %[[P1]][%[[IDX]]]
+    // CHECK:        return %[[A]], %[[B]]
+  )"));
+}
+
 }  // namespace
 }  // namespace mlir_converter
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
index 4b8a2af5661935..ba41af491180a6 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
@@ -138,26 +138,9 @@ MlirTransposeFusion::MlirTransposeFusion(const HloFusionAnalysis& analysis)
   }
 }
 
-/*static*/ bool MlirTransposeFusion::IsSupported(
-    const HloFusionAnalysis& analysis) {
-  // If there is a hero, which does not have a transpose, the codegen might
-  // fail because of the incorrect thread ID mapping for that particular case.
-  return GetShMemTransposes(analysis).size() == analysis.fusion_heroes().size();
-}
-
 std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
     int64_t root_index, IndexingContext* indexing_context) const {
   const auto& hero = *analysis_.fusion_heroes()[root_index];
-  const auto& root = *analysis_.fusion_roots()[root_index];
-  if (!GetDescriptionForTiledTransposeEmitter(root, hero)) {
-    // Non-transpose roots are elementwise by definition.
-    return ComputeThreadIdToInputIndexing(root_index, 0, indexing_context);
-  }
-  return ComputeThreadIdToOutputIndexing(hero, indexing_context);
-}
-
-IndexingMap MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
-    const HloInstruction& hero, IndexingContext* indexing_context) const {
   // The block offsets are permuted, but the thread offsets remain the same.
   auto* mlir_context = indexing_context->GetMLIRContext();
   auto block_offset = GetBlockOffsetsForTiling(tiling_, mlir_context)
@@ -187,6 +170,20 @@ IndexingMap MlirTransposeFusion::ComputeThreadIdToInputIndexing(
   return map;
 }
 
+std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    IndexingContext* indexing_context) const {
+  const auto& hero = *analysis_.fusion_heroes()[root_index];
+  const auto& root = *analysis_.fusion_roots()[root_index];
+  if (!GetDescriptionForTiledTransposeEmitter(root, hero)) {
+    // Non-transpose roots are elementwise by definition.
+    return ComputeThreadIdToOutputIndexing(root_index, indexing_context);
+  }
+
+  return ComputeThreadIdToInputIndexing(*analysis_.fusion_heroes()[root_index],
+                                        indexing_context);
+}
+
 LaunchDimensions MlirTransposeFusion::launch_dimensions() const {
   return LaunchDimensions(tiling_.GetNumBlocks(),
                           tiling_.GetNumThreadsPerBlock());
@@ -298,12 +295,11 @@ absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
   IndexingContext indexing_context{mlir_context};
   ValueRange output_tensor_args =
       entry_function.getArguments().drop_front(num_inputs);
-  auto output_indexing = ComputeThreadIdToOutputIndexing(
-      *shmem_transposes_.front(), &indexing_context);
+  auto output_indexing = *ComputeThreadIdToOutputIndexing(0, &indexing_context);
   auto shmem_output_indexing =
       GetSharedMemoryReadIndexingMap(output_indexing, permutation_[2]);
   auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(
-      shmem_transposes_.front(), &indexing_context);
+      analysis_.fusion_heroes()[0], &indexing_context);
   auto root_indexing = ComposeIndexingMaps(output_indexing, epilogue_indexing);
   auto result_tensors = EmitThreadLoopNest(
       builder, output_tensor_args, output_indexing,
@@ -324,9 +320,19 @@ absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
                          root_indices, builder);
         SmallVector<Value> results;
         results.reserve(output_tensor_args.size());
-        for (auto [tensor, value] : llvm::zip(output_tensors, result_scalars)) {
-          results.push_back(
-              builder.create<InsertOp>(value, tensor, root_indices));
+        const auto& first_shape = analysis_.fusion_roots().front()->shape();
+        for (auto [tensor, value, root] : llvm::zip(
+                 output_tensors, result_scalars, analysis_.fusion_roots())) {
+          llvm::SmallVector<Value> indices;
+          if (ShapeUtil::EqualIgnoringElementType(first_shape, root->shape())) {
+            indices = root_indices;
+          } else {
+            auto bitcast_map =
+                GetBitcastMap(first_shape, root->shape(), &indexing_context);
+            indices = ApplyAffineMap(bitcast_map.GetAffineMap(), root_indices,
+                                     {}, builder);
+          }
+          results.push_back(builder.create<InsertOp>(value, tensor, indices));
         }
         return results;
       });
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
index fd9f5863e8260e..3eb6e6fef98a74 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
@@ -51,23 +51,16 @@ class MlirTransposeFusion : public MlirFusionEmitterBase {
   explicit MlirTransposeFusion(const HloFusionAnalysis& analysis);
   LaunchDimensions launch_dimensions() const override;
 
-  static bool IsSupported(const HloFusionAnalysis& analysis);
-
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
       int64_t root_index, IndexingContext* indexing_context) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override {
-    return ComputeThreadIdToInputIndexing(
-        *analysis_.fusion_heroes()[root_index], indexing_context);
-  }
+      IndexingContext* indexing_context) const override;
 
  protected:
   IndexingMap ComputeThreadIdToInputIndexing(
       const HloInstruction& hero, IndexingContext* indexing_context) const;
-  IndexingMap ComputeThreadIdToOutputIndexing(
-      const HloInstruction& hero, IndexingContext* indexing_context) const;
 
   absl::Status EmitEntryFunction(
       const mlir_converter::PartitionedComputations& computations,
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
index 38fe0789b8eadf..bbffea39df1042 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
@@ -251,7 +251,6 @@ TEST_F(MlirTransposeFusionTest, Transpose021_NoEpilogue) {
     // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
     // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
     // CHECK-DAG:  %[[C8:.*]] = arith.constant 8 : index
-
     // CHECK:      %[[SHMEM:.*]] = xla_gpu.allocate_shared : tensor<1x32x32xf32>
     // CHECK:      %[[SHMEM_WITH_VALS:.*]] = scf.for
     // CHECK-SAME:     %[[C0]] to %[[C8]] step %[[C1]]
@@ -285,6 +284,7 @@ TEST_F(MlirTransposeFusionTest, Transpose_4D) {
         calls=%fused_computation
     }
   )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
@@ -303,6 +303,7 @@ TEST_F(MlirTransposeFusionTest, Transpose_2D) {
         calls=%fused_computation
     }
   )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
@@ -328,6 +329,7 @@ TEST_F(MlirTransposeFusionTest, Transpose_2D_2) {
       ROOT %fusion = f32[2820,17]{1,0} fusion(%p0, %p1), kind=kInput, calls=%fused_computation
     }
   )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
@@ -352,6 +354,7 @@ TEST_F(MlirTransposeFusionTest, MultipleRootsForTranspose) {
         fusion(), kind=kInput, calls=%fused_computation
     }
   )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
@@ -369,6 +372,60 @@ TEST_F(MlirTransposeFusionTest, PartialTile) {
       ROOT %fusion = f64[6,4,2,24] fusion(%p0), kind=kInput, calls=%fused_computation
     }
   )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(MlirTransposeFusionTest, MixedIndexing) {
+  auto kHloString = R"(
+    HloModule m
+
+    fused_computation {
+      %p0 = f64[24,2,6,4] parameter(0)
+      %bc = f64[24,2,24] bitcast(%p0)
+      %t1 = f64[6,4,2,24] transpose(%p0), dimensions={2,3,1,0}
+      %t2 = f64[24,2,24] transpose(%bc), dimensions={2,1,0}
+      %p1 = f64[] parameter(1)
+      %bc1 = f64[6,4,2,24] broadcast(%p1), dimensions={}
+      %bc2 = f64[24,2,24] broadcast(%p1), dimensions={}
+      %a1 = f64[6,4,2,24] add(%t1, %bc1)
+      %a2 = f64[24,2,24] add(%t2, %bc2)
+      ROOT %t = (f64[6,4,2,24], f64[24,2,24]) tuple(%a1, %a2)
+    }
+
+    ENTRY main {
+      %p0 = f64[24,2,6,4] parameter(0)
+      %p1 = f64[] parameter(1)
+      ROOT %fusion = (f64[6,4,2,24], f64[24,2,24]) fusion(%p0, %p1),
+        kind=kInput, calls=%fused_computation
+    }
+  )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(MlirTransposeFusionTest, SideOutputs) {
+  auto kHloString = R"(
+    HloModule m
+
+    fused_computation {
+      %p0 = f64[24,2,36] parameter(0)
+      %p1 = f64[36,2,24] parameter(1)
+      %tr = f64[36,2,24] transpose(%p0), dimensions={2,1,0}
+      %neg = f64[36,2,24] negate(%p1)
+      %log = f64[24,2,36] log(%p0)
+      ROOT %t = (f64[36,2,24], f64[36,2,24], f64[24,2,36])
+        tuple(%neg, %tr, %log)
+    }
+
+    ENTRY main {
+      %p0 = f64[24,2,36] parameter(0)
+      %p1 = f64[36,2,24] parameter(1)
+      ROOT %fusion = (f64[36,2,24], f64[36,2,24], f64[24,2,36])
+        fusion(%p0, %p1), kind=kInput, calls=%fused_computation
+    }
+  )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 

From 89572357dada17656848bb77824d834ecb225e25 Mon Sep 17 00:00:00 2001
From: Harsha H S <hsharsha@users.noreply.github.com>
Date: Tue, 19 Mar 2024 03:02:49 -0700
Subject: [PATCH 077/670] PR #10261: [ROCm] ConvBfloat16Support HLO pass for
 AMDGPU Compiler

Imported from GitHub PR https://github.com/openxla/xla/pull/10261

Copybara import of the project:

--
0568134a7d3108f1c29794f536f3acbfe238dff1 by Pavel Emeliyanenko <pavel.emeliyanenko@amd.com>:

added ConvBfloat16Support HLO pass

--
3106f99a8d75e1d891053f3a2b3ee1a46c29f5db by Harsha HS <harsha.havanurshamsundara@amd.com>:

fix typo

Merging this change closes #10261

PiperOrigin-RevId: 617108219
---
 .../xla/xla/service/gpu/amdgpu_compiler.cc    | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
index 723585608b5c55..f429e20f27d1ac 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/service/call_inliner.h"
 #include "xla/service/convert_mover.h"
 #include "xla/service/dot_dimension_merger.h"
+#include "xla/service/float_normalization.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/conv_algorithm_picker.h"
 #include "xla/service/gpu/cublas_pad_for_gemms.h"
@@ -61,6 +62,34 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+namespace {
+
+struct ConvBfloat16Support : public FloatSupport {
+  explicit ConvBfloat16Support(const se::RocmComputeCapability& rocm)
+      : FloatSupport(BF16),
+        // TODO: MIOpen does not support bf16 convolutions yet
+        is_conv_bf16_supported_(rocm.has_bf16_dtype_support()) {}
+
+  bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
+                                   int64_t operand_index) const override {
+    return (hlo.opcode() != HloOpcode::kConvolution) || is_conv_bf16_supported_;
+  }
+
+  bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
+    return (hlo.opcode() != HloOpcode::kConvolution) || is_conv_bf16_supported_;
+  }
+
+  bool SupportsMixedPrecisions(const HloInstruction& hlo) const override {
+    // Skip all HLOs other than convolutions.
+    return (hlo.opcode() != HloOpcode::kConvolution);
+  }
+
+ private:
+  bool is_conv_bf16_supported_;
+};
+
+}  // namespace
+
 absl::Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
     HloModule* hlo_module, se::GpuComputeCapability gpu_version,
     se::dnn::VersionInfo dnn_version,
@@ -71,6 +100,12 @@ absl::Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
   pipeline.AddInvariantCheckerDebug<HloVerifier>(
       /*layout_sensitive=*/false,
       /*allow_mixed_precision=*/false);
+
+  // Convert unsupported bf16 convolutions to f32.
+  ConvBfloat16Support conv_bf16_support(
+      std::get<se::RocmComputeCapability>(gpu_version));
+  pipeline.AddPass<FloatNormalization>(&conv_bf16_support);
+
   pipeline.AddPass<GpusolverRewriter>();
   pipeline.AddPass<GpuConvRewriter>();
   pipeline.AddPass<GpuConvPaddingLegalization>();

From eebacf22187ea45c85e8581a0525f0a238ccfeb8 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 03:12:30 -0700
Subject: [PATCH 078/670] #shlo_ref Add typedefs for the tensor (element) type
 variant.

PiperOrigin-RevId: 617110499
---
 tensorflow/lite/experimental/shlo/tensor.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/experimental/shlo/tensor.h b/tensorflow/lite/experimental/shlo/tensor.h
index 6904ad92db4689..57029105d1a218 100644
--- a/tensorflow/lite/experimental/shlo/tensor.h
+++ b/tensorflow/lite/experimental/shlo/tensor.h
@@ -33,8 +33,10 @@ constexpr TensorElementType BaselineType(TensorElementType type) {
   return type;
 }
 
-std::variant<TensorElementType, QuantizedTensorElementType> BaselineType(
-    const std::variant<TensorElementType, QuantizedTensorElementType>& type);
+using TensorElementTypeVariant =
+    std::variant<TensorElementType, QuantizedTensorElementType>;
+
+TensorElementTypeVariant BaselineType(const TensorElementTypeVariant& type);
 
 struct TensorType {
   Shape shape;
@@ -46,6 +48,8 @@ struct QuantizedTensorType {
   QuantizedTensorElementType element_type;
 };
 
+using TensorTypeVariant = std::variant<TensorType, QuantizedTensorType>;
+
 struct Tensor {
   const Shape& shape() const;
   Shape& shape();
@@ -69,8 +73,7 @@ struct Tensor {
   const TensorElementType& tensor_element_type() const;
   const QuantizedTensorElementType& quantized_tensor_element_type() const;
 
-  std::variant<TensorElementType, QuantizedTensorElementType> element_type()
-      const;
+  TensorElementTypeVariant element_type() const;
 
   template <DataType data_type, typename T = typename Storage<data_type>::Type>
   T* GetDataAs() {
@@ -88,7 +91,7 @@ struct Tensor {
                                static_cast<size_t>(NumElements()));
   }
 
-  std::variant<TensorType, QuantizedTensorType> type;
+  TensorTypeVariant type;
 
   // If type is TensorType, the type should be Storage<type.element_type>::Type.
   // If type is QuantizedTensorType, the type should be

From 1bde09fabaa39ecc09beb29aaf706e8adb346ac2 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 03:23:19 -0700
Subject: [PATCH 079/670] #shlo_ref Refactor unary element wise op tests.

PiperOrigin-RevId: 617112557
---
 tensorflow/lite/experimental/shlo/data_type.h |   9 +-
 tensorflow/lite/experimental/shlo/ops/BUILD   |  29 +-
 tensorflow/lite/experimental/shlo/ops/abs.cc  |  26 +-
 .../lite/experimental/shlo/ops/abs_test.cc    |  85 ++----
 tensorflow/lite/experimental/shlo/ops/cbrt.cc |  23 +-
 .../lite/experimental/shlo/ops/cbrt_test.cc   |  66 ++---
 tensorflow/lite/experimental/shlo/ops/ceil.cc |  22 +-
 .../lite/experimental/shlo/ops/ceil_test.cc   |  66 ++---
 .../lite/experimental/shlo/ops/cosine.cc      |  23 +-
 .../lite/experimental/shlo/ops/cosine_test.cc |  67 ++---
 .../lite/experimental/shlo/ops/test_util.h    | 174 ++++++++++--
 .../shlo/ops/unary_elementwise_test.cc        |   2 +-
 .../shlo/ops/unary_elementwise_test_util.h    | 250 ++++++++++++++++++
 tensorflow/lite/experimental/shlo/ops/util.cc |  37 +++
 tensorflow/lite/experimental/shlo/ops/util.h  |  52 ++++
 .../lite/experimental/shlo/status_matcher.h   |   5 +-
 16 files changed, 638 insertions(+), 298 deletions(-)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h

diff --git a/tensorflow/lite/experimental/shlo/data_type.h b/tensorflow/lite/experimental/shlo/data_type.h
index 8e8fe2d6202911..f313fdc175ce58 100644
--- a/tensorflow/lite/experimental/shlo/data_type.h
+++ b/tensorflow/lite/experimental/shlo/data_type.h
@@ -95,10 +95,17 @@ using StorageType = typename Storage<data_type>::Type;
 
 constexpr bool IsBool(DataType data_type) { return data_type == DataType::kI1; }
 
-constexpr bool IsInteger(DataType data_type) {
+constexpr bool IsSignedInteger(DataType data_type) {
   return data_type == DataType::kSI4 || data_type == DataType::kSI8 ||
          data_type == DataType::kSI16 || data_type == DataType::kSI32;
 }
+
+constexpr bool IsUnsignedInteger(DataType data_type) { return false; }
+
+constexpr bool IsInteger(DataType data_type) {
+  return IsSignedInteger(data_type) || IsUnsignedInteger(data_type);
+}
+
 constexpr bool IsFloat(DataType data_type) {
   return data_type == DataType::kBF16 || data_type == DataType::kF16 ||
          data_type == DataType::kF32;
diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 45eaa7771807e2..15bcab95773b55 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -70,7 +70,9 @@ cc_library(
     srcs = ["util.cc"],
     hdrs = ["util.h"],
     deps = [
+        "//tensorflow/lite/experimental/shlo:data_type",
         "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:tensor",
         "@com_google_absl//absl/status",
     ],
 )
@@ -118,7 +120,6 @@ cc_test(
         "//tensorflow/lite/experimental/shlo:status_matcher",
         "//tensorflow/lite/experimental/shlo:tensor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -129,13 +130,30 @@ cc_library(
     hdrs = ["test_util.h"],
     deps = [
         "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
         "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:tensor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_googletest//:gtest",
     ],
 )
 
+cc_library(
+    name = "unary_elementwise_test_util",
+    testonly = True,
+    hdrs = ["unary_elementwise_test_util.h"],
+    deps = [
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "abs",
     srcs = ["abs.cc"],
@@ -156,6 +174,7 @@ cc_test(
     deps = [
         ":abs",
         ":test_util",
+        ":unary_elementwise_test_util",
         "//tensorflow/lite/experimental/shlo:quantize",
         "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
         "//tensorflow/lite/experimental/shlo:shape",
@@ -173,7 +192,6 @@ cc_library(
         ":unary_elementwise",
         ":util",
         "//tensorflow/lite/experimental/shlo:bf16",
-        "//tensorflow/lite/experimental/shlo:data_type",
         "//tensorflow/lite/experimental/shlo:dispatch",
         "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:tensor",
@@ -188,6 +206,7 @@ cc_test(
     deps = [
         ":cbrt",
         ":test_util",
+        ":unary_elementwise_test_util",
         "//tensorflow/lite/experimental/shlo:bf16",
         "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:quantize",
@@ -195,7 +214,6 @@ cc_test(
         "//tensorflow/lite/experimental/shlo:shape",
         "//tensorflow/lite/experimental/shlo:status_matcher",
         "//tensorflow/lite/experimental/shlo:tensor",
-        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -223,6 +241,7 @@ cc_test(
     deps = [
         ":ceil",
         ":test_util",
+        ":unary_elementwise_test_util",
         "//tensorflow/lite/experimental/shlo:bf16",
         "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:quantize",
@@ -230,7 +249,6 @@ cc_test(
         "//tensorflow/lite/experimental/shlo:shape",
         "//tensorflow/lite/experimental/shlo:status_matcher",
         "//tensorflow/lite/experimental/shlo:tensor",
-        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -244,7 +262,6 @@ cc_library(
         ":unary_elementwise",
         ":util",
         "//tensorflow/lite/experimental/shlo:bf16",
-        "//tensorflow/lite/experimental/shlo:data_type",
         "//tensorflow/lite/experimental/shlo:dispatch",
         "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:tensor",
@@ -258,6 +275,7 @@ cc_test(
     deps = [
         ":cosine",
         ":test_util",
+        ":unary_elementwise_test_util",
         "//tensorflow/lite/experimental/shlo:bf16",
         "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:quantize",
@@ -265,7 +283,6 @@ cc_test(
         "//tensorflow/lite/experimental/shlo:shape",
         "//tensorflow/lite/experimental/shlo:status_matcher",
         "//tensorflow/lite/experimental/shlo:tensor",
-        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/experimental/shlo/ops/abs.cc b/tensorflow/lite/experimental/shlo/ops/abs.cc
index 8f8f7415b3197b..dd92713df972ed 100644
--- a/tensorflow/lite/experimental/shlo/ops/abs.cc
+++ b/tensorflow/lite/experimental/shlo/ops/abs.cc
@@ -33,32 +33,26 @@ AbsOp Create(typename AbsOp::Attributes) { return AbsOp{}; }
 
 absl::Status Prepare(AbsOp& op, const Tensor& input, Tensor& output) {
   SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
-  if (BaselineType(input.element_type()) !=
-      BaselineType(output.element_type())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.abs constraint (C2) is not satisfied (incompatible baseline "
-        "types.).");
-  }
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(CheckCtx("abs"), input,
+                                               IsSignedIntTensor, IsFloatTensor,
+                                               IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("abs"), input, output));
   return absl::OkStatus();
 }
 
 absl::Status Evaluate(AbsOp& op, const Tensor& input, Tensor& output) {
   Abs abs;
-  if (input.IsPerAxisQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerChannel,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       abs, input, output);
-  } else if (input.IsPerTensorQuantized()) {
+  if (input.IsPerTensorQuantized()) {
     DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
                        input.quantized_tensor_element_type().StorageType(),
                        input.quantized_tensor_element_type().ExpressedType(),
                        abs, input, output)
-  } else {
-    DISPATCH_BOOL_INT_FLOAT(detail::EvaluateNoQuantization,
-                            input.tensor_element_type(), abs, input, output);
+  } else if (IsSignedIntTensor(input) || IsFloatTensor(input)) {
+    DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
+                       input.tensor_element_type(), abs, input, output);
   }
-  return absl::OkStatus();
+  return absl::FailedPreconditionError("Unsupported tensor type.");
 }
 
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/abs_test.cc b/tensorflow/lite/experimental/shlo/ops/abs_test.cc
index 66972cabe9a6bf..0e3962825c56d6 100644
--- a/tensorflow/lite/experimental/shlo/ops/abs_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/abs_test.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/shlo/ops/abs.h"
 
-#include <cstddef>
-#include <cstdint>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
 #include "tensorflow/lite/experimental/shlo/quantize.h"
 #include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
@@ -30,6 +30,11 @@ using testing::ElementsAreArray;
 
 namespace shlo_ref {
 
+template <>
+struct ParamName<AbsOp> {
+  static std::string Get() { return "Abs"; }
+};
+
 namespace {
 
 constexpr struct AbsRef {
@@ -39,12 +44,25 @@ constexpr struct AbsRef {
   }
 } abs_ref;
 
+INSTANTIATE_TYPED_TEST_SUITE_P(Abs, UnaryElementwiseOpShapePropagationTest,
+                               AbsOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Abs, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<AbsOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<AbsOp, ConcatTypes<BoolTestType, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Abs, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
 template <class T>
 struct AbsTest : ::testing::Test {};
 
-TYPED_TEST_SUITE(AbsTest, NonQuantizedTestTypes, TestParamNames);
+TYPED_TEST_SUITE(AbsTest, ArithmeticTestTypes, TestParamNames);
 
-TYPED_TEST(AbsTest, NonQuantized) {
+TYPED_TEST(AbsTest, ArithmeticTensorsWork) {
   using StorageT = typename TypeParam::StorageT;
 
   const Shape shape({2, 3, 4});
@@ -107,64 +125,5 @@ TYPED_TEST(QuantizedAbsTest, QuantizedPerTensor) {
   EXPECT_THAT(output_data, ElementsAreArray(expected_data));
 }
 
-TYPED_TEST(QuantizedAbsTest, QuantizedPerAxis) {
-  using StorageT = typename TypeParam::StorageT;
-  using ExpressedT = typename TypeParam::ExpressedT;
-
-  const Shape shape({4, 3, 2});
-  const int quantized_dimension = 2;
-  const size_t rank = shape.Rank();
-  const Axis quantized_dimension_size = shape.Dim(quantized_dimension);
-  const size_t quantization_stride = [&] {
-    size_t res = 1;
-    for (int64_t i = rank - 1; i > quantized_dimension; --i) {
-      res *= shape.Dim(i);
-    }
-    return res;
-  }();
-  Vector<StorageT> input_data = IotaBuffer<TypeParam::kStorage>(shape);
-  Vector<StorageT> output_data(shape.NumElements());
-  Vector<StorageT> zero_points_data = RandomBuffer<TypeParam::kStorage>(
-      /*shape=*/Shape({shape.Dim(2)}), /*min=*/static_cast<StorageT>(-5),
-      /*max=*/static_cast<StorageT>(5));
-  Vector<ExpressedT> scales_data = RandomBuffer<TypeParam::kExpressed>(
-      /*shape=*/Shape({shape.Dim(2)}), /*min=*/static_cast<ExpressedT>(1),
-      /*max=*/static_cast<ExpressedT>(3));
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerAxis<TypeParam::kStorage,
-                                          TypeParam::kExpressed>(
-          scales_data, zero_points_data, quantized_dimension);
-  Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
-      .data = input_data.data()};
-  Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
-      .data = output_data.data()};
-
-  Vector<StorageT> expected_data(shape.NumElements());
-  absl::c_transform(
-      input_data, expected_data.begin(),
-      [&, element_index = 0ull, quantization_index = 0ull](auto v) mutable {
-        const StorageT zero_point = zero_points_data[quantization_index];
-        const ExpressedT scale = scales_data[quantization_index];
-
-        if (++element_index >= quantization_stride) {
-          element_index = 0;
-          if (++quantization_index >= quantized_dimension_size) {
-            quantization_index = 0;
-          }
-        }
-        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
-        const ExpressedT dequantized_res = abs_ref(dequantized_input);
-        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
-            dequantized_res, zero_point, ExpressedT(1) / scale);
-      });
-
-  auto op = Create(AbsOp::Attributes{});
-  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
-  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
-  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
-}
-
 }  // namespace
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/cbrt.cc b/tensorflow/lite/experimental/shlo/ops/cbrt.cc
index 2a526292829363..2e50c92c2e5998 100644
--- a/tensorflow/lite/experimental/shlo/ops/cbrt.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cbrt.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
-#include "tensorflow/lite/experimental/shlo/data_type.h"
 #include "tensorflow/lite/experimental/shlo/dispatch.h"
 #include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
@@ -49,20 +48,10 @@ CbrtOp Create(CbrtOp::Attributes) { return {}; }
 
 absl::Status Prepare(CbrtOp& op, const Tensor& input, Tensor& output) {
   SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
-  if (!input.IsQuantized() && IsInteger(input.StorageType())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.cbrt does not support integer tensor types.");
-  }
-  if (input.IsPerAxisQuantized()) {
-    return absl::FailedPreconditionError(
-        "stablehlo.cbrt does not support per axis quantization.");
-  }
-  if (BaselineType(input.element_type()) !=
-      BaselineType(output.element_type())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.cbrt constraint (C1) is not satisfied (incompatible "
-        "baseline types).");
-  }
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("cbrt"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("cbrt"), input, output));
   return absl::OkStatus();
 }
 
@@ -73,11 +62,11 @@ absl::Status Evaluate(CbrtOp& op, const Tensor& input, Tensor& output) {
                        input.quantized_tensor_element_type().StorageType(),
                        input.quantized_tensor_element_type().ExpressedType(),
                        cbrt, input, output)
-  } else {
+  } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    cbrt, input, output);
   }
-  return absl::OkStatus();
+  return absl::FailedPreconditionError("Unsupported tensor type.");
 }
 
 };  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc b/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc
index 1c8ae75845c0bb..687e3cb7debb15 100644
--- a/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc
@@ -16,26 +16,31 @@ limitations under the License.
 #include "tensorflow/lite/experimental/shlo/ops/cbrt.h"
 
 #include <cmath>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
 #include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
 #include "tensorflow/lite/experimental/shlo/quantize.h"
 #include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
 #include "tensorflow/lite/experimental/shlo/status_matcher.h"
 #include "tensorflow/lite/experimental/shlo/tensor.h"
 
-using shlo_ref::testing::StatusIs;
 using testing::ElementsAreArray;
 using testing::NanSensitiveFloatEq;
 using testing::Pointwise;
 
 namespace shlo_ref {
 
+template <>
+struct ParamName<CbrtOp> {
+  static std::string Get() { return "Cbrt"; }
+};
+
 namespace {
 
 struct Cbrt {
@@ -55,36 +60,25 @@ struct Cbrt {
   }
 } cbrt_ref;
 
-template <class T>
-struct NonQuantizedIntCbrtTest : ::testing::Test {};
+INSTANTIATE_TYPED_TEST_SUITE_P(Cbrt, UnaryElementwiseOpShapePropagationTest,
+                               CbrtOp, TestParamNames);
 
-TYPED_TEST_SUITE(NonQuantizedIntCbrtTest, NonQuantizedIntTestTypes,
-                 TestParamNames);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Cbrt, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<CbrtOp>, TestParamNames);
 
-TYPED_TEST(NonQuantizedIntCbrtTest, IntTensorsRaiseAnError) {
-  using StorageT = typename TypeParam::StorageT;
+using UnsupportedTypes = WithOpTypes<
+    CbrtOp, ConcatTypes<BoolTestType, IntTestTypes, PerAxisQuantizedTestTypes>>;
 
-  const Shape shape({2, 3, 4});
-  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
-  Vector<StorageT> output_data(shape.NumElements());
-
-  Tensor input_tensor{
-      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
-      .data = nullptr};
-  Tensor output_tensor = input_tensor;
-
-  auto op = Create(CbrtOp::Attributes{});
-  EXPECT_THAT(Prepare(op, input_tensor, output_tensor),
-              StatusIs(absl::StatusCode::kFailedPrecondition));
-}
+INSTANTIATE_TYPED_TEST_SUITE_P(Cbrt, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
 
 template <class T>
-struct NonQuantizedCbrtTest : ::testing::Test {};
+struct CbrtTest : ::testing::Test {};
 
-TYPED_TEST_SUITE(NonQuantizedCbrtTest, NonQuantizedFloatTestTypes,
-                 TestParamNames);
+TYPED_TEST_SUITE(CbrtTest, FloatTestTypes, TestParamNames);
 
-TYPED_TEST(NonQuantizedCbrtTest, FloatTensorsWork) {
+TYPED_TEST(CbrtTest, FloatTensorsWork) {
   using StorageT = typename TypeParam::StorageT;
 
   const Shape shape({2, 3, 4});
@@ -147,27 +141,5 @@ TYPED_TEST(QuantizedCbrtTest, PerTensorWorks) {
   EXPECT_THAT(output_data, ElementsAreArray(expected_data));
 }
 
-TYPED_TEST(QuantizedCbrtTest, PerAxisFails) {
-  using StorageT = typename TypeParam::StorageT;
-  using ExpressedT = typename TypeParam::ExpressedT;
-
-  const Shape shape({4, 3, 2});
-  const int quantized_dimension = 2;
-  Vector<ExpressedT> empty_scales;
-  Vector<StorageT> empty_zero_points;
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerAxis<TypeParam::kStorage,
-                                          TypeParam::kExpressed>(
-          empty_scales, empty_zero_points, quantized_dimension);
-  Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
-      .data = nullptr};
-  Tensor output_tensor = input_tensor;
-
-  auto op = Create(CbrtOp::Attributes{});
-  EXPECT_THAT(Prepare(op, input_tensor, output_tensor),
-              StatusIs(absl::StatusCode::kFailedPrecondition));
-}
-
 }  // namespace
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/ceil.cc b/tensorflow/lite/experimental/shlo/ops/ceil.cc
index 5a506dc12923af..a6b501131db5f9 100644
--- a/tensorflow/lite/experimental/shlo/ops/ceil.cc
+++ b/tensorflow/lite/experimental/shlo/ops/ceil.cc
@@ -49,20 +49,10 @@ CeilOp Create(CeilOp::Attributes) { return {}; }
 
 absl::Status Prepare(CeilOp& op, const Tensor& input, Tensor& output) {
   SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
-  if (!input.IsQuantized() && IsInteger(input.StorageType())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.ceil does not support integer tensor types.");
-  }
-  if (input.IsPerAxisQuantized()) {
-    return absl::FailedPreconditionError(
-        "stablehlo.ceil does not support per axis quantization.");
-  }
-  if (BaselineType(input.element_type()) !=
-      BaselineType(output.element_type())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.ceil constraint (C1) is not satisfied (incompatible "
-        "baseline types).");
-  }
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("ceil"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("ceil"), input, output));
   return absl::OkStatus();
 }
 
@@ -73,11 +63,11 @@ absl::Status Evaluate(CeilOp& op, const Tensor& input, Tensor& output) {
                        input.quantized_tensor_element_type().StorageType(),
                        input.quantized_tensor_element_type().ExpressedType(),
                        ceil, input, output)
-  } else {
+  } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    ceil, input, output);
   }
-  return absl::OkStatus();
+  return absl::FailedPreconditionError("Unsupported tensor type.");
 }
 
 };  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/ceil_test.cc b/tensorflow/lite/experimental/shlo/ops/ceil_test.cc
index 0875a02435e941..4059b19bcca63c 100644
--- a/tensorflow/lite/experimental/shlo/ops/ceil_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/ceil_test.cc
@@ -16,26 +16,31 @@ limitations under the License.
 #include "tensorflow/lite/experimental/shlo/ops/ceil.h"
 
 #include <cmath>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
 #include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
 #include "tensorflow/lite/experimental/shlo/quantize.h"
 #include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
 #include "tensorflow/lite/experimental/shlo/status_matcher.h"
 #include "tensorflow/lite/experimental/shlo/tensor.h"
 
-using shlo_ref::testing::StatusIs;
 using testing::ElementsAreArray;
 using testing::NanSensitiveFloatEq;
 using testing::Pointwise;
 
 namespace shlo_ref {
 
+template <>
+struct ParamName<CeilOp> {
+  static std::string Get() { return "Ceil"; }
+};
+
 namespace {
 
 struct Ceil {
@@ -55,36 +60,25 @@ struct Ceil {
   }
 } ceil_ref;
 
-template <class T>
-struct NonQuantizedIntCeilTest : ::testing::Test {};
+INSTANTIATE_TYPED_TEST_SUITE_P(Ceil, UnaryElementwiseOpShapePropagationTest,
+                               CeilOp, TestParamNames);
 
-TYPED_TEST_SUITE(NonQuantizedIntCeilTest, NonQuantizedIntTestTypes,
-                 TestParamNames);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Ceil, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<CeilOp>, TestParamNames);
 
-TYPED_TEST(NonQuantizedIntCeilTest, IntTensorsRaiseAnError) {
-  using StorageT = typename TypeParam::StorageT;
+using UnsupportedTypes = WithOpTypes<
+    CeilOp, ConcatTypes<BoolTestType, IntTestTypes, PerAxisQuantizedTestTypes>>;
 
-  const Shape shape({2, 3, 4});
-  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
-  Vector<StorageT> output_data(shape.NumElements());
-
-  Tensor input_tensor{
-      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
-      .data = nullptr};
-  Tensor output_tensor = input_tensor;
-
-  auto op = Create(CeilOp::Attributes{});
-  EXPECT_THAT(Prepare(op, input_tensor, output_tensor),
-              StatusIs(absl::StatusCode::kFailedPrecondition));
-}
+INSTANTIATE_TYPED_TEST_SUITE_P(Ceil, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
 
 template <class T>
-struct NonQuantizedCeilTest : ::testing::Test {};
+struct CeilTest : ::testing::Test {};
 
-TYPED_TEST_SUITE(NonQuantizedCeilTest, NonQuantizedFloatTestTypes,
-                 TestParamNames);
+TYPED_TEST_SUITE(CeilTest, FloatTestTypes, TestParamNames);
 
-TYPED_TEST(NonQuantizedCeilTest, FloatTensorsWork) {
+TYPED_TEST(CeilTest, FloatTensorsWork) {
   using StorageT = typename TypeParam::StorageT;
 
   const Shape shape({2, 3, 4});
@@ -147,27 +141,5 @@ TYPED_TEST(QuantizedCeilTest, PerTensorWorks) {
   EXPECT_THAT(output_data, ElementsAreArray(expected_data));
 }
 
-TYPED_TEST(QuantizedCeilTest, PerAxisFails) {
-  using StorageT = typename TypeParam::StorageT;
-  using ExpressedT = typename TypeParam::ExpressedT;
-
-  const Shape shape({4, 3, 2});
-  const int quantized_dimension = 2;
-  Vector<ExpressedT> empty_scales;
-  Vector<StorageT> empty_zero_points;
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerAxis<TypeParam::kStorage,
-                                          TypeParam::kExpressed>(
-          empty_scales, empty_zero_points, quantized_dimension);
-  Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
-      .data = nullptr};
-  Tensor output_tensor = input_tensor;
-
-  auto op = Create(CeilOp::Attributes{});
-  EXPECT_THAT(Prepare(op, input_tensor, output_tensor),
-              StatusIs(absl::StatusCode::kFailedPrecondition));
-}
-
 }  // namespace
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/cosine.cc b/tensorflow/lite/experimental/shlo/ops/cosine.cc
index e373708c15f369..8b757f9709ef18 100644
--- a/tensorflow/lite/experimental/shlo/ops/cosine.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cosine.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
-#include "tensorflow/lite/experimental/shlo/data_type.h"
 #include "tensorflow/lite/experimental/shlo/dispatch.h"
 #include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
@@ -49,20 +48,10 @@ CosineOp Create(CosineOp::Attributes) { return {}; }
 
 absl::Status Prepare(CosineOp& op, const Tensor& input, Tensor& output) {
   SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
-  if (!input.IsQuantized() && IsInteger(input.StorageType())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.cosine does not support integer tensor types.");
-  }
-  if (input.IsPerAxisQuantized()) {
-    return absl::FailedPreconditionError(
-        "stablehlo.cosine does not support per axis quantization.");
-  }
-  if (BaselineType(input.element_type()) !=
-      BaselineType(output.element_type())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.cosine constraint (C1) is not satisfied (incompatible "
-        "baseline types).");
-  }
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("cosine"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("cosine"), input, output));
   return absl::OkStatus();
 }
 
@@ -73,11 +62,11 @@ absl::Status Evaluate(CosineOp& op, const Tensor& input, Tensor& output) {
                        input.quantized_tensor_element_type().StorageType(),
                        input.quantized_tensor_element_type().ExpressedType(),
                        cosine, input, output)
-  } else {
+  } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    cosine, input, output);
   }
-  return absl::OkStatus();
+  return absl::FailedPreconditionError("Unsupported tensor type.");
 }
 
 };  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/cosine_test.cc b/tensorflow/lite/experimental/shlo/ops/cosine_test.cc
index 7eb8901cbe2aff..41fce8a264dd57 100644
--- a/tensorflow/lite/experimental/shlo/ops/cosine_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cosine_test.cc
@@ -16,26 +16,31 @@ limitations under the License.
 #include "tensorflow/lite/experimental/shlo/ops/cosine.h"
 
 #include <cmath>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
 #include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
 #include "tensorflow/lite/experimental/shlo/quantize.h"
 #include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
 #include "tensorflow/lite/experimental/shlo/status_matcher.h"
 #include "tensorflow/lite/experimental/shlo/tensor.h"
 
-using shlo_ref::testing::StatusIs;
 using testing::ElementsAreArray;
 using testing::NanSensitiveFloatEq;
 using testing::Pointwise;
 
 namespace shlo_ref {
 
+template <>
+struct ParamName<CosineOp> {
+  static std::string Get() { return "Cosine"; }
+};
+
 namespace {
 
 struct Cosine {
@@ -55,36 +60,26 @@ struct Cosine {
   }
 } cosine_ref;
 
-template <class T>
-struct NonQuantizedIntCosineTest : ::testing::Test {};
+INSTANTIATE_TYPED_TEST_SUITE_P(Cosine, UnaryElementwiseOpShapePropagationTest,
+                               CosineOp, TestParamNames);
 
-TYPED_TEST_SUITE(NonQuantizedIntCosineTest, NonQuantizedIntTestTypes,
-                 TestParamNames);
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Cosine, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<CosineOp>, TestParamNames);
 
-TYPED_TEST(NonQuantizedIntCosineTest, IntTensorsRaiseAnError) {
-  using StorageT = typename TypeParam::StorageT;
+using UnsupportedTypes =
+    WithOpTypes<CosineOp, ConcatTypes<BoolTestType, IntTestTypes,
+                                      PerAxisQuantizedTestTypes>>;
 
-  const Shape shape({2, 3, 4});
-  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
-  Vector<StorageT> output_data(shape.NumElements());
-
-  Tensor input_tensor{
-      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
-      .data = nullptr};
-  Tensor output_tensor = input_tensor;
-
-  auto op = Create(CosineOp::Attributes{});
-  EXPECT_THAT(Prepare(op, input_tensor, output_tensor),
-              StatusIs(absl::StatusCode::kFailedPrecondition));
-}
+INSTANTIATE_TYPED_TEST_SUITE_P(Cosine, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
 
 template <class T>
-struct NonQuantizedCosineTest : ::testing::Test {};
+struct CosineTest : ::testing::Test {};
 
-TYPED_TEST_SUITE(NonQuantizedCosineTest, NonQuantizedFloatTestTypes,
-                 TestParamNames);
+TYPED_TEST_SUITE(CosineTest, FloatTestTypes, TestParamNames);
 
-TYPED_TEST(NonQuantizedCosineTest, FloatTensorsWork) {
+TYPED_TEST(CosineTest, FloatTensorsWork) {
   using StorageT = typename TypeParam::StorageT;
 
   const Shape shape({2, 3, 4});
@@ -147,27 +142,5 @@ TYPED_TEST(QuantizedCosineTest, PerTensorWorks) {
   EXPECT_THAT(output_data, ElementsAreArray(expected_data));
 }
 
-TYPED_TEST(QuantizedCosineTest, PerAxisFails) {
-  using StorageT = typename TypeParam::StorageT;
-  using ExpressedT = typename TypeParam::ExpressedT;
-
-  const Shape shape({4, 3, 2});
-  const int quantized_dimension = 2;
-  Vector<ExpressedT> empty_scales;
-  Vector<StorageT> empty_zero_points;
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerAxis<TypeParam::kStorage,
-                                          TypeParam::kExpressed>(
-          empty_scales, empty_zero_points, quantized_dimension);
-  Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
-      .data = nullptr};
-  Tensor output_tensor = input_tensor;
-
-  auto op = Create(CosineOp::Attributes{});
-  EXPECT_THAT(Prepare(op, input_tensor, output_tensor),
-              StatusIs(absl::StatusCode::kFailedPrecondition));
-}
-
 }  // namespace
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/test_util.h b/tensorflow/lite/experimental/shlo/ops/test_util.h
index 9b64d10c8455f0..9eaab155e5c20f 100644
--- a/tensorflow/lite/experimental/shlo/ops/test_util.h
+++ b/tensorflow/lite/experimental/shlo/ops/test_util.h
@@ -18,16 +18,21 @@ limitations under the License.
 
 #include <random>
 #include <string>
+#include <tuple>
 #include <type_traits>
 
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
 
 namespace shlo_ref {
 
+// We use a vector class that is different from std::vector to have a consistent
+// API when dealing with bool tensors.
 template <class T>
 using Vector = absl::InlinedVector<T, 1>;
 
@@ -91,6 +96,19 @@ struct TestParam<storage_type, expressed_type> {
   using ExpressedT = StorageType<expressed_type>;
 };
 
+// Typed test parameter tag to ask for a per-tensor quantized tensor.
+template <class TestParamT>
+struct PerTensor {
+  using Param = TestParamT;
+};
+
+// Typed test parameter tag to ask for a per-channel quantized tensor.
+template <class TestParamT, Axis kAxis = 0>
+struct PerAxis {
+  using Param = TestParamT;
+  static constexpr Axis axis = kAxis;
+};
+
 constexpr const char* ToString(DataType t) {
   switch (t) {
     case DataType::kI1:
@@ -133,6 +151,33 @@ struct ParamName<TestParam<T, Ts...>> {
   }
 };
 
+template <DataType T, DataType... Ts>
+struct ParamName<PerTensor<TestParam<T, Ts...>>> {
+  static std::string Get() {
+    std::string name = std::string("PerTensor[") + ToString(T);
+    ((name += std::string("_") + ToString(Ts)), ...);
+    return name + "]";
+  }
+};
+
+template <DataType T, DataType... Ts, Axis axis>
+struct ParamName<PerAxis<TestParam<T, Ts...>, axis>> {
+  static std::string Get() {
+    std::string name = std::string("PerAxis[") + ToString(T);
+    ((name += std::string("_") + ToString(Ts)), ...);
+    return name + ":" + std::to_string(axis) + "]";
+  }
+};
+
+template <class TestParamT, class... TestParamTs>
+struct ParamName<std::tuple<TestParamT, TestParamTs...>> {
+  static std::string Get() {
+    std::string name = ParamName<TestParamT>::Get();
+    ((name += std::string(":") + ParamName<TestParamTs>::Get()), ...);
+    return name;
+  }
+};
+
 class TestParamNames {
  public:
   template <class T>
@@ -141,32 +186,123 @@ class TestParamNames {
   }
 };
 
+template <template <class> class F, class T>
+struct Map;
+
+template <template <class> class F, class... Ts>
+struct Map<F, ::testing::Types<Ts...>> {
+  using Types = ::testing::Types<F<Ts>...>;
+};
+
+template <template <class> class F, class T>
+using MapTypes = typename Map<F, T>::Types;
+
+template <class... Ts>
+struct Concat;
+
+template <class... Ts>
+struct Concat<::testing::Types<Ts...>> {
+  using Types = ::testing::Types<Ts...>;
+};
+
+template <class... Ts, class... Us, class... ExtraTypes>
+struct Concat<::testing::Types<Ts...>, ::testing::Types<Us...>, ExtraTypes...> {
+  using Types =
+      typename Concat<::testing::Types<Ts..., Us...>, ExtraTypes...>::Types;
+};
+
+template <class... Ts>
+using ConcatTypes = typename Concat<Ts...>::Types;
+
+template <class Op, class T>
+struct WithOp;
+
+template <class Op, class... Ts>
+struct WithOp<Op, ::testing::Types<Ts...>> {
+  using Types = ::testing::Types<std::tuple<Op, Ts>...>;
+};
+
+template <class Op, class T>
+using WithOpTypes = typename WithOp<Op, T>::Types;
+
+// Use this with TYPED_TEST_SUITE for boolean testing.
+using BoolTestType = ::testing::Types<TestParam<DataType::kI1>>;
+
 // Use this with TYPED_TEST_SUITE for non quantized integer testing.
-using NonQuantizedIntTestTypes =
-    testing::Types<TestParam<DataType::kSI4>, TestParam<DataType::kSI8>,
-                   TestParam<DataType::kSI16>, TestParam<DataType::kSI32>>;
+using IntTestTypes =
+    ::testing::Types<TestParam<DataType::kSI4>, TestParam<DataType::kSI8>,
+                     TestParam<DataType::kSI16>, TestParam<DataType::kSI32>>;
 
 // Use this with TYPED_TEST_SUITE for non quantized floating point testing.
-using NonQuantizedFloatTestTypes =
-    testing::Types<TestParam<DataType::kBF16>, TestParam<DataType::kF16>,
-                   TestParam<DataType::kF32>>;
+using FloatTestTypes =
+    ::testing::Types<TestParam<DataType::kBF16>, TestParam<DataType::kF16>,
+                     TestParam<DataType::kF32>>;
 
 // Use this with TYPED_TEST_SUITE for non quantized testing.
-using NonQuantizedTestTypes =
-    testing::Types<TestParam<DataType::kSI4>, TestParam<DataType::kSI8>,
-                   TestParam<DataType::kSI16>, TestParam<DataType::kSI32>,
-                   TestParam<DataType::kBF16>, TestParam<DataType::kF16>,
-                   TestParam<DataType::kF32>>;
+using ArithmeticTestTypes = ConcatTypes<IntTestTypes, FloatTestTypes>;
 
-// Use this with TYPED_TEST_SUITE for quantized testing.
+// Use this with TYPED_TEST_SUITE for unspecified quantized testing.
 using QuantizedTestTypes =
-    testing::Types<TestParam<DataType::kSI4, DataType::kF32>,
-                   TestParam<DataType::kSI8, DataType::kF32>,
-                   TestParam<DataType::kSI16, DataType::kF32>,
-                   TestParam<DataType::kSI4, DataType::kBF16>,
-                   TestParam<DataType::kSI8, DataType::kBF16>,
-                   TestParam<DataType::kSI4, DataType::kF16>,
-                   TestParam<DataType::kSI8, DataType::kF16>>;
+    ::testing::Types<TestParam<DataType::kSI4, DataType::kF32>,
+                     TestParam<DataType::kSI8, DataType::kF32>,
+                     TestParam<DataType::kSI16, DataType::kF32>,
+                     TestParam<DataType::kSI4, DataType::kBF16>,
+                     TestParam<DataType::kSI8, DataType::kBF16>,
+                     TestParam<DataType::kSI4, DataType::kF16>,
+                     TestParam<DataType::kSI8, DataType::kF16>>;
+
+// Use this with TYPED_TEST_SUITE for quantized per tensor testing.
+using PerTensorQuantizedTestTypes = MapTypes<PerTensor, QuantizedTestTypes>;
+
+template <class T>
+using PerAxis0 = PerAxis<T, 0>;
+
+// Use this with TYPED_TEST_SUITE for quantized per axis testing.
+using PerAxisQuantizedTestTypes = MapTypes<PerAxis0, QuantizedTestTypes>;
+
+// Builds a TensorType object and returns it in a variant that can be passed to
+// a tensor.
+template <DataType storage_type>
+TensorTypeVariant TensorTypeFor(TestParam<storage_type>, const Shape& shape) {
+  return TensorType{.shape = shape, .element_type = storage_type};
+}
+
+// Builds a per tensor QuantizedTensorType object and returns it in a variant
+// that can be passed to a tensor.
+//
+// WARNING: the scale and zero point are randomly generated:
+//   - scale is in [0.5, 1.5]
+//   - zero_point is in [-5, 5]
+template <DataType storage_type, DataType expressed_type>
+TensorTypeVariant TensorTypeFor(
+    PerTensor<TestParam<storage_type, expressed_type>>, const Shape& shape) {
+  std::random_device rd;
+  Distribution<expressed_type> expressed_dist(0.5, 1.5);
+  Distribution<storage_type> storage_dist(-5, 5);
+  StorageType<expressed_type> scale =
+      static_cast<StorageType<expressed_type>>(expressed_dist(rd));
+  StorageType<storage_type> zero_point = storage_dist(rd);
+  return QuantizedTensorType{
+      .shape = shape,
+      .element_type =
+          QuantizedTensorElementType::PerTensor<storage_type, expressed_type>(
+              scale, zero_point)};
+}
+
+// Builds a per axis QuantizedTensorType object and returns it in a variant
+// that can be passed to a tensor.
+//
+// WARNING: scales and zero points are unspecified and may be empty.
+template <DataType storage_type, DataType expressed_type, Axis axis>
+TensorTypeVariant TensorTypeFor(
+    PerAxis<TestParam<storage_type, expressed_type>, axis>,
+    const Shape& shape) {
+  return QuantizedTensorType{
+      .shape = shape,
+      .element_type =
+          QuantizedTensorElementType::PerAxis<storage_type, expressed_type>(
+              /*scales=*/{}, /*zero_points=*/{}, axis)};
+}
 
 }  // namespace shlo_ref
 
diff --git a/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test.cc b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test.cc
index cd81c4dbee48f4..5dc5e51fd3de18 100644
--- a/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test.cc
@@ -51,7 +51,7 @@ struct TestParam {
 template <class T>
 struct UnaryElementWiseTest : ::testing::Test {};
 
-TYPED_TEST_SUITE(UnaryElementWiseTest, NonQuantizedTestTypes);
+TYPED_TEST_SUITE(UnaryElementWiseTest, ArithmeticTestTypes);
 
 TYPED_TEST(UnaryElementWiseTest, NonQuantizedWithAbs) {
   using StorageT = typename TypeParam::StorageT;
diff --git a/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h
new file mode 100644
index 00000000000000..3c153ea6cb77f9
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h
@@ -0,0 +1,250 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UNARY_ELEMENTWISE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UNARY_ELEMENTWISE_TEST_UTIL_H_
+
+#include <tuple>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+// Lists couples of unmatched baseline element types.
+template <class Op>
+using UnaryElementwiseConstraint1Types = ::testing::Types<
+    std::tuple<Op, TestParam<DataType::kF16>, TestParam<DataType::kBF16>>,
+    std::tuple<Op, TestParam<DataType::kF16>, TestParam<DataType::kF32>>,
+    std::tuple<Op, TestParam<DataType::kBF16>, TestParam<DataType::kF16>>,
+    std::tuple<Op, TestParam<DataType::kBF16>, TestParam<DataType::kF32>>,
+    std::tuple<Op, TestParam<DataType::kF32>, TestParam<DataType::kF16>>,
+    std::tuple<Op, TestParam<DataType::kF32>, TestParam<DataType::kBF16>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>>;
+
+// Tests that the input shape is compared to the output shape and that it is
+// propagated if needed.
+template <class Op>
+class UnaryElementwiseOpShapePropagationTest : public ::testing::Test {
+ protected:
+  void SetOutputShape(Shape shape) {
+    output_tensor_.shape() = std::move(shape);
+  }
+  bool InputAndOutputShapesAreEqual() const {
+    return input_tensor_.shape() == output_tensor_.shape();
+  }
+
+  Op op_ = Create(typename Op::Attributes{});
+  Tensor input_tensor_ = {.type = TensorType{.shape = Shape({2, 3, 4}),
+                                             .element_type = DataType::kF32},
+                          .data = nullptr};
+  Tensor output_tensor_ = {
+      .type = TensorType{.shape = Shape(), .element_type = DataType::kF32},
+      .data = nullptr};
+};
+
+TYPED_TEST_SUITE_P(UnaryElementwiseOpShapePropagationTest);
+
+TYPED_TEST_P(UnaryElementwiseOpShapePropagationTest, ShapePropagationWorks) {
+  ASSERT_TRUE(this->output_tensor_.shape().empty());
+  EXPECT_OK(Prepare(this->op_, this->input_tensor_, this->output_tensor_));
+  EXPECT_THAT(this->output_tensor_.shape(),
+              ::testing::ElementsAreArray(this->input_tensor_.shape()));
+}
+
+TYPED_TEST_P(UnaryElementwiseOpShapePropagationTest,
+             SmallerOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3}));
+  ASSERT_FALSE(this->InputAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->input_tensor_, this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shape."));
+}
+
+TYPED_TEST_P(UnaryElementwiseOpShapePropagationTest,
+             BiggerOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3, 4, 5}));
+  ASSERT_FALSE(this->InputAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->input_tensor_, this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shape."));
+}
+
+TYPED_TEST_P(UnaryElementwiseOpShapePropagationTest,
+             IncompatibleOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3, 5}));
+  ASSERT_FALSE(this->InputAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->input_tensor_, this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shape."));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(UnaryElementwiseOpShapePropagationTest,
+                            ShapePropagationWorks,
+                            SmallerOutputShapeRaisesAnError,
+                            BiggerOutputShapeRaisesAnError,
+                            IncompatibleOutputShapeRaisesAnError);
+
+// Tests that the baseline element type of the input and output tensors is the
+// same.
+template <class T>
+class UnaryElementwiseSameBaselineElementTypeConstraintTest
+    : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(UnaryElementwiseSameBaselineElementTypeConstraintTest);
+
+TYPED_TEST_P(UnaryElementwiseSameBaselineElementTypeConstraintTest,
+             DifferentInputOutputStorageTypesRaiseAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using OperandTypeDesc = std::tuple_element_t<1, TypeParam>;
+  using ResultTypeDesc = std::tuple_element_t<2, TypeParam>;
+  const Shape shape({2, 3, 4});
+  Tensor input_tensor{.type = TensorTypeFor(OperandTypeDesc{}, shape),
+                      .data = nullptr};
+  Tensor output_tensor{.type = TensorTypeFor(ResultTypeDesc{}, shape),
+                       .data = nullptr};
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status = Prepare(op, input_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(
+      status.message(),
+      ::testing::ContainsRegex(
+          "stablehlo.[_a-z]+: baseline type constraint is not satisfied"));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(
+    UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    DifferentInputOutputStorageTypesRaiseAnError);
+
+// Tests that unsupported types are detected during when `Prepare` is called.
+template <class T>
+class UnaryElementwiseUnsupportedTypeTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(UnaryElementwiseUnsupportedTypeTest);
+
+TYPED_TEST_P(UnaryElementwiseUnsupportedTypeTest, PrepareRaisesAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using TypeDesc = std::tuple_element_t<1, TypeParam>;
+  Tensor input_tensor{.type = TensorTypeFor(TypeDesc{}, Shape({2, 3, 4})),
+                      .data = nullptr};
+  Tensor output_tensor = input_tensor;
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status = Prepare(op, input_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(status.message(),
+              ::testing::HasSubstr("Unsupported tensor type"));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(UnaryElementwiseUnsupportedTypeTest,
+                            PrepareRaisesAnError);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UNARY_ELEMENTWISE_TEST_UTIL_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/util.cc b/tensorflow/lite/experimental/shlo/ops/util.cc
index 4371e9f7edd5ef..d65d805daf8b9e 100644
--- a/tensorflow/lite/experimental/shlo/ops/util.cc
+++ b/tensorflow/lite/experimental/shlo/ops/util.cc
@@ -15,7 +15,9 @@ limitations under the License.
 #include "tensorflow/lite/experimental/shlo/ops/util.h"
 
 #include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
 
 namespace shlo_ref {
 
@@ -32,4 +34,39 @@ absl::Status Propagate(const Shape& input_shape, Shape& output_shape) {
   return absl::OkStatus();
 }
 
+bool IsBoolTensor(const Tensor& tensor) {
+  return !tensor.IsQuantized() && IsBool(tensor.StorageType());
+}
+
+bool IsSignedIntTensor(const Tensor& tensor) {
+  return !tensor.IsQuantized() && IsSignedInteger(tensor.StorageType());
+}
+
+bool IsUnsignedIntTensor(const Tensor& tensor) {
+  return !tensor.IsQuantized() && IsUnsignedInteger(tensor.StorageType());
+}
+
+bool IsFloatTensor(const Tensor& tensor) {
+  return !tensor.IsQuantized() && IsFloat(tensor.StorageType());
+}
+
+bool IsQuantizedPerTensorTensor(const Tensor& tensor) {
+  return tensor.IsPerTensorQuantized();
+}
+
+bool IsQuantizedPerAxisTensor(const Tensor& tensor) {
+  return tensor.IsPerAxisQuantized();
+}
+
+absl::Status CheckSameBaselineType(CheckCtx ctx, const Tensor& tensor1,
+                                   const Tensor& tensor2) {
+  if (BaselineType(tensor1.element_type()) !=
+      BaselineType(tensor2.element_type())) {
+    return absl::FailedPreconditionError(
+        "stablehlo." + ctx.op_name +
+        ": baseline type constraint is not satisfied.");
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/util.h b/tensorflow/lite/experimental/shlo/ops/util.h
index d13238db144264..a7ef8125225dbd 100644
--- a/tensorflow/lite/experimental/shlo/ops/util.h
+++ b/tensorflow/lite/experimental/shlo/ops/util.h
@@ -15,8 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UTIL_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UTIL_H_
 
+#include <string>
+
 #include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
 
 namespace shlo_ref {
 
@@ -31,6 +34,55 @@ namespace shlo_ref {
 // the input.
 absl::Status Propagate(const Shape& input_shape, Shape& output_shape);
 
+// Provides context information for the `Check*` functions error messages.
+struct CheckCtx {
+  explicit CheckCtx(std::string name) : op_name(name) {}
+  // The operation that requested the check.
+  std::string op_name;
+};
+
+// Checks that the `tensor` element type is supported by one the the `checks`
+// functions.
+//
+// Returns a failed precondition error when no check succeeds.
+//
+// The check functions should have the following signature.
+//
+// ```
+// bool Check(const Tensor& tensor);
+// ```
+template <class... CheckFuncs>
+absl::Status CheckSupportedTypes(CheckCtx ctx, const Tensor& tensor,
+                                 CheckFuncs&&... checks) {
+  if ((static_cast<CheckFuncs&&>(checks)(tensor) || ...)) {
+    return absl::OkStatus();
+  }
+  return absl::FailedPreconditionError("stablehlo." + ctx.op_name +
+                                       ": Unsupported tensor type.");
+}
+
+// Returns true if the tensor's storage type is boolean.
+bool IsBoolTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is a signed integer type.
+bool IsSignedIntTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is an unsigned integer type.
+bool IsUnsignedIntTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is an floating point type.
+bool IsFloatTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is quantized per tensor.
+bool IsQuantizedPerTensorTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is quantized per axis.
+bool IsQuantizedPerAxisTensor(const Tensor& tensor);
+
+// Checks that both tensors have the same baseline element type.
+absl::Status CheckSameBaselineType(CheckCtx ctx, const Tensor& tensor1,
+                                   const Tensor& tensor2);
+
 }  // namespace shlo_ref
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UTIL_H_
diff --git a/tensorflow/lite/experimental/shlo/status_matcher.h b/tensorflow/lite/experimental/shlo/status_matcher.h
index f79e732c14a750..77f4fad9ad2706 100644
--- a/tensorflow/lite/experimental/shlo/status_matcher.h
+++ b/tensorflow/lite/experimental/shlo/status_matcher.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TEST_MACROS_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TEST_MACROS_H_
 
+// IWYU pragma: always_keep
+
 #include <gmock/gmock.h>
-#include "absl/status/status.h"
+#include "absl/status/status.h"  // IWYU pragma: keep - used in the
+                                             // provided macros in OSS builds.
 
 namespace shlo_ref {
 namespace testing {

From 406222ecb43b2feef59690e0e28b0f0cc80a477c Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 19 Mar 2024 03:27:41 -0700
Subject: [PATCH 080/670] [kernelgen] Don't try bufferizing TensorDialect

This currently doesn't do anything because we
don't load the BufferizableOpInterface. But if
someone does we have a miscompile on our hands.

PiperOrigin-RevId: 617113554
---
 third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
index b1024fb030eca8..5b7e0842e32ae9 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
@@ -153,8 +153,7 @@ struct ComputeOpAndFuncBufferizePass
     // will be migrated to BufferizableOpInterface-based bufferization.
     options.opFilter.allowDialect<bufferization::BufferizationDialect,
                                   linalg::LinalgDialect, mhlo::MhloDialect,
-                                  shape::ShapeDialect, tensor::TensorDialect,
-                                  vector::VectorDialect>();
+                                  shape::ShapeDialect, vector::VectorDialect>();
 
     if (failed(bufferization::bufferizeOp(getOperation(), options))) {
       signalPassFailure();

From 407e30dd443e6892e95fae404dcf07ed74592dd0 Mon Sep 17 00:00:00 2001
From: LakshmiKalaKadali <149650845+LakshmiKalaKadali@users.noreply.github.com>
Date: Tue, 19 Mar 2024 16:26:33 +0530
Subject: [PATCH 081/670] Bounding box coordinates  terminology updated in
 detection_postprocess.cc

---
 tensorflow/lite/kernels/detection_postprocess.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/kernels/detection_postprocess.cc b/tensorflow/lite/kernels/detection_postprocess.cc
index 793fcd9dab8219..d1ccdd7fad8c45 100644
--- a/tensorflow/lite/kernels/detection_postprocess.cc
+++ b/tensorflow/lite/kernels/detection_postprocess.cc
@@ -55,8 +55,8 @@ constexpr int kBatchSize = 1;
 constexpr int kNumDetectionsPerClass = 100;
 
 // Object Detection model produces axis-aligned boxes in two formats:
-// BoxCorner represents the lower left corner (xmin, ymin) and
-// the upper right corner (xmax, ymax).
+// BoxCorner represents the upper left corner (xmin, ymin) and
+// the lower right corner (xmax, ymax).
 // CenterSize represents the center (xcenter, ycenter), height and width.
 // BoxCornerEncoding and CenterSizeEncoding are related as follows:
 // ycenter = y / y_scale * anchor.h + anchor.y;

From 40ae4b6eb1c4a646ffca392d6adb4e16d8c4e091 Mon Sep 17 00:00:00 2001
From: "Jiyoun (Jen) Ha" <jiyounha@google.com>
Date: Tue, 19 Mar 2024 03:33:46 -0700
Subject: [PATCH 082/670] Fixes `SAME` padding calculation to produce float
 results for correct `std::ceil` input.

PiperOrigin-RevId: 617115206
---
 .../stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir   | 4 +---
 .../transforms/uniform_quantized_stablehlo_to_tfl_pass.cc     | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
index 9a9ea66195f7cb..fcd5cc52d870e0 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
@@ -186,12 +186,10 @@ func.func @convolution_upstream_srq_strides(%arg0: tensor<1x3x3x4x!quant.uniform
 }
 // CHECK-LABEL: convolution_upstream_srq_strides
 // CHECK-SAME: %[[ARG:.+]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-// CHECK-DAG: %[[CONST_0:.+]] = "tfl.pseudo_const"() {value = dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
 // CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
 // CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK: %[[PAD:.+]] = "tfl.pad"(%[[ARG]], %[[CONST_0]]) : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<4x2xi32>) -> tensor<1x5x5x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
 // Tests that the stride_w is set to 2.
-// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[PAD]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 2 : i32} : (tensor<1x5x5x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 2 : i32} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+00>>
 // CHECK: return %[[CONV2D]] : tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
index aaa1236a6d9470..fa041289c5a861 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
@@ -1290,7 +1290,7 @@ class RewriteQuantizedConvolutionOp
     // output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
     auto get_output_dim_for_same_padding = [](int64_t input_dim,
                                               int64_t stride_dim) -> int64_t {
-      return std::ceil(input_dim / stride_dim);
+      return std::ceil(input_dim / static_cast<double>(stride_dim));
     };
     return output_height ==
                get_output_dim_for_same_padding(input_height, stride_height) &&

From 73dc44fdc55ed0a8563772dcfd92aaafef12a470 Mon Sep 17 00:00:00 2001
From: Frederic Rechtenstein <frec@google.com>
Date: Tue, 19 Mar 2024 03:38:34 -0700
Subject: [PATCH 083/670] Per-channel int4 quantization support for
 FullyConnected in XNNPack delegate

PiperOrigin-RevId: 617116206
---
 ...amically_quantized_fully_connected_test.cc |  80 +++++++---
 ...ically_quantized_fully_connected_tester.cc |  26 ++-
 ...mically_quantized_fully_connected_tester.h |   2 +
 .../delegates/xnnpack/xnnpack_delegate.cc     | 149 +++++++++++++++---
 4 files changed, 207 insertions(+), 50 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
index 05d9c13bba6f14..5defe0e657f3e0 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
@@ -32,6 +32,18 @@ namespace xnnpack {
 class DynamicallyQuantizedFullyConnectedTest
     : public testing::TestWithParam<WeightsType> {};
 
+int GenInputChannels(const std::function<int()> &rng,
+                     WeightsType weights_type) {
+  switch (weights_type) {
+    case WeightsType::kChannelWiseQuantizedInt8:
+    case WeightsType::kTensorWiseQuantizedInt8:
+      return rng();
+    case WeightsType::kChannelWiseQuantizedInt4:
+      // Int4 quantized kernels only support even number of channels.
+      return (rng() / 2) * 2;
+  }
+}
+
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
@@ -45,7 +57,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
   auto rng = std::mt19937(random_device());
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -53,7 +66,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .KeepDims(true)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -73,14 +86,15 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2D) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
       .InputShape({batch, input_channels})
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -100,7 +114,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2DKeepDims) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -108,7 +123,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2DKeepDims) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .KeepDims(true)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -128,14 +143,15 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3D) {
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = shape_rng();
   const auto width = shape_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
       .InputShape({batch, width, input_channels})
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -184,7 +200,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DKeepDims) {
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = shape_rng();
   const auto width = shape_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -192,7 +209,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DKeepDims) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .KeepDims(true)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -214,14 +231,15 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4D) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
   const auto width = shape_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
       .InputShape({batch, height, width, input_channels})
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -243,7 +261,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4DKeepDims) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
   const auto width = shape_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -251,7 +270,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4DKeepDims) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .KeepDims(true)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -271,7 +290,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, NoBias) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -279,7 +299,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, NoBias) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .NoBias()
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -299,7 +319,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluActivation) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -307,7 +328,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluActivation) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .ReluActivation()
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -327,7 +348,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, Relu6Activation) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -335,7 +357,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, Relu6Activation) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .Relu6Activation()
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -355,7 +377,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluMinus1To1Activation) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -363,7 +386,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluMinus1To1Activation) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .ReluMinus1To1Activation()
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -384,14 +407,15 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, MultiThreading) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
       .InputShape({batch, input_channels})
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -415,7 +439,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, WeightsCache) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -423,13 +448,15 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, WeightsCache) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .WeightsCache(weights_cache.get())
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
 // Returns a human readable string representation of the test parameter.
 std::string TestParamToString(testing::TestParamInfo<WeightsType> param) {
   switch (param.param) {
+    case WeightsType::kChannelWiseQuantizedInt4:
+      return "ChannelWiseQuantizedInt4";
     case WeightsType::kChannelWiseQuantizedInt8:
       return "ChannelWiseQuantizedInt8";
     case WeightsType::kTensorWiseQuantizedInt8:
@@ -444,6 +471,7 @@ INSTANTIATE_TEST_SUITE_P(
     DynamicallyQuantizedFullyConnectedTest,
     DynamicallyQuantizedFullyConnectedTest,
     testing::Values(WeightsType::kTensorWiseQuantizedInt8,
+                    WeightsType::kChannelWiseQuantizedInt4,
                     WeightsType::kChannelWiseQuantizedInt8),
     TestParamToString);
 
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.cc
index 13a0da90a67cf7..f07de21cf352ff 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/string.h"  // from @flatbuffers
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
 #include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
@@ -157,7 +158,19 @@ std::vector<char> DynamicallyQuantizedFullyConnectedTester::CreateTfLiteModel()
   std::vector<flatbuffers::Offset<Operator>> operators;
 
   /*********************** Generate filter and bias data **********************/
-  std::vector<int8_t> filter_data(InputChannels() * OutputChannels());
+  int filter_size_bytes = -1;
+  switch (WeightsType()) {
+    case WeightsType::kChannelWiseQuantizedInt4: {
+      filter_size_bytes = (InputChannels() * OutputChannels() + 1) / 2;
+      break;
+    }
+    case WeightsType::kChannelWiseQuantizedInt8:
+    case WeightsType::kTensorWiseQuantizedInt8: {
+      filter_size_bytes = InputChannels() * OutputChannels();
+      break;
+    }
+  }
+  std::vector<int8_t> filter_data(filter_size_bytes);
   std::generate(filter_data.begin(), filter_data.end(), std::ref(filter_rng));
   std::vector<float> bias_data(OutputChannels());
   std::generate(bias_data.begin(), bias_data.end(), std::ref(bias_rng));
@@ -185,15 +198,22 @@ std::vector<char> DynamicallyQuantizedFullyConnectedTester::CreateTfLiteModel()
       builder,
       builder.CreateVector<int32_t>(InputShape().data(), InputShape().size()),
       TensorType_FLOAT32, /*buffer=*/0));
-
+  tflite::TensorType filter_tensor_type;
   std::vector<float> filter_scale;
   std::vector<int64_t> filter_zero_point;
   switch (WeightsType()) {
+    case WeightsType::kChannelWiseQuantizedInt4:
+      filter_tensor_type = tflite::TensorType_INT4;
+      filter_scale.assign(OutputChannels(), FilterScale());
+      filter_zero_point.assign(OutputChannels(), 0);
+      break;
     case WeightsType::kChannelWiseQuantizedInt8:
+      filter_tensor_type = tflite::TensorType_INT8;
       filter_scale.assign(OutputChannels(), FilterScale());
       filter_zero_point.assign(OutputChannels(), 0);
       break;
     case WeightsType::kTensorWiseQuantizedInt8: {
+      filter_tensor_type = tflite::TensorType_INT8;
       filter_scale = {FilterScale()};
       filter_zero_point = {0};
       break;
@@ -202,7 +222,7 @@ std::vector<char> DynamicallyQuantizedFullyConnectedTester::CreateTfLiteModel()
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
-      TensorType_INT8, /*buffer=*/1, /*name=*/0,
+      filter_tensor_type, /*buffer=*/1, /*name=*/0,
       CreateQuantizationParameters(
           builder, /*min=*/0, /*max=*/0,
           builder.CreateVector<float>(filter_scale),
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h
index e3929e8ea86a51..e073fb79780f5d 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
@@ -30,6 +31,7 @@ namespace tflite {
 namespace xnnpack {
 
 enum class WeightsType {
+  kChannelWiseQuantizedInt4,
   kChannelWiseQuantizedInt8,
   kTensorWiseQuantizedInt8,
 };
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 0e1a2d3c30c6f8..5fffc633f222b6 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -141,12 +141,14 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
 
       return xnn_datatype_quint8;
     }
-    case kTfLiteInt8: {
+    case kTfLiteInt8:
+    case kTfLiteInt4: {
       if (tensor.quantization.type != kTfLiteAffineQuantization) {
         TF_LITE_KERNEL_LOG(context,
-                           "unsupported quantization type %d for INT8 "
+                           "unsupported quantization type %d for %s "
                            "tensor %d in XNNPACK delegate",
-                           tensor.quantization.type, t);
+                           tensor.quantization.type,
+                           TfLiteTypeGetName(tensor.type), t);
         return xnn_datatype_invalid;
       }
       const auto quantization_params =
@@ -154,26 +156,27 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
               tensor.quantization.params);
       if (quantization_params->scale == nullptr) {
         TF_LITE_KERNEL_LOG(context,
-                           "missing scale quantization parameters for INT8 "
+                           "missing scale quantization parameters for %s "
                            "tensor %d in XNNPACK delegate",
-                           t);
+                           TfLiteTypeGetName(tensor.type), t);
         return xnn_datatype_invalid;
       }
       if (quantization_params->zero_point == nullptr) {
         TF_LITE_KERNEL_LOG(context,
                            "missing zero point quantization parameters for "
-                           "INT8 tensor %d in XNNPACK delegate",
-                           t);
+                           "%s tensor %d in XNNPACK delegate",
+                           TfLiteTypeGetName(tensor.type), t);
         return xnn_datatype_invalid;
       }
       if (quantization_params->scale->size !=
           quantization_params->zero_point->size) {
         TF_LITE_KERNEL_LOG(context,
                            "mismatching number of scale (%d) and zero "
-                           "point (%d) quantization parameters for INT8 "
+                           "point (%d) quantization parameters for %s "
                            "tensor %d in XNNPACK delegate",
                            quantization_params->scale->size,
-                           quantization_params->zero_point->size, t);
+                           quantization_params->zero_point->size,
+                           TfLiteTypeGetName(tensor.type), t);
         return xnn_datatype_invalid;
       }
 
@@ -182,14 +185,22 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
         if (!std::isnormal(scale) || scale <= 0.0f) {
           TF_LITE_KERNEL_LOG(context,
                              "unsupported scale value (%f) in channel %d for "
-                             "INT8 tensor %d in XNNPACK delegate",
-                             scale, i, t);
+                             "%s tensor %d in XNNPACK delegate",
+                             scale, i, TfLiteTypeGetName(tensor.type), t);
           return xnn_datatype_invalid;
         }
       }
 
       if (quantization_params->scale->size == 1) {
         // Per-tensor quantization parameters
+        if (kTfLiteInt8 != tensor.type) {
+          TF_LITE_KERNEL_LOG(context,
+                             "unsupported per-tensor quantization scale "
+                             "parameter for %s tensor %d in XNNPACK delegate",
+                             TfLiteTypeGetName(tensor.type), t);
+          return xnn_datatype_invalid;
+        }
+
         const int zero_point = quantization_params->zero_point->data[0];
         if (zero_point < std::numeric_limits<int8_t>::min() ||
             zero_point > std::numeric_limits<int8_t>::max()) {
@@ -209,15 +220,25 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
              c <
              SizeOfDimension(&tensor, quantization_params->quantized_dimension);
              c++) {
-          if (quantization_params->zero_point->data[c] != 0) {
+          if (quantization_params->zero_point->data[c] != 0 &&
+              (tensor.type != kTfLiteInt4 &&
+               quantization_params->zero_point->data[c] != 8)) {
             TF_LITE_KERNEL_LOG(context,
                                "unsupported zero-point value %d in channel "
-                               "%d of INT8 tensor %d in XNNPACK delegate",
-                               quantization_params->zero_point->data[c], c, t);
+                               "%d of %s tensor %d in XNNPACK delegate",
+                               quantization_params->zero_point->data[c], c,
+                               TfLiteTypeGetName(tensor.type), t);
             return xnn_datatype_invalid;
           }
         }
-        return xnn_datatype_qcint8;
+        switch (tensor.type) {
+          case kTfLiteInt4:
+            return xnn_datatype_qcint4;
+          case kTfLiteInt8:
+            return xnn_datatype_qcint8;
+          default:
+            return xnn_datatype_invalid;
+        }
       } else {
         TF_LITE_KERNEL_LOG(
             context,
@@ -989,6 +1010,7 @@ class Subgraph {
               dims.size(), dims.data(), data, XNN_INVALID_VALUE_ID, flags,
               &xnnpack_id);
           break;
+        case xnn_datatype_qcint4:
         case xnn_datatype_qcint8:
         case xnn_datatype_qcint32:
           status = xnn_define_channelwise_quantized_tensor_value(
@@ -2009,6 +2031,77 @@ class Subgraph {
     return kTfLiteError;
   }
 
+  static TfLiteStatus CheckTensorFloat32OrQCInt4OrQCInt8Type(
+      const Delegate& delegate, TfLiteContext* context,
+      const TfLiteTensor& tensor, int expected_quantized_dimension,
+      int tensor_index, int node_index) {
+    switch (tensor.type) {
+      case kTfLiteFloat32:
+        return kTfLiteOk;
+      case kTfLiteInt4:
+      case kTfLiteInt8:
+        if (delegate.support_signed_8bit_quantization() &&
+            (kTfLiteInt8 == tensor.type || kTfLiteInt4 == tensor.type)) {
+          if (tensor.quantization.type != kTfLiteAffineQuantization) {
+            TF_LITE_MAYBE_KERNEL_LOG(
+                context,
+                "unsupported quantization type %d in tensor #%d in node #%d",
+                tensor.quantization.type, tensor_index, node_index);
+            return kTfLiteError;
+          }
+          const TfLiteAffineQuantization* quantization_params =
+              static_cast<const TfLiteAffineQuantization*>(
+                  tensor.quantization.params);
+          if (quantization_params->scale == nullptr) {
+            TF_LITE_MAYBE_KERNEL_LOG(context,
+                                     "missing scale quantization parameters in "
+                                     "tensor #%d in node #%d",
+                                     tensor_index, node_index);
+            return kTfLiteError;
+          }
+          if (quantization_params->scale->size > 1 &&
+              quantization_params->quantized_dimension !=
+                  expected_quantized_dimension) {
+            TF_LITE_MAYBE_KERNEL_LOG(
+                context,
+                "unsupported quantized dimension %d in tensor #%d in node #%d",
+                quantization_params->quantized_dimension, tensor_index,
+                node_index);
+            return kTfLiteError;
+          }
+          return kTfLiteOk;
+        }
+        break;
+      case kTfLiteUInt8:
+        if (delegate.support_unsigned_8bit_quantization()) {
+          const auto* quantization_params =
+              static_cast<const TfLiteAffineQuantization*>(
+                  tensor.quantization.params);
+          if (tensor.quantization.type != kTfLiteAffineQuantization ||
+              quantization_params->quantized_dimension != 0 ||
+              quantization_params->scale == nullptr ||
+              quantization_params->zero_point == nullptr ||
+              quantization_params->scale->size != 1 ||
+              quantization_params->zero_point->size != 1) {
+            TF_LITE_MAYBE_KERNEL_LOG(
+                context,
+                "unsupported quantization type %d in tensor #%d in node #%d",
+                tensor.quantization.type, tensor_index, node_index);
+            return kTfLiteError;
+          }
+          return kTfLiteOk;
+        }
+        break;
+      default:
+        break;
+    }
+
+    TF_LITE_MAYBE_KERNEL_LOG(
+        context, "unsupported type %s in tensor #%d in node #%d",
+        TfLiteTypeGetName(tensor.type), tensor_index, node_index);
+    return kTfLiteError;
+  }
+
   static TfLiteStatus CheckTensorFloat32OrQInt32Type(const Delegate& delegate,
                                                      TfLiteContext* context,
                                                      const TfLiteTensor& tensor,
@@ -3802,7 +3895,7 @@ class Subgraph {
           delegate, logging_context, filter_tensor, node->inputs->data[1],
           node_index));
     } else {
-      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQCInt8Type(
+      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQCInt4OrQCInt8Type(
           delegate, logging_context, filter_tensor,
           /*expected_quantized_dimension=*/0, node->inputs->data[1],
           node_index));
@@ -3853,7 +3946,8 @@ class Subgraph {
 
     bool dynamically_quantized = (delegate.enable_latest_operators() &&
                                   (input_tensor.type == kTfLiteFloat32 &&
-                                   filter_tensor.type == kTfLiteInt8));
+                                   (filter_tensor.type == kTfLiteInt4 ||
+                                    filter_tensor.type == kTfLiteInt8)));
     if (input_tensor.type != output_tensor.type ||
         ((input_tensor.type != filter_tensor.type) && !dynamically_quantized)) {
       TF_LITE_MAYBE_KERNEL_LOG(
@@ -3871,6 +3965,15 @@ class Subgraph {
       return kTfLiteError;
     }
 
+    if (filter_tensor.type == kTfLiteInt4 && input_channels % 2 == 1) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unsupported odd number of inputs channels (%d) in FULLY_CONNECTED"
+          " operator #%d",
+          input_channels, node_index);
+      return kTfLiteError;
+    }
+
     int32_t num_input_elements = 1;
     for (int i = 0; i < NumDimensions(&input_tensor); i++) {
       if (SizeOfDimension(&input_tensor, i) <= 0) {
@@ -4004,11 +4107,15 @@ class Subgraph {
         std::vector<size_t> filter_dims(
             &filter_tensor.dims->data[0],
             &filter_tensor.dims->data[NumDimensions(&filter_tensor)]);
+        const xnn_datatype filter_datatype = GetXNNPackDatatype(
+            logging_context, filter_tensor, node->inputs->data[1]);
+        int32_t zero_point_value = filter_params->zero_point->data[0];
         uint32_t kernel_id = XNN_INVALID_VALUE_ID;
-        status = xnn_define_channelwise_quantized_tensor_value(
-            subgraph, xnn_datatype_qcint8, filter_params->scale->data,
-            filter_dims.size(), /*channel_dim=*/0, filter_dims.data(),
-            GetTensorData<int8_t>(&filter_tensor), XNN_INVALID_VALUE_ID,
+        status = xnn_define_channelwise_quantized_tensor_value_v2(
+            subgraph, filter_datatype, zero_point_value,
+            filter_params->scale->data, filter_dims.size(), /*channel_dim=*/0,
+            filter_dims.data(), GetTensorData<int8_t>(&filter_tensor),
+            XNN_INVALID_VALUE_ID,
             /*flags=*/0, &kernel_id);
         if (status != xnn_status_success) {
           TF_LITE_KERNEL_LOG(

From 95fef810fc4df26ccd485b0a340e4c53becb5d6b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 03:58:26 -0700
Subject: [PATCH 084/670] [XLA:GPU] Add tool which extracts collective
 operations from HLO module.

PiperOrigin-RevId: 617120020
---
 third_party/xla/xla/tools/BUILD               | 24 +++++
 .../tools/extract_collective_operations.cc    | 96 +++++++++++++++++++
 third_party/xla/xla/tools/hlo_decomposer.cc   | 36 +++++++
 third_party/xla/xla/tools/hlo_decomposer.h    |  7 ++
 4 files changed, 163 insertions(+)
 create mode 100644 third_party/xla/xla/tools/extract_collective_operations.cc

diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 4fe829d1bb8b03..9aaf3aa33a0383 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -508,6 +508,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -652,6 +653,29 @@ xla_cc_binary(
     ],
 )
 
+xla_cc_binary(
+    name = "extract_collective_operations",
+    srcs = ["extract_collective_operations.cc"],
+    deps = [
+        ":hlo_decomposer_lib",
+        ":hlo_module_loader",
+        "//xla:debug_options_flags",
+        "//xla:status",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_proto_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/util:command_line_flags",
+    ],
+)
+
 tsl_gpu_library(
     name = "xla_compile_lib",
     srcs = ["xla_compile_lib.cc"],
diff --git a/third_party/xla/xla/tools/extract_collective_operations.cc b/third_party/xla/xla/tools/extract_collective_operations.cc
new file mode 100644
index 00000000000000..cc579b7bd445fc
--- /dev/null
+++ b/third_party/xla/xla/tools/extract_collective_operations.cc
@@ -0,0 +1,96 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/status.h"
+#include "xla/tools/hlo_decomposer.h"
+#include "xla/tools/hlo_module_loader.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/init_main.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/util/command_line_flags.h"
+
+namespace {
+const char* const kUsage = R"(
+This tool extracts collective operations from HLO module and saves them together
+to the separate module.
+
+Usage:
+bazel run extract_collective_operations -- --input=path/to/hlo_module
+  --output=path/to/hlo_module
+)";
+}  // namespace
+
+namespace xla {
+Status ExtractCollectiveOperations(const std::string& input,
+                                   const std::string& output) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> test_module,
+      LoadModuleFromFile(input, std::string(tsl::io::Extension(input)),
+                         hlo_module_loader_details::Config(), nullptr));
+
+  std::vector<xla::HloInstruction*> collective_instructions;
+  for (const auto& op : test_module->computations()) {
+    for (const auto& instr : op->instructions()) {
+      if (absl::StartsWith(instr->name(), "all-")) {
+        collective_instructions.push_back(instr);
+      }
+    }
+  }
+
+  if (collective_instructions.empty()) {
+    return absl::InternalError("No collective instructions found.");
+  }
+  auto collectives_module =
+      ExtractInstructionIntoNewModule(collective_instructions);
+
+  QCHECK_OK(tsl::WriteStringToFile(tsl::Env::Default(), output,
+                                   collectives_module->ToString()))
+      << "Can't open or write output module at " << output;
+  return absl::OkStatus();
+}
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  std::string input;
+  std::string output;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("input", &input, "input file"),
+      tsl::Flag("output", &output, "output file")};
+  xla::AppendDebugOptionsFlags(&flag_list);
+  const std::string kUsageString =
+      absl::StrCat(kUsage, "\n\n", tsl::Flags::Usage(argv[0], flag_list));
+  bool parse_ok = tsl::Flags::Parse(&argc, argv, flag_list);
+  tsl::port::InitMain(kUsageString.c_str(), &argc, &argv);
+  if (!parse_ok) {
+    LOG(QFATAL) << kUsageString;
+  }
+  TF_CHECK_OK(xla::ExtractCollectiveOperations(input, output));
+  return 0;
+}
diff --git a/third_party/xla/xla/tools/hlo_decomposer.cc b/third_party/xla/xla/tools/hlo_decomposer.cc
index a5a7c5d23e7b89..005355d671fe3c 100644
--- a/third_party/xla/xla/tools/hlo_decomposer.cc
+++ b/third_party/xla/xla/tools/hlo_decomposer.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -116,6 +117,41 @@ absl::StatusOr<std::vector<std::unique_ptr<HloModule>>> DecomposeHloModule(
   return modules;
 }
 
+std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
+    const std::vector<HloInstruction*>& instructions) {
+  CHECK(!instructions.empty());
+  HloInstruction& first_instruction = *instructions[0];
+  auto new_hlo_module = std::make_unique<HloModule>(
+      first_instruction.GetModule()->name() + "_collective_ops",
+      HloModuleConfig{},
+      std::make_unique<CompilationEnvironments>(
+          first_instruction.GetModule()->comp_envs()));
+  int parameter_number = 0;
+  HloComputation::Builder builder("entry_computation");
+  HloCloneContext clone_context(new_hlo_module.get());
+  std::vector<HloInstruction*> new_instructions;
+  for (auto* hlo : instructions) {
+    std::vector<HloInstruction*> new_operands;
+    for (const HloInstruction* operand : hlo->operands()) {
+      std::unique_ptr<HloInstruction> new_parameter =
+          HloInstruction::CreateParameter(parameter_number, operand->shape(),
+                                          operand->name());
+      ++parameter_number;
+      new_operands.push_back(builder.AddInstruction(std::move(new_parameter)));
+    }
+    std::unique_ptr<HloInstruction> new_instruction =
+        hlo->CloneWithNewOperands(hlo->shape(), new_operands, &clone_context);
+    new_instructions.push_back(
+        builder.AddInstruction(std::move(new_instruction)));
+  }
+
+  std::unique_ptr<HloInstruction> tuple_instruction =
+      HloInstruction::CreateTuple(new_instructions);
+  builder.AddInstruction(std::move(tuple_instruction));
+  new_hlo_module->AddEntryComputationWithLayouts(builder.Build());
+  return new_hlo_module;
+}
+
 std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
     const HloInstruction& hlo) {
   auto new_hlo_module = std::make_unique<HloModule>(
diff --git a/third_party/xla/xla/tools/hlo_decomposer.h b/third_party/xla/xla/tools/hlo_decomposer.h
index 9c449066906e05..d12b4d82216d1e 100644
--- a/third_party/xla/xla/tools/hlo_decomposer.h
+++ b/third_party/xla/xla/tools/hlo_decomposer.h
@@ -38,6 +38,13 @@ absl::StatusOr<std::vector<std::unique_ptr<HloModule>>> DecomposeHloModule(
 std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
     const HloInstruction& hlo);
 
+// Extracts HLO instructions into a new HLO module replacing all operands
+// with parameter instructions even if the result of one instruction is used
+// as a parameter to another. Combines results of all operations into the
+// tuple and adds this tuple as a root instruction of the new module.
+std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
+    const std::vector<HloInstruction*>& instructions);
+
 // Extracts producer and consumer HLO instruction into a new HLO module
 // replacing its operands with parameter instructions.
 std::unique_ptr<HloModule> ExtractProducerConsumerIntoNewModule(

From 433f78daf723557f1fe3ac054dfa029134b18c62 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 04:15:02 -0700
Subject: [PATCH 085/670] #shlo_ref Add `sine` op.

PiperOrigin-RevId: 617123856
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  35 +++++
 tensorflow/lite/experimental/shlo/ops/sine.cc |  73 +++++++++
 tensorflow/lite/experimental/shlo/ops/sine.h  |  34 +++++
 .../lite/experimental/shlo/ops/sine_test.cc   | 138 ++++++++++++++++++
 4 files changed, 280 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/sine.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/sine.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/sine_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 15bcab95773b55..934e74e45d4567 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -286,3 +286,38 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "sine",
+    srcs = ["sine.cc"],
+    hdrs = ["sine.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "sine_test",
+    srcs = ["sine_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":sine",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/sine.cc b/tensorflow/lite/experimental/shlo/ops/sine.cc
new file mode 100644
index 00000000000000..1d1228f18f0804
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sine.cc
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/sine.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Sine {
+  template <class T>
+  T operator()(T v) const {
+    return std::sin(v);
+  }
+
+  template <>
+  F16 operator()<F16>(F16 val) const {
+    return F16(operator()(static_cast<float>(val)));
+  }
+
+  template <>
+  BF16 operator()<BF16>(BF16 val) const {
+    return BF16(operator()(static_cast<float>(val)));
+  }
+};
+
+SineOp Create(SineOp::Attributes) { return {}; }
+
+absl::Status Prepare(SineOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("sine"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("sine"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(SineOp& op, const Tensor& input, Tensor& output) {
+  Sine sine;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       sine, input, output)
+  } else if (!input.IsQuantized() && IsFloat(input.StorageType())) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   sine, input, output);
+  }
+  return absl::FailedPreconditionError("Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/sine.h b/tensorflow/lite/experimental/shlo/ops/sine.h
new file mode 100644
index 00000000000000..66ca9a0a55f8b3
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sine.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SINE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SINE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct SineOp {
+  struct Attributes {};
+};
+
+SineOp Create(SineOp::Attributes);
+absl::Status Prepare(SineOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(SineOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SINE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/sine_test.cc b/tensorflow/lite/experimental/shlo/ops/sine_test.cc
new file mode 100644
index 00000000000000..c82f2c570bb858
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sine_test.cc
@@ -0,0 +1,138 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/sine.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<SineOp> {
+  static std::string Get() { return "Sine"; }
+};
+
+namespace {
+
+struct Sine {
+  template <class T>
+  T operator()(T v) const {
+    return std::sin(v);
+  }
+
+  template <>
+  F16 operator()<F16>(F16 val) const {
+    return F16(operator()(static_cast<float>(val)));
+  }
+
+  template <>
+  BF16 operator()<BF16>(BF16 val) const {
+    return BF16(operator()(static_cast<float>(val)));
+  }
+} sine_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sine, UnaryElementwiseOpShapePropagationTest,
+                               SineOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Sine, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<SineOp>, TestParamNames);
+
+using UnsupportedTypes = WithOpTypes<
+    SineOp, ConcatTypes<BoolTestType, IntTestTypes, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sine, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct FloatSineTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(FloatSineTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(FloatSineTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const TensorType tensor_type =
+      TensorType{.shape = shape, .element_type = TypeParam::kStorage};
+  Tensor input_tensor{.type = tensor_type, .data = input_data.data()};
+  Tensor output_tensor{.type = tensor_type, .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), sine_ref);
+
+  auto op = Create(SineOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedSineTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedSineTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedSineTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorType tensor_type = {
+      .shape = shape,
+      .element_type = QuantizedTensorElementType::PerTensor<
+          TypeParam::kStorage, TypeParam::kExpressed>(scale, zero_point)};
+  Tensor input_tensor{.type = tensor_type, .data = input_data.data()};
+  Tensor output_tensor{.type = tensor_type, .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = sine_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(SineOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From e03741a167bd3eafc4fc54cf4fe8b1d7837353ad Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Tue, 19 Mar 2024 04:18:28 -0700
Subject: [PATCH 086/670] [XLA:GPU] Create prototype class for tiling
 propagation using symbolic tiles.

SymbolicTileAnalysis allows computing symbolic tiles for all the instructions
in a given computation. All the symbolic tiles contained in an instance of
SymbolicTileAnalysis are parameterized by the chosen output tile (i.e. the
tile at the root instruction).

A given instruction may be tiled in several different ways in order to produce
the same output tile. In order to make sure all relevant tilings of an
instruction are captured, SymbolicTileAnalysis stores a symbolic tile for
each path from the root instruction of the computation to the relevant
instruction.

The class also allows evaluating the value of the concrete tiles for each
(instruction, path) pair once the offset, size, and stride parameters of the
output tile have been discovered.

PiperOrigin-RevId: 617124497
---
 third_party/xla/xla/service/gpu/model/BUILD   |  39 +++
 .../service/gpu/model/indexing_analysis.cc    |  34 +--
 .../xla/service/gpu/model/indexing_analysis.h |   4 +
 .../gpu/model/symbolic_tile_analysis.cc       | 223 ++++++++++++++++++
 .../gpu/model/symbolic_tile_analysis.h        | 109 +++++++++
 .../gpu/model/symbolic_tile_analysis_test.cc  | 170 +++++++++++++
 6 files changed, 562 insertions(+), 17 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
 create mode 100644 third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
 create mode 100644 third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc

diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index a3b08d3fc5b971..3e4d2e378afa0c 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -556,6 +556,45 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "symbolic_tile_analysis",
+    srcs = ["symbolic_tile_analysis.cc"],
+    hdrs = ["symbolic_tile_analysis.h"],
+    deps = [
+        ":indexing_analysis",
+        ":indexing_map",
+        ":tile_analysis",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:instruction_fusion",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+xla_cc_test(
+    name = "symbolic_tile_analysis_test",
+    srcs = ["symbolic_tile_analysis_test.cc"],
+    deps = [
+        ":indexing_map",
+        ":symbolic_tile_analysis",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:verified_hlo_module",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "coalescing_analysis",
     srcs = ["coalescing_analysis.cc"],
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index cc2cc9f2b83519..c8d20d55bcdf7a 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -78,23 +78,6 @@ HloInstructionIndexing CreateUnknownIndexing(int64_t count = 1) {
   return indexing;
 }
 
-IndexingMap CreateIdentityMap(const Shape& shape,
-                              IndexingContext* indexing_context) {
-  if (shape.IsTuple()) {
-    // Should happen only for variadic reduce. In that case all tuple shapes are
-    // equal.
-    return CreateIdentityMap(shape.tuple_shapes(0), indexing_context);
-  }
-
-  auto dims = shape.dimensions();
-  IndexingMap identity_map = IndexingMap::FromTensorSizes(
-      indexing_context,
-      AffineMap::getMultiDimIdentityMap(dims.size(),
-                                        indexing_context->GetMLIRContext()),
-      dims, {});
-  return identity_map;
-}
-
 HloInstructionIndexing ComputeOutputToInputCwiseOpIndexing(
     const HloInstruction* instr, IndexingContext* indexing_context) {
   IndexingMap identity_map =
@@ -895,6 +878,23 @@ AffineMap GetTilingAffineMap(llvm::ArrayRef<AffineExpr> exprs,
 
 }  // namespace
 
+IndexingMap CreateIdentityMap(const Shape& shape,
+                              IndexingContext* indexing_context) {
+  if (shape.IsTuple()) {
+    // Should happen only for variadic reduce. In that case all tuple shapes are
+    // equal.
+    return CreateIdentityMap(shape.tuple_shapes(0), indexing_context);
+  }
+
+  auto dims = shape.dimensions();
+  IndexingMap identity_map = IndexingMap::FromTensorSizes(
+      indexing_context,
+      AffineMap::getMultiDimIdentityMap(dims.size(),
+                                        indexing_context->GetMLIRContext()),
+      dims, {});
+  return identity_map;
+}
+
 llvm::SmallVector<AffineExpr, 4> DelinearizeInBoundsIndex(
     AffineExpr linear, absl::Span<const int64_t> sizes,
     absl::Span<const int64_t> strides) {
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.h b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
index 47abac957e0e0e..e277e543163fb1 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
@@ -161,6 +161,10 @@ IndexingMap GetIndexingMapForTiling(mlir::AffineMap block_offsets,
 // Returns the shape of the output of the instruction.
 const Shape& GetOutputShape(const HloInstruction* instr, int64_t output_id);
 
+// Creates an identity indexing map corresponding to the parameter shape.
+IndexingMap CreateIdentityMap(const Shape& shape,
+                              IndexingContext* indexing_context);
+
 llvm::SmallVector<mlir::AffineExpr, 4> DelinearizeInBoundsIndex(
     mlir::AffineExpr linear, absl::Span<const int64_t> sizes,
     absl::Span<const int64_t> strides);
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
new file mode 100644
index 00000000000000..872811f343583c
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -0,0 +1,223 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/symbolic_tile_analysis.h"
+
+#include <cstdint>
+#include <optional>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_context.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/gpu/model/tile_analysis.h"
+#include "xla/shape.h"
+#include "xla/status.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+using ::mlir::AffineExpr;
+using ::mlir::AffineMap;
+using ::mlir::SmallVector;
+
+struct HloAndPath {
+  const HloInstruction* hlo;
+  SymbolicTileAnalysis::InstructionPathFromRoot path;
+};
+
+}  // namespace
+
+/*static*/ SymbolicTileAnalysisOrError SymbolicTileAnalysis::AnalyzeComputation(
+    const HloComputation& computation, IndexingContext* ctx) {
+  absl::flat_hash_map<InstructionPathFromRoot, SymbolicTile>
+      symbolic_tile_from_path;
+  ConstHloInstructionMap<absl::flat_hash_set<InstructionPathFromRoot>>
+      paths_from_root_to_instruction;
+  absl::flat_hash_map<const InstructionPathFromRoot, IndexingMap>
+      indexing_map_from_path;
+  std::queue<HloAndPath> to_process;
+
+  const HloInstruction* root = computation.root_instruction();
+  paths_from_root_to_instruction.insert({root, {{}}});
+
+  to_process.push(HloAndPath{root, /*path=*/{}});
+  indexing_map_from_path.insert({{}, CreateIdentityMap(root->shape(), ctx)});
+
+  while (!to_process.empty()) {
+    const HloAndPath hlo_and_path = to_process.front();
+    to_process.pop();
+
+    const HloInstruction* hlo = hlo_and_path.hlo;
+
+    // Bail out on instructions that are known to cause problems down the line.
+    // This is not an inherent limitation of the approach, but simply issues
+    // to be resolved in the current implementation.
+    if (hlo->opcode() == HloOpcode::kDot ||
+        hlo->opcode() == HloOpcode::kReshape ||
+        hlo->opcode() == HloOpcode::kBitcast ||
+        hlo->opcode() == HloOpcode::kConcatenate) {
+      return absl::StrCat("Bailing out on ", hlo->ToString()).c_str();
+    }
+
+    // Bail out on instructions that do not output a single array.
+    if (!hlo->shape().IsArray()) {
+      return absl::StrCat(hlo->ToString(), " outputs more than a single array")
+          .c_str();
+    }
+
+    const IndexingMap& hlo_indexing_map =
+        indexing_map_from_path.at(hlo_and_path.path);
+
+    std::optional<SymbolicTile> symbolic_tile =
+        SymbolicTile::FromIndexingMap(hlo_indexing_map);
+    if (!symbolic_tile.has_value()) {
+      return absl::StrCat("Failed to compute symbolic tile for ",
+                          hlo_indexing_map.ToString(), " for HLO ",
+                          hlo->ToString())
+          .c_str();
+    }
+    symbolic_tile_from_path.insert({hlo_and_path.path, symbolic_tile.value()});
+
+    std::optional<HloInstructionIndexing> operands_indexing =
+        ComputeOutputToInputIndexing(hlo, /*output_id=*/0, ctx);
+
+    if (!operands_indexing.has_value()) {
+      return absl::StrCat("Failed to compute operands indexing for ",
+                          hlo->ToString())
+          .c_str();
+    }
+
+    int operand_id = 0;
+    for (auto [operand, operand_indexing_map_set] :
+         llvm::zip(hlo->operands(), operands_indexing->indexing_maps)) {
+      // Assign hlo_indexing_map again, since the reference may have been
+      // invalidated by the insertion below.
+      const IndexingMap& hlo_indexing_map =
+          indexing_map_from_path.at(hlo_and_path.path);
+      CHECK_EQ(operand_indexing_map_set.size(), 1);
+
+      IndexingMap operand_indexing_map = ComposeIndexingMaps(
+          hlo_indexing_map, *operand_indexing_map_set.begin());
+
+      InstructionPathFromRoot operand_path = InstructionPathFromRoot(
+          hlo_and_path.path.begin(), hlo_and_path.path.end());
+      operand_path.push_back(operand_id);
+
+      indexing_map_from_path.insert({operand_path, operand_indexing_map});
+      to_process.push(HloAndPath{operand, operand_path});
+
+      // TODO(bchetioui): replace instances of 'count' with 'contains' once OSS
+      // builds use C++20.
+      if (paths_from_root_to_instruction.count(operand) == 0) {
+        paths_from_root_to_instruction.insert({operand, {operand_path}});
+      } else {
+        paths_from_root_to_instruction.at(operand).insert(operand_path);
+      }
+
+      ++operand_id;
+    }
+  }
+
+  return SymbolicTileAnalysis(symbolic_tile_from_path,
+                              paths_from_root_to_instruction, ctx);
+}
+
+namespace {
+
+std::vector<int64_t> EvaluateTileMap(AffineMap affine_map,
+                                     absl::Span<int64_t const> parameters) {
+  CHECK_EQ(affine_map.getNumSymbols(), parameters.size());
+  CHECK_EQ(affine_map.getNumDims(), 0);
+
+  SmallVector<AffineExpr> symbol_replacements = llvm::to_vector(
+      llvm::map_range(parameters, [affine_map](const int64_t v) -> AffineExpr {
+        return mlir::getAffineConstantExpr(v, affine_map.getContext());
+      }));
+
+  mlir::AffineMap simplified_affine_map =
+      mlir::simplifyAffineMap(affine_map.replaceDimsAndSymbols(
+          /*dimReplacements=*/{}, symbol_replacements, /*numResultDims=*/0,
+          /*numResultSyms=*/0));
+
+  SmallVector<int64_t> results = llvm::to_vector(llvm::map_range(
+      simplified_affine_map.getResults(), [](AffineExpr result) -> int64_t {
+        return llvm::cast<mlir::AffineConstantExpr>(result).getValue();
+      }));
+
+  return std::vector<int64_t>(results.begin(), results.end());
+}
+
+}  // namespace
+
+std::vector<int64_t> SymbolicTileAnalysis::TileOffsets(
+    const HloInstruction* hlo, const InstructionPathFromRoot& path) const {
+  CHECK(tile_parameters_.has_value());
+  // TODO(bchetioui): replace instances of 'count' with 'contains' once OSS
+  // builds use C++20.
+  CHECK_EQ(paths_from_root_to_instruction_.count(hlo), 1);
+  CHECK_EQ(paths_from_root_to_instruction_.at(hlo).count(path), 1);
+  return EvaluateTileMap(symbolic_tile_from_path_.at(path).offset_map(),
+                         *tile_parameters_);
+}
+
+// TODO(bchetioui): remove dependency on stride and offset parameters.
+std::vector<int64_t> SymbolicTileAnalysis::TileSizes(
+    const HloInstruction* hlo, const InstructionPathFromRoot& path) const {
+  CHECK(tile_parameters_.has_value());
+  // TODO(bchetioui): replace instances of 'count' with 'contains' once OSS
+  // builds use C++20.
+  CHECK_EQ(paths_from_root_to_instruction_.count(hlo), 1);
+  CHECK_EQ(paths_from_root_to_instruction_.at(hlo).count(path), 1);
+  return EvaluateTileMap(symbolic_tile_from_path_.at(path).size_map(),
+                         *tile_parameters_);
+}
+
+std::vector<int64_t> SymbolicTileAnalysis::TileStrides(
+    const HloInstruction* hlo, const InstructionPathFromRoot& path) const {
+  CHECK(tile_parameters_.has_value());
+  // TODO(bchetioui): replace instances of 'count' with 'contains' once OSS
+  // builds use C++20.
+  CHECK_EQ(paths_from_root_to_instruction_.count(hlo), 1);
+  CHECK_EQ(paths_from_root_to_instruction_.at(hlo).count(path), 1);
+  return EvaluateTileMap(symbolic_tile_from_path_.at(path).stride_map(),
+                         *tile_parameters_);
+}
+
+void SymbolicTileAnalysis::SetTileParameters(
+    absl::Span<int64_t const> parameters) {
+  // TODO(bchetioui): CHECK num parameters somehow?
+  tile_parameters_ = std::vector(parameters.begin(), parameters.end());
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
new file mode 100644
index 00000000000000..439a1bf4d1d992
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -0,0 +1,109 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_ANALYSIS_H_
+#define XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_ANALYSIS_H_
+
+#include <cstdint>
+#include <optional>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/indexing_context.h"
+#include "xla/service/gpu/model/tile_analysis.h"
+#include "xla/service/instruction_fusion.h"
+
+namespace xla {
+namespace gpu {
+
+class SymbolicTileAnalysis;
+using SymbolicTileAnalysisOrError =
+    std::variant<SymbolicTileAnalysis, FusionDecision>;
+
+// Constructs and holds symbolic tiles for all the instructions within a
+// computation. We may hold several different symbolic tiles for the same
+// instruction if the instruction is indexed in several different ways in order
+// to produce a single chunk of the output. In order to handle this properly,
+// we store a symbolic tile for each possible path starting from the root
+// instruction of the computation to the relevant instruction.
+class SymbolicTileAnalysis {
+ public:
+  // `InstructionPathFromRoot` allows representing a graph path from the root
+  // instruction of a computation up to one of its consumers. Each integer
+  // in the path represents the index of the operand edge to follow to reach
+  // the instruction, starting from the root instruction.
+  using InstructionPathFromRoot = std::vector<int>;
+
+  // Tries to construct a symbolic tile analysis from a computation. Returns
+  // a diagnostic if the construction fails for any reason.
+  static SymbolicTileAnalysisOrError AnalyzeComputation(
+      const HloComputation& computation, IndexingContext* ctx);
+
+  // Evaluates the tile offsets of an instruction from the analyzed computation
+  // following the provided path from the root. Tile parameters must have been
+  // set before calling this method.
+  std::vector<int64_t> TileOffsets(const HloInstruction* hlo,
+                                   const InstructionPathFromRoot& path) const;
+  // Evaluates the tile sizes of an instruction from the analyzed computation
+  // following the provided path from the root. Tile parameters must have been
+  // set before calling this method.
+  std::vector<int64_t> TileSizes(const HloInstruction* hlo,
+                                 const InstructionPathFromRoot& path) const;
+  // Evaluates the tile strides of an instruction from the analyzed computation
+  // following the provided path from the root. Tile parameters must have been
+  // set before calling this method.
+  std::vector<int64_t> TileStrides(const HloInstruction* hlo,
+                                   const InstructionPathFromRoot& path) const;
+
+  // Populate tile parameters. This is a prerequisite in order to extract
+  // concrete values using `TileOffsets`, `TileSizes`, and `TileStrides`.
+  void SetTileParameters(absl::Span<int64_t const> parameters);
+
+  // Return the underlying IndexingContext.
+  IndexingContext* GetIndexingContext() const { return context_; };
+
+ private:
+  SymbolicTileAnalysis(
+      absl::flat_hash_map<InstructionPathFromRoot, SymbolicTile>
+          symbolic_tile_from_path,
+      ConstHloInstructionMap<absl::flat_hash_set<InstructionPathFromRoot>>
+          paths_from_root_to_instruction,
+      IndexingContext* context)
+      : symbolic_tile_from_path_(symbolic_tile_from_path),
+        paths_from_root_to_instruction_(paths_from_root_to_instruction),
+        context_(context) {}
+
+  absl::flat_hash_map<InstructionPathFromRoot, SymbolicTile>
+      symbolic_tile_from_path_;
+  // Maps each instruction in the analyzed computation to a set containing all
+  // the possible paths from the root instruction to the key instruction.
+  ConstHloInstructionMap<absl::flat_hash_set<InstructionPathFromRoot>>
+      paths_from_root_to_instruction_;
+  IndexingContext* context_;
+  // Optionally set tile parameters. These parameters can be set by calling
+  // `SetTileParameters`, and correspond to the output tile for the analyzed
+  // computation. The order and type of parameters are as explained in the
+  // documentation of `SymbolicTile`.
+  std::optional<std::vector<int64_t>> tile_parameters_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
new file mode 100644
index 00000000000000..3ba190e22f97b2
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
@@ -0,0 +1,170 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/symbolic_tile_analysis.h"
+
+#include <cstdint>
+#include <memory>
+#include <variant>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/indexing_context.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/verified_hlo_module.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::testing::ElementsAre;
+
+void SetTileParametersWithDefaultOffsetsAndStrides(
+    absl::Span<int64_t const> sizes, SymbolicTileAnalysis& analysis) {
+  std::vector<int64_t> parameters;
+  parameters.reserve(3 * sizes.size());
+
+  for (int64_t size : sizes) {
+    // Untiled dims have offset = 0 and stride = 1.
+    parameters.push_back(0);
+    parameters.push_back(size);
+    parameters.push_back(1);
+  }
+  analysis.SetTileParameters(parameters);
+}
+
+using SymbolicTileAnalysisTest = HloTestBase;
+
+TEST_F(SymbolicTileAnalysisTest, SimpleNormalizationDiamondIsSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+max {
+  p1 = f32[] parameter(1)
+  p0 = f32[] parameter(0)
+  ROOT m = f32[] maximum(p0, p1)
+}
+
+ENTRY main {
+  p0 = f32[2,97]{1,0} parameter(0)
+  constant = f32[] constant(-inf)
+  reduce = f32[2] reduce(p0, constant), dimensions={1}, to_apply=max
+  broadcast = f32[2,97]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[2,97]{1,0} subtract(p0, broadcast)
+})"));
+
+  mlir::MLIRContext mlir_ctx;
+  IndexingContext ctx(&mlir_ctx);
+
+  SymbolicTileAnalysisOrError analysis_or_error =
+      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                               &ctx);
+
+  EXPECT_TRUE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+  SymbolicTileAnalysis analysis =
+      std::get<SymbolicTileAnalysis>(analysis_or_error);
+
+  SetTileParametersWithDefaultOffsetsAndStrides(/*sizes=*/{1, 10}, analysis);
+
+  const HloInstruction* p0 =
+      module->entry_computation()->parameter_instruction(0);
+  SymbolicTileAnalysis::InstructionPathFromRoot p0_from_subtract0({0});
+  SymbolicTileAnalysis::InstructionPathFromRoot p0_from_subtract1({1, 0, 0});
+
+  EXPECT_THAT(analysis.TileOffsets(p0, p0_from_subtract0), ElementsAre(0, 0));
+  EXPECT_THAT(analysis.TileSizes(p0, p0_from_subtract0), ElementsAre(1, 10));
+  EXPECT_THAT(analysis.TileStrides(p0, p0_from_subtract0), ElementsAre(1, 1));
+
+  EXPECT_THAT(analysis.TileOffsets(p0, p0_from_subtract1), ElementsAre(0, 0));
+  EXPECT_THAT(analysis.TileSizes(p0, p0_from_subtract1), ElementsAre(1, 97));
+  EXPECT_THAT(analysis.TileStrides(p0, p0_from_subtract1), ElementsAre(1, 1));
+}
+
+TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedDot) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY main {
+  p0 = f32[1,2]{1,0} parameter(0)
+  p1 = f32[2,3]{1,0} parameter(1)
+  ROOT dot = f32[1,3]{1,0} dot(p0, p1),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})"));
+
+  mlir::MLIRContext mlir_ctx;
+  IndexingContext ctx(&mlir_ctx);
+  SymbolicTileAnalysisOrError analysis_or_error =
+      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                               &ctx);
+  EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+}
+
+TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedReshape) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY main {
+  p0 = f32[1,2]{1,0} parameter(0)
+  ROOT reshape = f32[2] reshape(p0)
+})"));
+
+  mlir::MLIRContext mlir_ctx;
+  IndexingContext ctx(&mlir_ctx);
+  SymbolicTileAnalysisOrError analysis_or_error =
+      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                               &ctx);
+  EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+}
+
+TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedBitcast) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY main {
+  p0 = f32[1,2]{1,0} parameter(0)
+  ROOT bitcast = f32[2] bitcast(p0)
+})"));
+
+  mlir::MLIRContext mlir_ctx;
+  IndexingContext ctx(&mlir_ctx);
+  SymbolicTileAnalysisOrError analysis_or_error =
+      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                               &ctx);
+  EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+}
+
+TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedConcatenate) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY main {
+  p0 = f32[1,3]{1,0} parameter(0)
+  p1 = f32[1,3]{1,0} parameter(1)
+  ROOT concatenate = f32[2,3] concatenate(p0, p1), dimensions={0}
+})"));
+
+  mlir::MLIRContext mlir_ctx;
+  IndexingContext ctx(&mlir_ctx);
+  SymbolicTileAnalysisOrError analysis_or_error =
+      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                               &ctx);
+  EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla

From e219881df135731e163a99eb2416a281c128bb93 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 04:39:31 -0700
Subject: [PATCH 087/670] #shlo_ref Add `tanh` op.

PiperOrigin-RevId: 617128357
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  35 +++++
 tensorflow/lite/experimental/shlo/ops/tanh.cc |  74 +++++++++
 tensorflow/lite/experimental/shlo/ops/tanh.h  |  34 ++++
 .../lite/experimental/shlo/ops/tanh_test.cc   | 145 ++++++++++++++++++
 4 files changed, 288 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/tanh.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/tanh.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/tanh_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 934e74e45d4567..edb19eb4b8e2f5 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -321,3 +321,38 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "tanh",
+    srcs = ["tanh.cc"],
+    hdrs = ["tanh.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "tanh_test",
+    srcs = ["tanh_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":tanh",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/tanh.cc b/tensorflow/lite/experimental/shlo/ops/tanh.cc
new file mode 100644
index 00000000000000..3ba4c17a88dba6
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/tanh.cc
@@ -0,0 +1,74 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/tanh.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Tanh {
+  template <class T>
+  T operator()(T v) const {
+    return std::tanh(v);
+  }
+
+  template <>
+  F16 operator()<F16>(F16 val) const {
+    return F16(operator()(static_cast<float>(val)));
+  }
+
+  template <>
+  BF16 operator()<BF16>(BF16 val) const {
+    return BF16(operator()(static_cast<float>(val)));
+  }
+};
+
+TanhOp Create(TanhOp::Attributes) { return {}; }
+
+absl::Status Prepare(TanhOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("tanh"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("tanh"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(TanhOp& op, const Tensor& input, Tensor& output) {
+  Tanh tanh;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       tanh, input, output)
+  } else if (!input.IsQuantized() && IsFloat(input.StorageType())) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   tanh, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.tanh: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/tanh.h b/tensorflow/lite/experimental/shlo/ops/tanh.h
new file mode 100644
index 00000000000000..fabf2b3ae0528c
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/tanh.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TANH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TANH_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct TanhOp {
+  struct Attributes {};
+};
+
+TanhOp Create(TanhOp::Attributes);
+absl::Status Prepare(TanhOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(TanhOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TANH_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/tanh_test.cc b/tensorflow/lite/experimental/shlo/ops/tanh_test.cc
new file mode 100644
index 00000000000000..d57e52d7318235
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/tanh_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/tanh.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<TanhOp> {
+  static std::string Get() { return "Tanh"; }
+};
+
+namespace {
+
+struct Tanh {
+  template <class T>
+  T operator()(T v) const {
+    return std::tanh(v);
+  }
+
+  template <>
+  F16 operator()<F16>(F16 val) const {
+    return F16(operator()(static_cast<float>(val)));
+  }
+
+  template <>
+  BF16 operator()<BF16>(BF16 val) const {
+    return BF16(operator()(static_cast<float>(val)));
+  }
+} tanh_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Tanh, UnaryElementwiseOpShapePropagationTest,
+                               TanhOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Tanh, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<TanhOp>, TestParamNames);
+
+using UnsupportedTypes = WithOpTypes<
+    TanhOp, ConcatTypes<BoolTestType, IntTestTypes, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Tanh, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct FloatTanhTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(FloatTanhTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(FloatTanhTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), tanh_ref);
+
+  auto op = Create(TanhOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedTanhTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedTanhTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedTanhTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = tanh_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(TanhOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From da09611b50c51fc7654e139a01ceceb4174fce89 Mon Sep 17 00:00:00 2001
From: Andrew Goodbody <andrew.goodbody@linaro.org>
Date: Tue, 19 Mar 2024 12:24:27 +0000
Subject: [PATCH 088/670] [Linaro:ARM_CI] Fix Python 3.12 ARM_CD builds

Remove the setting of SETUPTOOLS_USE_DISTUTILS environment
variable as it is no longer needed and breaks Python 3.12
package installation in some cases.
---
 tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_test_build.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_test_build.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_test_build.sh
index ccc8d201da030f..a2567292ca1bca 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_test_build.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_test_build.sh
@@ -113,9 +113,6 @@ sudo sed -i '/^build --profile/d' /usertools/aarch64_clang.bazelrc
 sudo sed -i '\@^build.*=\"/usr/local/bin/python3\"$@d' /usertools/aarch64_clang.bazelrc
 sed -i '$ aimport /usertools/aarch64_clang.bazelrc' .bazelrc
 
-# Override breaking change in setuptools v60 (https://github.com/pypa/setuptools/pull/2896)
-export SETUPTOOLS_USE_DISTUTILS=stdlib
-
 # Local variables
 WHL_DIR="${KOKORO_ARTIFACTS_DIR}/tensorflow/whl"
 sudo install -o ${CI_BUILD_USER} -g ${CI_BUILD_GROUP} -d ${WHL_DIR}

From bf157bb5b528bf58ab85531687b4c651ec8a90d4 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 04:50:59 -0700
Subject: [PATCH 089/670] #shlo_ref Add `sqrt` op.

PiperOrigin-RevId: 617130327
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  34 ++++
 tensorflow/lite/experimental/shlo/ops/sqrt.cc |  73 +++++++++
 tensorflow/lite/experimental/shlo/ops/sqrt.h  |  34 ++++
 .../lite/experimental/shlo/ops/sqrt_test.cc   | 147 ++++++++++++++++++
 4 files changed, 288 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/sqrt.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/sqrt.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/sqrt_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index edb19eb4b8e2f5..ccdaf9c9358494 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -356,3 +356,37 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "sqrt",
+    srcs = ["sqrt.cc"],
+    hdrs = ["sqrt.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "sqrt_test",
+    srcs = ["sqrt_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":sqrt",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/sqrt.cc b/tensorflow/lite/experimental/shlo/ops/sqrt.cc
new file mode 100644
index 00000000000000..e13ff7a3025aa1
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sqrt.cc
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/sqrt.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Sqrt {
+  template <class T>
+  T operator()(T v) const {
+    return std::sqrt(v);
+  }
+
+  template <>
+  F16 operator()<F16>(F16 val) const {
+    return F16(operator()(static_cast<float>(val)));
+  }
+
+  template <>
+  BF16 operator()<BF16>(BF16 val) const {
+    return BF16(operator()(static_cast<float>(val)));
+  }
+};
+
+SqrtOp Create(SqrtOp::Attributes) { return {}; }
+
+absl::Status Prepare(SqrtOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("sqrt"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("sqrt"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(SqrtOp& op, const Tensor& input, Tensor& output) {
+  Sqrt sqrt;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       sqrt, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   sqrt, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.sqrt: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/sqrt.h b/tensorflow/lite/experimental/shlo/ops/sqrt.h
new file mode 100644
index 00000000000000..6955efff8d5a42
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sqrt.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SQRT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SQRT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct SqrtOp {
+  struct Attributes {};
+};
+
+SqrtOp Create(SqrtOp::Attributes);
+absl::Status Prepare(SqrtOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(SqrtOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SQRT_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc b/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc
new file mode 100644
index 00000000000000..161a6c6882d4a4
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/sqrt.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<SqrtOp> {
+  static std::string Get() { return "Sqrt"; }
+};
+
+namespace {
+
+struct Sqrt {
+  template <class T>
+  T operator()(T v) const {
+    return std::sqrt(v);
+  }
+
+  template <>
+  F16 operator()<F16>(F16 val) const {
+    return F16(operator()(static_cast<float>(val)));
+  }
+
+  template <>
+  BF16 operator()<BF16>(BF16 val) const {
+    return BF16(operator()(static_cast<float>(val)));
+  }
+} sqrt_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sqrt, UnaryElementwiseOpShapePropagationTest,
+                               SqrtOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Sqrt, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<SqrtOp>, TestParamNames);
+
+using UnsupportedTypes = WithOpTypes<
+    SqrtOp, ConcatTypes<BoolTestType, IntTestTypes, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sqrt, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct FloatSqrtTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(FloatSqrtTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(FloatSqrtTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/static_cast<StorageT>(0));
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), sqrt_ref);
+
+  auto op = Create(SqrtOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedSqrtTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedSqrtTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedSqrtTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/static_cast<StorageT>(zero_point));
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = sqrt_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(SqrtOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From 5ee9e003da7ebc5fab378c3557ece015c553c7d1 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 19 Mar 2024 05:11:41 -0700
Subject: [PATCH 090/670] Integrate LLVM at llvm/llvm-project@fadc38efed81

Updates LLVM usage to match
[fadc38efed81](https://github.com/llvm/llvm-project/commit/fadc38efed81)

PiperOrigin-RevId: 617134734
---
 third_party/llvm/generated.patch | 669 -------------------------------
 third_party/llvm/workspace.bzl   |   4 +-
 2 files changed, 2 insertions(+), 671 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 575d74a4816f67..509398da979e83 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,670 +1 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
---- a/clang/docs/ReleaseNotes.rst
-+++ b/clang/docs/ReleaseNotes.rst
-@@ -201,21 +201,6 @@
-   and each must be a positive integer when provided. The parameter ``x`` is required, while ``y`` and
-   ``z`` are optional with default value of 1.
- 
--- The ``_Nullable`` and ``_Nonnull`` family of type attributes can now apply
--  to certain C++ class types, such as smart pointers:
--  ``void useObject(std::unique_ptr<Object> _Nonnull obj);``.
--
--  This works for standard library types including ``unique_ptr``, ``shared_ptr``,
--  and ``function``. See
--  `the attribute reference documentation <https://llvm.org/docs/AttributeReference.html#nullability-attributes>`_
--  for the full list.
--
--- The ``_Nullable`` attribute can be applied to C++ class declarations:
--  ``template <class T> class _Nullable MySmartPointer {};``.
--
--  This allows the ``_Nullable`` and ``_Nonnull`` family of type attributes to
--  apply to this class.
--
- Improvements to Clang's diagnostics
- -----------------------------------
- - Clang now applies syntax highlighting to the code snippets it
-diff -ruN --strip-trailing-cr a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
---- a/clang/include/clang/Basic/AttrDocs.td
-+++ b/clang/include/clang/Basic/AttrDocs.td
-@@ -4151,20 +4151,6 @@
-       @property (assign, nullable) NSView *superview;
-       @property (readonly, nonnull) NSArray *subviews;
-     @end
--
--As well as built-in pointer types, the nullability attributes can be attached
--to C++ classes marked with the ``_Nullable`` attribute.
--
--The following C++ standard library types are considered nullable:
--``unique_ptr``, ``shared_ptr``, ``auto_ptr``, ``exception_ptr``, ``function``,
--``move_only_function`` and ``coroutine_handle``.
--
--Types should be marked nullable only where the type itself leaves nullability
--ambiguous. For example, ``std::optional`` is not marked ``_Nullable``, because
--``optional<int> _Nullable`` is redundant and ``optional<int> _Nonnull`` is
--not a useful type. ``std::weak_ptr`` is not nullable, because its nullability
--can change with no visible modification, so static annotation is unlikely to be
--unhelpful.
-   }];
- }
- 
-@@ -4199,17 +4185,6 @@
-     int fetch_or_zero(int * _Nullable ptr);
- 
- a caller of ``fetch_or_zero`` can provide null.
--
--The ``_Nullable`` attribute on classes indicates that the given class can
--represent null values, and so the ``_Nullable``, ``_Nonnull`` etc qualifiers
--make sense for this type. For example:
--
--  .. code-block:: c
--
--    class _Nullable ArenaPointer { ... };
--
--    ArenaPointer _Nonnull x = ...;
--    ArenaPointer _Nullable y = nullptr;
-   }];
- }
- 
-diff -ruN --strip-trailing-cr a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
---- a/clang/include/clang/Basic/Attr.td
-+++ b/clang/include/clang/Basic/Attr.td
-@@ -2178,10 +2178,9 @@
-   let Documentation = [TypeNonNullDocs];
- }
- 
--def TypeNullable : DeclOrTypeAttr {
-+def TypeNullable : TypeAttr {
-   let Spellings = [CustomKeyword<"_Nullable">];
-   let Documentation = [TypeNullableDocs];
--//  let Subjects = SubjectList<[CXXRecord], ErrorDiag>;
- }
- 
- def TypeNullableResult : TypeAttr {
-diff -ruN --strip-trailing-cr a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
---- a/clang/include/clang/Basic/Features.def
-+++ b/clang/include/clang/Basic/Features.def
-@@ -94,7 +94,6 @@
- FEATURE(enumerator_attributes, true)
- FEATURE(nullability, true)
- FEATURE(nullability_on_arrays, true)
--FEATURE(nullability_on_classes, true)
- FEATURE(nullability_nullable_result, true)
- FEATURE(memory_sanitizer,
-         LangOpts.Sanitize.hasOneOf(SanitizerKind::Memory |
-diff -ruN --strip-trailing-cr a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
---- a/clang/include/clang/Parse/Parser.h
-+++ b/clang/include/clang/Parse/Parser.h
-@@ -3014,7 +3014,6 @@
-   void DiagnoseAndSkipExtendedMicrosoftTypeAttributes();
-   SourceLocation SkipExtendedMicrosoftTypeAttributes();
-   void ParseMicrosoftInheritanceClassAttributes(ParsedAttributes &attrs);
--  void ParseNullabilityClassAttributes(ParsedAttributes &attrs);
-   void ParseBorlandTypeAttributes(ParsedAttributes &attrs);
-   void ParseOpenCLKernelAttributes(ParsedAttributes &attrs);
-   void ParseOpenCLQualifiers(ParsedAttributes &Attrs);
-diff -ruN --strip-trailing-cr a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
---- a/clang/include/clang/Sema/Sema.h
-+++ b/clang/include/clang/Sema/Sema.h
-@@ -1655,9 +1655,6 @@
-   /// Add [[gsl::Pointer]] attributes for std:: types.
-   void inferGslPointerAttribute(TypedefNameDecl *TD);
- 
--  /// Add _Nullable attributes for std:: types.
--  void inferNullableClassAttribute(CXXRecordDecl *CRD);
--
-   enum PragmaOptionsAlignKind {
-     POAK_Native,  // #pragma options align=native
-     POAK_Natural, // #pragma options align=natural
-diff -ruN --strip-trailing-cr a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
---- a/clang/lib/AST/Type.cpp
-+++ b/clang/lib/AST/Type.cpp
-@@ -4558,15 +4558,16 @@
-   case Type::Auto:
-     return ResultIfUnknown;
- 
--  // Dependent template specializations could instantiate to pointer types.
-+  // Dependent template specializations can instantiate to pointer
-+  // types unless they're known to be specializations of a class
-+  // template.
-   case Type::TemplateSpecialization:
--    // If it's a known class template, we can already check if it's nullable.
--    if (TemplateDecl *templateDecl =
--            cast<TemplateSpecializationType>(type.getTypePtr())
--                ->getTemplateName()
--                .getAsTemplateDecl())
--      if (auto *CTD = dyn_cast<ClassTemplateDecl>(templateDecl))
--        return CTD->getTemplatedDecl()->hasAttr<TypeNullableAttr>();
-+    if (TemplateDecl *templateDecl
-+          = cast<TemplateSpecializationType>(type.getTypePtr())
-+              ->getTemplateName().getAsTemplateDecl()) {
-+      if (isa<ClassTemplateDecl>(templateDecl))
-+        return false;
-+    }
-     return ResultIfUnknown;
- 
-   case Type::Builtin:
-@@ -4623,17 +4624,6 @@
-     }
-     llvm_unreachable("unknown builtin type");
- 
--  case Type::Record: {
--    const RecordDecl *RD = cast<RecordType>(type)->getDecl();
--    // For template specializations, look only at primary template attributes.
--    // This is a consistent regardless of whether the instantiation is known.
--    if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(RD))
--      return CTSD->getSpecializedTemplate()
--          ->getTemplatedDecl()
--          ->hasAttr<TypeNullableAttr>();
--    return RD->hasAttr<TypeNullableAttr>();
--  }
--
-   // Non-pointer types.
-   case Type::Complex:
-   case Type::LValueReference:
-@@ -4651,6 +4641,7 @@
-   case Type::DependentAddressSpace:
-   case Type::FunctionProto:
-   case Type::FunctionNoProto:
-+  case Type::Record:
-   case Type::DeducedTemplateSpecialization:
-   case Type::Enum:
-   case Type::InjectedClassName:
-diff -ruN --strip-trailing-cr a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
---- a/clang/lib/CodeGen/CGCall.cpp
-+++ b/clang/lib/CodeGen/CGCall.cpp
-@@ -4372,8 +4372,7 @@
-     NNAttr = getNonNullAttr(AC.getDecl(), PVD, ArgType, ArgNo);
- 
-   bool CanCheckNullability = false;
--  if (SanOpts.has(SanitizerKind::NullabilityArg) && !NNAttr && PVD &&
--      !PVD->getType()->isRecordType()) {
-+  if (SanOpts.has(SanitizerKind::NullabilityArg) && !NNAttr && PVD) {
-     auto Nullability = PVD->getType()->getNullability();
-     CanCheckNullability = Nullability &&
-                           *Nullability == NullabilityKind::NonNull &&
-diff -ruN --strip-trailing-cr a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
---- a/clang/lib/CodeGen/CodeGenFunction.cpp
-+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
-@@ -979,8 +979,7 @@
-   // return value. Initialize the flag to 'true' and refine it in EmitParmDecl.
-   if (SanOpts.has(SanitizerKind::NullabilityReturn)) {
-     auto Nullability = FnRetTy->getNullability();
--    if (Nullability && *Nullability == NullabilityKind::NonNull &&
--        !FnRetTy->isRecordType()) {
-+    if (Nullability && *Nullability == NullabilityKind::NonNull) {
-       if (!(SanOpts.has(SanitizerKind::ReturnsNonnullAttribute) &&
-             CurCodeDecl && CurCodeDecl->getAttr<ReturnsNonNullAttr>()))
-         RetValNullabilityPrecondition =
-diff -ruN --strip-trailing-cr a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
---- a/clang/lib/Parse/ParseDeclCXX.cpp
-+++ b/clang/lib/Parse/ParseDeclCXX.cpp
-@@ -1494,15 +1494,6 @@
-   }
- }
- 
--void Parser::ParseNullabilityClassAttributes(ParsedAttributes &attrs) {
--  while (Tok.is(tok::kw__Nullable)) {
--    IdentifierInfo *AttrName = Tok.getIdentifierInfo();
--    auto Kind = Tok.getKind();
--    SourceLocation AttrNameLoc = ConsumeToken();
--    attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0, Kind);
--  }
--}
--
- /// Determine whether the following tokens are valid after a type-specifier
- /// which could be a standalone declaration. This will conservatively return
- /// true if there's any doubt, and is appropriate for insert-';' fixits.
-@@ -1684,21 +1675,15 @@
- 
-   ParsedAttributes attrs(AttrFactory);
-   // If attributes exist after tag, parse them.
--  for (;;) {
--    MaybeParseAttributes(PAKM_CXX11 | PAKM_Declspec | PAKM_GNU, attrs);
--    // Parse inheritance specifiers.
--    if (Tok.isOneOf(tok::kw___single_inheritance,
--                    tok::kw___multiple_inheritance,
--                    tok::kw___virtual_inheritance)) {
--      ParseMicrosoftInheritanceClassAttributes(attrs);
--      continue;
--    }
--    if (Tok.is(tok::kw__Nullable)) {
--      ParseNullabilityClassAttributes(attrs);
--      continue;
--    }
--    break;
--  }
-+  MaybeParseAttributes(PAKM_CXX11 | PAKM_Declspec | PAKM_GNU, attrs);
-+
-+  // Parse inheritance specifiers.
-+  if (Tok.isOneOf(tok::kw___single_inheritance, tok::kw___multiple_inheritance,
-+                  tok::kw___virtual_inheritance))
-+    ParseMicrosoftInheritanceClassAttributes(attrs);
-+
-+  // Allow attributes to precede or succeed the inheritance specifiers.
-+  MaybeParseAttributes(PAKM_CXX11 | PAKM_Declspec | PAKM_GNU, attrs);
- 
-   // Source location used by FIXIT to insert misplaced
-   // C++11 attributes
-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp
---- a/clang/lib/Sema/SemaAttr.cpp
-+++ b/clang/lib/Sema/SemaAttr.cpp
-@@ -215,18 +215,6 @@
-   inferGslPointerAttribute(Record, Record);
- }
- 
--void Sema::inferNullableClassAttribute(CXXRecordDecl *CRD) {
--  static llvm::StringSet<> Nullable{
--      "auto_ptr",         "shared_ptr", "unique_ptr",         "exception_ptr",
--      "coroutine_handle", "function",   "move_only_function",
--  };
--
--  if (CRD->isInStdNamespace() && Nullable.count(CRD->getName()) &&
--      !CRD->hasAttr<TypeNullableAttr>())
--    for (Decl *Redecl : CRD->redecls())
--      Redecl->addAttr(TypeNullableAttr::CreateImplicit(Context));
--}
--
- void Sema::ActOnPragmaOptionsAlign(PragmaOptionsAlignKind Kind,
-                                    SourceLocation PragmaLoc) {
-   PragmaMsStackAction Action = Sema::PSK_Reset;
-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
---- a/clang/lib/Sema/SemaChecking.cpp
-+++ b/clang/lib/Sema/SemaChecking.cpp
-@@ -27,7 +27,6 @@
- #include "clang/AST/ExprObjC.h"
- #include "clang/AST/ExprOpenMP.h"
- #include "clang/AST/FormatString.h"
--#include "clang/AST/IgnoreExpr.h"
- #include "clang/AST/NSAPI.h"
- #include "clang/AST/NonTrivialTypeVisitor.h"
- #include "clang/AST/OperationKinds.h"
-@@ -7358,14 +7357,6 @@
- ///
- /// Returns true if the value evaluates to null.
- static bool CheckNonNullExpr(Sema &S, const Expr *Expr) {
--  // Treat (smart) pointers constructed from nullptr as null, whether we can
--  // const-evaluate them or not.
--  // This must happen first: the smart pointer expr might have _Nonnull type!
--  if (isa<CXXNullPtrLiteralExpr>(
--          IgnoreExprNodes(Expr, IgnoreImplicitAsWrittenSingleStep,
--                          IgnoreElidableImplicitConstructorSingleStep)))
--    return true;
--
-   // If the expression has non-null type, it doesn't evaluate to null.
-   if (auto nullability = Expr->IgnoreImplicit()->getType()->getNullability()) {
-     if (*nullability == NullabilityKind::NonNull)
-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
---- a/clang/lib/Sema/SemaDeclAttr.cpp
-+++ b/clang/lib/Sema/SemaDeclAttr.cpp
-@@ -5976,20 +5976,6 @@
-   D->addAttr(::new (S.Context) BuiltinAliasAttr(S.Context, AL, Ident));
- }
- 
--static void handleNullableTypeAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
--  if (AL.isUsedAsTypeAttr())
--    return;
--
--  if (auto *CRD = dyn_cast<CXXRecordDecl>(D);
--      !CRD || !(CRD->isClass() || CRD->isStruct())) {
--    S.Diag(AL.getRange().getBegin(), diag::err_attribute_wrong_decl_type_str)
--        << AL << AL.isRegularKeywordAttribute() << "classes";
--    return;
--  }
--
--  handleSimpleAttribute<TypeNullableAttr>(S, D, AL);
--}
--
- static void handlePreferredTypeAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-   if (!AL.hasParsedType()) {
-     S.Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 1;
-@@ -9959,10 +9945,6 @@
-   case ParsedAttr::AT_UsingIfExists:
-     handleSimpleAttribute<UsingIfExistsAttr>(S, D, AL);
-     break;
--
--  case ParsedAttr::AT_TypeNullable:
--    handleNullableTypeAttr(S, D, AL);
--    break;
-   }
- }
- 
-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
---- a/clang/lib/Sema/SemaDecl.cpp
-+++ b/clang/lib/Sema/SemaDecl.cpp
-@@ -18254,10 +18254,8 @@
-   if (PrevDecl)
-     mergeDeclAttributes(New, PrevDecl);
- 
--  if (auto *CXXRD = dyn_cast<CXXRecordDecl>(New)) {
-+  if (auto *CXXRD = dyn_cast<CXXRecordDecl>(New))
-     inferGslOwnerPointerAttribute(CXXRD);
--    inferNullableClassAttribute(CXXRD);
--  }
- 
-   // If there's a #pragma GCC visibility in scope, set the visibility of this
-   // record.
-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
---- a/clang/lib/Sema/SemaInit.cpp
-+++ b/clang/lib/Sema/SemaInit.cpp
-@@ -7075,11 +7075,6 @@
-       hasCopyOrMoveCtorParam(S.Context,
-                              getConstructorInfo(Step.Function.FoundDecl));
- 
--  // A smart pointer constructed from a nullable pointer is nullable.
--  if (NumArgs == 1 && !Kind.isExplicitCast())
--    S.diagnoseNullableToNonnullConversion(
--        Entity.getType(), Args.front()->getType(), Kind.getLocation());
--
-   // Determine the arguments required to actually perform the constructor
-   // call.
-   if (S.CompleteConstructorCall(Constructor, Step.Type, Args, Loc,
-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
---- a/clang/lib/Sema/SemaOverload.cpp
-+++ b/clang/lib/Sema/SemaOverload.cpp
-@@ -14797,13 +14797,6 @@
-           }
-         }
- 
--        // Check for nonnull = nullable.
--        // This won't be caught in the arg's initialization: the parameter to
--        // the assignment operator is not marked nonnull.
--        if (Op == OO_Equal)
--          diagnoseNullableToNonnullConversion(Args[0]->getType(),
--                                              Args[1]->getType(), OpLoc);
--
-         // Convert the arguments.
-         if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(FnDecl)) {
-           // Best->Access is only meaningful for class members.
-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
---- a/clang/lib/Sema/SemaTemplate.cpp
-+++ b/clang/lib/Sema/SemaTemplate.cpp
-@@ -2171,7 +2171,6 @@
- 
-   AddPushedVisibilityAttribute(NewClass);
-   inferGslOwnerPointerAttribute(NewClass);
--  inferNullableClassAttribute(NewClass);
- 
-   if (TUK != TUK_Friend) {
-     // Per C++ [basic.scope.temp]p2, skip the template parameter scopes.
-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
---- a/clang/lib/Sema/SemaType.cpp
-+++ b/clang/lib/Sema/SemaType.cpp
-@@ -4711,18 +4711,6 @@
-   return false;
- }
- 
--// Whether this is a type broadly expected to have nullability attached.
--// These types are affected by `#pragma assume_nonnull`, and missing nullability
--// will be diagnosed with -Wnullability-completeness.
--static bool shouldHaveNullability(QualType T) {
--  return T->canHaveNullability(/*ResultIfUnknown=*/false) &&
--         // For now, do not infer/require nullability on C++ smart pointers.
--         // It's unclear whether the pragma's behavior is useful for C++.
--         // e.g. treating type-aliases and template-type-parameters differently
--         // from types of declarations can be surprising.
--         !isa<RecordType>(T);
--}
--
- static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
-                                                 QualType declSpecType,
-                                                 TypeSourceInfo *TInfo) {
-@@ -4841,7 +4829,8 @@
-     // inner pointers.
-     complainAboutMissingNullability = CAMN_InnerPointers;
- 
--    if (shouldHaveNullability(T) && !T->getNullability()) {
-+    if (T->canHaveNullability(/*ResultIfUnknown*/ false) &&
-+        !T->getNullability()) {
-       // Note that we allow but don't require nullability on dependent types.
-       ++NumPointersRemaining;
-     }
-@@ -5064,7 +5053,8 @@
-   // If the type itself could have nullability but does not, infer pointer
-   // nullability and perform consistency checking.
-   if (S.CodeSynthesisContexts.empty()) {
--    if (shouldHaveNullability(T) && !T->getNullability()) {
-+    if (T->canHaveNullability(/*ResultIfUnknown*/ false) &&
-+        !T->getNullability()) {
-       if (isVaList(T)) {
-         // Record that we've seen a pointer, but do nothing else.
-         if (NumPointersRemaining > 0)
-diff -ruN --strip-trailing-cr a/clang/test/Sema/nullability.c b/clang/test/Sema/nullability.c
---- a/clang/test/Sema/nullability.c
-+++ b/clang/test/Sema/nullability.c
-@@ -248,5 +248,3 @@
-   void (^withTypedefBad)(INTS _Nonnull [2]) = // expected-error {{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'INTS' (aka 'int[4]')}}
-       ^(INTS _Nonnull x[2]) {}; // expected-error {{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'INTS' (aka 'int[4]')}}
- }
--
--struct _Nullable NotCplusplusClass {}; // expected-error {{'_Nullable' attribute only applies to classes}}
-diff -ruN --strip-trailing-cr a/clang/test/SemaCXX/nullability.cpp b/clang/test/SemaCXX/nullability.cpp
---- a/clang/test/SemaCXX/nullability.cpp
-+++ b/clang/test/SemaCXX/nullability.cpp
-@@ -4,10 +4,6 @@
- #else
- #  error nullability feature should be defined
- #endif
--#if __has_feature(nullability_on_classes)
--#else
--#  error smart-pointer feature should be defined
--#endif
- 
- #include "nullability-completeness.h"
- 
-@@ -31,7 +27,6 @@
- struct AddNonNull {
-   typedef _Nonnull T type; // expected-error{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'int'}}
-   // expected-error@-1{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'std::nullptr_t'}}
--  // expected-error@-2{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'NotPtr'}}
- };
- 
- typedef AddNonNull<int *>::type nonnull_int_ptr_1;
-@@ -40,33 +35,6 @@
- 
- typedef AddNonNull<int>::type nonnull_non_pointer_1; // expected-note{{in instantiation of template class 'AddNonNull<int>' requested here}}
- 
--// Nullability on C++ class types (smart pointers).
--struct NotPtr{};
--typedef AddNonNull<NotPtr>::type nonnull_non_pointer_2; // expected-note{{in instantiation}}
--struct _Nullable SmartPtr{
--  SmartPtr();
--  SmartPtr(nullptr_t);
--  SmartPtr(const SmartPtr&);
--  SmartPtr(SmartPtr&&);
--  SmartPtr &operator=(const SmartPtr&);
--  SmartPtr &operator=(SmartPtr&&);
--};
--typedef AddNonNull<SmartPtr>::type nonnull_smart_pointer_1;
--template<class> struct _Nullable SmartPtrTemplate{};
--typedef AddNonNull<SmartPtrTemplate<int>>::type nonnull_smart_pointer_2;
--namespace std { inline namespace __1 {
--  template <class> class unique_ptr {};
--  template <class> class function;
--  template <class Ret, class... Args> class function<Ret(Args...)> {};
--} }
--typedef AddNonNull<std::unique_ptr<int>>::type nonnull_smart_pointer_3;
--typedef AddNonNull<std::function<int()>>::type nonnull_smart_pointer_4;
--
--class Derived : public SmartPtr {};
--Derived _Nullable x; // expected-error {{'_Nullable' cannot be applied}}
--class DerivedPrivate : private SmartPtr {};
--DerivedPrivate _Nullable y; // expected-error {{'_Nullable' cannot be applied}}
--
- // Non-null checking within a template.
- template<typename T>
- struct AddNonNull2 {
-@@ -86,7 +54,6 @@
- void (X::* accepts_nonnull_3)(_Nonnull int *ptr);
- void accepts_nonnull_4(_Nonnull int *ptr);
- void (&accepts_nonnull_5)(_Nonnull int *ptr) = accepts_nonnull_4;
--void accepts_nonnull_6(SmartPtr _Nonnull);
- 
- void test_accepts_nonnull_null_pointer_literal(X *x) {
-   accepts_nonnull_1(0); // expected-warning{{null passed to a callee that requires a non-null argument}}
-@@ -94,8 +61,6 @@
-   (x->*accepts_nonnull_3)(0); // expected-warning{{null passed to a callee that requires a non-null argument}}
-   accepts_nonnull_4(0); // expected-warning{{null passed to a callee that requires a non-null argument}}
-   accepts_nonnull_5(0); // expected-warning{{null passed to a callee that requires a non-null argument}}
--
--  accepts_nonnull_6(nullptr); // expected-warning{{null passed to a callee that requires a non-null argument}}
- }
- 
- template<void FP(_Nonnull int*)> 
-@@ -106,7 +71,6 @@
- template void test_accepts_nonnull_null_pointer_literal_template<&accepts_nonnull_4>(); // expected-note{{instantiation of function template specialization}}
- 
- void TakeNonnull(void *_Nonnull);
--void TakeSmartNonnull(SmartPtr _Nonnull);
- // Check different forms of assignment to a nonull type from a nullable one.
- void AssignAndInitNonNull() {
-   void *_Nullable nullable;
-@@ -117,26 +81,12 @@
-   void *_Nonnull nonnull;
-   nonnull = nullable; // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}}
-   nonnull = {nullable}; // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}}
-+
-   TakeNonnull(nullable); //expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull}}
-   TakeNonnull(nonnull); // OK
--  nonnull = (void *_Nonnull)nullable; // explicit cast OK
--
--  SmartPtr _Nullable s_nullable;
--  SmartPtr _Nonnull s(s_nullable); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
--  SmartPtr _Nonnull s2{s_nullable}; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
--  SmartPtr _Nonnull s3 = {s_nullable}; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
--  SmartPtr _Nonnull s4 = s_nullable; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
--  SmartPtr _Nonnull s_nonnull;
--  s_nonnull = s_nullable; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
--  s_nonnull = {s_nullable}; // no warning here - might be nice?
--  TakeSmartNonnull(s_nullable); //expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull}}
--  TakeSmartNonnull(s_nonnull); // OK
--  s_nonnull = (SmartPtr _Nonnull)s_nullable; // explicit cast OK
--  s_nonnull = static_cast<SmartPtr _Nonnull>(s_nullable); // explicit cast OK
- }
- 
- void *_Nullable ReturnNullable();
--SmartPtr _Nullable ReturnSmartNullable();
- 
- void AssignAndInitNonNullFromFn() {
-   void *_Nonnull p(ReturnNullable()); // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}}
-@@ -146,16 +96,8 @@
-   void *_Nonnull nonnull;
-   nonnull = ReturnNullable(); // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}}
-   nonnull = {ReturnNullable()}; // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}}
--  TakeNonnull(ReturnNullable()); //expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull}}
- 
--  SmartPtr _Nonnull s(ReturnSmartNullable()); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
--  SmartPtr _Nonnull s2{ReturnSmartNullable()}; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
--  SmartPtr _Nonnull s3 = {ReturnSmartNullable()}; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
--  SmartPtr _Nonnull s4 = ReturnSmartNullable(); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
--  SmartPtr _Nonnull s_nonnull;
--  s_nonnull = ReturnSmartNullable(); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
--  s_nonnull = {ReturnSmartNullable()};
--  TakeSmartNonnull(ReturnSmartNullable()); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}}
-+  TakeNonnull(ReturnNullable()); //expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull}}
- }
- 
- void ConditionalExpr(bool c) {
-diff -ruN --strip-trailing-cr a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
---- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
-+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
-@@ -1019,6 +1019,7 @@
-   const DataLayout &DL = getDataLayout();
- 
-   // GlobalVariables are always constant pointers themselves.
-+  PointerType *PTy = GVar->getType();
-   Type *ETy = GVar->getValueType();
- 
-   if (GVar->hasExternalLinkage()) {
-@@ -1026,9 +1027,6 @@
-       O << ".visible ";
-     else
-       O << ".extern ";
--  } else if (GVar->hasCommonLinkage() &&
--             GVar->getAddressSpace() == ADDRESS_SPACE_GLOBAL) {
--    O << ".common ";
-   } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() ||
-              GVar->hasAvailableExternallyLinkage() ||
-              GVar->hasCommonLinkage()) {
-@@ -1140,7 +1138,7 @@
-   }
- 
-   O << ".";
--  emitPTXAddressSpace(GVar->getAddressSpace(), O);
-+  emitPTXAddressSpace(PTy->getAddressSpace(), O);
- 
-   if (isManaged(*GVar)) {
-     if (STI.getPTXVersion() < 40 || STI.getSmVersion() < 30) {
-@@ -1169,8 +1167,8 @@
-     // Ptx allows variable initilization only for constant and global state
-     // spaces.
-     if (GVar->hasInitializer()) {
--      if ((GVar->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
--          (GVar->getAddressSpace() == ADDRESS_SPACE_CONST)) {
-+      if ((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
-+          (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) {
-         const Constant *Initializer = GVar->getInitializer();
-         // 'undef' is treated as there is no value specified.
-         if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) {
-@@ -1185,7 +1183,7 @@
-             !isa<UndefValue>(GVar->getInitializer())) {
-           report_fatal_error("initial value of '" + GVar->getName() +
-                              "' is not allowed in addrspace(" +
--                             Twine(GVar->getAddressSpace()) + ")");
-+                             Twine(PTy->getAddressSpace()) + ")");
-         }
-       }
-     }
-@@ -1204,8 +1202,8 @@
-       ElementSize = DL.getTypeStoreSize(ETy);
-       // Ptx allows variable initilization only for constant and
-       // global state spaces.
--      if (((GVar->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
--           (GVar->getAddressSpace() == ADDRESS_SPACE_CONST)) &&
-+      if (((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
-+           (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) &&
-           GVar->hasInitializer()) {
-         const Constant *Initializer = GVar->getInitializer();
-         if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/NVPTX/common-linkage.ll b/llvm/test/CodeGen/NVPTX/common-linkage.ll
---- a/llvm/test/CodeGen/NVPTX/common-linkage.ll
-+++ b/llvm/test/CodeGen/NVPTX/common-linkage.ll
-@@ -1,26 +0,0 @@
--; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
--; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
--
--; CHECK: .common .global .align 4 .u32 g
--@g = common addrspace(1) global i32 0, align 4
--
--; CHECK: .weak .const .align 4 .u32 c
--@c = common addrspace(4) global i32 0, align 4
--
--; CHECK: .weak .shared .align 4 .u32 s
--@s = common addrspace(3) global i32 0, align 4
--
--define i32 @f1() {
--  %1 = load i32, ptr addrspace(1) @g
--  ret i32 %1
--}
--
--define i32 @f4() {
--  %1 = load i32, ptr addrspace(4) @c
--  ret i32 %1
--}
--
--define i32 @f3() {
--  %1 = load i32, ptr addrspace(3) @s
--  ret i32 %1
--}
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/NVPTX/weak-global.ll b/llvm/test/CodeGen/NVPTX/weak-global.ll
---- a/llvm/test/CodeGen/NVPTX/weak-global.ll
-+++ b/llvm/test/CodeGen/NVPTX/weak-global.ll
-@@ -1,7 +1,7 @@
- ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
- ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
- 
--; CHECK: .common .global .align 4 .u32 g
-+; CHECK: .weak .global .align 4 .u32 g
- @g = common addrspace(1) global i32 zeroinitializer
- 
- define i32 @func0() {
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 67616ae9c97943..98eef196d2d988 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "a4ca07f13b560b4f6fa5459eef7159e4f9ee9a6b"
-    LLVM_SHA256 = "fb936389d46b3ce7ee423c0d788e5359da8ce41cfe8996847719920c6f60b044"
+    LLVM_COMMIT = "fadc38efed815511c21032abab4b71e4320adc1c"
+    LLVM_SHA256 = "38e08676b4c612da3a97f3418de89ffd5206879106fccadefa1ccc6932b3b375"
 
     tf_http_archive(
         name = name,

From 136b3da82b1075c3f756a2d507ec4597aa6e470f Mon Sep 17 00:00:00 2001
From: Alan Kelly <alankelly@google.com>
Date: Tue, 19 Mar 2024 05:36:38 -0700
Subject: [PATCH 091/670] Replaces XNN_FLAG_KEEP_DIM --> XNN_FLAG_REDUCE_DIM to
 maintain existing behaviour

PiperOrigin-RevId: 617139509
---
 tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 5fffc633f222b6..9ce317fd271831 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -4501,8 +4501,6 @@ class Subgraph {
     uint32_t flags = 0;
     if (!reducer_params->keep_dims) {
       expected_output_dims -= num_reduction_axes;
-    } else {
-      flags = XNN_FLAG_KEEP_DIMS;
     }
     TF_LITE_ENSURE_STATUS(CheckTensorShape(
         logging_context, output_tensor, expected_output_dims,
@@ -4671,7 +4669,7 @@ class Subgraph {
         node_index));
 
     if (subgraph != nullptr) {
-      uint32_t flags = reducer_params->keep_dims ? XNN_FLAG_KEEP_DIMS : 0;
+      uint32_t flags = 0;
       xnn_status status = xnn_status_success;
       switch (num_reduction_axes) {
         case 1:

From 6b85d72fe35e89a12a973c7aea692e0501d26fce Mon Sep 17 00:00:00 2001
From: Alan Kelly <alankelly@google.com>
Date: Tue, 19 Mar 2024 05:39:19 -0700
Subject: [PATCH 092/670] Safely cast float to int32

PiperOrigin-RevId: 617140084
---
 tensorflow/lite/kernels/cast.cc      | 13 +++++++++++++
 tensorflow/lite/kernels/cast_test.cc | 10 ++++++++++
 2 files changed, 23 insertions(+)

diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index 57c65247f8788d..e345a12aeb529c 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <algorithm>
+#include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 
 #include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/core/c/common.h"
@@ -35,6 +37,17 @@ namespace {
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
+void copyCast(const float* in, int32_t* out, int num_elements) {
+  float min_int_float =
+      std::nextafterf((float)std::numeric_limits<int32_t>::min(), 0);
+  float max_int_float =
+      std::nextafterf((float)std::numeric_limits<int32_t>::max(), 0);
+  std::transform(in, in + num_elements, out, [=](float a) {
+    return static_cast<int32_t>(
+        std::max(std::min(a, max_int_float), min_int_float));
+  });
+}
+
 template <typename FromT, typename ToT>
 void copyCast(const FromT* in, ToT* out, int num_elements) {
   std::transform(in, in + num_elements, out,
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index c2eef57197119b..d83c26b0b224ce 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <complex>
+#include <limits>
 #include <random>
 #include <vector>
 
@@ -59,6 +60,15 @@ TEST(CastOpModel, CastInt4ToFloatLarge) {
   }
 }
 
+TEST(CastOpModel, CastFloatToInt32Infinity) {
+  CastOpModel m({TensorType_FLOAT32, {2}}, {TensorType_INT32, {2}});
+  m.PopulateTensor<float>(m.input(), {std::numeric_limits<float>::infinity(),
+                                      -std::numeric_limits<float>::infinity()});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<int32_t>(m.output()),
+              ElementsAreArray({2147483520, -2147483520}));
+}
+
 TEST(CastOpModel, CastInt16ToFloat) {
   CastOpModel m({TensorType_INT16, {2, 3}}, {TensorType_FLOAT32, {2, 3}});
   m.PopulateTensor<int16_t>(m.input(), {100, 200, 300, 400, 500, 600});

From f5b39c692fc4cd9ce321ffc6584197ba566f59fc Mon Sep 17 00:00:00 2001
From: Trevor Morris <tmorris@nvidia.com>
Date: Tue, 19 Mar 2024 05:42:29 -0700
Subject: [PATCH 093/670] PR #8874: [GPU] Use NCCL user buffers for collective
 permute and all-to-all

Imported from GitHub PR https://github.com/openxla/xla/pull/8874

This PR enables XLA to take advantage of NCCL user buffers for ncclSend/ncclRecv when `--xla_gpu_enable_nccl_user_buffers=true` is used. Requires NCCL 2.20

Copybara import of the project:

--
98acdf27d4eba6b19652a76d3f7dcd6630349fc5 by Trevor Morris <tmorris@nvidia.com>:

Use NCCL user buffers for ncclSend/ncclRecv ops

--
bcc289b49bcf2086b50a86a2381ea1b80acd3dd2 by Trevor Morris <tmorris@nvidia.com>:

Include memory space in buffers for collective permute and send/recv

--
4a83d8906b6b5e305dad23fc1d8b9a5069637279 by Trevor Morris <tmorris@nvidia.com>:

Don't offload send, recv

--
0083a418c4ab119ed5a0eb061113104980476943 by Trevor Morris <tmorris@nvidia.com>:

Fix conditional

Merging this change closes #8874

PiperOrigin-RevId: 617140675
---
 .../service/gpu/gpu_memory_space_assignment.h | 26 ++++++++++++-------
 .../xla/service/gpu/ir_emitter_unnested.cc    | 20 +++++++++++---
 .../gpu/runtime/nccl_all_to_all_thunk.cc      |  2 ++
 .../runtime/nccl_collective_permute_thunk.cc  |  2 ++
 .../service/gpu/runtime/nccl_recv_thunk.cc    |  2 ++
 .../service/gpu/runtime/nccl_send_thunk.cc    |  2 ++
 6 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
index 79c9187cfbde5a..faa9195bc37fc4 100644
--- a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
+++ b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
@@ -35,20 +35,28 @@ inline constexpr int64_t kCollectiveMemorySpaceColor = 1;
 // collective memory using ncclMemAlloc in the runtime.
 inline BufferAssigner::Colorer CollectiveColorer() {
   return [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
+    static const auto* kSupportedOpcodes = new absl::flat_hash_set<HloOpcode>{
+        HloOpcode::kAllReduce,
+        HloOpcode::kAllReduceStart,
+        HloOpcode::kAllReduceDone,
+        HloOpcode::kAllGather,
+        HloOpcode::kAllGatherStart,
+        HloOpcode::kAllGatherDone,
+        HloOpcode::kReduceScatter,
+        HloOpcode::kCollectivePermute,
+        HloOpcode::kCollectivePermuteStart,
+        HloOpcode::kCollectivePermuteDone,
+        HloOpcode::kAllToAll,
+    };
     for (HloValue* value : alias_analysis->dataflow_analysis().values()) {
       auto& buffer = alias_analysis->GetBufferContainingValue(*value);
       for (const auto& alias : buffer.values()) {
-        if ((alias->instruction()->opcode() == HloOpcode::kAllReduce ||
-             alias->instruction()->opcode() == HloOpcode::kAllReduceStart ||
-             alias->instruction()->opcode() == HloOpcode::kAllReduceDone ||
-             alias->instruction()->opcode() == HloOpcode::kAllGather ||
-             alias->instruction()->opcode() == HloOpcode::kAllGatherStart ||
-             alias->instruction()->opcode() == HloOpcode::kAllGatherDone ||
-             alias->instruction()->opcode() == HloOpcode::kReduceScatter) ||
+        // opcode or async wrapped opcode is in kSupportedOpcodes.
+        if (kSupportedOpcodes->contains(alias->instruction()->opcode()) ||
             ((alias->instruction()->opcode() == HloOpcode::kAsyncStart ||
               alias->instruction()->opcode() == HloOpcode::kAsyncDone) &&
-             alias->instruction()->async_wrapped_opcode() ==
-                 HloOpcode::kReduceScatter)) {
+             kSupportedOpcodes->contains(
+                 alias->instruction()->async_wrapped_opcode()))) {
           value->set_color(kCollectiveMemorySpaceColor);
         }
       }
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 09c0631925406b..2711662bdf1743 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -2179,7 +2179,8 @@ Status IrEmitterUnnested::EmitCollectivePermute(
   // First output is aliased.
   TF_RET_CHECK(
       instr->shape().IsTuple() && instr->shape().tuple_shapes_size() == 2 &&
-      instr->shape().tuple_shapes(0) == instr->shape().tuple_shapes(1));
+      Shape::Equal().IgnoreMemorySpaceInLayout()(
+          instr->shape().tuple_shapes(0), instr->shape().tuple_shapes(1)));
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
                       GetAllocationSliceForHlo(instr, {1}));
 
@@ -2187,6 +2188,9 @@ Status IrEmitterUnnested::EmitCollectivePermute(
   const auto& hlo_config = ir_emitter_context_->hlo_module().config();
   const int64_t replica_count = hlo_config.replica_count();
   const int64_t partition_count = hlo_config.num_partitions();
+  const int64_t src_memory_space = shape.layout().memory_space();
+  const int64_t dst_memory_space =
+      instr->shape().tuple_shapes(1).layout().memory_space();
 
   if (NcclCollectivePermuteStartThunk::IsDegenerate(instr, replica_count,
                                                     partition_count)) {
@@ -2202,7 +2206,9 @@ Status IrEmitterUnnested::EmitCollectivePermute(
     const NcclCollectiveThunk::Buffer buffer = {
         /*element_count=*/ShapeUtil::ElementsIn(shape),
         /*source_buffer=*/source_slice,
-        /*destination_buffer=*/result_slice};
+        /*destination_buffer=*/result_slice,
+        /*source_memory_space=*/src_memory_space,
+        /*destination_memory_space=*/dst_memory_space};
     auto thunk = std::make_unique<NcclCollectivePermuteStartThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(instr), NcclApi::Default(),
         instr, replica_count, partition_count, buffer);
@@ -2619,10 +2625,13 @@ absl::Status IrEmitterUnnested::EmitSendThunk(const HloSendInstruction* instr) {
     const auto& hlo_config = ir_emitter_context_->hlo_module().config();
     const int64_t replica_count = hlo_config.replica_count();
     const int64_t partition_count = hlo_config.num_partitions();
+    const int64_t memory_space = src->shape().layout().memory_space();
     const NcclCollectiveThunk::Buffer nccl_buffer = {
         /*element_count=*/ShapeUtil::ElementsIn(src->shape()),
         /*source_buffer=*/buffer,
-        /*destination_buffer=*/buffer};
+        /*destination_buffer=*/buffer,
+        /*source_memory_space=*/memory_space,
+        /*destination_memory_space=*/memory_space};
     auto thunk = std::make_unique<NcclSendThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(instr), NcclApi::Default(),
         instr, replica_count, partition_count, nccl_buffer);
@@ -2685,10 +2694,13 @@ absl::Status IrEmitterUnnested::EmitRecvThunk(const HloRecvInstruction* instr) {
     const auto& hlo_config = ir_emitter_context_->hlo_module().config();
     const int64_t replica_count = hlo_config.replica_count();
     const int64_t partition_count = hlo_config.num_partitions();
+    const int64_t memory_space = instr->shape().layout().memory_space();
     const NcclCollectiveThunk::Buffer nccl_buffer = {
         /*element_count=*/ShapeUtil::ElementsIn(instr->shape().tuple_shapes(0)),
         /*source_buffer=*/buffer,
-        /*destination_buffer=*/buffer};
+        /*destination_buffer=*/buffer,
+        /*source_memory_space=*/memory_space,
+        /*destination_memory_space=*/memory_space};
     auto thunk = std::make_unique<NcclRecvThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(instr), NcclApi::Default(),
         instr, replica_count, partition_count, nccl_buffer);
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
index 86394e84d1f021..0f1de920ab5e55 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
@@ -108,6 +108,8 @@ absl::Status RunAllToAll(NcclApi* nccl_api, bool has_split_dimension,
                          se::Stream& stream, NcclApi::NcclCommHandle comm) {
   int device_ordinal = stream.parent()->device_ordinal();
   VLOG(3) << "Performing all-to-all from device ordinal: " << device_ordinal;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(nccl_api, device_ordinal, buffers, comm));
 
   TF_ASSIGN_OR_RETURN(int32_t num_participants, nccl_api->CommCount(comm));
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
index b815a37785e5bf..e9520f82bf1070 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
@@ -173,6 +173,8 @@ absl::Status RunCollectivePermute(
   int device_ordinal = stream.parent()->device_ordinal();
   VLOG(3) << "Performing collective permute from device ordinal: "
           << device_ordinal << "current_id " << current_id;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(nccl_api, device_ordinal, {buffer}, comm));
 
   const std::optional<int64_t> source_id = source_target.source;
   const std::optional<int64_t> target_id = source_target.target;
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
index d9705bf8d741e0..2dad1baaa49daf 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
@@ -93,6 +93,8 @@ absl::Status NcclRecvThunk::RunNcclCollective(const ExecuteParams& params,
   int device_ordinal = stream.parent()->device_ordinal();
   VLOG(3) << "Performing Recv from device ordinal: " << device_ordinal
           << "current_id " << current_id;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(nccl_api(), device_ordinal, {buffer}, comm));
 
   const std::optional<int64_t> source_id = source_target.source;
   se::DeviceMemoryBase dest_addr = buffer.destination_buffer;
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
index 0184da6f5a9824..e83b0f4e97e04a 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
@@ -93,6 +93,8 @@ absl::Status NcclSendThunk::RunNcclCollective(const ExecuteParams& params,
   int device_ordinal = stream.parent()->device_ordinal();
   VLOG(3) << "Performing collective permute from device ordinal: "
           << device_ordinal << "current_id " << current_id;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(nccl_api(), device_ordinal, {buffer}, comm));
 
   const std::optional<int64_t> target_id = source_target.target;
   se::DeviceMemoryBase src_addr = buffer.source_buffer;

From 6648185e0cc27a19a9e7466ed0d3baa1f4a4ad63 Mon Sep 17 00:00:00 2001
From: Sheng Yang <yang.sheng@intel.com>
Date: Tue, 19 Mar 2024 05:47:13 -0700
Subject: [PATCH 094/670] PR #10247: Add sycl name in se_gpu_pjrt_client

Imported from GitHub PR https://github.com/openxla/xla/pull/10247

It is a sub PR of https://github.com/openxla/xla/pull/9042.

Copybara import of the project:

--
c2f60c318aebfe4e64bf3becfd40d04949c50b41 by Sheng, Yang <yang.sheng@intel.com>:

Add sycl name in se_gpu_pjrt_client

Merging this change closes #10247

PiperOrigin-RevId: 617141542
---
 third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc   | 2 ++
 third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc | 3 ++-
 third_party/xla/xla/pjrt/pjrt_compiler.h             | 8 ++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 14696f6b23703d..30132a90042082 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -1078,6 +1078,8 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
     const GpuClientOptions& options) {
 #if TENSORFLOW_USE_ROCM
   auto pjrt_platform_name = xla::RocmName();
+#elif TENSORFLOW_USE_SYCL
+  auto pjrt_platform_name = xla::SyclName();
 #else   // TENSORFLOW_USE_ROCM
   auto pjrt_platform_name = xla::CudaName();
 #endif  // TENSORFLOW_USE_ROCM
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 79eea0d3ebbd3a..18ad76139a73e8 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -56,7 +56,8 @@ namespace xla {
 namespace {
 
 bool IsGpuClient(const PjRtClient& client) {
-  return client.platform_id() == CudaId() || client.platform_id() == RocmId();
+  return client.platform_id() == CudaId() || client.platform_id() == RocmId() ||
+         client.platform_id() == SyclId();
 }
 
 bool IsSameTopology(const PjRtTopologyDescription& topology1,
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler.h b/third_party/xla/xla/pjrt/pjrt_compiler.h
index f94769093692aa..d624fd0cf99cd0 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/pjrt_compiler.h
@@ -44,6 +44,10 @@ inline const char* RocmName() {
   static constexpr char kRocmName[] = "rocm";
   return kRocmName;
 }
+inline const char* SyclName() {
+  static constexpr char kSyclName[] = "sycl";
+  return kSyclName;
+}
 inline const char* TpuName() {
   static constexpr char kTpuName[] = "tpu";
   return kTpuName;
@@ -60,6 +64,10 @@ inline PjRtPlatformId RocmId() {
   static const PjRtPlatformId kRocmId = tsl::Fingerprint64(RocmName());
   return kRocmId;
 }
+inline PjRtPlatformId SyclId() {
+  static const PjRtPlatformId kSyclId = tsl::Fingerprint64(SyclName());
+  return kSyclId;
+}
 inline PjRtPlatformId TpuId() {
   static const PjRtPlatformId kTpuId = tsl::Fingerprint64(TpuName());
   return kTpuId;

From f183145f8d0d4603c97216cbb7e734d3f1348f6c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 05:51:38 -0700
Subject: [PATCH 095/670] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/59be786cfd0fa9945bce36b9b2918d4b1753c426.

PiperOrigin-RevId: 617142273
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 0e4affe9180311..5a8a6f91caf6d4 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "9c80654b96f0fcf47cb636500f47edbe1fbc555e"
-    TFRT_SHA256 = "491e224c2d8d4be312ed0de65d6ab5a6912aa1d18b03439f6531f399b72ff385"
+    TFRT_COMMIT = "59be786cfd0fa9945bce36b9b2918d4b1753c426"
+    TFRT_SHA256 = "843a2f3f4ef5d1ca44cfd6b7e596ddc5e2ed37e692b6c77bca4a8d1790471d4b"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 0e4affe9180311..5a8a6f91caf6d4 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "9c80654b96f0fcf47cb636500f47edbe1fbc555e"
-    TFRT_SHA256 = "491e224c2d8d4be312ed0de65d6ab5a6912aa1d18b03439f6531f399b72ff385"
+    TFRT_COMMIT = "59be786cfd0fa9945bce36b9b2918d4b1753c426"
+    TFRT_SHA256 = "843a2f3f4ef5d1ca44cfd6b7e596ddc5e2ed37e692b6c77bca4a8d1790471d4b"
 
     tf_http_archive(
         name = "tf_runtime",

From 5545338d6030bd802cd5245242b7c0e264d4dea8 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 06:08:47 -0700
Subject: [PATCH 096/670] #shlo_ref Add `exponential` op.

PiperOrigin-RevId: 617147740
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  34 ++++
 .../lite/experimental/shlo/ops/exponential.cc |  73 +++++++++
 .../lite/experimental/shlo/ops/exponential.h  |  34 ++++
 .../experimental/shlo/ops/exponential_test.cc | 147 ++++++++++++++++++
 4 files changed, 288 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/exponential.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/exponential.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/exponential_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index ccdaf9c9358494..94a2f6d7e7b29b 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -390,3 +390,37 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "exponential",
+    srcs = ["exponential.cc"],
+    hdrs = ["exponential.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "exponential_test",
+    srcs = ["exponential_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":exponential",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential.cc b/tensorflow/lite/experimental/shlo/ops/exponential.cc
new file mode 100644
index 00000000000000..8c8994ccc8b296
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/exponential.cc
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/exponential.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Exponential {
+  template <class T>
+  T operator()(T v) const {
+    return std::exp(v);
+  }
+
+  template <>
+  F16 operator()<F16>(F16 val) const {
+    return F16(operator()(static_cast<float>(val)));
+  }
+
+  template <>
+  BF16 operator()<BF16>(BF16 val) const {
+    return BF16(operator()(static_cast<float>(val)));
+  }
+};
+
+ExponentialOp Create(ExponentialOp::Attributes) { return {}; }
+
+absl::Status Prepare(ExponentialOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("cosine"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("cosine"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(ExponentialOp& op, const Tensor& input, Tensor& output) {
+  Exponential exponential;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       exponential, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   exponential, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.tanh: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential.h b/tensorflow/lite/experimental/shlo/ops/exponential.h
new file mode 100644
index 00000000000000..6fe19227009cfc
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/exponential.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct ExponentialOp {
+  struct Attributes {};
+};
+
+ExponentialOp Create(ExponentialOp::Attributes);
+absl::Status Prepare(ExponentialOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(ExponentialOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_test.cc b/tensorflow/lite/experimental/shlo/ops/exponential_test.cc
new file mode 100644
index 00000000000000..12a180a5a60826
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/exponential.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<ExponentialOp> {
+  static std::string Get() { return "Exponential"; }
+};
+
+namespace {
+
+struct Exponential {
+  template <class T>
+  T operator()(T v) const {
+    return std::exp(v);
+  }
+
+  template <>
+  F16 operator()<F16>(F16 val) const {
+    return F16(operator()(static_cast<float>(val)));
+  }
+
+  template <>
+  BF16 operator()<BF16>(BF16 val) const {
+    return BF16(operator()(static_cast<float>(val)));
+  }
+} exponential_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Exponential,
+                               UnaryElementwiseOpShapePropagationTest,
+                               ExponentialOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Exponential, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<ExponentialOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<ExponentialOp, ConcatTypes<BoolTestType, IntTestTypes,
+                                           PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Exponential, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct ExponentialTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(ExponentialTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(ExponentialTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), exponential_ref);
+
+  auto op = Create(ExponentialOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedExponentialTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedExponentialTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedExponentialTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = exponential_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(ExponentialOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From d022a42d9b5df7155c0a1896a989a0c73714569a Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 06:19:32 -0700
Subject: [PATCH 097/670] #shlo_ref Add `floor` op.

PiperOrigin-RevId: 617150122
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  34 ++++
 .../lite/experimental/shlo/ops/floor.cc       |  73 +++++++++
 tensorflow/lite/experimental/shlo/ops/floor.h |  34 ++++
 .../lite/experimental/shlo/ops/floor_test.cc  | 146 ++++++++++++++++++
 4 files changed, 287 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/floor.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/floor.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/floor_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 94a2f6d7e7b29b..baceb0eb0717c6 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -424,3 +424,37 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "floor",
+    srcs = ["floor.cc"],
+    hdrs = ["floor.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "floor_test",
+    srcs = ["floor_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":floor",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/floor.cc b/tensorflow/lite/experimental/shlo/ops/floor.cc
new file mode 100644
index 00000000000000..1a3c9e8efbb07f
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/floor.cc
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/floor.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Floor {
+  template <class T>
+  T operator()(T v) const {
+    return std::floor(v);
+  }
+
+  template <>
+  F16 operator()<F16>(F16 val) const {
+    return F16(operator()(static_cast<float>(val)));
+  }
+
+  template <>
+  BF16 operator()<BF16>(BF16 val) const {
+    return BF16(operator()(static_cast<float>(val)));
+  }
+};
+
+FloorOp Create(FloorOp::Attributes) { return {}; }
+
+absl::Status Prepare(FloorOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("floor"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("floor"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(FloorOp& op, const Tensor& input, Tensor& output) {
+  Floor floor;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       floor, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   floor, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.floor: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/floor.h b/tensorflow/lite/experimental/shlo/ops/floor.h
new file mode 100644
index 00000000000000..1a3f7c48c5c3e9
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/floor.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_FLOOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_FLOOR_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct FloorOp {
+  struct Attributes {};
+};
+
+FloorOp Create(FloorOp::Attributes);
+absl::Status Prepare(FloorOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(FloorOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_FLOOR_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/floor_test.cc b/tensorflow/lite/experimental/shlo/ops/floor_test.cc
new file mode 100644
index 00000000000000..08ad8a8de670c5
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/floor_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/floor.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<FloorOp> {
+  static std::string Get() { return "Floor"; }
+};
+
+namespace {
+
+struct Floor {
+  template <class T>
+  T operator()(T v) const {
+    return std::floor(v);
+  }
+
+  template <>
+  F16 operator()<F16>(F16 val) const {
+    return F16(operator()(static_cast<float>(val)));
+  }
+
+  template <>
+  BF16 operator()<BF16>(BF16 val) const {
+    return BF16(operator()(static_cast<float>(val)));
+  }
+} floor_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Floor, UnaryElementwiseOpShapePropagationTest,
+                               FloorOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Floor, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<FloorOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<FloorOp, ConcatTypes<BoolTestType, IntTestTypes,
+                                     PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Floor, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct FloorTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(FloorTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(FloorTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), floor_ref);
+
+  auto op = Create(FloorOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedFloorTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedFloorTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedFloorTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = floor_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(FloorOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From fad65151940107d8392c950710cfb19c25a172d4 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 06:31:14 -0700
Subject: [PATCH 098/670] #shlo_ref Add `log` op.

PiperOrigin-RevId: 617152641
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  34 ++++
 tensorflow/lite/experimental/shlo/ops/log.cc  |  73 +++++++++
 tensorflow/lite/experimental/shlo/ops/log.h   |  34 ++++
 .../lite/experimental/shlo/ops/log_test.cc    | 147 ++++++++++++++++++
 4 files changed, 288 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/log.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/log.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/log_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index baceb0eb0717c6..51b63c1ee54243 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -458,3 +458,37 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "log",
+    srcs = ["log.cc"],
+    hdrs = ["log.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "log_test",
+    srcs = ["log_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":log",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/log.cc b/tensorflow/lite/experimental/shlo/ops/log.cc
new file mode 100644
index 00000000000000..1beca617bc5880
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/log.cc
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/log.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Log {
+  template <class T>
+  T operator()(T v) const {
+    return std::log(v);
+  }
+
+  template <>
+  F16 operator()<F16>(F16 val) const {
+    return F16(operator()(static_cast<float>(val)));
+  }
+
+  template <>
+  BF16 operator()<BF16>(BF16 val) const {
+    return BF16(operator()(static_cast<float>(val)));
+  }
+};
+
+LogOp Create(LogOp::Attributes) { return {}; }
+
+absl::Status Prepare(LogOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("log"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("log"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(LogOp& op, const Tensor& input, Tensor& output) {
+  Log log;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       log, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   log, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.log: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/log.h b/tensorflow/lite/experimental/shlo/ops/log.h
new file mode 100644
index 00000000000000..981cf23ab379d7
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/log.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct LogOp {
+  struct Attributes {};
+};
+
+LogOp Create(LogOp::Attributes);
+absl::Status Prepare(LogOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(LogOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/log_test.cc b/tensorflow/lite/experimental/shlo/ops/log_test.cc
new file mode 100644
index 00000000000000..2d5c45239bd0c5
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/log_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/log.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<LogOp> {
+  static std::string Get() { return "Log"; }
+};
+
+namespace {
+
+struct Log {
+  template <class T>
+  T operator()(T v) const {
+    return std::log(v);
+  }
+
+  template <>
+  F16 operator()<F16>(F16 val) const {
+    return F16(operator()(static_cast<float>(val)));
+  }
+
+  template <>
+  BF16 operator()<BF16>(BF16 val) const {
+    return BF16(operator()(static_cast<float>(val)));
+  }
+} log_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Log, UnaryElementwiseOpShapePropagationTest,
+                               LogOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Log, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<LogOp>, TestParamNames);
+
+using UnsupportedTypes = WithOpTypes<
+    LogOp, ConcatTypes<BoolTestType, IntTestTypes, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Log, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct LogTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(LogTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(LogTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/static_cast<StorageT>(0.1));
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), log_ref);
+
+  auto op = Create(LogOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedLogTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedLogTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedLogTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  Vector<StorageT> input_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/zero_point + 1);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = log_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(LogOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From 9de302e62b15f15550c3fe486c768282edaf27c2 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 06:41:33 -0700
Subject: [PATCH 099/670] #shlo_ref Add `exponential_minus_one` op.

PiperOrigin-RevId: 617154923
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  34 ++++
 .../shlo/ops/exponential_minus_one.cc         |  76 +++++++++
 .../shlo/ops/exponential_minus_one.h          |  36 +++++
 .../shlo/ops/exponential_minus_one_test.cc    | 151 ++++++++++++++++++
 4 files changed, 297 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 51b63c1ee54243..0133e09d843cff 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -492,3 +492,37 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "exponential_minus_one",
+    srcs = ["exponential_minus_one.cc"],
+    hdrs = ["exponential_minus_one.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "exponential_minus_one_test",
+    srcs = ["exponential_minus_one_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":exponential_minus_one",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc
new file mode 100644
index 00000000000000..57de5eb188df0f
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc
@@ -0,0 +1,76 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct ExponentialMinusOne {
+  template <class T>
+  T operator()(T v) const {
+    return std::expm1(v);
+  }
+
+  template <>
+  F16 operator()(F16 v) const {
+    return F16(operator()(static_cast<float>(v)));
+  }
+
+  template <>
+  BF16 operator()(BF16 v) const {
+    return BF16(operator()(static_cast<float>(v)));
+  }
+};
+
+ExponentialMinusOneOp Create(ExponentialMinusOneOp::Attributes) { return {}; }
+
+absl::Status Prepare(ExponentialMinusOneOp& op, const Tensor& input,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("exponential_minus_one"), input,
+                          IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("exponential_minus_one"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(ExponentialMinusOneOp& op, const Tensor& input,
+                      Tensor& output) {
+  ExponentialMinusOne exponential_minus_one;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       exponential_minus_one, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   exponential_minus_one, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.exponential_minus_one: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h
new file mode 100644
index 00000000000000..9c4fa9bcadaa1b
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_MINUS_ONE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_MINUS_ONE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct ExponentialMinusOneOp {
+  struct Attributes {};
+};
+
+ExponentialMinusOneOp Create(ExponentialMinusOneOp::Attributes);
+absl::Status Prepare(ExponentialMinusOneOp& op, const Tensor& input,
+                     Tensor& output);
+absl::Status Evaluate(ExponentialMinusOneOp& op, const Tensor& input,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_MINUS_ONE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc
new file mode 100644
index 00000000000000..b791350e49a304
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<ExponentialMinusOneOp> {
+  static std::string Get() { return "ExponentialMinusOne"; }
+};
+
+namespace {
+
+struct ExponentialMinusOne {
+  template <class T>
+  T operator()(T v) const {
+    return std::expm1(v);
+  }
+
+  template <>
+  F16 operator()(F16 v) const {
+    return F16(operator()(static_cast<float>(v)));
+  }
+
+  template <>
+  BF16 operator()(BF16 v) const {
+    return BF16(operator()(static_cast<float>(v)));
+  }
+} exponential_minus_one_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(ExponentialMinusOne,
+                               UnaryElementwiseOpShapePropagationTest,
+                               ExponentialMinusOneOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    ExponentialMinusOne, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<ExponentialMinusOneOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<ExponentialMinusOneOp, ConcatTypes<BoolTestType, IntTestTypes,
+                                                   PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(ExponentialMinusOneOp,
+                               UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct ExponentialMinusOneTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(ExponentialMinusOneTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(ExponentialMinusOneTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(),
+                    exponential_minus_one_ref);
+
+  auto op = Create(ExponentialMinusOneOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedExponentialMinusOneTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedExponentialMinusOneTest, QuantizedTestTypes,
+                 TestParamNames);
+
+TYPED_TEST(QuantizedExponentialMinusOneTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res =
+            exponential_minus_one_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(ExponentialMinusOneOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From 27910564d47cbbad364abbb0fb7ddddbcc1b4098 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 19 Mar 2024 06:49:56 -0700
Subject: [PATCH 100/670] [XLA:Python] Migrate remaining python bindings to
 nanobind, and delete unused pybind11 code.

PiperOrigin-RevId: 617156835
---
 third_party/xla/xla/pjrt/status_casters.h     |    2 +-
 third_party/xla/xla/python/BUILD              |   97 +-
 third_party/xla/xla/python/dlpack.cc          |   85 +-
 third_party/xla/xla/python/dlpack.h           |   24 +-
 .../xla/xla/python/ifrt_proxy/client/BUILD    |    4 +-
 .../xla/python/ifrt_proxy/client/py_module.cc |   56 +-
 .../xla/python/ifrt_proxy/client/py_module.h  |    4 +-
 .../xla/xla/python/ifrt_proxy/jax/BUILD       |    1 -
 .../ifrt_proxy/jax/ifrt_proxy_internal.py     |    3 +-
 .../xla/xla/python/nb_absl_flat_hash_set.h    |   33 +
 third_party/xla/xla/python/nb_class_ptr.h     |    2 +-
 third_party/xla/xla/python/nb_helpers.h       |   12 +
 .../xla/xla/python/outfeed_receiver_py.cc     |   34 +-
 third_party/xla/xla/python/pjit.cc            |   11 +-
 third_party/xla/xla/python/pmap_lib.cc        |   35 +-
 third_party/xla/xla/python/py_array.cc        |   46 +-
 third_party/xla/xla/python/py_array.h         |   25 +-
 third_party/xla/xla/python/py_client.cc       |  399 ++++--
 third_party/xla/xla/python/py_client.h        |  177 +--
 .../xla/xla/python/py_compile_only_client.cc  |  130 +-
 .../xla/xla/python/py_compile_only_client.h   |    6 +-
 third_party/xla/xla/python/py_device.cc       |  321 +++++
 third_party/xla/xla/python/py_device.h        |   83 ++
 third_party/xla/xla/python/py_device_list.cc  |   45 +-
 third_party/xla/xla/python/py_device_list.h   |    6 +-
 third_party/xla/xla/python/py_executable.cc   |   17 +-
 third_party/xla/xla/python/py_executable.h    |   16 +-
 third_party/xla/xla/python/py_memory_space.cc |  107 ++
 third_party/xla/xla/python/py_memory_space.h  |   66 +
 .../xla/xla/python/python_ref_manager.cc      |   35 -
 .../xla/xla/python/python_ref_manager.h       |   10 +-
 third_party/xla/xla/python/python_utils.h     |   53 -
 third_party/xla/xla/python/sharding.cc        |   53 +-
 third_party/xla/xla/python/sharding.h         |    8 +-
 third_party/xla/xla/python/types.cc           |   35 -
 third_party/xla/xla/python/types.h            |  119 --
 third_party/xla/xla/python/util.h             |   62 -
 third_party/xla/xla/python/xla.cc             | 1131 +++++------------
 third_party/xla/xla/python/xla.h              |   28 -
 third_party/xla/xla/python/xla_compiler.cc    |    4 +-
 third_party/xla/xla/python/xla_extension.cc   |   21 -
 41 files changed, 1637 insertions(+), 1769 deletions(-)
 create mode 100644 third_party/xla/xla/python/nb_absl_flat_hash_set.h
 create mode 100644 third_party/xla/xla/python/py_device.cc
 create mode 100644 third_party/xla/xla/python/py_device.h
 create mode 100644 third_party/xla/xla/python/py_memory_space.cc
 create mode 100644 third_party/xla/xla/python/py_memory_space.h
 delete mode 100644 third_party/xla/xla/python/python_utils.h
 delete mode 100644 third_party/xla/xla/python/xla.h
 delete mode 100644 third_party/xla/xla/python/xla_extension.cc

diff --git a/third_party/xla/xla/pjrt/status_casters.h b/third_party/xla/xla/pjrt/status_casters.h
index 72f89e4cbb9924..915153f9c372c9 100644
--- a/third_party/xla/xla/pjrt/status_casters.h
+++ b/third_party/xla/xla/pjrt/status_casters.h
@@ -103,7 +103,7 @@ struct ThrowIfErrorWrapper<xla::Status(Args...), xla::Status (&)(Args...)> {
 template <typename C, typename... Args, typename F>
 struct ThrowIfErrorWrapper<xla::Status (C::*)(Args...), F> {
   explicit ThrowIfErrorWrapper(F&& f) : func(std::move(f)) {}
-  void operator()(Args... args) {
+  void operator()(Args... args) const {
     xla::ThrowIfError(func(std::forward<Args>(args)...));
   }
   F func;
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 1c58803c966be2..026539d90f9a6f 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -205,8 +205,6 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/python/lib/core:numpy",
-        "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:absl_casters",
     ],
 )
 
@@ -226,23 +224,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@pybind11",
-    ],
-)
-
-cc_library(
-    name = "python_utils",
-    hdrs = ["python_utils.h"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    deps = [
-        "//xla:status_macros",
-        "//xla:util",
         "@local_config_python//:python_headers",  # buildcleaner: keep
-        "@pybind11",
     ],
 )
 
@@ -301,9 +283,11 @@ cc_library(
         "py_array.cc",
         "py_client.cc",
         "py_compile_only_client.cc",
+        "py_device.cc",
         "py_device_list.cc",
         "py_executable.cc",
         "py_host_callback.cc",
+        "py_memory_space.cc",
         "py_values.cc",
         "sharding.cc",
     ],
@@ -311,9 +295,11 @@ cc_library(
         "py_array.h",
         "py_client.h",
         "py_compile_only_client.h",
+        "py_device.h",
         "py_device_list.h",
         "py_executable.h",
         "py_host_callback.h",
+        "py_memory_space.h",
         "py_values.h",
         "sharded_device_array.h",
         "sharding.h",
@@ -329,6 +315,7 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         ":callback",
+        ":nb_absl_span",
         ":nb_class_ptr",
         ":nb_helpers",
         ":nb_numpy",
@@ -336,7 +323,6 @@ cc_library(
         ":py_client_gpu",
         ":py_host_callback_proto_cc",
         ":python_ref_manager",
-        ":python_utils",
         ":traceback",
         ":transfer_guard_lib",
         ":types",
@@ -375,6 +361,7 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
@@ -385,10 +372,12 @@ cc_library(
         "//xla/python/pjrt_ifrt",
         "//xla/python/pjrt_ifrt:xla_host_callback_proto_cc",
         "//xla/python/pjrt_ifrt:xla_ifrt",
+        "//xla/service:computation_placer_hdr",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
         "//xla/service:platform_util",
         "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/framework:allocator",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:fingerprint",
@@ -401,8 +390,6 @@ cc_library(
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
-        "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:absl_casters",
     ] + if_cuda([
         "@local_config_cuda//cuda:cuda_headers",
         "//xla/stream_executor/cuda:cuda_driver",
@@ -490,6 +477,7 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
+        ":nb_class_ptr",
         ":py_client",
         ":python_ref_manager",
         ":traceback",
@@ -515,7 +503,6 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
-        "@pybind11",
     ],
 )
 
@@ -734,6 +721,7 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         ":jax_jit",
+        ":nb_class_ptr",
         ":nb_helpers",
         ":nb_numpy",
         ":py_client",
@@ -765,7 +753,6 @@ cc_library(
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/python/lib/core:numpy",
-        "@pybind11",
     ],
 )
 
@@ -799,6 +786,7 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
+        ":nb_class_ptr",
         ":outfeed_receiver",
         ":py_client",
         ":types",
@@ -814,7 +802,6 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:status_casters",
         "@local_tsl//tsl/platform:logging",
-        "@pybind11",
     ],
 )
 
@@ -1013,7 +1000,6 @@ cc_library(
         "//xla/python/ifrt",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@pybind11",
     ],
 )
 
@@ -1098,7 +1084,6 @@ cc_library(
         "//xla/service:tuple_simplifier",
         "@local_tsl//tsl/lib/strings:proto_serialization",
         "@local_tsl//tsl/platform:logging",
-        "@pybind11",
     ],
 )
 
@@ -1155,7 +1140,20 @@ cc_library(
 
 tsl_pybind_extension(
     name = "xla_extension",
-    srcs = ["xla_extension.cc"],
+    srcs = [
+        "logging.cc",
+        "logging.h",
+        "xla.cc",
+    ],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    defines = select({
+        ":gpu_enabled": ["XLA_PYTHON_ENABLE_GPU=1"],
+        "//conditions:default": [],
+    }),
+    features = ["-use_header_modules"],
     linkopts = select({
         ":use_jax_cuda_pip_rpaths": [
             "-Wl,-rpath,$$ORIGIN/../nvidia/cuda_cupti/lib",
@@ -1173,32 +1171,6 @@ tsl_pybind_extension(
     ],
     pytype_srcs = glob(["xla_extension/*.pyi"]),
     visibility = ["//visibility:public"],
-    deps = [
-        ":xla_extension_library",
-        "@pybind11",
-    ],
-)
-
-cc_library(
-    name = "xla_extension_library",
-    srcs = [
-        "logging.cc",
-        "logging.h",
-        "xla.cc",
-    ],
-    hdrs = [
-        "xla.h",
-    ],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    defines = select({
-        ":gpu_enabled": ["XLA_PYTHON_ENABLE_GPU=1"],
-        "//conditions:default": [],
-    }),
-    features = ["-use_header_modules"],
     deps = [
         ":custom_call_sharding",
         ":dlpack",
@@ -1206,6 +1178,7 @@ cc_library(
         ":mlir",
         ":nb_absl_flat_hash_map",
         ":nb_absl_span",
+        ":nb_class_ptr",
         ":ops",
         ":outfeed_receiver_py",
         ":pjit",
@@ -1227,6 +1200,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:initialize",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
@@ -1258,6 +1232,7 @@ cc_library(
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/pjrt/distributed:protocol_proto_cc",
         "//xla/pjrt/distributed:service",
+        "//xla/pjrt/gpu:gpu_helpers",
         "//xla/python/ifrt",
         "//xla/python/ifrt_proxy/client:py_module",
         "//xla/python/pjrt_ifrt",
@@ -1265,10 +1240,9 @@ cc_library(
         "@local_tsl//tsl/distributed_runtime/preemption:preemption_sync_manager",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform/cloud:gcs_file_system",
         "@local_tsl//tsl/python/lib/core:numpy",
-        "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:absl_casters",
     ] + select({
         # gloo transport only builds on linux
         "@local_tsl//tsl:macos": [],
@@ -1353,6 +1327,7 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         "//third_party/nanobind",
+        "@com_google_absl//absl/strings:str_format",
         "@local_config_python//:python_headers",
     ],
 )
@@ -1395,3 +1370,15 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
+
+cc_library(
+    name = "nb_absl_flat_hash_set",
+    hdrs = ["nb_absl_flat_hash_set.h"],
+    compatible_with = [],
+    copts = ["-fexceptions"],
+    features = ["-use_header_modules"],
+    deps = [
+        "//third_party/nanobind",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
diff --git a/third_party/xla/xla/python/dlpack.cc b/third_party/xla/xla/python/dlpack.cc
index 52e02d21c31a1a..0cb187ba27a76a 100644
--- a/third_party/xla/xla/python/dlpack.cc
+++ b/third_party/xla/xla/python/dlpack.cc
@@ -33,14 +33,13 @@ limitations under the License.
 #include "include/dlpack/dlpack.h"  // from @dlpack
 #include "llvm/Support/Casting.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "pybind11/gil.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
 #include "xla/layout.h"
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/py_array.h"
@@ -57,7 +56,6 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 namespace xla {
 namespace {
@@ -68,7 +66,7 @@ struct DLPackTensor {
   ~DLPackTensor();
 
   // `buffer_reference` is populated if we have shared (read-only) access.
-  py::object buffer_reference;
+  nb::object buffer_reference;
 
   // `external_reference` is always populated.
   std::unique_ptr<PjRtBuffer::ExternalReference> external_reference;
@@ -293,11 +291,9 @@ absl::StatusOr<PjRtDevice*> DeviceForDLDevice(const PjRtClient* cpu_client,
 
 }  // namespace
 
-absl::StatusOr<py::capsule> BufferToDLPackManagedTensor(
-    py::handle py_buffer, std::optional<std::intptr_t> stream) {
-  // TODO(phawkins): remove .ptr() when nanobind transition is complete.
-  ifrt::Array* ifrt_array =
-      nb::cast<xla::PyArray>(nb::handle(py_buffer.ptr())).ifrt_array();
+absl::StatusOr<nb::capsule> BufferToDLPackManagedTensor(
+    nb::handle py_buffer, std::optional<std::intptr_t> stream) {
+  ifrt::Array* ifrt_array = nb::cast<xla::PyArray>(py_buffer).ifrt_array();
   if (ifrt_array == nullptr) {
     return Unimplemented(
         "BufferToDLPackManagedTensor called on deleted array.");
@@ -323,7 +319,7 @@ absl::StatusOr<py::capsule> BufferToDLPackManagedTensor(
   {
     // AcquireExternalReference may block; there are no API guarantees.
     GlobalPyRefManager()->CollectGarbage();
-    py::gil_scoped_release gil_release;
+    nb::gil_scoped_release gil_release;
     TF_ASSIGN_OR_RETURN(pack->external_reference,
                         pjrt_buffer->AcquireExternalReference());
     if (stream) {
@@ -334,7 +330,7 @@ absl::StatusOr<py::capsule> BufferToDLPackManagedTensor(
           AwaitBuffersReady(absl::MakeConstSpan(&ifrt_array, 1)));
     }
   }
-  pack->buffer_reference = py::reinterpret_borrow<py::object>(py_buffer);
+  pack->buffer_reference = nb::borrow<nb::object>(py_buffer);
 
   dt.data = pack->external_reference->OpaqueDeviceMemoryDataPointer();
   pack->tensor.manager_ctx = pack.get();
@@ -357,29 +353,36 @@ absl::StatusOr<py::capsule> BufferToDLPackManagedTensor(
   dt.strides = reinterpret_cast<std::int64_t*>(pack->strides.data());
   dt.byte_offset = 0;
 
-  py::capsule capsule(&pack.release()->tensor, kDlTensorCapsuleName,
-                      [](PyObject* obj) {
-                        DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(
-                            PyCapsule_GetPointer(obj, kDlTensorCapsuleName));
-                        if (dlmt) {
-                          DLPackTensorDeleter(dlmt);
-                        } else {
-                          // The tensor has been deleted. Clear any error from
-                          // PyCapsule_GetPointer.
-                          PyErr_Clear();
-                        }
-                      });
+  // We cannot use nanobind's capsule object constructor because we need to
+  // detect if the capsule name has been changed in the deleter, but nanobind
+  // hides the underlying Python object from the deleter.
+  nb::capsule capsule = nb::steal<nb::capsule>(
+      PyCapsule_New(&pack.release()->tensor, kDlTensorCapsuleName,
+                    [](PyObject* obj) noexcept {
+                      DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(
+                          PyCapsule_GetPointer(obj, kDlTensorCapsuleName));
+                      if (dlmt) {
+                        DLPackTensorDeleter(dlmt);
+                      } else {
+                        // The tensor has been deleted. Clear any error from
+                        // PyCapsule_GetPointer.
+                        PyErr_Clear();
+                      }
+                    }));
+  if (!capsule.ptr()) {
+    throw nb::python_error();
+  }
   return capsule;
 }
 
-absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
-    const pybind11::capsule& tensor, std::shared_ptr<PyClient> cpu_client,
-    std::shared_ptr<PyClient> gpu_client) {
+absl::StatusOr<nb::object> DLPackManagedTensorToBuffer(
+    const nb::capsule& tensor, std::optional<nb_class_ptr<PyClient>> cpu_client,
+    std::optional<nb_class_ptr<PyClient>> gpu_client) {
   // TODO(hyeontaek): This is a potential target for an IFRT client to multiplex
   // multiple PjRt clients. Devices from these PjRt clients could be expressed
   // as a unified set of IFRT devices.
-  auto* cpu_pjrt_client = cpu_client ? cpu_client->pjrt_client() : nullptr;
-  auto* gpu_pjrt_client = gpu_client ? gpu_client->pjrt_client() : nullptr;
+  auto* cpu_pjrt_client = cpu_client ? (*cpu_client)->pjrt_client() : nullptr;
+  auto* gpu_pjrt_client = gpu_client ? (*gpu_client)->pjrt_client() : nullptr;
 
   if (std::string_view(tensor.name()) != kDlTensorCapsuleName) {
     return InvalidArgument(
@@ -387,7 +390,7 @@ absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
         "Note that a DLPack tensor may be consumed at most once.",
         std::string_view(tensor.name()));
   }
-  DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(tensor);
+  DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(tensor.data());
   if (dlmt->dl_tensor.ndim < 0) {
     return InvalidArgument(
         "Number of dimensions in DLManagedTensor must be nonnegative, got %d",
@@ -447,8 +450,8 @@ absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
   // TODO(phawkins): simplify the expression below once we know cpu_client is
   // always non-null.
   auto client = (cpu_client && device->client() == cpu_pjrt_client)
-                    ? std::move(cpu_client)
-                    : std::move(gpu_client);
+                    ? std::move(*cpu_client)
+                    : std::move(*gpu_client);
   auto* ifrt_client =
       llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(client->ifrt_client());
   if (ifrt_client == nullptr) {
@@ -457,22 +460,20 @@ absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
   }
   TF_ASSIGN_OR_RETURN(auto ifrt_array,
                       ifrt_client->CreatePjRtArray(std::move(pjrt_buffer)));
-  auto out = PyArray::MakeFromSingleDeviceArray(
-      std::move(client), Traceback::Get(), std::move(ifrt_array), false, true);
-  // TODO(phawkins): remove after nanobind transition is complete.
-  return py::reinterpret_steal<py::object>(out.release().ptr());
+  return PyArray::MakeFromSingleDeviceArray(std::move(client), Traceback::Get(),
+                                            std::move(ifrt_array), false, true);
 }
 
-absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
-    const pybind11::capsule& tensor, PjRtDevice* device,
-    std::shared_ptr<PyClient> client, std::optional<std::intptr_t> stream) {
+absl::StatusOr<nb::object> DLPackManagedTensorToBuffer(
+    const nb::capsule& tensor, PjRtDevice* device,
+    nb_class_ptr<PyClient> client, std::optional<std::intptr_t> stream) {
   if (std::string_view(tensor.name()) != kDlTensorCapsuleName) {
     return InvalidArgument(
         "DLPack tensor must be a capsule with name \"dltensor\", got \"%s\". "
         "Note that a DLPack tensor may be consumed at most once.",
         std::string_view(tensor.name()));
   }
-  DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(tensor);
+  DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(tensor.data());
   if (dlmt->dl_tensor.ndim < 0) {
     return InvalidArgument(
         "Number of dimensions in DLManagedTensor must be nonnegative, got %d",
@@ -519,10 +520,8 @@ absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
   }
   TF_ASSIGN_OR_RETURN(auto ifrt_array,
                       ifrt_client->CreatePjRtArray(std::move(pjrt_buffer)));
-  auto out = PyArray::MakeFromSingleDeviceArray(
-      std::move(client), Traceback::Get(), std::move(ifrt_array), false, true);
-  // TODO(phawkins): remove after nanobind transition is complete.
-  return py::reinterpret_steal<py::object>(out.release().ptr());
+  return PyArray::MakeFromSingleDeviceArray(std::move(client), Traceback::Get(),
+                                            std::move(ifrt_array), false, true);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/python/dlpack.h b/third_party/xla/xla/python/dlpack.h
index 11272e0f0032ed..069fdf1de13bb1 100644
--- a/third_party/xla/xla/python/dlpack.h
+++ b/third_party/xla/xla/python/dlpack.h
@@ -16,10 +16,13 @@ limitations under the License.
 #ifndef XLA_PYTHON_DLPACK_H_
 #define XLA_PYTHON_DLPACK_H_
 
-#include <memory>
+#include <cstdint>
 #include <optional>
 
-#include "pybind11/pybind11.h"  // from @pybind11
+#include "absl/status/statusor.h"
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/py_client.h"
 
 namespace xla {
@@ -31,16 +34,17 @@ namespace xla {
 // stream, if set, is a GPU stream, e.g. cudaStream_t for CUDA GPUs, that should
 // be synchronized to the buffer as per
 // https://dmlc.github.io/dlpack/latest/python_spec.html#python-specification-for-dlpack.
-absl::StatusOr<pybind11::capsule> BufferToDLPackManagedTensor(
-    pybind11::handle buffer, std::optional<std::intptr_t> stream);
+absl::StatusOr<nanobind::capsule> BufferToDLPackManagedTensor(
+    nanobind::handle buffer, std::optional<std::intptr_t> stream);
 
-absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
-    const pybind11::capsule& tensor, std::shared_ptr<PyClient> cpu_client,
-    std::shared_ptr<PyClient> gpu_client);
+absl::StatusOr<nanobind::object> DLPackManagedTensorToBuffer(
+    const nanobind::capsule& tensor,
+    std::optional<nb_class_ptr<PyClient>> cpu_client,
+    std::optional<nb_class_ptr<PyClient>> gpu_client);
 
-absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
-    const pybind11::capsule& tensor, PjRtDevice* device,
-    std::shared_ptr<PyClient> client, std::optional<std::intptr_t> stream);
+absl::StatusOr<nanobind::object> DLPackManagedTensorToBuffer(
+    const nanobind::capsule& tensor, ifrt::Device* device,
+    nb_class_ptr<PyClient> client, std::optional<std::intptr_t> stream);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index 7a989ec885c38a..27f920e9b82eae 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -507,7 +507,9 @@ cc_library(
     deps = [
         ":grpc_client",
         ":registry",
+        "//third_party/nanobind",
         "//xla/pjrt:status_casters",
+        "//xla/python:nb_class_ptr",
         "//xla/python:py_client",
         "//xla/python/ifrt",
         "@com_google_absl//absl/log",
@@ -519,7 +521,5 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:statusor",
-        "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:absl_casters",
     ],
 )
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc b/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
index c20dc63c4d06d9..dca0b74cf1ee72 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
@@ -26,31 +26,31 @@
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "pybind11/cast.h"  // from @pybind11
-#include "pybind11/detail/common.h"  // from @pybind11
-#include "pybind11/functional.h"  // from @pybind11  // NOLINT  // IWYU pragma: keep
-#include "pybind11/gil.h"  // from @pybind11
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil  // NOLINT  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/function.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt_proxy/client/registry.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/py_client.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/statusor.h"
 
+namespace nb = ::nanobind;
+
 namespace xla {
 namespace ifrt {
 namespace proxy {
 namespace {
 
 struct PyClientConnectionOptions {
-  std::function<void(absl::Status)> on_disconnect;
-  std::function<void(std::string)> on_connection_update;
+  std::optional<std::function<void(std::string)>> on_disconnect;
+  std::optional<std::function<void(std::string)>> on_connection_update;
 };
 
-absl::StatusOr<std::shared_ptr<xla::PyClient>> GetClient(
+absl::StatusOr<nb_class_ptr<PyClient>> GetClient(
     std::string proxy_server_address,
     const PyClientConnectionOptions& py_options) {
   DCHECK(PyGILState_Check());
@@ -64,16 +64,16 @@ absl::StatusOr<std::shared_ptr<xla::PyClient>> GetClient(
     // or even deadlock. A unique_ptr or `absl::AnyInvocable` is not sufficient
     // because downstream code can make copies. Reference:
     // https://pybind11.readthedocs.io/en/stable/advanced/misc.html#common-sources-of-global-interpreter-lock-errors
-    auto py_on_disconnect = std::make_shared<std::function<void(absl::Status)>>(
-        std::move(py_options.on_disconnect));
+    auto py_on_disconnect = std::make_shared<std::function<void(std::string)>>(
+        std::move(*py_options.on_disconnect));
 
     options.on_disconnect =
         [on_disconnect = std::move(py_on_disconnect)](absl::Status s) mutable {
           LOG(WARNING) << "Connection to server failed, calling supplied "
                        << "`on_disconnect` function: " << s;
           tsl::Env::Default()->SchedClosure([s, on_disconnect]() mutable {
-            pybind11::gil_scoped_acquire gil_acquire;
-            (*on_disconnect)(s);
+            nb::gil_scoped_acquire gil_acquire;
+            (*on_disconnect)(s.ToString());
             on_disconnect = nullptr;
           });
         };
@@ -81,39 +81,39 @@ absl::StatusOr<std::shared_ptr<xla::PyClient>> GetClient(
 
   if (py_options.on_connection_update) {
     auto fn = std::make_shared<std::function<void(std::string)>>(
-        std::move(py_options.on_connection_update));
+        std::move(*py_options.on_connection_update));
     options.on_connection_update = [fn](absl::string_view log_line) -> void {
       tsl::Env::Default()->SchedClosure([fn, str = std::string(log_line)] {
-        pybind11::gil_scoped_acquire gil_acquire;
+        nb::gil_scoped_acquire gil_acquire;
         (*fn)(std::string(str));
       });
     };
   }
 
   {
-    pybind11::gil_scoped_release gil_release;
+    nb::gil_scoped_release gil_release;
     TF_ASSIGN_OR_RETURN(client, CreateClient(proxy_server_address, options));
   }
 
   // Constructing `xla::PyClient` requires GIL as it may dec-ref Python objects.
-  return std::make_shared<xla::PyClient>(std::move(client));
+  return xla::PyClient::Make(std::move(client));
 }
 
 }  // namespace
 
-void BuildIfrtProxySubmodule(pybind11::module_& m) {
-  pybind11::module_ sub_module = m.def_submodule("ifrt_proxy", "IFRT proxy");
+void BuildIfrtProxySubmodule(nb::module_& m) {
+  nb::module_ sub_module = m.def_submodule("ifrt_proxy", "IFRT proxy");
 
-  pybind11::class_<PyClientConnectionOptions>(sub_module,
-                                              "ClientConnectionOptions")
-      .def(pybind11::init<>())
-      .def_readwrite("on_disconnect", &PyClientConnectionOptions::on_disconnect)
-      .def_readwrite("on_connection_update",
-                     &PyClientConnectionOptions::on_connection_update);
+  nb::class_<PyClientConnectionOptions>(sub_module, "ClientConnectionOptions")
+      .def(nb::init<>())
+      .def_rw("on_disconnect", &PyClientConnectionOptions::on_disconnect,
+              nb::arg().none())
+      .def_rw("on_connection_update",
+              &PyClientConnectionOptions::on_connection_update,
+              nb::arg().none());
 
   sub_module.def("get_client", xla::ValueOrThrowWrapper(GetClient),
-                 pybind11::arg("proxy_server_address"),
-                 pybind11::arg("options"));
+                 nb::arg("proxy_server_address"), nb::arg("options"));
 }
 
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/py_module.h b/third_party/xla/xla/python/ifrt_proxy/client/py_module.h
index 508d91a0f2d7c5..3e104aba93ad70 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/py_module.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/py_module.h
@@ -16,13 +16,13 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
 #define XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
 
-#include "pybind11/pybind11.h"  // from @pybind11
+#include "third_party/nanobind/include/nanobind/nanobind.h"
 
 namespace xla {
 namespace ifrt {
 namespace proxy {
 
-void BuildIfrtProxySubmodule(pybind11::module_& m);
+void BuildIfrtProxySubmodule(nanobind::module_& m);
 
 }  // namespace proxy
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt_proxy/jax/BUILD b/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
index 1a84033e2fa33a..b05846d91e0d21 100644
--- a/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
@@ -31,7 +31,6 @@ pytype_strict_library(
     # copybara:uncomment_end
     deps = [
         "//xla/python:xla_client",
-        "@pybind11_abseil//pybind11_abseil:status",
     ],
 )
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py b/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
index 790c9567e010af..4b46a2ea0317bf 100644
--- a/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
+++ b/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
@@ -21,7 +21,6 @@
 import dataclasses
 from typing import Callable, Optional
 
-from pybind11_abseil import status
 from xla.python import xla_client
 
 
@@ -38,7 +37,7 @@ class ConnectionOptions:
       provided as human-readable strings, and an end-user may find them helpful.
   """
 
-  on_disconnect: Optional[Callable[[status.Status], None]] = None
+  on_disconnect: Optional[Callable[[str], None]] = None
   on_connection_update: Optional[Callable[[str], None]] = None
 
 
diff --git a/third_party/xla/xla/python/nb_absl_flat_hash_set.h b/third_party/xla/xla/python/nb_absl_flat_hash_set.h
new file mode 100644
index 00000000000000..9927f2356765e9
--- /dev/null
+++ b/third_party/xla/xla/python/nb_absl_flat_hash_set.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_NB_ABSL_FLAT_HASH_SET_H_
+#define XLA_PYTHON_NB_ABSL_FLAT_HASH_SET_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/detail/nb_set.h"
+
+namespace nanobind {
+namespace detail {
+
+template <typename Key, typename Hash, typename Eq, typename Alloc>
+struct type_caster<absl::flat_hash_set<Key, Hash, Eq, Alloc>>
+    : set_caster<absl::flat_hash_set<Key, Hash, Eq, Alloc>, Key> {};
+
+}  // namespace detail
+}  // namespace nanobind
+
+#endif  // XLA_PYTHON_NB_ABSL_FLAT_HASH_SET_H_
diff --git a/third_party/xla/xla/python/nb_class_ptr.h b/third_party/xla/xla/python/nb_class_ptr.h
index 68283d4ce875fe..c2be986c896934 100644
--- a/third_party/xla/xla/python/nb_class_ptr.h
+++ b/third_party/xla/xla/python/nb_class_ptr.h
@@ -39,7 +39,7 @@ class nb_class_ptr : public nanobind::object {
 
   T* operator->() const { return nanobind::inst_ptr<T>(ptr()); }
   T& operator*() const { return *nanobind::inst_ptr<T>(ptr()); }
-  T* get() const { return nanobind::inst_ptr<T>(ptr()); }
+  T* get() const { return ptr() ? nanobind::inst_ptr<T>(ptr()) : nullptr; }
 };
 
 // This function is analogous to std::make_unique<T>(...), but instead it
diff --git a/third_party/xla/xla/python/nb_helpers.h b/third_party/xla/xla/python/nb_helpers.h
index 9db7f89248c8e0..b0bbab98439c73 100644
--- a/third_party/xla/xla/python/nb_helpers.h
+++ b/third_party/xla/xla/python/nb_helpers.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <Python.h>
 
+#include "absl/strings/str_format.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 
 namespace xla {
@@ -30,6 +31,17 @@ Py_hash_t nb_hash(nanobind::handle o);
 // TODO(phawkins): consider upstreaming this to nanobind.
 bool nb_isinstance(nanobind::handle inst, nanobind::handle cls);
 
+// Issues a Python deprecation warning. Throws a C++ exception if issuing the
+// Python warning causes a Python exception to be raised.
+template <typename... Args>
+void PythonDeprecationWarning(const absl::FormatSpec<Args...>& format,
+                              const Args&... args) {
+  if (PyErr_WarnEx(PyExc_DeprecationWarning,
+                   absl::StrFormat(format, args...).c_str(), 1) < 0) {
+    throw nanobind::python_error();
+  }
+}
+
 // Variant of NB_TYPE_CASTER that doesn't define from_cpp()
 #define NB_TYPE_CASTER_FROM_PYTHON_ONLY(Value_, descr)   \
   using Value = Value_;                                  \
diff --git a/third_party/xla/xla/python/outfeed_receiver_py.cc b/third_party/xla/xla/python/outfeed_receiver_py.cc
index 5563e53720b0a0..e802550840277c 100644
--- a/third_party/xla/xla/python/outfeed_receiver_py.cc
+++ b/third_party/xla/xla/python/outfeed_receiver_py.cc
@@ -27,17 +27,16 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/function.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "pybind11/cast.h"  // from @pybind11
-#include "pybind11/functional.h"  // from @pybind11
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/client/executable_build_options.h"
 #include "xla/client/xla_builder.h"
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/status_casters.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/outfeed_receiver.h"
 #include "xla/python/py_client.h"
 #include "xla/python/types.h"
@@ -46,7 +45,6 @@ limitations under the License.
 namespace xla {
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 namespace {
 
@@ -56,11 +54,11 @@ class OutfeedReceiverForPython {
  public:
   // A callback to Python takes: consumer id, received literal.
   using CallbackToPython =
-      std::function<void(ClientAndPtr<PjRtDevice>, uint32_t, py::object)>;
+      std::function<void(nb_class_ptr<PyDevice>, uint32_t, nb::object)>;
 
   OutfeedReceiverForPython(
       CallbackToPython callback_python,
-      std::vector<std::shared_ptr<PyClient>> clients,
+      std::vector<nb_class_ptr<PyClient>> clients,
       ssize_t max_callback_queue_size_bytes,
       const std::optional<ExecutableBuildOptions>& executable_build_options)
       : callback_python_(std::move(callback_python)),
@@ -72,7 +70,7 @@ class OutfeedReceiverForPython {
         };
     std::vector<PjRtClient*> client_ptrs(clients_.size());
     absl::c_transform(clients_, client_ptrs.begin(),
-                      [](const std::shared_ptr<PyClient>& client) {
+                      [](const nb_class_ptr<PyClient>& client) {
                         return client->pjrt_client();
                       });
     outfeed_receiver_ = std::make_unique<OutfeedReceiver>(
@@ -118,24 +116,24 @@ class OutfeedReceiverForPython {
     }
     // We expect the number of clients to be small, so an O(n) search is fine.
     auto it = absl::c_find_if(
-        clients_, [device](const std::shared_ptr<PyClient>& client) {
+        clients_, [device](const nb_class_ptr<PyClient>& client) {
           return client->pjrt_client() == device->client();
         });
     CHECK(it != clients_.end());
+    PyClient* client = it->get();
     nb::gil_scoped_acquire gil_acquire;  // Need GIL also for LiteralToPython
     nb::object literal_python = LiteralToPython(std::move(literal)).value();
     // The callback_ should handle all exceptions in user-code. If we get
     // an exception here, it is a bug in the callback and we should stop.
-    callback_python_(
-        WrapWithClient<PjRtDevice>(*it, device), consumer_id,
-        py::reinterpret_steal<py::object>(literal_python.release().ptr()));
+    callback_python_(client->GetPyDevice(device), consumer_id,
+                     std::move(literal_python));
   }
 
  private:
   CallbackToPython callback_python_;
   absl::Mutex mu_;
   bool outfeed_receiver_shutting_down_ ABSL_GUARDED_BY(mu_) = false;
-  std::vector<std::shared_ptr<PyClient>> clients_;
+  std::vector<nb_class_ptr<PyClient>> clients_;
   std::unique_ptr<OutfeedReceiver> outfeed_receiver_;
 };
 
@@ -146,17 +144,13 @@ void BuildOutfeedReceiverSubmodule(nb::module_& m) {
       m.def_submodule("outfeed_receiver", "Outfeed receiver");
   outfeed_receiver.def(
       "start",
-      [](nb::object callback_to_python, nb::object clients,
-         ssize_t max_callback_queue_size_bytes,
+      [](OutfeedReceiverForPython::CallbackToPython callback_to_python,
+         nb::sequence clients, ssize_t max_callback_queue_size_bytes,
          std::optional<ExecutableBuildOptions> executable_build_options)
           -> std::unique_ptr<OutfeedReceiverForPython> {
-        // TODO(phawkins): after the nanobind transition, pass
-        // clients as a std::vector<std::shared_ptr<PyClient>>.
         auto server = std::make_unique<OutfeedReceiverForPython>(
-            py::cast<OutfeedReceiverForPython::CallbackToPython>(
-                py::handle(callback_to_python.ptr())),
-            py::cast<std::vector<std::shared_ptr<PyClient>>>(
-                py::handle(clients.ptr())),
+            std::move(callback_to_python),
+            SequenceToVector<nb_class_ptr<PyClient>>(clients),
             max_callback_queue_size_bytes, executable_build_options);
         nb::gil_scoped_release gil_release;
         server->Start();
diff --git a/third_party/xla/xla/python/pjit.cc b/third_party/xla/xla/python/pjit.cc
index 09f285fd10794a..630865e9132abf 100644
--- a/third_party/xla/xla/python/pjit.cc
+++ b/third_party/xla/xla/python/pjit.cc
@@ -228,7 +228,7 @@ class PjitFunction {
       return PjitFunction::AsPjitFunctionUnchecked(*this);
     }
   };
-  // Alias as ::object; outside the scope above we won't confuse pybind11's
+  // Alias as ::object; outside the scope above we won't confuse nanobind's
   // macros.
   using object = pyobject;
 
@@ -368,7 +368,8 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
                   const std::vector<bool>& kept_args,
                   const std::vector<nb::object>& in_shardings,
                   const nb::callable& shard_arg_fallback) {
-  const auto& addressable_devices = executable.AddressableDevices();
+  const auto& addressable_devices =
+      executable.ifrt_loaded_executable()->addressable_devices();
   int num_args = arguments.flat_dynamic_args.size();
 
   std::vector<tsl::RCReference<xla::ifrt::Array>> num_args_arrays;
@@ -443,10 +444,10 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
     // `PjitFunction::UpdateArgsSignature()`.
     DCHECK(ifrt_array != nullptr) << "PyArray has been unexpectedly deleted.";
 
-    if (sharding_num_devices == 1 && ifrt_array->sharding().devices().front() !=
-                                         addressable_devices[0].get()) {
+    if (sharding_num_devices == 1 &&
+        ifrt_array->sharding().devices().front() != addressable_devices[0]) {
       xla::ifrt::DeviceList::Devices ifrt_devices;
-      ifrt_devices.push_back(addressable_devices[0].get());
+      ifrt_devices.push_back(addressable_devices[0]);
       auto sharding = xla::ifrt::OpaqueSharding::Create(
           xla::ifrt::DeviceList(std::move(ifrt_devices)),
           ifrt_array->sharding().memory_kind());
diff --git a/third_party/xla/xla/python/pmap_lib.cc b/third_party/xla/xla/python/pmap_lib.cc
index 14bc9ed9234930..9caaa46d0a44a6 100644
--- a/third_party/xla/xla/python/pmap_lib.cc
+++ b/third_party/xla/xla/python/pmap_lib.cc
@@ -43,7 +43,6 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/variant.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/status_casters.h"
@@ -53,10 +52,12 @@ limitations under the License.
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/jax_jit.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/nb_helpers.h"
 #include "xla/python/nb_numpy.h"
 #include "xla/python/py_array.h"
 #include "xla/python/py_client.h"
+#include "xla/python/py_device.h"
 #include "xla/python/py_executable.h"
 #include "xla/python/py_values.h"
 #include "xla/python/python_ref_manager.h"
@@ -77,7 +78,6 @@ limitations under the License.
 namespace jax {
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 namespace {
 
@@ -203,16 +203,15 @@ absl::StatusOr<ShardArgResult> ShardArg(
     options.squash_64bit_types = !jax_enable_x64;
     options.allow_zero_copy = true;
     for (size_t i = 0; i < n_devices; ++i) {
-      auto to_device = py::cast<xla::ClientAndPtr<xla::PjRtDevice>>(
-          py::handle(py_devices_list[i].ptr()));
-      if (to_device.get_client() == nullptr) {
+      auto to_device = nb::cast<xla::PyDevice*>(py_devices_list[i]);
+      if (to_device->client().get() == nullptr) {
         return xla::InvalidArgument("Cannot copy to unattached devices.");
       }
 
       TF_ASSIGN_OR_RETURN(
           xla::DevicePutResult on_device,
-          DevicePut(arg[indices[i]], to_device.get_client()->ifrt_client(),
-                    to_device.get(), options, xla::ifrt::MemoryKind()));
+          DevicePut(arg[indices[i]], to_device->client()->ifrt_client(),
+                    to_device->device(), options, xla::ifrt::MemoryKind()));
 
       per_device_arrays.push_back(std::move(on_device.ifrt_array));
       devices.push_back(per_device_arrays.back()->sharding().devices().front());
@@ -341,7 +340,7 @@ class PmapFunction {
       return PmapFunction::AsPmapFunctionUnchecked(*this);
     }
   };
-  // Alias as ::object; outside the scope above we won't confuse pybind11's
+  // Alias as ::object; outside the scope above we won't confuse nanobind's
   // macros.
   using object = pyobject;
 
@@ -459,19 +458,19 @@ void PmapFunction::PopulateCacheEntry(PmapCacheEntry& cache_entry,
     return;
   }
   cache_entry.executable = std::move(executable);
-  const std::vector<xla::ClientAndPtr<xla::PjRtDevice>>& client_and_devices =
+  const std::vector<xla::nb_class_ptr<xla::PyDevice>>& devices =
       cache_entry.executable->AddressableDevices();
-  cache_entry.devices.reserve(client_and_devices.size());
-  for (auto& client_and_device : client_and_devices) {
-    cache_entry.devices.push_back(client_and_device.get());
+  cache_entry.devices.reserve(devices.size());
+  for (auto& device : devices) {
+    cache_entry.devices.push_back(device->device());
   }
 
   // Inputs shard args details.
   nb::list input_indices = pmap_data.attr("input_indices");
 
   cache_entry.py_devices = pmap_data.attr("input_devices");
-  auto input_devices = py::cast<std::vector<xla::PjRtDevice*>>(
-      py::handle(pmap_data.attr("input_devices").ptr()));
+  auto input_devices = nb::cast<std::vector<xla::nb_class_ptr<xla::PyDevice>>>(
+      pmap_data.attr("input_devices"));
 
   nb::list input_array_shardings = pmap_data.attr("input_array_shardings");
 
@@ -654,7 +653,7 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
   // we access them from Python.
   auto traceback = xla::Traceback::Get();
   // TODO(jblespiau): Change the `client` function to return a reference.
-  std::shared_ptr<xla::PyClient> client = cache_entry.executable->client();
+  xla::nb_class_ptr<xla::PyClient> client = cache_entry.executable->client();
 
   // Convert the PjRtBuffer objects to PyBuffer, and invert the order from
   // [num_devices, num_args] to [num_args, num_devices].
@@ -747,12 +746,6 @@ PyObject* JaxPmapFunction_tp_vectorcall(PyObject* callable,
       return nullptr;
     }
     return out.value().release().ptr();
-  } catch (py::error_already_set& e) {
-    e.restore();
-    return nullptr;
-  } catch (py::cast_error& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
   } catch (nb::python_error& e) {
     e.restore();
     return nullptr;
diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index 1d3121484085b5..831e3c09ab25d3 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -43,8 +43,6 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/pjrt/exceptions.h"
@@ -62,12 +60,14 @@ limitations under the License.
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/nb_helpers.h"
 #include "xla/python/nb_numpy.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 #include "xla/python/py_client.h"
+#include "xla/python/py_device.h"
 #include "xla/python/py_values.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/sharding.h"
@@ -93,7 +93,6 @@ namespace xla {
 namespace {
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 PjRtBuffer* GetPjrtBuffer(ifrt::Array* ifrt_array) {
   auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array);
@@ -355,7 +354,7 @@ PyArray_Storage::PyArray_Storage(nb::object aval, bool weak_type,
                                  xla::nb_dtype dtype,
                                  std::vector<int64_t> shape,
                                  nb::object sharding, bool committed,
-                                 std::shared_ptr<PyClient> py_client,
+                                 nb_class_ptr<PyClient> py_client,
                                  std::optional<nb_traceback> traceback,
                                  tsl::RCReference<ifrt::Array> ifrt_array,
                                  xla::PjRtFuture<absl::Status> result_status)
@@ -404,7 +403,7 @@ void PyArray::PyInit(nb::object self, DisableFastpath) {
 }
 
 PyArray PyArray::MakeFromSingleDeviceArray(
-    std::shared_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
+    nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
     tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type, bool committed,
     xla::PjRtFuture<absl::Status> result_status) {
   if (!llvm::isa<ifrt::SingleDeviceSharding>(ifrt_array->sharding())) {
@@ -434,7 +433,7 @@ PyArray PyArray::MakeFromSingleDeviceArray(
 }
 
 PyArray PyArray::MakeFromIfrtArrayAndSharding(
-    std::shared_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
+    nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
     tsl::RCReference<ifrt::Array> ifrt_array, nb::object sharding,
     bool weak_type, bool committed, bool skip_checks) {
   auto shape_span = ifrt_array->shape().dims();
@@ -469,8 +468,7 @@ PyArray PyArrayResultHandler::Call(absl::Span<const PyArray> py_arrays) const {
 }
 
 PyArray PyArrayResultHandler::Call(
-    std::shared_ptr<PyClient> py_client,
-    tsl::RCReference<ifrt::Array> ifrt_array,
+    nb_class_ptr<PyClient> py_client, tsl::RCReference<ifrt::Array> ifrt_array,
     xla::PjRtFuture<absl::Status> result_status) const {
   return PyArray(aval_, weak_type_, dtype_, shape_, sharding_,
                  std::move(py_client), Traceback::Get(), std::move(ifrt_array),
@@ -484,7 +482,7 @@ PyArray PyArrayResultHandler::Call(PyArray py_array) const {
 
 PyArray::PyArray(nb::object aval, bool weak_type, nb_dtype dtype,
                  std::vector<int64_t> shape, nb::object sharding,
-                 std::shared_ptr<PyClient> py_client,
+                 nb_class_ptr<PyClient> py_client,
                  std::optional<nb_traceback> traceback,
                  tsl::RCReference<ifrt::Array> ifrt_array, bool committed,
                  bool skip_checks,
@@ -583,7 +581,7 @@ Status PyArray::set_arrays(nb::object obj) {
   for (nb::handle obj : list) {
     if (obj.type().is(PyArray::type())) {
       auto py_array = nb::borrow<PyArray>(obj);
-      if (py_array.py_client() != py_client()) {
+      if (py_array.py_client().get() != py_client().get()) {
         return InvalidArgument("Client mismatch when assigning to _arrays.");
       }
       if (py_array.num_shards() != 1) {
@@ -813,8 +811,8 @@ nb::dict PyArray::CudaArrayInterface() {
   return result;
 }
 
-StatusOr<nb::object> CudaArrayInterfaceToBuffer(
-    const nb::dict& cai, std::shared_ptr<PyClient> client) {
+StatusOr<nb::object> CudaArrayInterfaceToBuffer(const nb::dict& cai,
+                                                nb_class_ptr<PyClient> client) {
 #ifndef GOOGLE_CUDA
   throw XlaRuntimeError("This operation requires CUDA support.");
 #else
@@ -912,8 +910,9 @@ StatusOr<nb::object> CudaArrayInterfaceToBuffer(
   std::function<void()> on_delete_callback = []() {};
   TF_ASSIGN_OR_RETURN(
       auto pjrt_buffer,
-      device->client()->CreateViewOfDeviceBuffer(
-          static_cast<char*>(data_ptr), shape, device.get(), on_delete_callback,
+      device->client()->pjrt_client()->CreateViewOfDeviceBuffer(
+          static_cast<char*>(data_ptr), shape, device->device(),
+          on_delete_callback,
           stream <= 2 ? std::nullopt : std::make_optional(stream)));
   auto* ifrt_client =
       llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(client->ifrt_client());
@@ -1065,7 +1064,7 @@ StatusOr<PyArray> PyArray::CopyToDeviceWithSharding(ifrt::DeviceList devices,
 
 StatusOr<PyArray> PyArray::BatchedDevicePut(
     nb::object aval, nb::object sharding, std::vector<nb::object> xs,
-    std::vector<ClientAndPtr<PjRtDevice>> dst_devices, bool committed,
+    absl::Span<const PyDevice* const> dst_devices, bool committed,
     bool force_copy, PjRtClient::HostBufferSemantics host_buffer_semantics,
     bool jax_enable_x64) {
   if (dst_devices.size() != xs.size() || xs.empty()) {
@@ -1075,8 +1074,8 @@ StatusOr<PyArray> PyArray::BatchedDevicePut(
                      dst_devices.size(), xs.size())
             .c_str());
   }
-  for (ClientAndPtr<PjRtDevice>& device : dst_devices) {
-    if (device.get_client() == nullptr) {
+  for (const PyDevice* device : dst_devices) {
+    if (device->client().get() == nullptr) {
       return InvalidArgument("Cannot copy to unattached devices.");
     }
   }
@@ -1117,8 +1116,8 @@ StatusOr<PyArray> PyArray::BatchedDevicePut(
     }
     TF_ASSIGN_OR_RETURN(
         DevicePutResult on_device,
-        DevicePut(x, dst_devices[i].get_client()->ifrt_client(),
-                  dst_devices[i].get(), options, dst_memory_kind));
+        DevicePut(x, dst_devices[i]->client()->ifrt_client(),
+                  dst_devices[i]->device(), options, dst_memory_kind));
     ifrt_arrays.push_back(std::move(on_device.ifrt_array));
     devices.push_back(ifrt_arrays.back()->sharding().devices().front());
     shapes.push_back(ifrt_arrays.back()->shape());
@@ -1146,7 +1145,7 @@ StatusOr<PyArray> PyArray::BatchedDevicePut(
           xla::ifrt::ArrayCopySemantics::kReuseInput));
 
   return PyArray(aval, weak_type, dtype, std::move(shape), sharding,
-                 dst_devices[0].client(), Traceback::Get(),
+                 dst_devices[0]->client(), Traceback::Get(),
                  std::move(ifrt_array), committed, /*skip_checks=*/true);
 }
 
@@ -1616,13 +1615,12 @@ Status PyArray::RegisterTypes(nb::module_& m) {
   type.attr("__module__") = m.attr("__name__");
 
   m.attr("copy_array_to_devices_with_sharding") = nb::cpp_function(
-      [](PyArray self, nb::object dst_devices_py, nb::object sharding) {
-        auto dst_devices = py::cast<std::vector<ClientAndPtr<PjRtDevice>>>(
-            py::handle(dst_devices_py.ptr()));
+      [](PyArray self, absl::Span<const PyDevice* const> dst_devices,
+         nb::object sharding) {
         ifrt::DeviceList::Devices devices;
         devices.reserve(dst_devices.size());
         for (auto& d : dst_devices) {
-          devices.push_back(d.get());
+          devices.push_back(d->device());
         }
         return xla::ValueOrThrow(self.CopyToDeviceWithSharding(
             ifrt::DeviceList(devices), std::move(sharding)));
diff --git a/third_party/xla/xla/python/py_array.h b/third_party/xla/xla/python/py_array.h
index b2883f0292e7c9..72f0981fbe0769 100644
--- a/third_party/xla/xla/python/py_array.h
+++ b/third_party/xla/xla/python/py_array.h
@@ -31,15 +31,12 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "pybind11/numpy.h"  // from @pybind11
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "pybind11/stl.h"  // from @pybind11
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/future.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/nb_numpy.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/py_client.h"
@@ -79,7 +76,7 @@ class PyHostValue {
 struct PyArray_Storage {
   PyArray_Storage(nanobind::object aval, bool weak_type, nb_dtype dtype,
                   std::vector<int64_t> shape, nanobind::object sharding,
-                  bool committed, std::shared_ptr<PyClient> py_client,
+                  bool committed, nb_class_ptr<PyClient> py_client,
                   std::optional<nb_traceback> traceback,
                   tsl::RCReference<ifrt::Array> ifrt_array,
                   xla::PjRtFuture<absl::Status> result_status);
@@ -103,7 +100,7 @@ struct PyArray_Storage {
   nanobind::object npy_value = nanobind::none();
   bool committed = false;
 
-  std::shared_ptr<PyClient> py_client;
+  nb_class_ptr<PyClient> py_client;
   std::optional<nb_traceback> traceback;
   tsl::RCReference<ifrt::Array> ifrt_array;
 
@@ -147,7 +144,7 @@ class PyArray : public nanobind::object {
   // checked.
   PyArray(nanobind::object aval, bool weak_type, nb_dtype dtype,
           std::vector<int64_t> shape, nanobind::object sharding,
-          std::shared_ptr<PyClient> py_client,
+          nb_class_ptr<PyClient> py_client,
           std::optional<nb_traceback> traceback,
           tsl::RCReference<ifrt::Array> ifrt_array, bool committed,
           bool skip_checks,
@@ -155,15 +152,13 @@ class PyArray : public nanobind::object {
               xla::PjRtFuture<absl::Status>());
 
   static PyArray MakeFromSingleDeviceArray(
-      std::shared_ptr<PyClient> py_client,
-      std::optional<nb_traceback> traceback,
+      nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
       tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type, bool committed,
       xla::PjRtFuture<absl::Status> result_status =
           xla::PjRtFuture<absl::Status>());
 
   static PyArray MakeFromIfrtArrayAndSharding(
-      std::shared_ptr<PyClient> py_client,
-      std::optional<nb_traceback> traceback,
+      nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
       tsl::RCReference<ifrt::Array> ifrt_array, nanobind::object sharding,
       bool weak_type, bool committed, bool skip_checks);
 
@@ -188,7 +183,7 @@ class PyArray : public nanobind::object {
     GetStorage().npy_value = std::move(v);
   }
 
-  const std::shared_ptr<PyClient>& py_client() const {
+  const nb_class_ptr<PyClient>& py_client() const {
     return GetStorage().py_client;
   }
 
@@ -294,7 +289,7 @@ class PyArray : public nanobind::object {
   static StatusOr<PyArray> BatchedDevicePut(
       nanobind::object aval, nanobind::object sharding,
       std::vector<nanobind::object> xs,
-      std::vector<ClientAndPtr<PjRtDevice>> dst_devices, bool committed,
+      absl::Span<const PyDevice* const> dst_devices, bool committed,
       bool force_copy, PjRtClient::HostBufferSemantics host_buffer_semantics,
       bool jax_enable_x64);
 
@@ -325,7 +320,7 @@ class PyArrayResultHandler {
   PyArray Call(absl::Span<const PyArray> py_arrays) const;
   PyArray Call(PyArray py_array) const;
 
-  PyArray Call(std::shared_ptr<PyClient> py_client,
+  PyArray Call(nb_class_ptr<PyClient> py_client,
                tsl::RCReference<ifrt::Array> ifrt_array,
                xla::PjRtFuture<absl::Status> result_status =
                    xla::PjRtFuture<absl::Status>()) const;
@@ -342,7 +337,7 @@ class PyArrayResultHandler {
 };
 
 StatusOr<nanobind::object> CudaArrayInterfaceToBuffer(
-    const nanobind::dict& cai, std::shared_ptr<PyClient> cuda_client);
+    const nanobind::dict& cai, nb_class_ptr<PyClient> cuda_client);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/py_client.cc b/third_party/xla/xla/python/py_client.cc
index 298ba93be5d381..19ee503c1ded5c 100644
--- a/third_party/xla/xla/python/py_client.cc
+++ b/third_party/xla/xla/python/py_client.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -38,6 +40,12 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/pair.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/variant.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/literal.h"
 #include "xla/pjrt/exceptions.h"
@@ -46,6 +54,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/callback.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
@@ -53,13 +62,17 @@ limitations under the License.
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt/memory.h"
+#include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/python/pprof_profile_builder.h"
 #include "xla/python/py_array.h"
+#include "xla/python/py_device.h"
 #include "xla/python/py_executable.h"
 #include "xla/python/py_host_callback.h"
+#include "xla/python/py_memory_space.h"
 #include "xla/python/py_values.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/traceback.h"
@@ -83,7 +96,13 @@ limitations under the License.
 namespace xla {
 
 namespace nb = nanobind;
-namespace py = pybind11;
+
+/*static*/ nb_class_ptr<PyClient> PyClient::Make(
+    std::shared_ptr<ifrt::Client> ifrt_client) {
+  auto client = make_nb_class<PyClient>(std::move(ifrt_client));
+  Initialize(client);
+  return client;
+}
 
 PyClient::PyClient(std::shared_ptr<ifrt::Client> ifrt_client)
     : ifrt_client_(std::move(ifrt_client)),
@@ -91,43 +110,75 @@ PyClient::PyClient(std::shared_ptr<ifrt::Client> ifrt_client)
   CHECK(ifrt_client_);
 }
 
+/* static */ void PyClient::Initialize(nb_class_ptr<PyClient> client) {
+  for (ifrt::Device* device : client->ifrt_client()->devices()) {
+    client->devices_[device] = make_nb_class<PyDevice>(client, device);
+
+    for (PjRtMemorySpace* memory : device->memory_spaces()) {
+      auto& py_memory = client->memory_spaces_[memory];
+      if (py_memory.get() == nullptr) {
+        py_memory = make_nb_class<PyMemorySpace>(client, memory);
+      }
+    }
+  }
+}
+
 PyClient::~PyClient() {
-  py::gil_scoped_release gil;
+  nb::gil_scoped_release gil;
   ifrt_client_ = nullptr;
 }
 
-std::vector<ClientAndPtr<PjRtDevice>> PyClient::Devices() {
-  std::vector<ClientAndPtr<PjRtDevice>> devices;
+nb_class_ptr<PyDevice> PyClient::GetPyDevice(ifrt::Device* device) {
+  auto& py_device = devices_[device];
+  if (py_device.get() == nullptr) {
+    py_device = make_nb_class<PyDevice>(
+        nb::borrow<nb_class_ptr<PyClient>>(nb::find(this)), device);
+  }
+  return py_device;
+}
+
+nb_class_ptr<PyMemorySpace> PyClient::GetPyMemorySpace(
+    PjRtMemorySpace* memory_space) {
+  auto& py_memory = memory_spaces_[memory_space];
+  if (py_memory.get() == nullptr) {
+    py_memory = make_nb_class<PyMemorySpace>(
+        nb::borrow<nb_class_ptr<PyClient>>(nb::find(this)), memory_space);
+  }
+  return py_memory;
+}
+
+std::vector<nb_class_ptr<PyDevice>> PyClient::Devices() {
+  std::vector<nb_class_ptr<PyDevice>> devices;
   auto span = ifrt_client_->devices();
   devices.reserve(span.size());
   for (PjRtDevice* device : span) {
-    devices.push_back(WrapWithClient(shared_from_this(), device));
+    devices.push_back(GetPyDevice(device));
   }
   return devices;
 }
 
-std::vector<ClientAndPtr<PjRtDevice>> PyClient::LocalDevices() {
-  std::vector<ClientAndPtr<PjRtDevice>> devices;
+std::vector<nb_class_ptr<PyDevice>> PyClient::LocalDevices() {
+  std::vector<nb_class_ptr<PyDevice>> devices;
   devices.reserve(ifrt_client_->addressable_devices().size());
   for (ifrt::Device* device : ifrt_client_->addressable_devices()) {
-    devices.push_back(WrapWithClient(shared_from_this(), device));
+    devices.push_back(GetPyDevice(device));
   }
   return devices;
 }
 
-absl::StatusOr<ClientAndPtr<PjRtDevice>> PyClient::DeviceFromLocalHardwareId(
+absl::StatusOr<nb_class_ptr<PyDevice>> PyClient::DeviceFromLocalHardwareId(
     int local_hardware_id) {
   TF_ASSIGN_OR_RETURN(PjRtDevice * device,
                       ifrt_client_->LookupAddressableDevice(local_hardware_id));
-  return WrapWithClient(shared_from_this(), device);
+  return GetPyDevice(device);
 }
 
-std::vector<std::shared_ptr<PyLoadedExecutable>> PyClient::LiveExecutables() {
+nb::list PyClient::LiveExecutables() {
   CHECK(PyGILState_Check());
-  std::vector<std::shared_ptr<PyLoadedExecutable>> executables;
+  nb::list executables;
   for (PyLoadedExecutable* exec = executables_; exec; exec = exec->next_) {
     if (!exec->is_deleted()) {
-      executables.push_back(exec->shared_from_this());
+      executables.append(nb::find(exec));
     }
   }
   return executables;
@@ -218,28 +269,28 @@ absl::Status PyClient::Defragment() {
   return absl::OkStatus();
 }
 
-absl::StatusOr<py::object> PyClient::BufferFromPyval(
-    pybind11::handle argument, PjRtDevice* device, bool force_copy,
-    ifrt::Client::HostBufferSemantics host_buffer_semantics) {
+/* static */ absl::StatusOr<nb::object> PyClient::BufferFromPyval(
+    nb_class_ptr<PyClient> client, nb::handle argument, PjRtDevice* device,
+    bool force_copy, ifrt::Client::HostBufferSemantics host_buffer_semantics) {
   if (device == nullptr) {
-    TF_RET_CHECK(!ifrt_client_->addressable_devices().empty());
-    device = ifrt_client_->addressable_devices().front();
+    TF_RET_CHECK(!client->ifrt_client_->addressable_devices().empty());
+    device = client->ifrt_client_->addressable_devices().front();
   }
   CHECK(device != nullptr);
 
   auto transfer_guard_formatter = [&argument, dst_device = device] {
-    auto type = py::cast<std::string>(py::str(argument.get_type()));
+    auto type = nb::cast<std::string>(nb::str(argument.type()));
     // Catch exceptions because shape and dtype properties convertible to str
     // are not guaranteed to present in an arbitrary argument.
     std::string shape;
     std::string dtype;
     try {
-      shape = py::cast<std::string>(py::str(argument.attr("shape")));
+      shape = nb::cast<std::string>(nb::str(argument.attr("shape")));
     } catch (const std::exception& e) {
       shape = "<unknown>";
     }
     try {
-      dtype = py::cast<std::string>(py::str(argument.attr("dtype")));
+      dtype = nb::cast<std::string>(nb::str(argument.attr("dtype")));
     } catch (const std::exception& e) {
       dtype = "<unknown>";
     }
@@ -250,11 +301,11 @@ absl::StatusOr<py::object> PyClient::BufferFromPyval(
       jax::ApplyTransferGuardToHostToDevice(transfer_guard_formatter));
 
   TF_ASSIGN_OR_RETURN(PjRtDevice * found_device,
-                      ifrt_client_->LookupDevice(device->id()));
+                      client->ifrt_client_->LookupDevice(device->id()));
   if (found_device != device) {
     return InvalidArgument("Cannot copy value to device '%s' with '%s' backend",
                            device->DebugString(),
-                           ifrt_client_->platform_name());
+                           client->ifrt_client_->platform_name());
   }
   GlobalPyRefManager()->CollectGarbage();
 
@@ -265,26 +316,23 @@ absl::StatusOr<py::object> PyClient::BufferFromPyval(
        (host_buffer_semantics == ifrt::Client::HostBufferSemantics::kZeroCopy));
   // TODO(phawkins): remove .ptr() after nanobind transition is complete.
   TF_ASSIGN_OR_RETURN(DevicePutResult put,
-                      DevicePut(argument.ptr(), ifrt_client_.get(), device,
-                                options, ifrt::MemoryKind()));
+                      DevicePut(argument.ptr(), client->ifrt_client_.get(),
+                                device, options, ifrt::MemoryKind()));
 
   if (put.ifrt_array) {
     auto traceback = Traceback::Get();
-    auto out = PyArray::MakeFromSingleDeviceArray(
-        shared_from_this(), std::move(traceback), std::move(put.ifrt_array),
+    return PyArray::MakeFromSingleDeviceArray(
+        std::move(client), std::move(traceback), std::move(put.ifrt_array),
         /*weak_type=*/false,
         /*committed=*/false);
-    // TODO(phawkins): remove after nanobind transition is complete.
-    return py::reinterpret_steal<py::object>(out.release().ptr());
   } else {
-    // TODO(phawkins): remove .ptr() after nanobind transition is complete.
-    return py::reinterpret_borrow<py::object>(put.owning_pybuffer.ptr());
+    return put.owning_pybuffer;
   }
 }
 
-absl::StatusOr<std::vector<std::pair<pybind11::bytes, pybind11::object>>>
-PyClient::MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
-                                      PjRtDevice* device) {
+/* static */ absl::StatusOr<nb::list> PyClient::MakeCrossHostReceiveBuffers(
+    nb_class_ptr<PyClient> client, absl::Span<const Shape> shapes,
+    PjRtDevice* device) {
   CHECK(device != nullptr);
   absl::Mutex mu;
   absl::StatusOr<std::vector<PjRtCrossHostRecvDescriptors>> recv_descriptors_or;
@@ -292,13 +340,13 @@ PyClient::MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
 
   TF_ASSIGN_OR_RETURN(
       auto buffers,
-      pjrt_client()->MakeCrossHostReceiveBuffers(
+      client->pjrt_client()->MakeCrossHostReceiveBuffers(
           shapes, device,
           [&done, &recv_descriptors_or,
            &mu](absl::StatusOr<PjRtCrossHostRecvState> recv_state_or) {
             absl::MutexLock l(&mu);
             if (recv_state_or.ok()) {
-              py::gil_scoped_acquire gil;
+              nb::gil_scoped_acquire gil;
               recv_descriptors_or = std::move(recv_state_or->descriptors);
             } else {
               recv_descriptors_or = recv_state_or.status();
@@ -307,36 +355,32 @@ PyClient::MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
           }));
 
   {
-    py::gil_scoped_release gil_release;
+    nb::gil_scoped_release gil_release;
     absl::MutexLock l(&mu);
     mu.Await(absl::Condition(&done));
   }
 
   TF_RETURN_IF_ERROR(recv_descriptors_or.status());
   CHECK_EQ(buffers.size(), recv_descriptors_or->size());
-  std::vector<std::pair<pybind11::bytes, pybind11::object>> result;
-  result.reserve(buffers.size());
+  nb::list result;
   for (int i = 0; i < buffers.size(); ++i) {
     auto& descriptors = recv_descriptors_or->at(i);
     CHECK_EQ(descriptors.serialized_descriptors.size(), 1);
     const std::string& desc = descriptors.serialized_descriptors[0];
-    pybind11::bytes py_desc = pybind11::bytes(desc);
-    auto* client =
-        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(ifrt_client());
-    if (client == nullptr) {
+    nb::bytes py_desc = nb::bytes(desc.data(), desc.size());
+    auto* ifrt_client = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(
+        client->ifrt_client());
+    if (ifrt_client == nullptr) {
       throw XlaRuntimeError(
           "This operation is implemented for a PjRt-compatible backend only.");
     }
     TF_ASSIGN_OR_RETURN(auto ifrt_array,
-                        client->CreatePjRtArray(std::move(buffers[i])));
-    auto py_buf = PyArray::MakeFromSingleDeviceArray(
-        shared_from_this(), Traceback::Get(), std::move(ifrt_array),
-        /*weak_type=*/false,
-        /*committed=*/false);
-    // TODO(phawkins): update after nanobind transition
-    result.push_back(std::make_pair(
-        std::move(py_desc),
-        py::reinterpret_steal<py::object>(py_buf.release().ptr())));
+                        ifrt_client->CreatePjRtArray(std::move(buffers[i])));
+    auto py_buf = PyArray::MakeFromSingleDeviceArray(client, Traceback::Get(),
+                                                     std::move(ifrt_array),
+                                                     /*weak_type=*/false,
+                                                     /*committed=*/false);
+    result.append(nb::make_tuple(std::move(py_desc), std::move(py_buf)));
   }
   return result;
 }
@@ -346,7 +390,7 @@ namespace {
 // Makes IFRT `CompileOptions` from XLA `CompileOptions` and optional host
 // callbacks.
 std::unique_ptr<ifrt::CompileOptions> MakeIfrtCompileOptions(
-    CompileOptions options, std::vector<pybind11::capsule> host_callbacks) {
+    CompileOptions options, std::vector<nb::capsule> host_callbacks) {
   std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>
       ifrt_loaded_host_callbacks;
   ifrt_loaded_host_callbacks.reserve(host_callbacks.size());
@@ -354,8 +398,8 @@ std::unique_ptr<ifrt::CompileOptions> MakeIfrtCompileOptions(
   // created by `PyClient::MakePythonCallbackUsingHostSendAndRecv()` or
   // `PyClient::GetEmitPythonCallbackDescriptor()`.
   for (auto& host_callback : host_callbacks) {
-    ifrt_loaded_host_callbacks.push_back(
-        tsl::FormRef(host_callback.get_pointer<ifrt::LoadedHostCallback>()));
+    ifrt_loaded_host_callbacks.push_back(tsl::FormRef(
+        static_cast<ifrt::LoadedHostCallback*>(host_callback.data())));
   }
   return std::make_unique<ifrt::XlaCompileOptions>(
       std::move(options), std::move(ifrt_loaded_host_callbacks));
@@ -364,9 +408,8 @@ std::unique_ptr<ifrt::CompileOptions> MakeIfrtCompileOptions(
 // Makes IFRT `DeserializeExecutableOptions` from XLA `CompileOptions` and
 // optional host callbacks.
 std::unique_ptr<ifrt::DeserializeExecutableOptions>
-MakeIfrtDeserializeExecutableOptions(
-    std::optional<CompileOptions> options,
-    std::vector<pybind11::capsule> host_callbacks) {
+MakeIfrtDeserializeExecutableOptions(std::optional<CompileOptions> options,
+                                     std::vector<nb::capsule> host_callbacks) {
   std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>
       ifrt_loaded_host_callbacks;
   ifrt_loaded_host_callbacks.reserve(host_callbacks.size());
@@ -374,8 +417,8 @@ MakeIfrtDeserializeExecutableOptions(
   // created by `PyClient::MakePythonCallbackUsingHostSendAndRecv()` or
   // `PyClient::GetEmitPythonCallbackDescriptor()`.
   for (auto& host_callback : host_callbacks) {
-    ifrt_loaded_host_callbacks.push_back(
-        tsl::FormRef(host_callback.get_pointer<ifrt::LoadedHostCallback>()));
+    ifrt_loaded_host_callbacks.push_back(tsl::FormRef(
+        static_cast<ifrt::LoadedHostCallback*>(host_callback.data())));
   }
   return std::make_unique<ifrt::XlaDeserializeExecutableOptions>(
       std::move(options), std::move(ifrt_loaded_host_callbacks));
@@ -383,13 +426,14 @@ MakeIfrtDeserializeExecutableOptions(
 
 }  // namespace
 
-absl::StatusOr<std::shared_ptr<PyLoadedExecutable>> PyClient::Compile(
-    std::string mlir_module, CompileOptions options,
-    std::vector<pybind11::capsule> host_callbacks) {
+/* static */ absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> PyClient::Compile(
+    nb_class_ptr<PyClient> client, std::string mlir_module,
+    CompileOptions options, std::vector<nb::capsule> host_callbacks) {
   // Pass allocated device memory size to compile options for pjrt compatible
   // backends.
   auto* pjrt_compatible_client =
-      llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(ifrt_client_.get());
+      llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(
+          client->ifrt_client_.get());
   if (pjrt_compatible_client != nullptr) {
     auto addressable_devices =
         pjrt_compatible_client->pjrt_client()->addressable_devices();
@@ -412,48 +456,51 @@ absl::StatusOr<std::shared_ptr<PyLoadedExecutable>> PyClient::Compile(
   auto ifrt_compile_options =
       MakeIfrtCompileOptions(std::move(options), std::move(host_callbacks));
   {
-    py::gil_scoped_release gil_release;
+    nb::gil_scoped_release gil_release;
     mlir::MLIRContext context;
     TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
                         ParseMlirModuleString(mlir_module, context));
     TF_ASSIGN_OR_RETURN(
         ifrt_loaded_executable,
-        ifrt_client_->GetDefaultCompiler()->Compile(
+        client->ifrt_client_->GetDefaultCompiler()->Compile(
             std::make_unique<xla::ifrt::XlaProgram>(module.get()),
             std::move(ifrt_compile_options)));
     TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
   }
   auto traceback = Traceback::Get();
-  return std::make_shared<PyLoadedExecutable>(
-      shared_from_this(), std::move(ifrt_loaded_executable),
+  return make_nb_class<PyLoadedExecutable>(
+      std::move(client), std::move(ifrt_loaded_executable),
       std::move(traceback), std::move(fingerprint));
 }
 
-absl::StatusOr<py::bytes> PyClient::SerializeExecutable(
+absl::StatusOr<nb::bytes> PyClient::SerializeExecutable(
     const PyLoadedExecutable& executable) const {
-  return executable.ifrt_loaded_executable()->Serialize();
+  TF_ASSIGN_OR_RETURN(auto serialized,
+                      executable.ifrt_loaded_executable()->Serialize());
+  return nb::bytes(serialized.data(), serialized.size());
 }
 
-absl::StatusOr<std::shared_ptr<PyLoadedExecutable>>
-PyClient::DeserializeExecutable(const std::string& serialized,
+/* static */ absl::StatusOr<nb_class_ptr<PyLoadedExecutable>>
+PyClient::DeserializeExecutable(nb_class_ptr<PyClient> client,
+                                nb::bytes serialized,
                                 std::optional<CompileOptions> options,
-                                std::vector<pybind11::capsule> host_callbacks) {
+                                std::vector<nb::capsule> host_callbacks) {
   std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable;
   std::optional<std::string> fingerprint;
   auto ifrt_deserialize_options = MakeIfrtDeserializeExecutableOptions(
       std::move(options), std::move(host_callbacks));
   {
-    py::gil_scoped_release gil_release;
+    nb::gil_scoped_release gil_release;
     TF_ASSIGN_OR_RETURN(
         ifrt_loaded_executable,
-        ifrt_client_->GetDefaultCompiler()->DeserializeLoadedExecutable(
-            serialized, std::move(ifrt_deserialize_options)));
-    TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
+        client->ifrt_client_->GetDefaultCompiler()->DeserializeLoadedExecutable(
+            std::string_view(serialized.c_str(), serialized.size()),
+            std::move(ifrt_deserialize_options)));
   }
   TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
   auto traceback = Traceback::Get();
-  return std::make_shared<PyLoadedExecutable>(
-      shared_from_this(), std::move(ifrt_loaded_executable),
+  return make_nb_class<PyLoadedExecutable>(
+      std::move(client), std::move(ifrt_loaded_executable),
       std::move(traceback), std::move(fingerprint));
 }
 
@@ -490,7 +537,7 @@ H AbslHashValue(H h, const HeapProfileKey& key) {
 
 }  // namespace
 
-absl::StatusOr<py::bytes> PyClient::HeapProfile() {
+absl::StatusOr<nb::bytes> PyClient::HeapProfile() {
   CHECK(PyGILState_Check());
   absl::flat_hash_set<PjRtBuffer*> buffer_set;
   absl::flat_hash_map<HeapProfileKey, int64_t> entries;
@@ -563,50 +610,49 @@ absl::StatusOr<py::bytes> PyClient::HeapProfile() {
       kind_label->set_str(buffer_string_id);
       auto* device_label = sample->add_label();
       device_label->set_key(device_string_id);
-      device_label->set_str(
-          builder.StringId(std::string(entry.first.device->DebugString())));
+      std::string device_label_str(entry.first.device->DebugString());
+      device_label->set_str(builder.StringId(device_label_str));
     } else {
       kind_label->set_str(executable_string_id);
     }
   }
-  return py::bytes(builder.profile().SerializeAsString());
+  std::string serialized = builder.profile().SerializeAsString();
+  return nb::bytes(serialized.data(), serialized.size());
 }
 
-absl::StatusOr<pybind11::object>
-PyClient::MakePythonCallbackUsingHostSendAndRecv(
-    pybind11::function callable, absl::Span<Shape const> operand_shapes,
+absl::StatusOr<nb::object> PyClient::MakePythonCallbackUsingHostSendAndRecv(
+    nb::callable callable, absl::Span<Shape const> operand_shapes,
     absl::Span<Shape const> result_shapes,
     absl::Span<uint16_t const> send_channel_ids,
-    absl::Span<uint16_t const> recv_channel_ids,
-    pybind11::function serializer) {
+    absl::Span<uint16_t const> recv_channel_ids, nb::callable serializer) {
   TF_ASSIGN_OR_RETURN(
       auto loaded_host_callback,
       PyHostSendAndRecvLoadedHostCallback::Create(
-          ifrt_client(), nb::steal<nb::callable>(callable.release().ptr()),
-          operand_shapes, result_shapes, send_channel_ids, recv_channel_ids,
-          nb::steal<nb::callable>(serializer.release().ptr())));
-  py::capsule callback_capsule(loaded_host_callback.release(), [](void* ptr) {
-    static_cast<ifrt::LoadedHostCallback*>(ptr)->DropRef();
-  });
+          ifrt_client(), std::move(callable), operand_shapes, result_shapes,
+          send_channel_ids, recv_channel_ids, std::move(serializer)));
+  nb::capsule callback_capsule(
+      loaded_host_callback.release(), [](void* ptr) noexcept {
+        static_cast<ifrt::LoadedHostCallback*>(ptr)->DropRef();
+      });
   return callback_capsule;
 }
 
-absl::StatusOr<std::pair<uint64_t, pybind11::object>>
-PyClient::GetEmitPythonCallbackDescriptor(pybind11::function callable,
-                                          py::object operand_shapes,
-                                          py::object result_shapes) {
-  TF_ASSIGN_OR_RETURN(
-      auto loaded_host_callback,
-      PyCpuLoadedHostCallback::Create(
-          ifrt_client(), nb::steal<nb::callable>(callable.release().ptr()),
-          nb::cast<std::vector<Shape>>(nb::handle(operand_shapes.ptr())),
-          nb::cast<std::vector<Shape>>(nb::handle(result_shapes.ptr()))));
+absl::StatusOr<std::pair<uint64_t, nb::object>>
+PyClient::GetEmitPythonCallbackDescriptor(nb::callable callable,
+                                          nb::object operand_shapes,
+                                          nb::object result_shapes) {
+  TF_ASSIGN_OR_RETURN(auto loaded_host_callback,
+                      PyCpuLoadedHostCallback::Create(
+                          ifrt_client(), std::move(callable),
+                          nb::cast<std::vector<Shape>>(operand_shapes),
+                          nb::cast<std::vector<Shape>>(result_shapes)));
   const uint64_t descriptor = loaded_host_callback->descriptor();
 
-  py::capsule callback_capsule(loaded_host_callback.release(), [](void* ptr) {
-    static_cast<ifrt::LoadedHostCallback*>(ptr)->DropRef();
-  });
-  return std::make_pair(descriptor, py::object(std::move(callback_capsule)));
+  nb::capsule callback_capsule(
+      loaded_host_callback.release(), [](void* ptr) noexcept {
+        static_cast<ifrt::LoadedHostCallback*>(ptr)->DropRef();
+      });
+  return std::make_pair(descriptor, nb::object(std::move(callback_capsule)));
 }
 
 XLA_CPU_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM("xla_python_cpu_callback",
@@ -618,4 +664,141 @@ XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(
     absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value()));
 #endif
 
+/* static */ int PyClient::tp_traverse(PyObject* self, visitproc visit,
+                                       void* arg) {
+  PyClient* c = nb::inst_ptr<PyClient>(self);
+  for (const auto& [ifrt_device, py_device] : c->devices_) {
+    Py_VISIT(py_device.ptr());
+  }
+  for (const auto& [ifrt_memory, py_memory] : c->memory_spaces_) {
+    Py_VISIT(py_memory.ptr());
+  }
+  return 0;
+}
+
+/* static */ int PyClient::tp_clear(PyObject* self) {
+  PyClient* c = nb::inst_ptr<PyClient>(self);
+  absl::flat_hash_map<ifrt::Device*, nb_class_ptr<PyDevice>> devices;
+  std::swap(devices, c->devices_);
+  absl::flat_hash_map<PjRtMemorySpace*, nb_class_ptr<PyMemorySpace>>
+      memory_spaces;
+  std::swap(memory_spaces, c->memory_spaces_);
+  return 0;
+}
+
+PyType_Slot PyClient::slots_[] = {
+    {Py_tp_traverse, (void*)PyClient::tp_traverse},
+    {Py_tp_clear, (void*)PyClient::tp_clear},
+    {0, nullptr},
+};
+
+/* static */ void PyClient::RegisterPythonTypes(nb::module_& m) {
+  nb::enum_<PjRtClient::HostBufferSemantics>(m, "HostBufferSemantics")
+      .value("IMMUTABLE_ONLY_DURING_CALL",
+             PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall)
+      .value("IMMUTABLE_UNTIL_TRANSFER_COMPLETES",
+             PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes)
+      .value("ZERO_COPY", PjRtClient::HostBufferSemantics::kZeroCopy);
+
+  nb::class_<PyClient> py_local_client(m, "Client", nb::is_weak_referenceable(),
+                                       nb::type_slots(PyClient::slots_));
+  py_local_client.def_prop_ro("platform", &PyClient::platform_name)
+      .def_prop_ro("platform_version", &PyClient::platform_version)
+      .def_prop_ro("runtime_type", &PyClient::runtime_type)
+      .def("device_count", &PyClient::device_count)
+      .def("local_device_count", &PyClient::addressable_device_count)
+      .def("devices", &PyClient::Devices)
+      .def("local_devices", &PyClient::LocalDevices)
+      .def("device_from_local_hardware_id",
+           xla::ValueOrThrowWrapper(&PyClient::DeviceFromLocalHardwareId))
+      .def("live_executables", &PyClient::LiveExecutables)
+      .def("live_arrays", &PyClient::LiveArrays)
+      .def("live_buffers", &PyClient::LiveArrays)
+      .def("process_index", &PyClient::process_index)
+      .def("host_id", &PyClient::process_index)
+      .def("task_id", &PyClient::process_index)
+      .def(
+          "buffer_from_pyval",
+          [](nb_class_ptr<PyClient> client, nb::handle argument,
+             PyDevice* device, bool force_copy,
+             PjRtClient::HostBufferSemantics host_buffer_semantics) {
+            return ValueOrThrow(
+                PyClient::BufferFromPyval(std::move(client), argument,
+                                          device ? device->device() : nullptr,
+                                          force_copy, host_buffer_semantics));
+          },
+          nb::arg("argument"), nb::arg("device").none() = nullptr,
+          nb::arg("force_copy") = false,
+          nb::arg("host_buffer_semantics") =
+              PjRtClient::HostBufferSemantics::kZeroCopy)
+      .def(
+          "make_cross_host_receive_buffers",
+          [](nb_class_ptr<PyClient> client, absl::Span<const Shape> shapes,
+             PjRtDevice* device) {
+            return ValueOrThrow(PyClient::MakeCrossHostReceiveBuffers(
+                std::move(client), shapes, device));
+          },
+          nb::arg("shapes"), nb::arg("device"))
+      .def(
+          "compile",
+          [](nb_class_ptr<PyClient> client, nb::bytes mlir_module,
+             CompileOptions options, std::vector<nb::capsule> host_callbacks) {
+            return ValueOrThrow(PyClient::Compile(
+                std::move(client),
+                std::string(mlir_module.c_str(), mlir_module.size()),
+                std::move(options), std::move(host_callbacks)));
+          },
+          nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
+          nb::arg("host_callbacks") = std::vector<nb::capsule>())
+      .def(
+          "compile",
+          [](nb_class_ptr<PyClient> client, std::string mlir_module,
+             CompileOptions options, std::vector<nb::capsule> host_callbacks) {
+            return ValueOrThrow(PyClient::Compile(
+                std::move(client), std::move(mlir_module), std::move(options),
+                std::move(host_callbacks)));
+          },
+          nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
+          nb::arg("host_callbacks") = std::vector<nb::capsule>())
+      .def("serialize_executable",
+           xla::ValueOrThrowWrapper(&PyClient::SerializeExecutable))
+      .def(
+          "deserialize_executable",
+          [](nb_class_ptr<PyClient> client, nb::bytes serialized,
+             std::optional<CompileOptions> options,
+             std::vector<nb::capsule> host_callbacks) {
+            return ValueOrThrow(PyClient::DeserializeExecutable(
+                std::move(client), std::move(serialized), std::move(options),
+                std::move(host_callbacks)));
+          },
+          nb::arg("serialized"), nb::arg("compile_options").none() = nb::none(),
+          nb::arg("host_callbacks") = std::vector<nb::capsule>())
+      .def("heap_profile", xla::ValueOrThrowWrapper(&PyClient::HeapProfile))
+      // TODO(zhangqiaorjc): Experimental.
+      .def("defragment",
+           [](PyClient& self) { xla::ThrowIfError(self.Defragment()); })
+      .def("get_emit_python_callback_descriptor",
+           xla::ValueOrThrowWrapper(&PyClient::GetEmitPythonCallbackDescriptor),
+           nb::arg("callable"), nb::arg("operand_shapes"),
+           nb::arg("result_shapes").none() = nb::none())
+      .def("make_python_callback_from_host_send_and_recv",
+           xla::ValueOrThrowWrapper(
+               &PyClient::MakePythonCallbackUsingHostSendAndRecv),
+           nb::arg("callable"), nb::arg("operand_shapes"),
+           nb::arg("result_shapes"), nb::arg("send_channel_ids"),
+           nb::arg("recv_channel_ids"),
+           nb::arg("serializer").none() = nb::none())
+      .def("__getattr__",
+           [](PyClient& client, std::string_view name) -> nb::object {
+             const auto& attrs = client.attributes();
+             auto it = attrs.find(name);
+             if (it != attrs.end()) {
+               return std::visit([](auto&& v) { return nb::cast(v); },
+                                 it->second);
+             }
+             throw nb::attribute_error(
+                 absl::StrCat("Unknown attribute ", name).c_str());
+           });
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/python/py_client.h b/third_party/xla/xla/python/py_client.h
index 173ffdf6e9c25b..312008eb781dac 100644
--- a/third_party/xla/xla/python/py_client.h
+++ b/third_party/xla/xla/python/py_client.h
@@ -16,99 +16,48 @@ limitations under the License.
 #ifndef XLA_PYTHON_PY_CLIENT_H_
 #define XLA_PYTHON_PY_CLIENT_H_
 
+#include <Python.h>
+
 #include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/client/xla_builder.h"
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
-#include "xla/statusor.h"
-#include "xla/types.h"
+#include "xla/shape.h"
 
 namespace xla {
 
 class PyClient;
 class PyLoadedExecutable;
 class PyArray;
+class PyDevice;
+class PyMemorySpace;
 struct PyArray_Storage;
 
-// Custom holder types.
-//
-// We must keep the PyClient object alive as long as any of the runtime
-// objects are alive. Since we don't have a lot of control over Python
-// destructor ordering, we keep the PyClient object as a std::shared_ptr<>,
-// and ensure that each Python runtime object holds a reference to the
-// PyClient. An alternative design would be to keep a single global
-// singleton PyClient, although this seems less flexible, especially for
-// writing tests.
-//
-// To maintain PyClient references, we define pybind11 holder classes that
-// are custom smart pointers that also keep a reference to a PyClient.
-// pybind11 has a `keep_alive` feature that has a similar goal, but it doesn't
-// seem sufficiently flexible to describe ownership relationships in cases where
-// the ownership doesn't pertain to a direct argument or return value of a
-// function. Another alternative to the holder classes would be to create proxy
-// objects that contain both a reference and a runtime class; holder classes
-// seem less tedious to define.
-
-// A pair of a PyClient reference and an unowned pointer to T.
-template <typename T>
-class ClientAndPtr {
- public:
-  ClientAndPtr() = default;
-  // pybind11 requires that we define a constructor that takes a raw pointer,
-  // but it should be unreachable.
-  explicit ClientAndPtr(T*) {
-    LOG(FATAL) << "ClientAndPtr should constructed via WrapWithClient.";
-  }
-
-  ClientAndPtr(const ClientAndPtr&) = default;
-  ClientAndPtr(ClientAndPtr&&) = default;
-  ClientAndPtr& operator=(const ClientAndPtr&) = default;
-  ClientAndPtr& operator=(ClientAndPtr&&) = default;
-
-  PyClient* get_client() const { return client_; }
-
-  std::shared_ptr<PyClient> client() const {
-    return std::shared_ptr<PyClient>(contents_, client_);
-  }
-
-  T* get() const { return contents_.get(); }
-  T* operator->() const { return contents_.get(); }
-  T& operator*() const { return *contents_; }
-
- private:
-  template <typename U>
-  friend ClientAndPtr<U> WrapWithClient(std::shared_ptr<PyClient> client,
-                                        U* contents);
-  std::shared_ptr<T> contents_;
-  PyClient* client_;
-};
-
-// By defining a templated helper function, we can use return type deduction
-// and avoid specifying types at the caller.
-template <typename T>
-ClientAndPtr<T> WrapWithClient(std::shared_ptr<PyClient> client, T* contents) {
-  ClientAndPtr<T> result;
-  result.client_ = client.get();
-  result.contents_ = std::shared_ptr<T>(std::move(client), contents);
-  return result;
-}
-
 // Python wrapper around PjRtClient.
 // We use a wrapper class to add Python-specific functionality.
-class PyClient : public std::enable_shared_from_this<PyClient> {
+class PyClient {
  public:
+  static nb_class_ptr<PyClient> Make(std::shared_ptr<ifrt::Client> ifrt_client);
+
+  // Do not call the constructor directly. Use `PyClient::Make` instead.
   explicit PyClient(std::shared_ptr<ifrt::Client> ifrt_client);
   virtual ~PyClient();
 
@@ -140,7 +89,7 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
     return shared_ptr_pjrt_client();
   }
 
-  absl::string_view platform_name() const {
+  std::string_view platform_name() const {
     // TODO(phawkins): this is a temporary backwards compatibility shim. We
     // changed the name PJRT reports for GPU platforms to "cuda" or "rocm", but
     // we haven't yet updated JAX clients that expect "gpu". Migrate users and
@@ -152,12 +101,10 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
       return ifrt_client_->platform_name();
     }
   }
-  absl::string_view platform_version() const {
+  std::string_view platform_version() const {
     return ifrt_client_->platform_version();
   }
-  absl::string_view runtime_type() const {
-    return ifrt_client_->runtime_type();
-  }
+  std::string_view runtime_type() const { return ifrt_client_->runtime_type(); }
 
   // Returns implementation-specific attributes about this client, e.g. the PJRT
   // C API version if applicable.
@@ -172,43 +119,48 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   int device_count() const { return ifrt_client_->device_count(); }
   int process_index() const { return ifrt_client_->process_index(); }
 
-  std::vector<ClientAndPtr<PjRtDevice>> Devices();
-  std::vector<ClientAndPtr<PjRtDevice>> LocalDevices();
-  StatusOr<ClientAndPtr<PjRtDevice>> DeviceFromLocalHardwareId(
+  std::vector<nb_class_ptr<PyDevice>> Devices();
+  std::vector<nb_class_ptr<PyDevice>> LocalDevices();
+  absl::StatusOr<nb_class_ptr<PyDevice>> DeviceFromLocalHardwareId(
       int local_hardware_id);
 
+  // Returns the PyDevice associated with the given PjRtDevice.
+  nb_class_ptr<PyDevice> GetPyDevice(PjRtDevice* device);
+
+  // Returns the PyMemorySpace associated with the given PjRtMemorySpace.
+  nb_class_ptr<PyMemorySpace> GetPyMemorySpace(PjRtMemorySpace* memory_space);
+
   // Returns a vector of live PyArray objects. PyArray objects may share
   // PjRtBuffers, so there may be duplicates of the same underlying device
   // buffer.
-  std::vector<pybind11::object> LiveBuffersOnDevice(PjRtDevice* device);
+  std::vector<nanobind::object> LiveBuffersOnDevice(PjRtDevice* device);
 
-  // Returns a vector of live PyLoadedExecutable objects.
-  // note: must return std::shared_ptr instead of raw ptrs
-  // https://pybind11.readthedocs.io/en/stable/advanced/smart_ptrs.html#std-shared-ptr
-  std::vector<std::shared_ptr<PyLoadedExecutable>> LiveExecutables();
+  nanobind::list LiveExecutables();
 
   // TODO(zhangqiaorjc): Remove when we have transparent defragmentation.
-  Status Defragment();
+  absl::Status Defragment();
 
-  StatusOr<std::vector<std::pair<pybind11::bytes, pybind11::object>>>
-  MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
-                              PjRtDevice* device);
+  static absl::StatusOr<nanobind::list> MakeCrossHostReceiveBuffers(
+      nb_class_ptr<PyClient> client, absl::Span<const Shape> shapes,
+      PjRtDevice* device);
 
-  StatusOr<pybind11::object> BufferFromPyval(
-      pybind11::handle argument, PjRtDevice* device, bool force_copy,
+  static absl::StatusOr<nanobind::object> BufferFromPyval(
+      nb_class_ptr<PyClient> client, nanobind::handle argument,
+      PjRtDevice* device, bool force_copy,
       ifrt::Client::HostBufferSemantics host_buffer_semantics);
 
-  StatusOr<std::shared_ptr<PyLoadedExecutable>> Compile(
-      std::string mlir_module, CompileOptions options,
-      std::vector<pybind11::capsule> host_callbacks);
+  static absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> Compile(
+      nb_class_ptr<PyClient> client, std::string mlir_module,
+      CompileOptions options, std::vector<nanobind::capsule> host_callbacks);
 
-  StatusOr<pybind11::bytes> SerializeExecutable(
+  absl::StatusOr<nanobind::bytes> SerializeExecutable(
       const PyLoadedExecutable& executable) const;
-  StatusOr<std::shared_ptr<PyLoadedExecutable>> DeserializeExecutable(
-      const std::string& serialized, std::optional<CompileOptions> options,
-      std::vector<pybind11::capsule> host_callbacks);
+  static absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> DeserializeExecutable(
+      nb_class_ptr<PyClient> client, nanobind::bytes serialized,
+      std::optional<CompileOptions> options,
+      std::vector<nanobind::capsule> host_callbacks);
 
-  StatusOr<pybind11::bytes> HeapProfile();
+  absl::StatusOr<nanobind::bytes> HeapProfile();
 
   // `GetEmitPythonCallbackDescriptor` takes in an input Python callable that
   // takes in arguments of shapes `operand_shapes` and returns values of shapes
@@ -224,19 +176,19 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   // either arrays or None values.
   // TODO(phawkins): pass operand_shapes and result_shapes as
   // absl::Span<Shape const> when nanobind transition is complete.
-  StatusOr<std::pair<uint64_t, pybind11::object>>
-  GetEmitPythonCallbackDescriptor(pybind11::function callable,
-                                  pybind11::object operand_shapes,
-                                  pybind11::object result_shapes);
+  absl::StatusOr<std::pair<uint64_t, nanobind::object>>
+  GetEmitPythonCallbackDescriptor(nanobind::callable callable,
+                                  nanobind::object operand_shapes,
+                                  nanobind::object result_shapes);
   // Deprecated; please switch to emitting a `CustomCallOp` directly.
-  StatusOr<XlaOp> EmitPythonCallbackFromDescriptor(
+  absl::StatusOr<XlaOp> EmitPythonCallbackFromDescriptor(
       XlaBuilder& builder, uint64_t descriptor,
       absl::Span<XlaOp const> operands, absl::Span<Shape const> result_shapes,
       std::optional<std::vector<Shape>> operand_layouts, bool has_side_effect);
   // Deprecated; please switch to using `GetEmitPythonCallbackDescriptor`
   // and then emitting a `CustomCall` op instead.
-  StatusOr<std::pair<XlaOp, pybind11::object>> EmitPythonCallback(
-      pybind11::function callable, XlaBuilder& builder,
+  absl::StatusOr<std::pair<XlaOp, nanobind::object>> EmitPythonCallback(
+      nanobind::callable callable, XlaBuilder& builder,
       absl::Span<XlaOp const> operands, absl::Span<Shape const> result_shapes,
       std::optional<std::vector<Shape>> operand_layouts, bool has_side_effect);
 
@@ -255,20 +207,29 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   // The callable receives as arguments NumPy arrays for arguments with array
   // types, and None for Token argument. The callable must return a tuple of
   // either arrays or None values.
-  StatusOr<pybind11::object> MakePythonCallbackUsingHostSendAndRecv(
-      pybind11::function callable, absl::Span<Shape const> operand_shapes,
+  absl::StatusOr<nanobind::object> MakePythonCallbackUsingHostSendAndRecv(
+      nanobind::callable callable, absl::Span<Shape const> operand_shapes,
       absl::Span<Shape const> result_shapes,
       absl::Span<uint16_t const> send_channel_ids,
       absl::Span<uint16_t const> recv_channel_ids,
-      pybind11::function serializer);
+      nanobind::callable serializer);
 
   std::vector<nanobind::object> LiveArrays() const;
 
+  static void RegisterPythonTypes(nanobind::module_& m);
+
+ protected:
+  static void Initialize(nb_class_ptr<PyClient> client);
+
  private:
   friend class PyLoadedExecutable;
   friend class PyArray;
   friend struct PyArray_Storage;
 
+  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
+  static int tp_clear(PyObject* self);
+  static PyType_Slot slots_[];
+
   std::shared_ptr<ifrt::Client> ifrt_client_;
   absl::flat_hash_map<std::string, xla::ifrt::Client::ClientAttribute>
       client_attributes_;
@@ -278,10 +239,12 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
 
   PyLoadedExecutable* executables_ = nullptr;
   PyArray_Storage* arrays_ = nullptr;
+
+  absl::flat_hash_map<ifrt::Device*, nb_class_ptr<PyDevice>> devices_;
+  absl::flat_hash_map<PjRtMemorySpace*, nb_class_ptr<PyMemorySpace>>
+      memory_spaces_;
 };
 
 }  // namespace xla
 
-PYBIND11_DECLARE_HOLDER_TYPE(T, xla::ClientAndPtr<T>);
-
 #endif  // XLA_PYTHON_PY_CLIENT_H_
diff --git a/third_party/xla/xla/python/py_compile_only_client.cc b/third_party/xla/xla/python/py_compile_only_client.cc
index c6d88e54045a8d..a4297bce47bacc 100644
--- a/third_party/xla/xla/python/py_compile_only_client.cc
+++ b/third_party/xla/xla/python/py_compile_only_client.cc
@@ -15,25 +15,58 @@ limitations under the License.
 
 #include "xla/python/py_compile_only_client.h"
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
-#include "nanobind/nanobind.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "pybind11/stl.h"  // from @pybind11
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
+#include "xla/literal.h"
 #include "xla/pjrt/mlir_to_hlo.h"
+#include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/status_casters.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
-#include "tsl/python/lib/core/numpy.h"  //NOLINT
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/py_client.h"
+#include "xla/service/computation_placer.h"
+#include "xla/util.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/python/lib/core/numpy.h"
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 namespace xla {
 
@@ -66,16 +99,16 @@ class PjRtCompileOnlyDevice : public PjRtDevice {
       absl::string_view description) const override {
     return nullptr;
   }
-  Status TransferToInfeed(const LiteralSlice& literal) override {
+  absl::Status TransferToInfeed(const LiteralSlice& literal) override {
     return Unimplemented("TransferToInfeed is not supported");
   }
-  Status TransferFromOutfeed(MutableBorrowingLiteral literal) override {
+  absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) override {
     return Unimplemented("TransferFromOutfeed is not supported");
   }
   absl::Span<PjRtMemorySpace* const> memory_spaces() const override {
     return {};
   }
-  StatusOr<PjRtMemorySpace*> default_memory_space() const override {
+  absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override {
     return Unimplemented("default_memory_space is not supported");
   }
 
@@ -86,13 +119,14 @@ class PjRtCompileOnlyDevice : public PjRtDevice {
 class InvalidIfrtCompiler final
     : public llvm::RTTIExtends<InvalidIfrtCompiler, ifrt::Compiler> {
  public:
-  StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> Compile(
       std::unique_ptr<ifrt::Program> program,
       std::unique_ptr<ifrt::CompileOptions> options) override {
     return Unimplemented("Compile not implemented.");
   }
 
-  StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> DeserializeLoadedExecutable(
+  absl::StatusOr<std::unique_ptr<ifrt::LoadedExecutable>>
+  DeserializeLoadedExecutable(
       absl::string_view serialized,
       std::unique_ptr<ifrt::DeserializeExecutableOptions> options) override {
     return Unimplemented("DeserializeLoadedExecutable not implemented.");
@@ -116,7 +150,7 @@ class CompileOnlyIfRtClient final
     }
   }
 
-  StatusOr<tsl::RCReference<ifrt::Array>> MakeArrayFromHostBuffer(
+  absl::StatusOr<tsl::RCReference<ifrt::Array>> MakeArrayFromHostBuffer(
       const void* data, ifrt::DType dtype, ifrt::Shape shape,
       std::optional<absl::Span<const int64_t>> byte_strides,
       std::shared_ptr<const ifrt::Sharding> sharding,
@@ -126,7 +160,8 @@ class CompileOnlyIfRtClient final
         "MakeArrayFromHostBuffer not available with compile-only client.");
   }
 
-  StatusOr<tsl::RCReference<ifrt::Array>> AssembleArrayFromSingleDeviceArrays(
+  absl::StatusOr<tsl::RCReference<ifrt::Array>>
+  AssembleArrayFromSingleDeviceArrays(
       ifrt::Shape shape, std::shared_ptr<const ifrt::Sharding> sharding,
       absl::Span<tsl::RCReference<ifrt::Array>> arrays,
       ifrt::ArrayCopySemantics semantics) override {
@@ -135,7 +170,7 @@ class CompileOnlyIfRtClient final
         "client.");
   }
 
-  StatusOr<tsl::RCReference<ifrt::Tuple>> MakeTuple(
+  absl::StatusOr<tsl::RCReference<ifrt::Tuple>> MakeTuple(
       absl::Span<tsl::RCReference<ifrt::Value>> values) override {
     return Unimplemented("MakeTuple not available with compile-only client.");
   }
@@ -165,17 +200,17 @@ class CompileOnlyIfRtClient final
     return {};
   }
   int process_index() const override { return 0; }
-  StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override {
     return Unimplemented(
         "GetDefaultDeviceAssignment not available with compile-only client.");
   }
-  StatusOr<ifrt::Device*> LookupDevice(int device_id) const override {
+  absl::StatusOr<ifrt::Device*> LookupDevice(int device_id) const override {
     return Unimplemented(
         "LookupDevice not available with compile-only client.");
   }
 
-  StatusOr<ifrt::Device*> LookupAddressableDevice(
+  absl::StatusOr<ifrt::Device*> LookupAddressableDevice(
       int local_hardware_id) const override {
     return Unimplemented(
         "LookupAddressableDevice not available with compile-only client.");
@@ -187,7 +222,7 @@ class CompileOnlyIfRtClient final
 
   const PjRtTopologyDescription& topology() const { return *topology_; }
 
-  StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
+  absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
   GetTopologyForDevices(
       absl::Span<ifrt::Device* const> devices) const override {
     return topology_;
@@ -207,15 +242,24 @@ class CompileOnlyPyClient : public PyClient {
  public:
   using PyClient::PyClient;
 
-  StatusOr<std::shared_ptr<PjRtExecutable>> CompileUnloaded(
-      std::string mlir_module, CompileOptions options,
-      std::vector<pybind11::capsule> host_callbacks) {
+  static nb_class_ptr<PyClient> Make(
+      std::shared_ptr<PjRtTopologyDescription> topology) {
+    auto client =
+        nb::borrow<nb_class_ptr<PyClient>>(make_nb_class<CompileOnlyPyClient>(
+            std::make_unique<CompileOnlyIfRtClient>(std::move(topology))));
+    CompileOnlyPyClient::Initialize(client);
+    return client;
+  }
+
+  absl::StatusOr<std::shared_ptr<PjRtExecutable>> CompileUnloaded(
+      std::string_view mlir_module, CompileOptions options,
+      std::vector<nb::capsule> host_callbacks) {
     if (!host_callbacks.empty()) {
       return Unimplemented(
           "Compiling with host_callbacks not available with compile-only "
           "client.");
     }
-    pybind11::gil_scoped_release gil_release;
+    nb::gil_scoped_release gil_release;
     mlir::MLIRContext context;
     TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
                         ParseMlirModuleString(mlir_module, context));
@@ -226,42 +270,36 @@ class CompileOnlyPyClient : public PyClient {
     return PjRtCompile(std::move(options), module.get(),
                        ifrt_client->topology());
   }
+
+ private:
+  static void Initialize(nb_class_ptr<PyClient> client) {
+    PyClient::Initialize(client);
+  }
 };
 
 }  // namespace
 
-std::shared_ptr<PyClient> MakeCompileOnlyClient(
+nb_class_ptr<PyClient> MakeCompileOnlyClient(
     std::shared_ptr<PjRtTopologyDescription> topology) {
-  return std::make_shared<CompileOnlyPyClient>(
-      std::make_unique<CompileOnlyIfRtClient>(std::move(topology)));
+  return CompileOnlyPyClient::Make(std::move(topology));
 }
 
-void RegisterCompileOnlyClient(pybind11::module& m) {
-  pybind11::class_<CompileOnlyPyClient, PyClient,
-                   std::shared_ptr<CompileOnlyPyClient>>(m,
-                                                         "CompileOnlyPyClient")
+void RegisterCompileOnlyClient(nb::module_& m) {
+  nb::class_<CompileOnlyPyClient, PyClient>(m, "CompileOnlyPyClient")
       .def(
           "compile",
-          [](CompileOnlyPyClient& self, std::string mlir_module,
-             py::object options_py,
-             std::vector<pybind11::capsule> host_callbacks) {
-            // TODO(phawkins): just wrap CompileOnlyPyClient::CompileUnloaded
-            // directly when the nanobind transition is complete.
-            CompileOptions options;
-            if (!options_py.is_none()) {
-              try {
-                options =
-                    nb::cast<CompileOptions>(nb::handle(options_py.ptr()));
-              } catch (std::exception& e) {
-                throw py::type_error(e.what());
-              }
-            }
-            return ValueOrThrow(
-                self.CompileUnloaded(mlir_module, options, host_callbacks));
+          [](CompileOnlyPyClient& self, nb::bytes mlir_module,
+             CompileOptions options, std::vector<nb::capsule> host_callbacks) {
+            return ValueOrThrow(self.CompileUnloaded(
+                std::string_view(mlir_module.c_str(), mlir_module.size()),
+                std::move(options), std::move(host_callbacks)));
           },
-          pybind11::arg("computation"),
-          pybind11::arg("compile_options") = py::none(),
-          pybind11::arg("host_callbacks") = std::vector<pybind11::capsule>());
+          nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
+          nb::arg("host_callbacks") = std::vector<nb::capsule>())
+      .def(
+          "compile", ValueOrThrowWrapper(&CompileOnlyPyClient::CompileUnloaded),
+          nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
+          nb::arg("host_callbacks") = std::vector<nb::capsule>());
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/python/py_compile_only_client.h b/third_party/xla/xla/python/py_compile_only_client.h
index e2e93720bdf234..0501cc869a276d 100644
--- a/third_party/xla/xla/python/py_compile_only_client.h
+++ b/third_party/xla/xla/python/py_compile_only_client.h
@@ -19,7 +19,9 @@ limitations under the License.
 #include <memory>
 
 // placeholder for index annotation headers
+#include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/py_client.h"
 
 namespace xla {
@@ -33,10 +35,10 @@ namespace xla {
 // Python duck typing to treat the unloaded executable like a loaded executable
 // (except it will raise errors if you try to run it, which is what we want for
 // AOT environments).
-std::shared_ptr<PyClient> MakeCompileOnlyClient(
+nb_class_ptr<PyClient> MakeCompileOnlyClient(
     std::shared_ptr<PjRtTopologyDescription>);
 
-void RegisterCompileOnlyClient(pybind11::module& m);
+void RegisterCompileOnlyClient(nanobind::module_& m);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/py_device.cc b/third_party/xla/xla/python/py_device.cc
new file mode 100644
index 00000000000000..bc947ae8f8848c
--- /dev/null
+++ b/third_party/xla/xla/python/py_device.cc
@@ -0,0 +1,321 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/py_device.h"
+
+#include <Python.h>
+
+#include <cstdint>
+#include <exception>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <variant>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_join.h"
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/variant.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
+#include "xla/layout_util.h"
+#include "xla/literal.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/status_casters.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/nb_helpers.h"
+#include "xla/python/py_client.h"
+#include "xla/python/py_memory_space.h"
+#include "xla/python/python_ref_manager.h"
+#include "xla/python/types.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "tsl/framework/allocator.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace nb = ::nanobind;
+
+namespace xla {
+
+PyDevice::PyDevice(nb_class_ptr<PyClient> client, ifrt::Device* device)
+    : client_(std::move(client)), device_(device) {}
+
+int PyDevice::id() const { return device_->id(); }
+
+int PyDevice::process_index() const { return device_->process_index(); }
+
+std::string_view PyDevice::platform() const {
+  // TODO(phawkins): this is a temporary backwards
+  // compatibility shim. We changed the name PJRT
+  // reports for GPU platforms to "cuda" or "rocm",
+  // but we haven't yet updated JAX clients that
+  // expect "gpu". Migrate users and remove this
+  // code.
+  if (client_->platform_name() == "cuda" ||
+      client_->platform_name() == "rocm") {
+    return std::string_view("gpu");
+  } else {
+    return client_->platform_name();
+  }
+}
+
+std::string_view PyDevice::device_kind() const {
+  return device_->device_kind();
+}
+
+std::optional<int> PyDevice::local_hardware_id() const {
+  int local_hardware_id = device_->local_hardware_id();
+  if (local_hardware_id == -1) {
+    return std::nullopt;
+  }
+  return local_hardware_id;
+}
+
+std::string_view PyDevice::Str() const { return device_->DebugString(); }
+
+std::string_view PyDevice::Repr() const { return device_->ToString(); }
+
+absl::Status PyDevice::TransferToInfeed(LiteralSlice literal) {
+  GlobalPyRefManager()->CollectGarbage();
+  nb::gil_scoped_release gil_release;
+  return device_->TransferToInfeed(literal);
+}
+
+absl::StatusOr<nb::object> PyDevice::TransferFromOutfeed(Shape shape) {
+  GlobalPyRefManager()->CollectGarbage();
+  std::shared_ptr<Literal> literal;
+  {
+    nb::gil_scoped_release gil_release;
+    ShapeUtil::ForEachMutableSubshape(
+        &shape, [](Shape* subshape, const ShapeIndex&) {
+          if (!subshape->has_layout()) {
+            LayoutUtil::SetToDefaultLayout(subshape);
+          }
+        });
+    literal = std::make_shared<Literal>(shape);
+    TF_RETURN_IF_ERROR(device_->TransferFromOutfeed(literal.get()));
+  }
+  return LiteralToPython(std::move(literal));
+}
+
+absl::StatusOr<nb_class_ptr<PyMemorySpace>> PyDevice::Memory(
+    std::string_view kind) const {
+  xla::PjRtMemorySpace* result_memory_space = nullptr;
+  for (auto* memory_space : device_->memory_spaces()) {
+    if (memory_space->memory_space_kind() == kind) {
+      if (result_memory_space != nullptr) {
+        std::string memories = absl::StrJoin(
+            device_->memory_spaces(), ", ",
+            [](std::string* out, const auto& memory_space) {
+              absl::StrAppend(out, memory_space->memory_space_kind());
+            });
+        auto device_kind = device_->device_kind();
+        return xla::InvalidArgument(
+            "Found more than one addressable memory for "
+            "kind %s which is not allowed. There can only "
+            "be one memory for each "
+            "kind. Device %s can address the following "
+            "memory kinds: %s",
+            kind, device_kind, memories);
+      }
+      result_memory_space = memory_space;
+    }
+  }
+  if (result_memory_space == nullptr) {
+    std::string memories =
+        absl::StrJoin(device_->memory_spaces(), ", ",
+                      [](std::string* out, const auto& memory_space) {
+                        absl::StrAppend(out, memory_space->memory_space_kind());
+                      });
+    auto device_kind = device_->device_kind();
+    return xla::InvalidArgument(
+        "Could not find memory addressable by device %s. Device %s "
+        "can address the following memory kinds: %s. "
+        "Got memory kind: %s",
+        device_kind, device_kind, memories, kind);
+  }
+  return client_->GetPyMemorySpace(result_memory_space);
+}
+
+absl::StatusOr<nb_class_ptr<PyMemorySpace>> PyDevice::DefaultMemory() const {
+  TF_ASSIGN_OR_RETURN(auto* memory_space, device_->default_memory_space());
+  return client_->GetPyMemorySpace(memory_space);
+}
+
+nb::list PyDevice::AddressableMemories() const {
+  nb::list memory_spaces;
+  for (auto* memory_space : device_->memory_spaces()) {
+    memory_spaces.append(client_->GetPyMemorySpace(memory_space));
+  }
+  return memory_spaces;
+}
+
+absl::StatusOr<std::optional<nb::dict>> PyDevice::MemoryStats() const {
+  GlobalPyRefManager()->CollectGarbage();
+  absl::StatusOr<tsl::AllocatorStats> maybe_stats =
+      device_->GetAllocatorStats();
+  if (absl::IsUnimplemented(maybe_stats.status())) {
+    return std::nullopt;
+  }
+  // Raise error if any status other than Unimplemented is returned.
+  ThrowIfError(maybe_stats.status());
+
+  nb::dict result;
+  result["num_allocs"] = maybe_stats->num_allocs;
+  result["bytes_in_use"] = maybe_stats->bytes_in_use;
+  result["peak_bytes_in_use"] = maybe_stats->peak_bytes_in_use;
+  result["largest_alloc_size"] = maybe_stats->largest_alloc_size;
+  if (maybe_stats->bytes_limit) {
+    result["bytes_limit"] = *maybe_stats->bytes_limit;
+  }
+  result["bytes_reserved"] = maybe_stats->bytes_reserved;
+  result["peak_bytes_reserved"] = maybe_stats->peak_bytes_reserved;
+  if (maybe_stats->bytes_reservable_limit) {
+    result["bytes_reservable_limit"] = *maybe_stats->bytes_reservable_limit;
+  }
+  result["largest_free_block_bytes"] = maybe_stats->largest_free_block_bytes;
+  if (maybe_stats->pool_bytes) {
+    result["pool_bytes"] = *maybe_stats->pool_bytes;
+  }
+  if (maybe_stats->peak_pool_bytes) {
+    result["peak_pool_bytes"] = *maybe_stats->peak_pool_bytes;
+  }
+  return result;
+}
+
+absl::StatusOr<std::intptr_t> PyDevice::GetStreamForExternalReadyEvents()
+    const {
+  return device_->GetStreamForExternalReadyEvents();
+}
+
+/* static */ int PyDevice::tp_traverse(PyObject* self, visitproc visit,
+                                       void* arg) {
+  PyDevice* d = nb::inst_ptr<PyDevice>(self);
+  Py_VISIT(d->client().ptr());
+  return 0;
+}
+
+/* static */ int PyDevice::tp_clear(PyObject* self) {
+  PyDevice* d = nb::inst_ptr<PyDevice>(self);
+  nb_class_ptr<PyClient> client;
+  std::swap(client, d->client_);
+  return 0;
+}
+
+PyType_Slot PyDevice::slots_[] = {
+    {Py_tp_traverse, (void*)PyDevice::tp_traverse},
+    {Py_tp_clear, (void*)PyDevice::tp_clear},
+    {0, nullptr},
+};
+
+/* static */ void PyDevice::RegisterPythonType(nb::module_& m) {
+  nb::class_<PyDevice> device(
+      m, "Device", nb::type_slots(PyDevice::slots_),
+      "A descriptor of an available device.\n\nSubclasses are used to "
+      "represent specific types of devices, e.g. CPUs, GPUs. Subclasses may "
+      "have additional properties specific to that device type.");
+  device
+      .def_prop_ro(
+          "id", &PyDevice::id,
+          "Integer ID of this device.\n\nUnique across all available devices "
+          "of this type, including remote devices on multi-host platforms.")
+      .def_prop_ro("process_index", &PyDevice::process_index,
+                   "Integer index of this device's process.\n\n"
+                   "This is always 0 except on multi-process platforms.")
+      .def_prop_ro("host_id", &PyDevice::process_index,
+                   "Deprecated; please use process_index")
+      .def_prop_ro("task_id", &PyDevice::process_index,
+                   "Deprecated; please use process_index")
+      .def_prop_ro("platform", &PyDevice::platform)
+      .def_prop_ro("device_kind", &PyDevice::device_kind)
+      .def_prop_ro("client", &PyDevice::client)
+      .def_prop_ro(
+          "local_hardware_id", &PyDevice::local_hardware_id,
+          "Opaque hardware ID, e.g., the CUDA device number. In general, not "
+          "guaranteed to be dense, and not guaranteed to be defined on all "
+          "platforms.")
+      .def("__str__", &PyDevice::Str)
+      .def("__repr__", &PyDevice::Repr)
+      .def("transfer_to_infeed",
+           ThrowIfErrorWrapper(&PyDevice::TransferToInfeed))
+      .def("transfer_from_outfeed",
+           ValueOrThrowWrapper(&PyDevice::TransferFromOutfeed))
+      .def("memory", ValueOrThrowWrapper(&PyDevice::Memory), nb::arg("kind"))
+      .def("default_memory", ValueOrThrowWrapper(&PyDevice::DefaultMemory),
+           "Returns the default memory of a device.")
+      .def("addressable_memories", &PyDevice::AddressableMemories,
+           "Returns all the memories that a device can address.")
+
+      .def("live_buffers",
+           [](nb::handle device) {
+             PythonDeprecationWarning(
+                 "Per device live_buffers() is deprecated. Please "
+                 "use the jax.live_arrays() for jax.Arrays instead.");
+             return nb::list();
+           })
+      .def(
+          "memory_stats", ValueOrThrowWrapper(&PyDevice::MemoryStats),
+          "Returns memory statistics for this device keyed by name. May not "
+          "be implemented on all platforms, and different platforms may return "
+          "different stats, or -1 for unavailable stats. 'bytes_in_use' is "
+          "usually available. Intended for diagnostic use.")
+      .def(
+          "get_stream_for_external_ready_events",
+          xla::ValueOrThrowWrapper(&PyDevice::GetStreamForExternalReadyEvents));
+  static PyMethodDef get_attr_method = {
+      "__getattr__",
+      +[](PyObject* self, PyObject* args) -> PyObject* {
+        PyObject* key;
+        if (!PyArg_ParseTuple(args, "O", &key)) {
+          PyErr_SetString(PyExc_TypeError, "__getattr__ must take 1 argument.");
+          return nullptr;
+        }
+        try {
+          auto device = nb::cast<PyDevice*>(nb::handle(self));
+          auto name = nb::cast<std::string_view>(nb::handle(key));
+          const auto& attrs = device->device_->Attributes();
+          auto it = attrs.find(name);
+          if (it != attrs.end()) {
+            auto result =
+                std::visit([](auto&& v) { return nb::cast(v); }, it->second);
+            return result.release().ptr();
+          }
+          PyErr_SetNone(PyExc_AttributeError);
+          return nullptr;
+        } catch (std::exception& e) {
+          PyErr_Format(PyExc_SystemError, "Unhandled nanobind exception: %s",
+                       e.what());
+          return nullptr;
+        } catch (...) {
+          PyErr_SetString(PyExc_SystemError, "Unhandled nanobind exception.");
+          return nullptr;
+        }
+      },
+      METH_VARARGS,
+      nullptr,
+  };
+  device.attr("__getattr__") = nb::steal<nb::object>(PyDescr_NewMethod(
+      reinterpret_cast<PyTypeObject*>(device.ptr()), &get_attr_method));
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/python/py_device.h b/third_party/xla/xla/python/py_device.h
new file mode 100644
index 00000000000000..0576d55da325f0
--- /dev/null
+++ b/third_party/xla/xla/python/py_device.h
@@ -0,0 +1,83 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_DEVICE_H_
+#define XLA_PYTHON_PY_DEVICE_H_
+
+#include <Python.h>
+
+#include <cstdint>
+#include <optional>
+#include <string_view>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "xla/literal.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/py_client.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+class PyDevice {
+ public:
+  PyDevice(nb_class_ptr<PyClient> client, ifrt::Device* device);
+
+  // Devices are compared using Python object identity, so we don't allow them
+  // to be copied or moved.
+  PyDevice(const PyDevice&) = delete;
+  PyDevice(PyDevice&&) = delete;
+  PyDevice& operator=(const PyDevice&) = delete;
+  PyDevice& operator=(PyDevice&&) = delete;
+
+  const nb_class_ptr<PyClient>& client() const { return client_; }
+  ifrt::Device* device() const { return device_; }
+
+  int id() const;
+  int process_index() const;
+  std::string_view platform() const;
+  std::string_view device_kind() const;
+  std::optional<int> local_hardware_id() const;
+
+  std::string_view Str() const;
+  std::string_view Repr() const;
+
+  absl::Status TransferToInfeed(LiteralSlice literal);
+  absl::StatusOr<nanobind::object> TransferFromOutfeed(Shape shape);
+
+  absl::StatusOr<nb_class_ptr<PyMemorySpace>> Memory(
+      std::string_view kind) const;
+  absl::StatusOr<nb_class_ptr<PyMemorySpace>> DefaultMemory() const;
+  nanobind::list AddressableMemories() const;
+  absl::StatusOr<std::optional<nanobind::dict>> MemoryStats() const;
+
+  absl::StatusOr<std::intptr_t> GetStreamForExternalReadyEvents() const;
+
+  static void RegisterPythonType(nanobind::module_& m);
+
+ private:
+  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
+  static int tp_clear(PyObject* self);
+  static PyType_Slot slots_[];
+
+  nb_class_ptr<PyClient> client_;
+  ifrt::Device* device_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_DEVICE_H_
diff --git a/third_party/xla/xla/python/py_device_list.cc b/third_party/xla/xla/python/py_device_list.cc
index b28b67f14178d7..3ce4eb19d34538 100644
--- a/third_party/xla/xla/python/py_device_list.cc
+++ b/third_party/xla/xla/python/py_device_list.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -31,12 +30,12 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/make_iterator.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/nb_helpers.h"
 #include "xla/python/py_client.h"
+#include "xla/python/py_device.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/sharding.h"
 #include "xla/python/types.h"
@@ -45,9 +44,8 @@ limitations under the License.
 namespace jax {
 
 namespace nb = ::nanobind;
-namespace py = ::pybind11;
 
-PyDeviceList::PyDeviceList(std::shared_ptr<xla::PyClient> py_client,
+PyDeviceList::PyDeviceList(xla::nb_class_ptr<xla::PyClient> py_client,
                            xla::ifrt::DeviceList device_list)
     : py_client_(std::move(py_client)), device_list_(std::move(device_list)) {}
 
@@ -61,19 +59,19 @@ PyDeviceList::PyDeviceList(nb::tuple py_device_assignment)
   xla::ifrt::DeviceList::Devices devices;
   devices.reserve(py_device_assignment.size());
   for (nb::handle obj : py_device_assignment) {
-    if (!py::isinstance<xla::PjRtDevice>(obj.ptr())) {
-      // Non-`xla::PjRtDevice` is used on an alternative JAX backend with device
+    if (!nb::isinstance<xla::PyDevice>(obj.ptr())) {
+      // Non-`xla::PyDevice` is used on an alternative JAX backend with device
       // duck typing. Use Python device objects already set in `device_list_`.
       return;
     }
-    auto py_device = py::cast<xla::ClientAndPtr<xla::PjRtDevice>>(obj.ptr());
-    if (py_client_ == nullptr) {
-      py_client_ = py_device.client();
-    } else if (py_device.client() != py_client_) {
+    auto py_device = nb::cast<xla::PyDevice*>(obj);
+    if (py_client_.get() == nullptr) {
+      py_client_ = py_device->client();
+    } else if (py_device->client().get() != py_client_.get()) {
       // If the list contains multiple clients, fall back to device duck typing.
       return;
     }
-    devices.push_back(py_device.get());
+    devices.push_back(py_device->device());
   }
   device_list_ = xla::ifrt::DeviceList(std::move(devices));
 }
@@ -154,9 +152,7 @@ nb::object PyDeviceList::GetItem(int index) {
       } else if (index < 0) {
         index += device_list.size();
       }
-      py::object d =
-          py::cast(xla::WrapWithClient(py_client_, device_list[index]));
-      return nb::steal(d.release().ptr());
+      return py_client_->GetPyDevice(device_list[index]);
     }
     case 1:
       return std::get<1>(device_list_).attr("__getitem__")(index);
@@ -176,8 +172,7 @@ nb::object PyDeviceList::GetSlice(nb::slice slice) {
       }
       nb::tuple out = nb::steal<nb::tuple>(PyTuple_New(slicelength));
       for (size_t i = 0; i < slicelength; ++i) {
-        py::object d =
-            py::cast(xla::WrapWithClient(py_client_, device_list[start]));
+        nb::object d = py_client_->GetPyDevice(device_list[start]);
         PyTuple_SET_ITEM(out.ptr(), i, d.release().ptr());
         start += step;
       }
@@ -197,7 +192,7 @@ nb::tuple PyDeviceList::AsTuple() const {
       nb::tuple out = nb::steal<nb::tuple>(PyTuple_New(device_list.size()));
       int i = 0;
       for (xla::ifrt::Device* device : device_list) {
-        py::object d = py::cast(xla::WrapWithClient(py_client_, device));
+        nb::object d = py_client_->GetPyDevice(device);
         PyTuple_SET_ITEM(out.ptr(), i, d.release().ptr());
         ++i;
       }
@@ -218,18 +213,16 @@ nb::iterator PyDeviceList::Iter() {
       struct Iterator {
         void operator++() { ++it; }
         bool operator==(const Iterator& other) const { return it == other.it; }
-        xla::ClientAndPtr<xla::PjRtDevice> operator*() const {
-          return xla::WrapWithClient(py_client, *it);
+        xla::nb_class_ptr<xla::PyDevice> operator*() const {
+          return py_client->GetPyDevice(*it);
         }
-        const std::shared_ptr<xla::PyClient>& py_client;
+        xla::nb_class_ptr<xla::PyClient> py_client;
         xla::ifrt::DeviceList::Devices::const_iterator it;
       };
-      return nb::steal<nb::iterator>(
-          py::make_iterator(
-              Iterator{py_client_, std::get<0>(device_list_).begin()},
-              Iterator{py_client_, std::get<0>(device_list_).end()})
-              .release()
-              .ptr());
+      return nb::make_iterator(
+          nb::type<PyDeviceList>(), "ifrt_device_iterator",
+          Iterator{py_client_, std::get<0>(device_list_).begin()},
+          Iterator{py_client_, std::get<0>(device_list_).end()});
     }
     case 1:
       return nb::make_iterator(
diff --git a/third_party/xla/xla/python/py_device_list.h b/third_party/xla/xla/python/py_device_list.h
index 38c551bca4de4b..a569f3958c68f8 100644
--- a/third_party/xla/xla/python/py_device_list.h
+++ b/third_party/xla/xla/python/py_device_list.h
@@ -33,7 +33,7 @@ namespace jax {
 // Device list with various caching and direct access to IFRT DeviceList.
 class PyDeviceList {
  public:
-  PyDeviceList(std::shared_ptr<xla::PyClient> py_client,
+  PyDeviceList(xla::nb_class_ptr<xla::PyClient> py_client,
                xla::ifrt::DeviceList device_list);
   explicit PyDeviceList(nanobind::tuple py_device_assignment);
   ~PyDeviceList();
@@ -44,7 +44,7 @@ class PyDeviceList {
   PyDeviceList& operator=(PyDeviceList&&) = delete;
 
   // These two methods are safe to call from C++ without GIL.
-  std::shared_ptr<xla::PyClient> py_client() const { return py_client_; }
+  xla::nb_class_ptr<xla::PyClient> py_client() const { return py_client_; }
   absl::StatusOr<xla::ifrt::DeviceList> ifrt_device_list() const;
 
   // Methods below require GIL.
@@ -78,7 +78,7 @@ class PyDeviceList {
 
   // Valid only if `device_list_` contains `xla::ifrt::DeviceList` and
   // non-empty.
-  std::shared_ptr<xla::PyClient> py_client_;
+  xla::nb_class_ptr<xla::PyClient> py_client_;
 
   // Either C++ `ifrt::DeviceList` or Python duck-type devices.
   // TODO(hyeontaek): Remove support for Python duck-type devices once all
diff --git a/third_party/xla/xla/python/py_executable.cc b/third_party/xla/xla/python/py_executable.cc
index b494aaf7f98676..e425bd1fa4d0ef 100644
--- a/third_party/xla/xla/python/py_executable.cc
+++ b/third_party/xla/xla/python/py_executable.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout.h"
-#include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/python/ifrt/array.h"
@@ -43,8 +42,10 @@ limitations under the License.
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/py_array.h"
 #include "xla/python/py_client.h"
+#include "xla/python/py_device.h"
 #include "xla/python/traceback.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/fingerprint.h"
@@ -74,7 +75,7 @@ absl::Status PyShardedToken::Await() {
 }
 
 PyLoadedExecutable::PyLoadedExecutable(
-    std::shared_ptr<PyClient> client,
+    nb_class_ptr<PyClient> client,
     std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
     std::optional<nb_traceback> traceback,
     std::optional<std::string> fingerprint)
@@ -111,12 +112,12 @@ PyLoadedExecutable::~PyLoadedExecutable() {
   }
 }
 
-std::vector<ClientAndPtr<PjRtDevice>> PyLoadedExecutable::AddressableDevices()
+std::vector<nb_class_ptr<PyDevice>> PyLoadedExecutable::AddressableDevices()
     const {
-  std::vector<ClientAndPtr<PjRtDevice>> devices;
+  std::vector<nb_class_ptr<PyDevice>> devices;
   devices.reserve(ifrt_loaded_executable_->addressable_devices().size());
   for (ifrt::Device* device : ifrt_loaded_executable_->addressable_devices()) {
-    devices.push_back(WrapWithClient(client_, device));
+    devices.push_back(client_->GetPyDevice(device));
   }
   return devices;
 }
@@ -175,7 +176,7 @@ struct ShardedBufferAdapter<ExecuteShardedArg> {
 };
 
 void PopulateExecuteShardedResults(
-    const std::shared_ptr<PyClient>& client,
+    const nb_class_ptr<PyClient>& client,
     std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
     const xla::PjRtFuture<absl::Status>& result_status, int num_computations,
     std::vector<std::vector<PyArray>>& outputs) {
@@ -199,7 +200,7 @@ void PopulateExecuteShardedResults(
 
 template <typename ArgT, typename ArgAdapter = ShardedBufferAdapter<ArgT>>
 absl::StatusOr<PyExecuteResults> ExecuteShardedOnLocalDevicesInternal(
-    const ExecuteOptions& options, const std::shared_ptr<PyClient>& client,
+    const ExecuteOptions& options, const nb_class_ptr<PyClient>& client,
     ifrt::LoadedExecutable* ifrt_loaded_executable, absl::Span<const ArgT> args,
     std::optional<std::vector<PjRtFuture<absl::Status>>>& returned_futures,
     bool attach_status_to_results) {
@@ -253,7 +254,7 @@ absl::StatusOr<PyExecuteResults> ExecuteShardedOnLocalDevicesInternal(
 }  // namespace
 
 PyExecuteResults::PyExecuteResults(
-    const std::shared_ptr<PyClient>& client,
+    const nb_class_ptr<PyClient>& client,
     std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
     int num_computations, PyShardedToken token,
     xla::PjRtFuture<absl::Status> result_status)
diff --git a/third_party/xla/xla/python/py_executable.h b/third_party/xla/xla/python/py_executable.h
index 5aed9320f0eacc..cfd716d7cc6e32 100644
--- a/third_party/xla/xla/python/py_executable.h
+++ b/third_party/xla/xla/python/py_executable.h
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/executable.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/pjrt_ifrt/pjrt_executable.h"
 #include "xla/python/py_array.h"
 #include "xla/python/py_client.h"
@@ -87,7 +88,7 @@ class PyShardedToken {
 
 class PyExecuteResults {
  public:
-  PyExecuteResults(const std::shared_ptr<PyClient>& client,
+  PyExecuteResults(const nb_class_ptr<PyClient>& client,
                    std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
                    int num_computations, PyShardedToken token,
                    xla::PjRtFuture<absl::Status> result_status =
@@ -116,7 +117,7 @@ class PyExecuteResults {
  private:
   bool is_exploded_ = false;
   bool token_consumed_ = false;
-  std::shared_ptr<PyClient> client_;
+  nb_class_ptr<PyClient> client_;
   std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays_;
   int num_computations_;
   PyShardedToken token_;
@@ -129,17 +130,16 @@ using ExecuteShardedArg = std::variant<PyArray, std::vector<PyArray>>;
 // Python wrapper around PjRtExecutable. We use a wrapper class:
 // a) to keep the PyClient alive via a std::shared_ptr<>
 // b) to add Python-specific functionality.
-class PyLoadedExecutable
-    : public std::enable_shared_from_this<PyLoadedExecutable> {
+class PyLoadedExecutable {
  public:
   PyLoadedExecutable(
-      std::shared_ptr<PyClient> client,
+      nb_class_ptr<PyClient> client,
       std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
       std::optional<nb_traceback> traceback,
       std::optional<std::string> fingerprint);
   ~PyLoadedExecutable();
 
-  std::shared_ptr<PyClient> client() const { return client_; }
+  nb_class_ptr<PyClient> client() const { return client_; }
   ifrt::LoadedExecutable* ifrt_loaded_executable() const {
     return ifrt_loaded_executable_.get();
   }
@@ -149,7 +149,7 @@ class PyLoadedExecutable
     return ifrt_loaded_executable_->addressable_device_logical_ids();
   }
 
-  std::vector<ClientAndPtr<PjRtDevice>> AddressableDevices() const;
+  std::vector<nb_class_ptr<PyDevice>> AddressableDevices() const;
 
   int64_t SizeOfGeneratedCodeInBytes() const {
     return ifrt_loaded_executable_->SizeOfGeneratedCodeInBytes();
@@ -235,7 +235,7 @@ class PyLoadedExecutable
  private:
   friend class PyClient;
 
-  std::shared_ptr<PyClient> client_;
+  nb_class_ptr<PyClient> client_;
   std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable_;
   std::optional<nb_traceback> traceback_;
 
diff --git a/third_party/xla/xla/python/py_memory_space.cc b/third_party/xla/xla/python/py_memory_space.cc
new file mode 100644
index 00000000000000..73aab577e544cc
--- /dev/null
+++ b/third_party/xla/xla/python/py_memory_space.cc
@@ -0,0 +1,107 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/py_memory_space.h"
+
+#include <Python.h>
+
+#include <string_view>
+#include <utility>
+
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/py_client.h"
+
+namespace nb = ::nanobind;
+
+namespace xla {
+
+PyMemorySpace::PyMemorySpace(nb_class_ptr<PyClient> client,
+                             PjRtMemorySpace* memory_space)
+    : client_(std::move(client)), memory_space_(memory_space) {}
+
+int PyMemorySpace::process_index() const { return client_->process_index(); }
+
+std::string_view PyMemorySpace::platform() const {
+  // TODO(phawkins): this is a temporary backwards
+  // compatibility shim. We changed the name PJRT
+  // reports for GPU platforms to "cuda" or "rocm",
+  // but we haven't yet updated JAX clients that
+  // expect "gpu". Migrate users and remove this
+  // code.
+  if (client_->platform_name() == "cuda" ||
+      client_->platform_name() == "rocm") {
+    return std::string_view("gpu");
+  } else {
+    return client_->platform_name();
+  }
+}
+
+std::string_view PyMemorySpace::kind() const {
+  return memory_space_->memory_space_kind();
+}
+
+std::string_view PyMemorySpace::Str() const {
+  return memory_space_->DebugString();
+}
+
+std::string_view PyMemorySpace::Repr() const {
+  return memory_space_->ToString();
+}
+
+nb::list PyMemorySpace::AddressableByDevices() const {
+  nb::list devices;
+  for (ifrt::Device* device : memory_space_->devices()) {
+    devices.append(client_->GetPyDevice(device));
+  }
+  return devices;
+}
+
+/* static */ int PyMemorySpace::tp_traverse(PyObject* self, visitproc visit,
+                                            void* arg) {
+  PyMemorySpace* d = nb::inst_ptr<PyMemorySpace>(self);
+  Py_VISIT(d->client().ptr());
+  return 0;
+}
+
+/* static */ int PyMemorySpace::tp_clear(PyObject* self) {
+  PyMemorySpace* d = nb::inst_ptr<PyMemorySpace>(self);
+  nb_class_ptr<PyClient> client;
+  std::swap(client, d->client_);
+  return 0;
+}
+
+PyType_Slot PyMemorySpace::slots_[] = {
+    {Py_tp_traverse, (void*)PyMemorySpace::tp_traverse},
+    {Py_tp_clear, (void*)PyMemorySpace::tp_clear},
+    {0, nullptr},
+};
+
+/* static */ void PyMemorySpace::RegisterPythonType(nb::module_& m) {
+  nb::class_<PyMemorySpace> device(m, "Memory",
+                                   nb::type_slots(PyMemorySpace::slots_));
+  device.def_prop_ro("process_index", &PyMemorySpace::process_index)
+      .def_prop_ro("platform", &PyMemorySpace::platform)
+      .def_prop_ro("kind", &PyMemorySpace::kind)
+      .def("__str__", &PyMemorySpace::Str)
+      .def("__repr__", &PyMemorySpace::Repr)
+      .def("addressable_by_devices", &PyMemorySpace::AddressableByDevices,
+           "Returns devices that can address this memory.");
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/python/py_memory_space.h b/third_party/xla/xla/python/py_memory_space.h
new file mode 100644
index 00000000000000..8605a6bd1d37f9
--- /dev/null
+++ b/third_party/xla/xla/python/py_memory_space.h
@@ -0,0 +1,66 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_MEMORY_SPACE_H_
+#define XLA_PYTHON_PY_MEMORY_SPACE_H_
+
+#include <Python.h>
+
+#include <string_view>
+
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/py_client.h"
+
+namespace xla {
+
+class PyMemorySpace {
+ public:
+  PyMemorySpace(nb_class_ptr<PyClient> client, PjRtMemorySpace* memory_space);
+
+  // Memory spaces are compared using Python object identity, so we don't allow
+  // them to be copied or moved.
+  PyMemorySpace(const PyMemorySpace&) = delete;
+  PyMemorySpace(PyMemorySpace&&) = delete;
+  PyMemorySpace& operator=(const PyMemorySpace&) = delete;
+  PyMemorySpace& operator=(PyMemorySpace&&) = delete;
+
+  const nb_class_ptr<PyClient>& client() const { return client_; }
+  PjRtMemorySpace* memory_space() const { return memory_space_; }
+
+  int process_index() const;
+  std::string_view platform() const;
+  std::string_view kind() const;
+
+  std::string_view Str() const;
+  std::string_view Repr() const;
+
+  nanobind::list AddressableByDevices() const;
+
+  static void RegisterPythonType(nanobind::module_& m);
+
+ private:
+  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
+  static int tp_clear(PyObject* self);
+  static PyType_Slot slots_[];
+
+  nb_class_ptr<PyClient> client_;
+  PjRtMemorySpace* memory_space_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_MEMORY_SPACE_H_
diff --git a/third_party/xla/xla/python/python_ref_manager.cc b/third_party/xla/xla/python/python_ref_manager.cc
index 427658e6f9e369..9e40ced1ed9cd6 100644
--- a/third_party/xla/xla/python/python_ref_manager.cc
+++ b/third_party/xla/xla/python/python_ref_manager.cc
@@ -29,7 +29,6 @@ limitations under the License.
 namespace xla {
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 PythonRefManager::ManagedPyObjects::ManagedPyObjects(
     PythonRefManager* manager, absl::Span<nb::object> objects)
@@ -46,24 +45,6 @@ PythonRefManager::ManagedPyObjects::~ManagedPyObjects() {
   }
 }
 
-std::shared_ptr<PythonRefManager::ManagedPyObjects>
-PythonRefManager::ManageReference(py::object object) {
-  nb::object o = nb::steal(object.release().ptr());
-  return std::make_shared<ManagedPyObjects>(this,
-                                            absl::Span<nb::object>(&o, 1));
-}
-
-std::shared_ptr<PythonRefManager::ManagedPyObjects>
-PythonRefManager::ManageReferences(absl::Span<py::object> objects) {
-  std::vector<nb::object> objects_to_manage;
-  objects_to_manage.reserve(objects.size());
-  for (py::object& object : objects) {
-    objects_to_manage.push_back(nb::steal(object.release().ptr()));
-  }
-  return std::make_shared<ManagedPyObjects>(this,
-                                            absl::MakeSpan(objects_to_manage));
-}
-
 std::shared_ptr<PythonRefManager::ManagedPyObjects>
 PythonRefManager::ManageReference(nb::object object) {
   return std::make_shared<ManagedPyObjects>(this,
@@ -91,22 +72,6 @@ void PythonRefManager::AddGarbage(absl::Span<nb::object> garbage) {
   }
 }
 
-void PythonRefManager::AddGarbage(py::object garbage) {
-  absl::MutexLock lock(&mu_);
-  // We want to collect arbitrary python garbage (e.g., buffers) aggressively.
-  garbage_count_.fetch_add(100, std::memory_order_relaxed);
-  python_garbage_.push_back(nb::steal(garbage.release().ptr()));
-}
-
-void PythonRefManager::AddGarbage(absl::Span<py::object> garbage) {
-  absl::MutexLock lock(&mu_);
-  // We want to collect arbitrary python garbage (e.g., buffers) aggressively.
-  garbage_count_.fetch_add(100, std::memory_order_relaxed);
-  for (py::object& o : garbage) {
-    python_garbage_.push_back(nb::steal(o.release().ptr()));
-  }
-}
-
 void PythonRefManager::AddGarbage(
     absl::Span<std::pair<PyCodeObject*, int> const> garbage) {
   absl::MutexLock lock(&mu_);
diff --git a/third_party/xla/xla/python/python_ref_manager.h b/third_party/xla/xla/python/python_ref_manager.h
index 27df2fe7494102..fd7b4cfbe12221 100644
--- a/third_party/xla/xla/python/python_ref_manager.h
+++ b/third_party/xla/xla/python/python_ref_manager.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_PYTHON_PYTHON_REF_MANAGER_H_
 #define XLA_PYTHON_PYTHON_REF_MANAGER_H_
 
+#include <Python.h>
+
 #include <atomic>
 #include <deque>
 #include <memory>
@@ -26,7 +28,6 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "pybind11/pybind11.h"  // from @pybind11
 
 namespace xla {
 
@@ -43,7 +44,7 @@ class PythonRefManager {
  public:
   PythonRefManager() = default;
 
-  // Holds references to a set of pybind11::objects, adding the references to
+  // Holds references to a set of nanobind::objects, adding the references to
   // the PythonRefManager on destruction.
   class ManagedPyObjects {
    public:
@@ -66,9 +67,6 @@ class PythonRefManager {
   // Creates a managed std::shared_ptr to an object. When the shared_ptr is
   // destroyed, the reference to 'object' will be added to python_garbage_,
   // and collected next time CollectGarbage() is called.
-  std::shared_ptr<ManagedPyObjects> ManageReference(pybind11::object object);
-  std::shared_ptr<ManagedPyObjects> ManageReferences(
-      absl::Span<pybind11::object> objects);
   std::shared_ptr<ManagedPyObjects> ManageReference(nanobind::object object);
   std::shared_ptr<ManagedPyObjects> ManageReferences(
       absl::Span<nanobind::object> objects);
@@ -76,8 +74,6 @@ class PythonRefManager {
   // Adds garbage objects to the manager.
   void AddGarbage(nanobind::object garbage);
   void AddGarbage(absl::Span<nanobind::object> garbage);
-  void AddGarbage(pybind11::object garbage);
-  void AddGarbage(absl::Span<pybind11::object> garbage);
   void AddGarbage(absl::Span<std::pair<PyCodeObject*, int> const> garbage);
 
   // Releases the contents of python_garbage_. Requires that the GIL is held.
diff --git a/third_party/xla/xla/python/python_utils.h b/third_party/xla/xla/python/python_utils.h
deleted file mode 100644
index b71d5b2ab5db24..00000000000000
--- a/third_party/xla/xla/python/python_utils.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PYTHON_UTILS_H_
-#define XLA_PYTHON_PYTHON_UTILS_H_
-
-#include <Python.h>
-
-#include <optional>
-#include <string>
-
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "xla/status_macros.h"
-#include "xla/util.h"
-
-namespace jax {
-
-// This file contains utilities to write Python wrappers using the C API.
-// It's used for performance critical code such as PyArray, jax.jit or
-// jax.pmap.
-
-// Helpers for building Python properties
-template <typename Func>
-pybind11::object property_readonly(Func&& get) {
-  pybind11::handle property(reinterpret_cast<PyObject*>(&PyProperty_Type));
-  return property(pybind11::cpp_function(std::forward<Func>(get)),
-                  pybind11::none(), pybind11::none(), "");
-}
-
-template <typename GetFunc, typename SetFunc>
-pybind11::object property(GetFunc&& get, SetFunc&& set) {
-  pybind11::handle property(reinterpret_cast<PyObject*>(&PyProperty_Type));
-  return property(pybind11::cpp_function(std::forward<GetFunc>(get)),
-                  pybind11::cpp_function(std::forward<SetFunc>(set)),
-                  pybind11::none(), "");
-}
-
-}  // namespace jax
-
-#endif  // XLA_PYTHON_PYTHON_UTILS_H_
diff --git a/third_party/xla/xla/python/sharding.cc b/third_party/xla/xla/python/sharding.cc
index 75e83084acb718..3ae6809b54e36e 100644
--- a/third_party/xla/xla/python/sharding.cc
+++ b/third_party/xla/xla/python/sharding.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <Python.h>
 
 #include <cstdlib>
-#include <memory>
 #include <string>
 #include <string_view>
 #include <utility>
@@ -30,10 +29,7 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/nb_helpers.h"
@@ -41,7 +37,6 @@ limitations under the License.
 #include "xla/python/py_client.h"
 #include "xla/python/py_device_list.h"
 #include "xla/python/sharded_device_array.h"
-#include "xla/util.h"
 #include "tsl/platform/logging.h"
 
 namespace jax {
@@ -176,45 +171,6 @@ bool ShardingEqual(nb::handle a, nb::handle b) {
   return a.equal(b);
 }
 
-xla::ClientAndPtr<xla::PjRtMemorySpace> GetMemory(
-    const xla::ClientAndPtr<xla::PjRtDevice>& device, const std::string& kind) {
-  xla::PjRtMemorySpace* result_memory_space = nullptr;
-  for (auto* memory_space : device->memory_spaces()) {
-    if (memory_space->memory_space_kind() == kind) {
-      if (result_memory_space != nullptr) {
-        std::string memories = absl::StrJoin(
-            device->memory_spaces(), ", ",
-            [](std::string* out, const auto& memory_space) {
-              absl::StrAppend(out, memory_space->memory_space_kind());
-            });
-        auto device_kind = device->device_kind();
-        xla::ThrowIfError(
-            xla::InvalidArgument("Found more than one addressable memory for "
-                                 "kind %s which is not allowed. There can only "
-                                 "be one memory for each "
-                                 "kind. Device %s can address the following "
-                                 "memory kinds: %s",
-                                 kind, device_kind, memories));
-      }
-      result_memory_space = memory_space;
-    }
-  }
-  if (result_memory_space == nullptr) {
-    std::string memories =
-        absl::StrJoin(device->memory_spaces(), ", ",
-                      [](std::string* out, const auto& memory_space) {
-                        absl::StrAppend(out, memory_space->memory_space_kind());
-                      });
-    auto device_kind = device->device_kind();
-    xla::ThrowIfError(xla::InvalidArgument(
-        "Could not find memory addressable by device %s. Device %s "
-        "can address the following memory kinds: %s. "
-        "Got memory kind: %s",
-        device_kind, device_kind, memories, kind));
-  }
-  return WrapWithClient(device.client(), result_memory_space);
-}
-
 NamedSharding::NamedSharding(nb::object mesh, nb::object spec,
                              nb::object memory_kind, nb::object parsed_pspec,
                              nb::object manual_axes)
@@ -249,15 +205,10 @@ SingleDeviceSharding::SingleDeviceSharding(nb::object device,
 }
 
 SingleDeviceSharding::SingleDeviceSharding(
-    std::shared_ptr<xla::PyClient> client, xla::ifrt::DeviceList device_list,
+    xla::nb_class_ptr<xla::PyClient> client, xla::ifrt::DeviceList device_list,
     nb::object memory_kind)
     : XLACompatibleSharding(/*num_devices=*/1),
-      // TODO(phawkins): remove pybind11 translation when nanobind transition is
-      // complete.
-      device_(nb::steal<nb::object>(
-          pybind11::cast(WrapWithClient(client, device_list.front()))
-              .release()
-              .ptr())),
+      device_(client->GetPyDevice(device_list.front())),
       memory_kind_(std::move(memory_kind)),
       internal_device_list_(xla::make_nb_class<PyDeviceList>(
           std::move(client), std::move(device_list))) {
diff --git a/third_party/xla/xla/python/sharding.h b/third_party/xla/xla/python/sharding.h
index 96438c091b557b..600d2abf7103f4 100644
--- a/third_party/xla/xla/python/sharding.h
+++ b/third_party/xla/xla/python/sharding.h
@@ -17,16 +17,13 @@ limitations under the License.
 #define XLA_PYTHON_SHARDING_H_
 
 #include <cstddef>
-#include <memory>
 #include <optional>
-#include <string>
 #include <utility>
 
 // placeholder for index annotation headers
 #include "absl/hash/hash.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/nb_class_ptr.h"
@@ -69,9 +66,6 @@ size_t ShardingHash(nanobind::handle sharding);
 
 bool ShardingEqual(nanobind::handle a, nanobind::handle b);
 
-xla::ClientAndPtr<xla::PjRtMemorySpace> GetMemory(
-    const xla::ClientAndPtr<xla::PjRtDevice>& device, const std::string& kind);
-
 class XLACompatibleSharding : public Sharding {
  public:
   using Sharding::Sharding;
@@ -118,7 +112,7 @@ class SingleDeviceSharding : public XLACompatibleSharding {
       nanobind::object device, nanobind::object memory_kind = nanobind::none());
 
   // Used only in C++ to accelerate `PyArray::MakeFromSingleDeviceArray()`.
-  SingleDeviceSharding(std::shared_ptr<xla::PyClient> client,
+  SingleDeviceSharding(xla::nb_class_ptr<xla::PyClient> client,
                        xla::ifrt::DeviceList device_list,
                        nanobind::object memory_kind);
 
diff --git a/third_party/xla/xla/python/types.cc b/third_party/xla/xla/python/types.cc
index 0c7c17472982c5..3ca11ef9d9e836 100644
--- a/third_party/xla/xla/python/types.cc
+++ b/third_party/xla/xla/python/types.cc
@@ -35,8 +35,6 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "pybind11/numpy.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/pjrt/exceptions.h"
@@ -56,7 +54,6 @@ limitations under the License.
 namespace xla {
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 namespace {
 
@@ -154,10 +151,6 @@ absl::StatusOr<PrimitiveType> DtypeToPrimitiveType(const nb_dtype& np_type) {
                          np_type.char_(), np_type.kind(), np_type.itemsize());
 }
 
-absl::StatusOr<PrimitiveType> DtypeToPrimitiveType(const py::dtype& np_type) {
-  return DtypeToPrimitiveType(nb::borrow<nb_dtype>(np_type.ptr()));
-}
-
 absl::StatusOr<nb_dtype> PrimitiveTypeToNbDtype(PrimitiveType type) {
   const CustomDtypes& custom_dtypes = GetCustomDtypes();
   auto to_nb_dtype = [](int typenum) -> nb_dtype {
@@ -215,11 +208,6 @@ absl::StatusOr<nb_dtype> PrimitiveTypeToNbDtype(PrimitiveType type) {
   }
 }
 
-absl::StatusOr<py::dtype> PrimitiveTypeToDtype(PrimitiveType type) {
-  TF_ASSIGN_OR_RETURN(nb_dtype np_type, PrimitiveTypeToNbDtype(type));
-  return py::reinterpret_steal<py::dtype>(np_type.release().ptr());
-}
-
 absl::StatusOr<nb_dtype> IfrtDtypeToNbDtype(ifrt::DType dtype) {
   const CustomDtypes& custom_dtypes = GetCustomDtypes();
   auto to_nb_dtype = [](int typenum) -> nb_dtype {
@@ -285,11 +273,6 @@ absl::StatusOr<nb_dtype> IfrtDtypeToNbDtype(ifrt::DType dtype) {
   }
 }
 
-absl::StatusOr<pybind11::dtype> IfrtDtypeToDtype(ifrt::DType dtype) {
-  TF_ASSIGN_OR_RETURN(nb_dtype np_type, IfrtDtypeToNbDtype(dtype));
-  return py::reinterpret_steal<py::dtype>(np_type.release().ptr());
-}
-
 absl::StatusOr<ifrt::DType> DtypeToIfRtDType(nb_dtype dtype) {
   TF_ASSIGN_OR_RETURN(auto primitive_type, DtypeToPrimitiveType(dtype));
   return ifrt::ToDType(primitive_type);
@@ -499,24 +482,6 @@ nb::tuple MutableSpanToNbTuple(absl::Span<nb::object> xs) {
   return out;
 }
 
-template <typename IntType>
-static py::tuple IntSpanToTupleHelper(absl::Span<IntType const> xs) {
-  py::tuple out(xs.size());
-  for (int i = 0; i < xs.size(); ++i) {
-    out[i] = py::int_(xs[i]);
-  }
-  return out;
-}
-
-template <>
-pybind11::tuple SpanToTuple(absl::Span<int const> xs) {
-  return IntSpanToTupleHelper(xs);
-}
-template <>
-pybind11::tuple SpanToTuple(absl::Span<int64_t const> xs) {
-  return IntSpanToTupleHelper(xs);
-}
-
 std::optional<CastToArrayResult> CastToArray(nb::handle h) {
   auto array =
       nb_numpy_ndarray::ensure(h, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_ALIGNED);
diff --git a/third_party/xla/xla/python/types.h b/third_party/xla/xla/python/types.h
index 5cffba8376c61a..772bccc64e89f3 100644
--- a/third_party/xla/xla/python/types.h
+++ b/third_party/xla/xla/python/types.h
@@ -26,11 +26,6 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "pybind11/numpy.h"  // from @pybind11
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "pybind11/stl.h"  // from @pybind11
-#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/python/ifrt/dtype.h"
@@ -44,17 +39,12 @@ limitations under the License.
 namespace xla {
 
 // Converts a NumPy dtype to a PrimitiveType.
-absl::StatusOr<PrimitiveType> DtypeToPrimitiveType(
-    const pybind11::dtype& np_type);
-
 absl::StatusOr<PrimitiveType> DtypeToPrimitiveType(const nb_dtype& np_type);
 
 // Converts a PrimitiveType to a Numpy dtype.
-absl::StatusOr<pybind11::dtype> PrimitiveTypeToDtype(PrimitiveType type);
 absl::StatusOr<nb_dtype> PrimitiveTypeToNbDtype(PrimitiveType type);
 
 // Converts an IFRT dtype to a NumPy dtype.
-absl::StatusOr<pybind11::dtype> IfrtDtypeToDtype(ifrt::DType dtype);
 absl::StatusOr<nb_dtype> IfrtDtypeToNbDtype(ifrt::DType dtype);
 
 StatusOr<ifrt::DType> DtypeToIfRtDType(nb_dtype dtype);
@@ -115,22 +105,6 @@ std::vector<int64_t> StridesForShape(PrimitiveType element_type,
 absl::StatusOr<nanobind::object> LiteralToPython(
     std::shared_ptr<Literal> literal);
 
-// Converts a sequence of C++ ints to a Python tuple of ints.
-// Pybind11 by default converts a std::vector<T> to a Python list;
-// we frequently want a tuple instead e.g. for shapes.
-template <typename T>
-pybind11::tuple SpanToTuple(absl::Span<T const> xs) {
-  pybind11::tuple out(xs.size());
-  for (int i = 0; i < xs.size(); ++i) {
-    out[i] = pybind11::cast(xs[i]);
-  }
-  return out;
-}
-template <>
-pybind11::tuple SpanToTuple(absl::Span<int const> xs);
-template <>
-pybind11::tuple SpanToTuple(absl::Span<int64_t const> xs);
-
 template <typename T>
 nanobind::tuple SpanToNbTuple(absl::Span<T const> xs) {
   nanobind::tuple out =
@@ -145,24 +119,6 @@ nanobind::tuple SpanToNbTuple(absl::Span<T const> xs) {
 // references to the objects.
 nanobind::tuple MutableSpanToNbTuple(absl::Span<nanobind::object> xs);
 
-// Converts a Python iterable/sequence of T to std::vector<T>
-template <typename T>
-std::vector<T> IterableToVector(const pybind11::iterable& iterable) {
-  std::vector<T> output;
-  for (auto item : iterable) {
-    output.push_back(item.cast<T>());
-  }
-  return output;
-}
-template <typename T>
-std::vector<T> SequenceToVector(const pybind11::sequence& sequence) {
-  std::vector<T> output;
-  output.reserve(sequence.size());
-  for (auto item : sequence) {
-    output.push_back(item.cast<T>());
-  }
-  return output;
-}
 
 template <typename T>
 std::vector<T> IterableToVector(const nanobind::iterable& iterable) {
@@ -194,81 +150,6 @@ std::optional<CastToArrayResult> CastToArray(nanobind::handle h);
 
 }  // namespace xla
 
-// This namespace is a documented pybind11 extension point.
-// Caution: Unusually for Google code, this code uses C++ exceptions because
-// they are the only mechanism for reporting cast failures to pybind11. However,
-// the exceptions are local to the binding code.
-namespace pybind11 {
-namespace detail {
-
-// Literals.
-// Literal data can be passed to XLA as a NumPy array; its value can be
-// cast to an xla::BorrowingLiteral or xla::LiteralSlice in a zero-copy way.
-// We don't have any literal -> numpy conversions here, since all the methods
-// that want to return arrays build Python objects directly.
-
-template <>
-struct type_caster<xla::BorrowingLiteral> {
- public:
-  PYBIND11_TYPE_CASTER(xla::BorrowingLiteral, _("xla::BorrowingLiteral"));
-
-  // Pybind appears to keep type_casters alive until the callee has run.
-  absl::InlinedVector<pybind11::array, 1> arrays;
-
-  bool load(handle input, bool) {
-    // TODO(b/79707221): support nested tuples if/when XLA adds support for
-    // nested BorrowingLiterals.
-    if (pybind11::isinstance<pybind11::tuple>(input)) {
-      pybind11::tuple tuple =
-          pybind11::reinterpret_borrow<pybind11::tuple>(input);
-      std::vector<xla::Shape> shapes;
-      std::vector<const char*> buffers;
-      arrays.reserve(tuple.size());
-      shapes.reserve(tuple.size());
-      buffers.reserve(tuple.size());
-      for (pybind11::handle entry : tuple) {
-        auto c = xla::CastToArray(entry.ptr());
-        if (!c) {
-          return false;
-        }
-        arrays.push_back(reinterpret_borrow<object>(c->array.ptr()));
-        buffers.push_back(c->buf_ptr);
-        shapes.push_back(c->shape);
-      }
-      value = xla::BorrowingLiteral(buffers,
-                                    xla::ShapeUtil::MakeTupleShape(shapes));
-    } else {
-      auto c = xla::CastToArray(input.ptr());
-      if (!c) {
-        return false;
-      }
-      arrays.push_back(reinterpret_borrow<object>(c->array.ptr()));
-      value = xla::BorrowingLiteral(c->buf_ptr, c->shape);
-    }
-    return true;
-  }
-};
-
-template <>
-struct type_caster<xla::LiteralSlice> {
- public:
-  PYBIND11_TYPE_CASTER(xla::LiteralSlice, const_name("xla::LiteralSlice"));
-
-  // Pybind appears to keep type_casters alive until the callee has run.
-  type_caster<xla::BorrowingLiteral> literal_caster;
-
-  bool load(handle handle, bool convert) {
-    if (!literal_caster.load(handle, convert)) {
-      return false;
-    }
-    value = static_cast<const xla::BorrowingLiteral&>(literal_caster);
-    return true;
-  }
-};
-
-}  // namespace detail
-}  // namespace pybind11
-
 namespace nanobind {
 namespace detail {
 
diff --git a/third_party/xla/xla/python/util.h b/third_party/xla/xla/python/util.h
index 58d986167bc441..3e9f26d2bbd086 100644
--- a/third_party/xla/xla/python/util.h
+++ b/third_party/xla/xla/python/util.h
@@ -19,74 +19,12 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "absl/strings/str_format.h"
 #include "absl/types/span.h"
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "xla/pjrt/pjrt_client.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/status.h"
 
 namespace xla {
 
-template <typename T>
-bool is_pybind_reinterpret_cast_ok(pybind11::handle h) {
-  static pybind11::detail::type_info* const type_info = []() {
-    auto* type_info =
-        pybind11::detail::get_type_info(typeid(T), /*throw_if_missing=*/false);
-    CHECK(type_info);
-    CHECK(type_info->simple_type);
-    return type_info;
-  }();
-  PyTypeObject* srctype = Py_TYPE(h.ptr());
-  // Exact type match.
-  if (srctype == type_info->type) {
-    return true;
-  }
-  // If we have a subtype, then look for a base type that matches.
-  if (PyType_IsSubtype(srctype, type_info->type)) {
-    const auto& bases = pybind11::detail::all_type_info(srctype);
-    for (auto* base : bases) {
-      if (PyType_IsSubtype(base->type, type_info->type)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-// Faster version of the pybind11 cast x.cast<T*>.
-// pybind11's cast is fairly slow because it looks up the type information
-// in a global hash table. It's not a particularly fast hash table and the
-// lookup is pointless when we know the target type and can cache the lookup.
-// This function does depend on a number of pybind11 internals;
-// if it ever bitrots, one option is to replace it with a pybind11 cast.
-// Return nullptr if the cast fails.
-template <typename T>
-T* fast_cast(pybind11::handle h) {
-  if (!is_pybind_reinterpret_cast_ok<T>(h)) {
-    // Fall back to pybind11's usual cast.
-    return h.cast<T*>();
-  }
-  auto* instance = reinterpret_cast<pybind11::detail::instance*>(h.ptr());
-  if (instance->simple_layout) {
-    return reinterpret_cast<T*>(instance->simple_value_holder[0]);
-  } else {
-    return reinterpret_cast<T*>(
-        pybind11::detail::values_and_holders(instance).begin()->value_ptr());
-  }
-}
-
-// Issues a Python deprecation warning. Throws a C++ exception if issuing the
-// Python warning causes a Python exception to be raised.
-template <typename... Args>
-void PythonDeprecationWarning(const absl::FormatSpec<Args...>& format,
-                              const Args&... args) {
-  if (PyErr_WarnEx(PyExc_DeprecationWarning,
-                   absl::StrFormat(format, args...).c_str(), 1) < 0) {
-    throw pybind11::error_already_set();
-  }
-}
-
 // Requests if given buffers are ready, awaits for results and returns OK if
 // all of the buffers are ready or the last non-ok status.
 Status AwaitBuffersReady(absl::Span<ifrt::Array* const> ifrt_arrays);
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index 12870ab43f93d4..f6d35ca1724ac0 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -13,14 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/python/xla.h"
-
 #include <Python.h>
 
 #include <cstdint>
-#include <exception>
 #include <functional>
-#include <map>
 #include <memory>
 #include <optional>
 #include <set>
@@ -29,42 +25,33 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
-// clang-format off
 #include "absl/base/casts.h"
-// Must be included first
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/nb_defs.h"
+#include "third_party/nanobind/include/nanobind/stl/function.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/pair.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/variant.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "pybind11/attr.h"  // from @pybind11
-#include "pybind11/cast.h"  // from @pybind11
-#include "pybind11/detail/common.h"  // from @pybind11
-#include "pybind11/numpy.h"  // from @pybind11
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "pybind11/stl.h"  // from @pybind11
-#include "pybind11/stl_bind.h"  // from @pybind11
-#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
 #include "xla/ffi/ffi_api.h"
-#include "xla/layout_util.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/distributed.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/distributed/service.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt_proxy/client/py_module.h"
 #include "xla/python/py_client.h"
 #include "xla/service/cpu/collectives_interface.h"
@@ -79,24 +66,24 @@ limitations under the License.
 #include "xla/pjrt/cpu/gloo_collectives.h"
 #include "xla/pjrt/cpu/gloo_kv_store.h"
 #endif  // __linux__
-
-#include "xla/literal.h"
 #include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/exceptions.h"
+#include "xla/pjrt/gpu/gpu_helpers.h"
 #include "xla/pjrt/pjrt_api.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
-#include "xla/pjrt/status_casters.h"
 #include "xla/python/custom_call_sharding.h"
 #include "xla/python/dlpack.h"
 #include "xla/python/jax_jit.h"
-#include "xla/python/logging.h"
+#include "xla/python/logging.h"  // IWYU pragma: keep
 #include "xla/python/mlir.h"
 #include "xla/python/nb_absl_flat_hash_map.h"  // IWYU pragma: keep
 #include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/ops.h"
 #include "xla/python/outfeed_receiver_py.h"
 #include "xla/python/pjit.h"
@@ -106,22 +93,20 @@ limitations under the License.
 #include "xla/python/profiler.h"
 #include "xla/python/py_array.h"
 #include "xla/python/py_compile_only_client.h"
+#include "xla/python/py_device.h"
 #include "xla/python/py_device_list.h"
 #include "xla/python/py_executable.h"
+#include "xla/python/py_memory_space.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/pytree.h"
 #include "xla/python/sharding.h"
 #include "xla/python/traceback.h"
 #include "xla/python/transfer_guard_lib.h"
-#include "xla/python/types.h"
-#include "xla/python/util.h"
 #include "xla/python/weakref_lru_cache.h"
 #include "xla/python/xla_compiler.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "tsl/distributed_runtime/preemption/preemption_sync_manager.h"
 #include "tsl/platform/platform.h"
+#include "tsl/platform/status.h"
 
 // TODO(phawkins): remove host_id properties after JAX is update to avoid them.
 
@@ -129,7 +114,6 @@ namespace xla {
 namespace {
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 bool IsOptimizedBuild() {
 #if NDEBUG
@@ -169,40 +153,21 @@ bool IsSanitized() { return IsAsan() || IsMsan() || IsTsan(); }
 
 }  // namespace
 
-static void Init(py::module_& m) {
+NB_MODULE(xla_extension, m_nb) {
   // Initialize ABSL logging because code within XLA uses it.
 #ifndef PLATFORM_GOOGLE
   InitializeAbslLogging();
 #endif  // PLATFORM_GOOGLE
 
-  // Normally this would happen at the start of NB_MODULE, but since this is a
-  // pybind11 module we have to do this ourselves.
-  nb::detail::init(NB_DOMAIN_STR);
-
   // We seem to get a fair number of leak warnings from nanobind. It's unclear
   // whether these are false positives or not.
   nb::set_leak_warnings(false);
 
   tsl::ImportNumpy();
 
-  nb::module_ m_nb = nb::cast<nb::module_>(nb::borrow(m.ptr()));
-
   // Exceptions
-  py::register_exception<XlaRuntimeError>(m, "XlaRuntimeError",
-                                          PyExc_RuntimeError);
-
-  // TODO(phawkins): use nb::exception<> once we have migrated all the pybind11
-  // code to nanobind. We use nb::register_exception_translator because we don't
-  // want to define the exception twice.
-  nb::register_exception_translator(
-      [](const std::exception_ptr& p, void* payload) {
-        try {
-          std::rethrow_exception(p);
-        } catch (const XlaRuntimeError& e) {
-          PyErr_SetString(reinterpret_cast<PyObject*>(payload), e.what());
-        }
-      },
-      nb::getattr(m_nb, "XlaRuntimeError").ptr());
+  nb::exception<XlaRuntimeError> xla_runtime_error(m_nb, "XlaRuntimeError",
+                                                   PyExc_RuntimeError);
 
   // Types
   nb::enum_<PrimitiveType>(m_nb, "PrimitiveType")
@@ -236,430 +201,42 @@ static void Init(py::module_& m) {
   // Must be before PyClient.compile.
   BuildXlaCompilerSubmodule(m_nb);
 
-  py::class_<PjRtDevice, ClientAndPtr<PjRtDevice>> device(
-      m, "Device",
-      "A descriptor of an available device.\n\nSubclasses are used to "
-      "represent specific types of devices, e.g. CPUs, GPUs. Subclasses may "
-      "have additional properties specific to that device type.");
-  device
-      .def_property_readonly(
-          "id", &PjRtDevice::id,
-          "Integer ID of this device.\n\nUnique across all available devices "
-          "of this type, including remote devices on multi-host platforms.")
-      .def_property_readonly(
-          "process_index", &PjRtDevice::process_index,
-          "Integer index of this device's process.\n\n"
-          "This is always 0 except on multi-process platforms.")
-      .def_property_readonly("host_id", &PjRtDevice::process_index,
-                             "Deprecated; please use process_index")
-      .def_property_readonly("task_id", &PjRtDevice::process_index,
-                             "Deprecated; please use process_index")
-      .def_property_readonly("platform",
-                             [](const ClientAndPtr<PjRtDevice>& device) {
-                               // TODO(phawkins): this is a temporary backwards
-                               // compatibility shim. We changed the name PJRT
-                               // reports for GPU platforms to "cuda" or "rocm",
-                               // but we haven't yet updated JAX clients that
-                               // expect "gpu". Migrate users and remove this
-                               // code.
-                               if (device.client()->platform_name() == "cuda" ||
-                                   device.client()->platform_name() == "rocm") {
-                                 return absl::string_view("gpu");
-                               } else {
-                                 return device.client()->platform_name();
-                               }
-                             })
-      .def_property_readonly("device_kind", &PjRtDevice::device_kind)
-      .def_property_readonly("client",
-                             [](const ClientAndPtr<PjRtDevice>& device) {
-                               return device.client();
-                             })
-      .def_property_readonly(
-          "local_hardware_id",
-          [](const ClientAndPtr<PjRtDevice>& device) -> std::optional<int> {
-            int local_hardware_id = device->local_hardware_id();
-            if (local_hardware_id == -1) {
-              return std::nullopt;
-            }
-            return local_hardware_id;
-          },
-          "Opaque hardware ID, e.g., the CUDA device number. In general, not "
-          "guaranteed to be dense, and not guaranteed to be defined on all "
-          "platforms.")
-      .def("__str__", &PjRtDevice::DebugString)
-      .def("__repr__", &PjRtDevice::ToString)
-      .def("transfer_to_infeed",
-           [](PjRtDevice& device, py::handle literal_py) {
-             // TODO(phawkins): just accept a Shape argument after nanobind
-             // transition is complete.
-             // We use a type caster directly because we need the value to
-             // alive until the transfer completes.
-             nb::detail::type_caster<LiteralSlice> literal_caster;
-             if (!literal_caster.from_python(literal_py.ptr(), 0, nullptr)) {
-               throw py::cast_error();
-             }
-             GlobalPyRefManager()->CollectGarbage();
-             py::gil_scoped_release gil_release;
-             xla::ThrowIfError(device.TransferToInfeed(literal_caster.value));
-           })
-      .def("transfer_from_outfeed",
-           [](PjRtDevice& device, py::handle shape_py) -> py::object {
-             // TODO(phawkins): just accept a Shape argument after nanobind
-             // transition is complete.
-             Shape shape = nb::cast<Shape>(nb::borrow(shape_py.ptr()));
-             GlobalPyRefManager()->CollectGarbage();
-             std::shared_ptr<Literal> literal;
-             {
-               py::gil_scoped_release gil_release;
-               ShapeUtil::ForEachMutableSubshape(
-                   &shape, [](Shape* subshape, const ShapeIndex&) {
-                     if (!subshape->has_layout()) {
-                       LayoutUtil::SetToDefaultLayout(subshape);
-                     }
-                   });
-               literal = std::make_shared<Literal>(shape);
-               xla::ThrowIfError(device.TransferFromOutfeed(literal.get()));
-             }
-             nb::object out = ValueOrThrow(LiteralToPython(std::move(literal)));
-             return py::reinterpret_steal<py::object>(out.release().ptr());
-           })
-      .def(
-          "memory",
-          [](const ClientAndPtr<PjRtDevice>& device, const std::string& kind) {
-            return jax::GetMemory(device, kind);
-          },
-          py::arg("kind"))
-      // Returns the default memory of a device.
-      .def("default_memory",
-           [](const ClientAndPtr<PjRtDevice>& device) {
-             auto* memory_space =
-                 xla::ValueOrThrow(device->default_memory_space());
-             return WrapWithClient(device.client(), memory_space);
-           })
-      // Returns all the memories that a device can address.
-      .def("addressable_memories",
-           [](const ClientAndPtr<PjRtDevice>& device) {
-             std::vector<ClientAndPtr<PjRtMemorySpace>> memory_spaces;
-             auto span = device->memory_spaces();
-             memory_spaces.reserve(span.size());
-             for (auto* memory_space : span) {
-               memory_spaces.push_back(
-                   WrapWithClient(device.client(), memory_space));
-             }
-             return memory_spaces;
-           })
-      .def("live_buffers",
-           [](const ClientAndPtr<PjRtDevice>& device) {
-             PythonDeprecationWarning(
-                 "Per device live_buffers() is going to be deprecated. Please "
-                 "use the jax.live_arrays() for jax.Arrays instead.");
-             return py::list();
-           })
-      .def(
-          "memory_stats",
-          [](const PjRtDevice& device)
-              -> std::optional<std::map<std::string, int64_t>> {
-            GlobalPyRefManager()->CollectGarbage();
-            xla::StatusOr<tsl::AllocatorStats> maybe_stats =
-                device.GetAllocatorStats();
-            if (absl::IsUnimplemented(maybe_stats.status())) {
-              return std::nullopt;
-            }
-            // Raise error if any status other than Unimplemented is returned.
-            ThrowIfError(maybe_stats.status());
-
-            std::map<std::string, int64_t> result;
-            result["num_allocs"] = maybe_stats->num_allocs;
-            result["bytes_in_use"] = maybe_stats->bytes_in_use;
-            result["peak_bytes_in_use"] = maybe_stats->peak_bytes_in_use;
-            result["largest_alloc_size"] = maybe_stats->largest_alloc_size;
-            if (maybe_stats->bytes_limit) {
-              result["bytes_limit"] = *maybe_stats->bytes_limit;
-            }
-            result["bytes_reserved"] = maybe_stats->bytes_reserved;
-            result["peak_bytes_reserved"] = maybe_stats->peak_bytes_reserved;
-            if (maybe_stats->bytes_reservable_limit) {
-              result["bytes_reservable_limit"] =
-                  *maybe_stats->bytes_reservable_limit;
-            }
-            result["largest_free_block_bytes"] =
-                maybe_stats->largest_free_block_bytes;
-            if (maybe_stats->pool_bytes) {
-              result["pool_bytes"] = *maybe_stats->pool_bytes;
-            }
-            if (maybe_stats->peak_pool_bytes) {
-              result["peak_pool_bytes"] = *maybe_stats->peak_pool_bytes;
-            }
-            return result;
-          },
-          "Returns memory statistics for this device keyed by name. May not be "
-          "implemented on all platforms, and different platforms may return "
-          "different stats, or -1 for unavailable stats. 'bytes_in_use' is "
-          "usually available. Intended for diagnostic use.")
-      .def("get_stream_for_external_ready_events",
-           xla::ValueOrThrowWrapper(
-               &PjRtDevice::GetStreamForExternalReadyEvents));
-  static PyMethodDef get_attr_method = {
-      "__getattr__",
-      +[](PyObject* self, PyObject* args) -> PyObject* {
-        PyObject* key;
-        if (!PyArg_ParseTuple(args, "O", &key)) {
-          PyErr_SetString(PyExc_TypeError, "__getattr__ must take 1 argument.");
-          return nullptr;
-        }
-        try {
-          auto device = py::cast<PjRtDevice*>(py::handle(self));
-          auto name = py::cast<std::string>(py::handle(key));
-          const auto& attrs = device->Attributes();
-          auto it = attrs.find(name);
-          if (it != attrs.end()) {
-            auto result =
-                std::visit([](auto&& v) { return py::cast(v); }, it->second);
-            return result.release().ptr();
-          }
-          PyErr_SetNone(PyExc_AttributeError);
-          return nullptr;
-        } catch (std::exception& e) {
-          PyErr_Format(PyExc_SystemError,
-                       "Some unhandled pybind11 exception: %s", e.what());
-          return nullptr;
-        } catch (...) {
-          PyErr_SetString(PyExc_SystemError,
-                          "Some unhandled pybind11 exception.");
-          return nullptr;
-        }
-      },
-      METH_VARARGS,
-      nullptr,
-  };
-  device.attr("__getattr__") =
-      py::reinterpret_steal<py::object>(PyDescr_NewMethod(
-          reinterpret_cast<PyTypeObject*>(device.ptr()), &get_attr_method));
-
-  py::class_<PjRtMemorySpace, ClientAndPtr<PjRtMemorySpace>> memory_space(
-      m, "Memory");
-  memory_space
-      .def_property_readonly(
-          "process_index",
-          [](const ClientAndPtr<PjRtMemorySpace>& memory_space) {
-            return memory_space.client()->process_index();
-          })
-      .def_property_readonly(
-          "platform",
-          [](const ClientAndPtr<PjRtMemorySpace>& memory_space) {
-            // TODO(phawkins): this is a temporary backwards
-            // compatibility shim. We changed the name PJRT
-            // reports for GPU platforms to "cuda" or "rocm",
-            // but we haven't yet updated JAX clients that
-            // expect "gpu". Migrate users and remove this
-            // code.
-            if (memory_space.client()->platform_name() == "cuda" ||
-                memory_space.client()->platform_name() == "rocm") {
-              return absl::string_view("gpu");
-            } else {
-              return memory_space.client()->platform_name();
-            }
-          })
-      .def_property_readonly("kind", &PjRtMemorySpace::memory_space_kind)
-      .def("__str__", &PjRtMemorySpace::DebugString)
-      .def("__repr__", &PjRtMemorySpace::ToString)
-      // Returns the devices that can address this `Memory`.
-      .def("addressable_by_devices",
-           [](const ClientAndPtr<PjRtMemorySpace>& memory_space) {
-             std::vector<ClientAndPtr<PjRtDevice>> devices;
-             auto span = memory_space->devices();
-             devices.reserve(span.size());
-             for (PjRtDevice* device : span) {
-               devices.push_back(WrapWithClient(memory_space.client(), device));
-             }
-             return devices;
-           });
+  PyDevice::RegisterPythonType(m_nb);
+  PyMemorySpace::RegisterPythonType(m_nb);
+  PyClient::RegisterPythonTypes(m_nb);
 
-  py::class_<PjRtLayout>(m, "PjRtLayout")
+  nb::class_<PjRtLayout>(m_nb, "PjRtLayout")
       .def("__str__", &PjRtLayout::ToString)
       .def("__eq__", [](const PjRtLayout& layout,
                         const PjRtLayout& other) { return layout == other; })
       .def("__hash__",
-           [](const PjRtLayout& layout) { return absl::HashOf(layout); })
-      .def(py::pickle(
-          [](const PjRtLayout& layout) -> py::tuple {
-            StatusOr<std::string> serialized = layout.Serialize();
-            ThrowIfError(serialized.status());
-            return py::make_tuple(py::bytes(*serialized));
-          },
-          [](py::tuple t) {
-            // TODO(b/328671718): don't assume PjRtXlaLayout. We probably want a
-            // generic method on PjRtCompiler instead, although we'll have
-            // somehow have to attach a compiler to this PjRtLayout (something
-            // like ClientAndPtr).
-            StatusOr<PjRtXlaLayout> layout =
-                PjRtXlaLayout::Deserialize(t[0].cast<std::string>());
-            ThrowIfError(layout.status());
-            return std::unique_ptr<PjRtLayout>(
-                new PjRtXlaLayout(std::move(*layout)));
-          }));
-
-  // Local XLA client methods.
-
-  py::enum_<PjRtClient::HostBufferSemantics>(m, "HostBufferSemantics")
-      .value("IMMUTABLE_ONLY_DURING_CALL",
-             PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall)
-      .value("IMMUTABLE_UNTIL_TRANSFER_COMPLETES",
-             PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes)
-      .value("ZERO_COPY", PjRtClient::HostBufferSemantics::kZeroCopy);
-
-  jax::BuildWeakrefLRUCacheAPI(m_nb);
-
-  py::class_<PyClient, std::shared_ptr<PyClient>> py_local_client(m, "Client");
-  py_local_client.def_property_readonly("platform", &PyClient::platform_name)
-      .def_property_readonly("platform_version", &PyClient::platform_version)
-      .def_property_readonly("runtime_type", &PyClient::runtime_type)
-      .def("device_count", &PyClient::device_count)
-      .def("local_device_count", &PyClient::addressable_device_count)
-      .def("devices", &PyClient::Devices)
-      .def("local_devices", &PyClient::LocalDevices)
-      .def("device_from_local_hardware_id",
-           xla::ValueOrThrowWrapper(&PyClient::DeviceFromLocalHardwareId))
-      // TODO(phawkins): revert to the following after nanobind transition is
-      // complete
-      // .def("live_executables", &PyClient::LiveExecutables)
-      // .def("live_arrays", &PyClient::LiveArrays)
-      // .def("live_buffers", &PyClient::LiveArrays)
-      .def("live_executables",
-           [](PyClient& client) {
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(client.LiveExecutables()).release().ptr());
+           [](const PjRtLayout& layout) { return absl::HashOf(layout); });
+
+  nb::class_<PjRtXlaLayout, PjRtLayout>(m_nb, "PjRtXlaLayout")
+      .def("__getstate__",
+           [](const PjRtXlaLayout& layout) -> nb::tuple {
+             absl::StatusOr<std::string> serialized = layout.Serialize();
+             ThrowIfError(serialized.status());
+             return nb::make_tuple(
+                 nb::bytes(serialized->data(), serialized->size()));
            })
-      .def("live_arrays",
-           [](const PyClient& client) {
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(client.LiveArrays()).release().ptr());
-           })
-      .def("live_buffers",
-           [](const PyClient& client) {
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(client.LiveArrays()).release().ptr());
-           })
-      .def("process_index", &PyClient::process_index)
-      .def("host_id", &PyClient::process_index)
-      .def("task_id", &PyClient::process_index)
-      .def(
-          "buffer_from_pyval",
-          [](py::handle py_client, py::handle argument, py::handle py_device,
-             bool force_copy,
-             PjRtClient::HostBufferSemantics host_buffer_semantics) {
-            PyClient* client = fast_cast<PyClient>(py_client);
-            PjRtDevice* device = py_device.is_none()
-                                     ? nullptr
-                                     : fast_cast<PjRtDevice>(py_device);
-            return ValueOrThrow(client->BufferFromPyval(
-                argument, device, force_copy, host_buffer_semantics));
-          },
-          py::arg("argument"), py::arg("device") = nullptr,
-          py::arg("force_copy") = false,
-          py::arg("host_buffer_semantics") =
-              PjRtClient::HostBufferSemantics::kZeroCopy)
-      .def("make_cross_host_receive_buffers",
-           xla::ValueOrThrowWrapper(&PyClient::MakeCrossHostReceiveBuffers),
-           py::arg("shapes"), py::arg("device"))
-      .def(
-          "compile",
-          [](PyClient& self, std::string mlir_module, py::object options_py,
-             std::vector<pybind11::capsule> host_callbacks) {
-            // TODO(phawkins): just wrap PyClient::Compile directly when the
-            // nanobind transition is complete.
-            CompileOptions options;
-            if (!options_py.is_none()) {
-              try {
-                options =
-                    nb::cast<CompileOptions>(nb::handle(options_py.ptr()));
-              } catch (std::exception& e) {
-                throw py::type_error(e.what());
-              }
-            }
-            return py::reinterpret_steal<py::object>(
-                nb::cast(ValueOrThrow(self.Compile(mlir_module, options,
-                                                   host_callbacks)))
-                    .release()
-                    .ptr());
-          },
-          py::arg("computation"), py::arg("compile_options") = py::none(),
-          py::arg("host_callbacks") = std::vector<py::capsule>())
-      .def("serialize_executable",
-           // TODO(phawkins): revert to the following after nanobind transition
-           // xla::ValueOrThrowWrapper(&PyClient::SerializeExecutable))
-           [](const PyClient& self, py::object executable_py) {
-             const PyLoadedExecutable* executable =
-                 nb::cast<const PyLoadedExecutable*>(
-                     nb::handle(executable_py.ptr()));
-             return xla::ValueOrThrow(self.SerializeExecutable(*executable));
-           })
-      .def(
-          "deserialize_executable",
-          // TODO(phawkins): revert to the following after nanobind transition
-          // is complete
-          // xla::ValueOrThrowWrapper(&PyClient::DeserializeExecutable),
-          [](PyClient& self, const std::string& serialized,
-             py::object options_py,
-             std::vector<pybind11::capsule> host_callbacks) {
-            std::optional<CompileOptions> options;
-            if (!options_py.is_none()) {
-              try {
-                options =
-                    nb::cast<CompileOptions>(nb::handle(options_py.ptr()));
-              } catch (std::exception& e) {
-                throw py::type_error(e.what());
-              }
-            }
-            auto out = nb::cast(xla::ValueOrThrow(self.DeserializeExecutable(
-                serialized, options, host_callbacks)));
-            return py::reinterpret_steal<py::object>(out.release().ptr());
-          },
-          py::arg("serialized"), py::arg("compile_options") = py::none(),
-          py::arg("host_callbacks") = std::vector<py::capsule>())
-      .def("heap_profile", xla::ValueOrThrowWrapper(&PyClient::HeapProfile))
-      // TODO(zhangqiaorjc): Experimental.
-      .def("defragment",
-           [](PyClient& self) { xla::ThrowIfError(self.Defragment()); })
-      .def("get_emit_python_callback_descriptor",
-           xla::ValueOrThrowWrapper(&PyClient::GetEmitPythonCallbackDescriptor),
-           py::arg("callable"), py::arg("operand_shapes"),
-           py::arg("result_shapes") = py::none())
-      .def(
-          "make_python_callback_from_host_send_and_recv",
-          // TODO(phawkins): revert to
-          //  xla::ValueOrThrowWrapper(
-          //      &PyClient::MakePythonCallbackUsingHostSendAndRecv),
-          // when the nanobind transition is done.
-          [](PyClient& self, py::function callable, py::object operand_shapes,
-             py::object result_shapes,
-             absl::Span<uint16_t const> send_channel_ids,
-             absl::Span<uint16_t const> recv_channel_ids,
-             py::function serializer) {
-            return ValueOrThrow(self.MakePythonCallbackUsingHostSendAndRecv(
-                callable,
-                nb::cast<std::vector<Shape>>(nb::handle(operand_shapes.ptr())),
-                nb::cast<std::vector<Shape>>(nb::handle(result_shapes.ptr())),
-                send_channel_ids, recv_channel_ids, serializer));
-          },
-
-          py::arg("callable"), py::arg("operand_shapes"),
-          py::arg("result_shapes"), py::arg("send_channel_ids"),
-          py::arg("recv_channel_ids"), py::arg("serializer") = py::none())
-      .def("__getattr__", [](PyClient& client, std::string name) -> py::object {
-        const auto& attrs = client.attributes();
-        auto it = attrs.find(name);
-        if (it != attrs.end()) {
-          return std::visit([](auto&& v) { return py::cast(v); }, it->second);
-        }
-        throw py::attribute_error(absl::StrCat("Unknown attribute ", name));
+      .def("__setstate__", [](PjRtXlaLayout* self, nb::tuple t) {
+        // TODO(b/328671718): don't assume PjRtXlaLayout. We probably want a
+        // generic method on PjRtCompiler instead, although we'll have
+        // somehow have to attach a compiler to this PjRtLayout (something
+        // like ClientAndPtr).
+        absl::StatusOr<PjRtXlaLayout> layout =
+            PjRtXlaLayout::Deserialize(nb::cast<std::string_view>(t[0]));
+        ThrowIfError(layout.status());
+        new (self) PjRtXlaLayout(std::move(*layout));
       });
 
-  py::class_<xla::cpu::CollectivesInterface,
-             std::shared_ptr<xla::cpu::CollectivesInterface>>
-      cpu_collectives(m, "CpuCollectives");
+  jax::BuildWeakrefLRUCacheAPI(m_nb);
+
+  nb::class_<xla::cpu::CollectivesInterface> cpu_collectives(m_nb,
+                                                             "CpuCollectives");
 
-  m.def(
+  m_nb.def(
       "make_gloo_tcp_collectives",
       [](std::shared_ptr<DistributedRuntimeClient> distributed_client,
 
@@ -688,177 +265,192 @@ static void Init(py::module_& m) {
             "make_gloo_tcp_collectives only implemented for linux");
 #endif  // __linux__
       },
-      py::arg("distributed_client"), py::arg("hostname") = std::nullopt,
-      py::arg("interface") = std::nullopt);
+      nb::arg("distributed_client"), nb::arg("hostname").none() = std::nullopt,
+      nb::arg("interface").none() = std::nullopt);
 
-  m.def(
+  m_nb.def(
       "get_tfrt_cpu_client",
       [](bool asynchronous,
          std::shared_ptr<DistributedRuntimeClient> distributed_client,
          int node_id, int num_nodes,
          std::shared_ptr<xla::cpu::CollectivesInterface> collectives)
-          -> std::shared_ptr<PyClient> {
-        py::gil_scoped_release gil_release;
-        CpuClientOptions options;
-        if (distributed_client != nullptr) {
-          options.kv_store = GetDistributedKeyValueStore(distributed_client,
-                                                         /*key_prefix=*/"cpu:");
-          options.node_id = node_id;
-          options.num_nodes = num_nodes;
+          -> nb_class_ptr<PyClient> {
+        std::unique_ptr<ifrt::PjRtClient> ifrt_client;
+        {
+          nb::gil_scoped_release gil_release;
+          CpuClientOptions options;
+          if (distributed_client != nullptr) {
+            options.kv_store =
+                GetDistributedKeyValueStore(distributed_client,
+                                            /*key_prefix=*/"cpu:");
+            options.node_id = node_id;
+            options.num_nodes = num_nodes;
+
+            options.collectives = std::move(collectives);
+          }
 
-          options.collectives = std::move(collectives);
+          options.asynchronous = asynchronous;
+          std::unique_ptr<PjRtClient> client =
+              xla::ValueOrThrow(GetTfrtCpuClient(options));
+          ifrt_client = ifrt::PjRtClient::Create(std::move(client));
         }
-
-        options.asynchronous = asynchronous;
-        std::unique_ptr<PjRtClient> client =
-            xla::ValueOrThrow(GetTfrtCpuClient(options));
-        return std::make_shared<PyClient>(
-            ifrt::PjRtClient::Create(std::move(client)));
+        return PyClient::Make(std::move(ifrt_client));
       },
-      py::arg("asynchronous") = true, py::arg("distributed_client") = nullptr,
-      py::arg("node_id") = 0, py::arg("num_nodes") = 1,
-      py::arg("collectives") =
+      nb::arg("asynchronous") = true, nb::arg("distributed_client") = nullptr,
+      nb::arg("node_id") = 0, nb::arg("num_nodes") = 1,
+      nb::arg("collectives").none() =
           std::shared_ptr<xla::cpu::CollectivesInterface>());
-  m.def("pjrt_plugin_loaded", [](std::string platform_name) -> bool {
-    xla::StatusOr<const PJRT_Api*> pjrt_api = pjrt::PjrtApi(platform_name);
+  m_nb.def("pjrt_plugin_loaded", [](std::string platform_name) -> bool {
+    absl::StatusOr<const PJRT_Api*> pjrt_api = pjrt::PjrtApi(platform_name);
     return pjrt_api.ok();
   });
-  m.def(
+  m_nb.def(
       "load_pjrt_plugin",
       [](std::string platform_name, std::optional<std::string> library_path,
-         std::optional<py::capsule> c_api) -> py::capsule {
+         std::optional<nb::capsule> c_api) -> nb::capsule {
         if (library_path.has_value()) {
           const PJRT_Api* api = xla::ValueOrThrow(
               pjrt::LoadPjrtPlugin(platform_name, *library_path));
-          return py::capsule(absl::bit_cast<void*>(api), "pjrt_c_api");
+          return nb::capsule(absl::bit_cast<void*>(api), "pjrt_c_api");
         }
         if (absl::string_view(c_api->name()) != "pjrt_c_api") {
-          throw py::value_error(
+          throw nb::value_error(
               "c_api argument to load_pjrt_plugin is not a pjrt_c_api "
               "capsule.");
         }
         xla::ThrowIfError(pjrt::SetPjrtApi(
-            platform_name, static_cast<const PJRT_Api*>(*c_api)));
+            platform_name, static_cast<const PJRT_Api*>(c_api->data())));
         return *c_api;
       },
-      py::arg("platform_name"), py::arg("library_path") = std::nullopt,
-      py::arg("c_api") = std::nullopt);
-  m.def("pjrt_plugin_initialized", [](std::string platform_name) -> bool {
+      nb::arg("platform_name"), nb::arg("library_path").none() = std::nullopt,
+      nb::arg("c_api").none() = std::nullopt);
+  m_nb.def("pjrt_plugin_initialized", [](std::string platform_name) -> bool {
     return xla::ValueOrThrow(pjrt::IsPjrtPluginInitialized(platform_name));
   });
-  m.def("initialize_pjrt_plugin", [](std::string platform_name) {
+  m_nb.def("initialize_pjrt_plugin", [](std::string platform_name) {
     return xla::ThrowIfError(pjrt::InitializePjrtPlugin(platform_name));
   });
 
 #ifdef XLA_PYTHON_ENABLE_GPU
-  py::class_<GpuAllocatorConfig> alloc_config(m, "GpuAllocatorConfig");
-  alloc_config.def(py::init<>())
-      .def_readwrite("kind", &GpuAllocatorConfig::kind)
-      .def_readwrite("memory_fraction", &GpuAllocatorConfig::memory_fraction)
-      .def_readwrite("preallocate", &GpuAllocatorConfig::preallocate)
-      .def_readwrite("collective_memory_size",
-                     &GpuAllocatorConfig::collective_memory_size);
-  py::enum_<GpuAllocatorConfig::Kind>(alloc_config, "Kind")
+  nb::class_<GpuAllocatorConfig> alloc_config(m_nb, "GpuAllocatorConfig");
+  alloc_config.def(nb::init<>())
+      .def_rw("kind", &GpuAllocatorConfig::kind)
+      .def_rw("memory_fraction", &GpuAllocatorConfig::memory_fraction)
+      .def_rw("preallocate", &GpuAllocatorConfig::preallocate)
+      .def_rw("collective_memory_size",
+              &GpuAllocatorConfig::collective_memory_size);
+  nb::enum_<GpuAllocatorConfig::Kind>(alloc_config, "Kind")
       .value("DEFAULT", GpuAllocatorConfig::Kind::kDefault)
       .value("PLATFORM", GpuAllocatorConfig::Kind::kPlatform)
       .value("BFC", GpuAllocatorConfig::Kind::kBFC)
       .value("CUDA_ASYNC", GpuAllocatorConfig::Kind::kCudaAsync);
 
-  m.def(
+  m_nb.def(
       "get_gpu_client",
       [](bool asynchronous, const GpuAllocatorConfig& allocator_config,
          std::shared_ptr<DistributedRuntimeClient> distributed_client,
          int node_id, int num_nodes,
          std::optional<std::set<int>> allowed_devices,
          std::optional<std::string> platform_name,
-         std::optional<bool> mock = false) -> std::shared_ptr<PyClient> {
-        py::gil_scoped_release gil_release;
-        std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
-        if (distributed_client != nullptr) {
-          kv_store = GetDistributedKeyValueStore(distributed_client,
-                                                 /*key_prefix=*/"gpu:");
+         std::optional<bool> mock = false) -> nb_class_ptr<PyClient> {
+        std::unique_ptr<ifrt::PjRtClient> ifrt_client;
+        {
+          nb::gil_scoped_release gil_release;
+          std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
+          if (distributed_client != nullptr) {
+            kv_store = GetDistributedKeyValueStore(distributed_client,
+                                                   /*key_prefix=*/"gpu:");
+          }
+          GpuClientOptions options;
+          options.allocator_config = allocator_config;
+          options.node_id = node_id;
+          options.num_nodes = num_nodes;
+          options.allowed_devices = allowed_devices;
+          options.platform_name = platform_name;
+          options.kv_store = kv_store;
+          options.enable_mock_nccl = mock.value_or(false);
+          std::unique_ptr<PjRtClient> pjrt_client =
+              xla::ValueOrThrow(GetStreamExecutorGpuClient(options));
+          ifrt_client = ifrt::PjRtClient::Create(std::move(pjrt_client));
         }
-        GpuClientOptions options;
-        options.allocator_config = allocator_config;
-        options.node_id = node_id;
-        options.num_nodes = num_nodes;
-        options.allowed_devices = allowed_devices;
-        options.platform_name = platform_name;
-        options.kv_store = kv_store;
-        options.enable_mock_nccl = mock.value_or(false);
-        std::unique_ptr<PjRtClient> client =
-            xla::ValueOrThrow(GetStreamExecutorGpuClient(options));
-        return std::make_shared<PyClient>(
-            ifrt::PjRtClient::Create(std::move(client)));
+        return PyClient::Make(std::move(ifrt_client));
       },
-      py::arg("asynchronous") = true,
-      py::arg("allocator_config") = GpuAllocatorConfig(),
-      py::arg("distributed_client") = nullptr, py::arg("node_id") = 0,
-      py::arg("num_nodes") = 1, py::arg("allowed_devices") = std::nullopt,
-      py::arg("platform_name") = std::nullopt, py::arg("mock") = std::nullopt);
+      nb::arg("asynchronous") = true,
+      nb::arg("allocator_config") = GpuAllocatorConfig(),
+      nb::arg("distributed_client") = nullptr, nb::arg("node_id") = 0,
+      nb::arg("num_nodes") = 1,
+      nb::arg("allowed_devices").none() = std::nullopt,
+      nb::arg("platform_name").none() = std::nullopt,
+      nb::arg("mock").none() = std::nullopt);
 #endif  // XLA_PYTHON_ENABLE_GPU
 
-  m.def(
+  m_nb.def(
       "get_c_api_client",
       [](std::string platform_name,
          const absl::flat_hash_map<std::string, PjRtValueType>& options,
          std::shared_ptr<DistributedRuntimeClient> distributed_client)
-          -> std::shared_ptr<PyClient> {
-        py::gil_scoped_release gil_release;
-        std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
-        if (distributed_client != nullptr) {
-          kv_store = GetDistributedKeyValueStore(
-              distributed_client,
-              /*key_prefix=*/absl::StrCat(platform_name, ":"));
+          -> nb_class_ptr<PyClient> {
+        std::unique_ptr<ifrt::PjRtClient> ifrt_client;
+        {
+          nb::gil_scoped_release gil_release;
+          std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
+          if (distributed_client != nullptr) {
+            kv_store = GetDistributedKeyValueStore(
+                distributed_client,
+                /*key_prefix=*/absl::StrCat(platform_name, ":"));
+          }
+          std::unique_ptr<PjRtClient> c_api_client = xla::ValueOrThrow(
+              GetCApiClient(platform_name, options, kv_store));
+          ifrt_client = ifrt::PjRtClient::Create(std::move(c_api_client));
         }
-        std::unique_ptr<PjRtClient> c_api_client =
-            xla::ValueOrThrow(GetCApiClient(platform_name, options, kv_store));
-        return std::make_shared<PyClient>(
-            ifrt::PjRtClient::Create(std::move(c_api_client)));
+        return PyClient::Make(std::move(ifrt_client));
       },
-      py::arg("platform_name"),
-      py::arg("options") = absl::flat_hash_map<std::string, PjRtValueType>(),
-      py::arg("distributed_client") = nullptr);
+      nb::arg("platform_name"),
+      nb::arg("options") = absl::flat_hash_map<std::string, PjRtValueType>(),
+      nb::arg("distributed_client").none() = nullptr);
   // TODO(b/322357665): Delete this method after TPU plugin changes to use the
   // standard registration.
-  m.def("get_default_c_api_topology",
-        [](std::string platform_name, std::string topology_name,
-           const absl::flat_hash_map<std::string, PjRtValueType>& options)
-            -> std::shared_ptr<PjRtTopologyDescription> {
-          return xla::ValueOrThrow(
-              GetCApiTopology(platform_name, topology_name, options));
-        });
-  m.def("get_c_api_topology",
-        [](py::capsule c_api, std::string topology_name,
-           const absl::flat_hash_map<std::string, PjRtValueType>& options)
-            -> std::shared_ptr<PjRtTopologyDescription> {
-          if (absl::string_view(c_api.name()) != "pjrt_c_api") {
-            throw py::value_error(
-                "Argument to get_c_api_topology was not a pjrt_c_api capsule.");
-          }
-          return xla::ValueOrThrow(GetCApiTopology(
-              static_cast<const PJRT_Api*>(c_api), topology_name, options));
-        });
-  m.def("get_topology_for_devices",
-        [](std::vector<ClientAndPtr<PjRtDevice>> devices_and_clients) {
-          if (devices_and_clients.empty()) {
-            throw py::value_error(
-                "get_topology_for_devices requires >= 1 devices.");
-          }
-          auto client = devices_and_clients[0].client();
-          std::vector<PjRtDevice*> devices;
-          devices.reserve(devices_and_clients.size());
-          for (const ClientAndPtr<PjRtDevice>& device : devices_and_clients) {
-            if (device.get_client() != client.get()) {
-              throw py::value_error(
-                  "devices passed to get_topology_for_devices come from "
-                  "different clients.");
-            }
-            devices.push_back(device.get());
+  m_nb.def("get_default_c_api_topology",
+           [](std::string platform_name, std::string topology_name,
+              const absl::flat_hash_map<std::string, PjRtValueType>& options)
+               -> std::shared_ptr<PjRtTopologyDescription> {
+             return xla::ValueOrThrow(
+                 GetCApiTopology(platform_name, topology_name, options));
+           });
+  m_nb.def(
+      "get_c_api_topology",
+      [](nb::capsule c_api, std::string topology_name,
+         const absl::flat_hash_map<std::string, PjRtValueType>& options)
+          -> std::shared_ptr<PjRtTopologyDescription> {
+        if (absl::string_view(c_api.name()) != "pjrt_c_api") {
+          throw nb::value_error(
+              "Argument to get_c_api_topology was not a pjrt_c_api capsule.");
+        }
+        return xla::ValueOrThrow(
+            GetCApiTopology(static_cast<const PJRT_Api*>(c_api.data()),
+                            topology_name, options));
+      });
+  m_nb.def(
+      "get_topology_for_devices",
+      [](const std::vector<nb_class_ptr<PyDevice>>& py_devices) {
+        if (py_devices.empty()) {
+          throw nb::value_error(
+              "get_topology_for_devices requires >= 1 devices.");
+        }
+        auto client = py_devices[0]->client();
+        std::vector<PjRtDevice*> ifrt_devices;
+        ifrt_devices.reserve(py_devices.size());
+        for (const auto& py_device : py_devices) {
+          if (py_device->client().get() != client.get()) {
+            throw nb::value_error(
+                "devices passed to get_topology_for_devices come from "
+                "different clients.");
           }
-          return xla::ValueOrThrow(client->ifrt_client()->GetTopologyForDevices(
-              absl::MakeSpan(devices)));
-        });
+          ifrt_devices.push_back(py_device->device());
+        }
+        return xla::ValueOrThrow(client->ifrt_client()->GetTopologyForDevices(
+            absl::MakeSpan(ifrt_devices)));
+      });
 
   TF_CHECK_OK(PyArray::RegisterTypes(m_nb));
   jax::RegisterDeviceList(m_nb);
@@ -900,12 +492,7 @@ static void Init(py::module_& m) {
       .def("consume_token", &PyExecuteResults::ConsumeToken);
 
   nb::class_<PyLoadedExecutable>(m_nb, "LoadedExecutable")
-      .def_prop_ro(
-          "client",
-          // TODO(phawkins): directly wrap method after nanobind transition.
-          [](const PyLoadedExecutable& self) -> nb::object {
-            return nb::borrow(py::cast(self.client()).ptr());
-          })
+      .def_prop_ro("client", &PyLoadedExecutable::client)
       .def("local_logical_device_ids",
            [](PyLoadedExecutable* exec) {
              auto span = exec->addressable_device_logical_ids();
@@ -917,12 +504,7 @@ static void Init(py::module_& m) {
                    logical_device_id.replica, logical_device_id.partition));
              }
            })
-      // TODO(phawkins): directly wrap after nanobind transition
-      // .def("local_devices", &PyLoadedExecutable::AddressableDevices)
-      .def("local_devices",
-           [](const PyLoadedExecutable& self) {
-             return nb::borrow(py::cast(self.AddressableDevices()).ptr());
-           })
+      .def("local_devices", &PyLoadedExecutable::AddressableDevices)
       .def("size_of_generated_code_in_bytes",
            &PyLoadedExecutable::SizeOfGeneratedCodeInBytes)
       .def(
@@ -978,33 +560,31 @@ static void Init(py::module_& m) {
   });
   sharded_token.def("get_token", &PyShardedToken::GetPyToken);
 
-  m.def("buffer_to_dlpack_managed_tensor",
-        xla::ValueOrThrowWrapper(BufferToDLPackManagedTensor),
-        py::arg("buffer"), py::arg("stream") = py::none());
-  m.def("dlpack_managed_tensor_to_buffer",
-        [](const pybind11::capsule& tensor, ClientAndPtr<PjRtDevice> device,
-           std::optional<std::intptr_t> stream) {
-          return xla::ValueOrThrow(DLPackManagedTensorToBuffer(
-              tensor, device.get(), device.client(), stream));
-        });
+  m_nb.def("buffer_to_dlpack_managed_tensor",
+           xla::ValueOrThrowWrapper(BufferToDLPackManagedTensor),
+           nb::arg("buffer"), nb::arg("stream").none() = nb::none());
+  m_nb.def(
+      "dlpack_managed_tensor_to_buffer",
+      [](const nb::capsule& tensor, nb_class_ptr<PyDevice> device,
+         std::optional<std::intptr_t> stream) {
+        return xla::ValueOrThrow(DLPackManagedTensorToBuffer(
+            tensor, device->device(), device->client(), stream));
+      },
+      nb::arg("dlpack"), nb::arg("device"), nb::arg("stream").none());
   // Legacy overload
-  m.def(
+  m_nb.def(
       "dlpack_managed_tensor_to_buffer",
-      [](const pybind11::capsule& tensor, std::shared_ptr<PyClient> cpu_client,
-         std::shared_ptr<PyClient> gpu_client) {
+      [](const nb::capsule& tensor,
+         std::optional<nb_class_ptr<PyClient>> cpu_client,
+         std::optional<nb_class_ptr<PyClient>> gpu_client) {
         return xla::ValueOrThrow(DLPackManagedTensorToBuffer(
             tensor, std::move(cpu_client), std::move(gpu_client)));
       },
-      py::arg("dlpack"), py::arg("cpu_backend") = nullptr,
-      py::arg("gpu_backend") = nullptr);
-  m.def("cuda_array_interface_to_buffer",
-        [](py::handle cai_py, std::shared_ptr<PyClient> cuda_client) {
-          // TODO(phawkins): simplify after nanobind transition is complete.
-          nb::dict cai = nb::cast<nb::dict>(nb::handle(cai_py.ptr()));
-          auto out = xla::ValueOrThrow(
-              CudaArrayInterfaceToBuffer(cai, std::move(cuda_client)));
-          return py::reinterpret_steal<py::object>(out.release().ptr());
-        });
+      nb::arg("dlpack"), nb::arg("cpu_backend").none() = nb::none(),
+      nb::arg("gpu_backend").none() = nb::none());
+  m_nb.def("cuda_array_interface_to_buffer",
+           xla::ValueOrThrowWrapper(CudaArrayInterfaceToBuffer));
+
   BuildProfilerSubmodule(m_nb);
   BuildOpsSubmodule(m_nb);
   BuildOutfeedReceiverSubmodule(m_nb);
@@ -1019,11 +599,10 @@ static void Init(py::module_& m) {
 
   // The following uses python bindings for PyClient defined above using
   // pybind11, and hence needs pybind11::module_ (not just nanobind::module_).
-  xla::ifrt::proxy::BuildIfrtProxySubmodule(m);
+  xla::ifrt::proxy::BuildIfrtProxySubmodule(m_nb);
 
-  py::class_<tsl::PreemptionSyncManager,
-             std::unique_ptr<tsl::PreemptionSyncManager>>
-      preemption_sync_manager(m, "PreemptionSyncManager");
+  nb::class_<tsl::PreemptionSyncManager> preemption_sync_manager(
+      m_nb, "PreemptionSyncManager");
   preemption_sync_manager
       .def(
           "initialize",
@@ -1033,32 +612,30 @@ static void Init(py::module_& m) {
                 xla::ValueOrThrow(client->GetCoordinationServiceAgent());
             xla::ThrowIfError(manager.Initialize(agent));
           },
-          py::arg("distributed_client"))
+          nb::arg("distributed_client"))
       .def("reached_sync_point",
            [](tsl::PreemptionSyncManager& manager, int step_counter) {
              return manager.ReachedSyncPoint(step_counter);
            });
-  m.def("create_preemption_sync_manager",
-        []() { return tsl::CreatePreemptionSyncManager(); });
+  m_nb.def("create_preemption_sync_manager",
+           []() { return tsl::CreatePreemptionSyncManager(); });
 
-  py::class_<DistributedRuntimeService,
-             std::unique_ptr<DistributedRuntimeService>>
-      distributed_runtime_service(m, "DistributedRuntimeService");
+  nb::class_<DistributedRuntimeService> distributed_runtime_service(
+      m_nb, "DistributedRuntimeService");
   distributed_runtime_service.def("shutdown",
                                   &DistributedRuntimeService::Shutdown,
-                                  py::call_guard<py::gil_scoped_release>());
-  py::class_<DistributedRuntimeClient,
-             std::shared_ptr<DistributedRuntimeClient>>
-      distributed_runtime_client(m, "DistributedRuntimeClient");
+                                  nb::call_guard<nb::gil_scoped_release>());
+  nb::class_<DistributedRuntimeClient> distributed_runtime_client(
+      m_nb, "DistributedRuntimeClient");
   distributed_runtime_client
       .def("connect",
            [](DistributedRuntimeClient& self) {
-             py::gil_scoped_release gil_release;
+             nb::gil_scoped_release gil_release;
              xla::ThrowIfError(self.Connect());
            })
       .def("shutdown",
            [](DistributedRuntimeClient& self) {
-             py::gil_scoped_release gil_release;
+             nb::gil_scoped_release gil_release;
              xla::ThrowIfError(self.Shutdown());
            })
       // This method assumes that the value is a Python string. Use
@@ -1068,32 +645,32 @@ static void Init(py::module_& m) {
           "blocking_key_value_get",
           [](DistributedRuntimeClient& client, std::string key,
              int64_t timeout_in_ms) {
-            py::gil_scoped_release gil_release;
+            nb::gil_scoped_release gil_release;
             return xla::ValueOrThrow(client.BlockingKeyValueGet(
                 key, absl::Milliseconds(timeout_in_ms)));
           },
-          py::arg("key"), py::arg("timeout_in_ms"))
+          nb::arg("key"), nb::arg("timeout_in_ms"))
       // Same as `blocking_key_value_get()`, but retrieves the raw Python byte
       // values explicitly.
       .def(
           "blocking_key_value_get_bytes",
           [](DistributedRuntimeClient& client, std::string key,
-             int64_t timeout_in_ms) -> py::bytes {
-            py::gil_scoped_release gil_release;
+             int64_t timeout_in_ms) -> nb::bytes {
+            nb::gil_scoped_release gil_release;
             std::string result = xla::ValueOrThrow(client.BlockingKeyValueGet(
                 key, absl::Milliseconds(timeout_in_ms)));
-            return py::bytes(result);
+            return nb::bytes(result.data(), result.size());
           },
-          py::arg("key"), py::arg("timeout_in_ms"))
+          nb::arg("key"), nb::arg("timeout_in_ms"))
       .def(
           "wait_at_barrier",
           [](DistributedRuntimeClient& client, std::string barrier_id,
              int64_t timeout_in_ms) {
-            py::gil_scoped_release gil_release;
+            nb::gil_scoped_release gil_release;
             xla::ThrowIfError(client.WaitAtBarrier(
                 barrier_id, absl::Milliseconds(timeout_in_ms)));
           },
-          py::arg("barrier_id"), py::arg("timeout_in_ms"))
+          nb::arg("barrier_id"), nb::arg("timeout_in_ms"))
       // The key must be a string, but the value can either be a Python string
       // or bytes object.
       // With Python string values, use `key_value_set()` and
@@ -1102,58 +679,70 @@ static void Init(py::module_& m) {
       // `blocking_key_value_get_bytes()`.
       .def(
           "key_value_set",
-          [](DistributedRuntimeClient& client, std::string key,
-             std::string value) {
-            py::gil_scoped_release gil_release;
+          [](DistributedRuntimeClient& client, std::string_view key,
+             std::string_view value) {
+            nb::gil_scoped_release gil_release;
             xla::ThrowIfError(client.KeyValueSet(key, value));
           },
-          py::arg("key"), py::arg("value"))
-      // The key must be a string, but the value must a Python bytes object.
+          nb::arg("key"), nb::arg("value"))
+      .def(
+          "key_value_set",
+          [](DistributedRuntimeClient& client, std::string_view key,
+             nb::bytes value) {
+            nb::gil_scoped_release gil_release;
+            xla::ThrowIfError(client.KeyValueSet(
+                key, std::string_view(value.c_str(), value.size())));
+          },
+          nb::arg("key"), nb::arg("value"))
+      // The key must be a string, but the value must a
+      // Python bytes object.
       // Use `key_value_set_bytes()` and `blocking_key_value_get_bytes()`.
       .def(
           "key_value_set_bytes",
-          [](DistributedRuntimeClient& client, std::string key,
-             py::bytes value) {
-            py::gil_scoped_release gil_release;
-            xla::ThrowIfError(client.KeyValueSet(key, value));
+          [](DistributedRuntimeClient& client, std::string_view key,
+             nb::bytes value) {
+            nb::gil_scoped_release gil_release;
+            xla::ThrowIfError(client.KeyValueSet(
+                key, std::string_view(value.c_str(), value.size())));
           },
-          py::arg("key"), py::arg("value"))
+          nb::arg("key"), nb::arg("value"))
       // Assumes that all values in the directory are Python strings.
       .def(
           "key_value_dir_get",
-          [](DistributedRuntimeClient& client, std::string key) {
-            py::gil_scoped_release gil_release;
+          [](DistributedRuntimeClient& client, std::string_view key) {
+            nb::gil_scoped_release gil_release;
             return xla::ValueOrThrow(client.KeyValueDirGet(key));
           },
-          py::arg("key"))
+          nb::arg("key"))
       // Assumes that all values in the directory are Python byte objects.
       // Same as `key_value_dir_get()`, but retrieves Python byte values
       // explicitly.
       .def(
           "key_value_dir_get_bytes",
-          [](DistributedRuntimeClient& client, std::string key)
-              -> std::vector<std::pair<std::string, py::bytes>> {
-            py::gil_scoped_release gil_release;
+          [](DistributedRuntimeClient& client, std::string_view key)
+              -> std::vector<std::pair<std::string, nb::bytes>> {
+            nb::gil_scoped_release gil_release;
             std::vector<std::pair<std::string, std::string>> result =
                 xla::ValueOrThrow(client.KeyValueDirGet(key));
-            // Convert std::string values to py::bytes.
-            std::vector<std::pair<std::string, py::bytes>> kvs;
+            // Convert std::string values to nb::bytes.
+            std::vector<std::pair<std::string, nb::bytes>> kvs;
             kvs.reserve(result.size());
             for (const auto& kv : result) {
-              kvs.push_back(std::pair(kv.first, py::bytes(kv.second)));
+              kvs.push_back(std::pair(
+                  kv.first, nb::bytes(kv.second.data(), kv.second.size())));
             }
             return kvs;
           },
-          py::arg("key"))
+          nb::arg("key"))
       .def(
           "key_value_delete",
-          [](DistributedRuntimeClient& client, std::string key) {
-            py::gil_scoped_release gil_release;
+          [](DistributedRuntimeClient& client, std::string_view key) {
+            nb::gil_scoped_release gil_release;
             return client.KeyValueDelete(key);
           },
-          py::arg("key"));
+          nb::arg("key"));
 
-  m.def(
+  m_nb.def(
       "get_distributed_runtime_service",
       [](std::string address, int num_nodes,
          std::optional<int> heartbeat_interval,
@@ -1180,19 +769,19 @@ static void Init(py::module_& m) {
             xla::ValueOrThrow(GetDistributedRuntimeService(address, options));
         return service;
       },
-      py::arg("address"), py::arg("num_nodes"), py::kw_only(),
-      py::arg("heartbeat_interval") = std::nullopt,
-      py::arg("max_missing_heartbeats") = std::nullopt,
-      py::arg("cluster_register_timeout") = std::nullopt,
-      py::arg("shutdown_timeout") = std::nullopt);
+      nb::arg("address"), nb::arg("num_nodes"),
+      nb::arg("heartbeat_interval").none() = std::nullopt,
+      nb::arg("max_missing_heartbeats").none() = std::nullopt,
+      nb::arg("cluster_register_timeout").none() = std::nullopt,
+      nb::arg("shutdown_timeout").none() = std::nullopt);
 
-  m.def(
+  m_nb.def(
       "get_distributed_runtime_client",
       [](std::string address, int node_id, std::optional<int> rpc_timeout,
          std::optional<int> init_timeout, std::optional<int> shutdown_timeout,
          std::optional<int> heartbeat_interval,
          std::optional<int> max_missing_heartbeats,
-         std::optional<std::function<void(xla::Status,
+         std::optional<std::function<void(absl::Status,
                                           bool coordinator_reported_failure)>>
              missed_heartbeat_callback,
          std::optional<bool> shutdown_on_destruction)
@@ -1223,18 +812,18 @@ static void Init(py::module_& m) {
         }
         return GetDistributedRuntimeClient(address, options);
       },
-      py::arg("address"), py::arg("node_id"), py::kw_only(),
-      py::arg("rpc_timeout") = std::nullopt,
-      py::arg("init_timeout") = std::nullopt,
-      py::arg("shutdown_timeout") = std::nullopt,
-      py::arg("heartbeat_interval") = std::nullopt,
-      py::arg("max_missing_heartbeats") = std::nullopt,
-      py::arg("missed_heartbeat_callback") = std::nullopt,
-      py::arg("shutdown_on_destruction") = std::nullopt);
+      nb::arg("address"), nb::arg("node_id"),
+      nb::arg("rpc_timeout").none() = std::nullopt,
+      nb::arg("init_timeout").none() = std::nullopt,
+      nb::arg("shutdown_timeout").none() = std::nullopt,
+      nb::arg("heartbeat_interval").none() = std::nullopt,
+      nb::arg("max_missing_heartbeats").none() = std::nullopt,
+      nb::arg("missed_heartbeat_callback").none() = std::nullopt,
+      nb::arg("shutdown_on_destruction").none() = std::nullopt);
 
-  m.def("collect_garbage", []() { GlobalPyRefManager()->CollectGarbage(); });
+  m_nb.def("collect_garbage", []() { GlobalPyRefManager()->CollectGarbage(); });
 
-  m.def("is_optimized_build", &IsOptimizedBuild);
+  m_nb.def("is_optimized_build", &IsOptimizedBuild);
 
   m_nb.def("json_to_pprof_profile",
            xla::ValueOrThrowWrapper(JsonToPprofProfile),
@@ -1245,140 +834,82 @@ static void Init(py::module_& m) {
            "Decodes an uncompressed pprof Profile protocol buffer into a JSON "
            "representation");
 
-  RegisterCompileOnlyClient(m);
-  py::class_<PjRtTopologyDescription, std::shared_ptr<PjRtTopologyDescription>>(
-      m, "DeviceTopology")
+  RegisterCompileOnlyClient(m_nb);
+  nb::class_<PjRtTopologyDescription>(m_nb, "DeviceTopology")
       .def("_make_compile_only_devices",
            [](std::shared_ptr<PjRtTopologyDescription> topology) {
              return MakeCompileOnlyClient(topology)->Devices();
            })
-      .def_property_readonly("platform",
-                             [](PjRtTopologyDescription& topology) {
-                               return topology.platform_name();
-                             })
-      .def_property_readonly("platform_version",
-                             [](PjRtTopologyDescription& topology) {
-                               return topology.platform_version();
-                             })
+      .def_prop_ro("platform",
+                   [](PjRtTopologyDescription& topology) {
+                     return topology.platform_name();
+                   })
+      .def_prop_ro("platform_version",
+                   [](PjRtTopologyDescription& topology) {
+                     return topology.platform_version();
+                   })
       .def("serialize",
-           [](PjRtTopologyDescription& topology) -> py::bytes {
-             return py::bytes(ValueOrThrow(topology.Serialize()));
+           [](PjRtTopologyDescription& topology) -> nb::bytes {
+             std::string serialized = ValueOrThrow(topology.Serialize());
+             return nb::bytes(serialized.data(), serialized.size());
            })
-      .def(
-          "__getattr__",
-          [](PjRtTopologyDescription& topology,
-             std::string name) -> py::object {
-            const auto& attrs = topology.Attributes();
-            auto it = attrs.find(name);
-            if (it != attrs.end()) {
-              return std::visit([](auto&& v) { return py::cast(v); },
-                                it->second);
-            }
-            throw py::attribute_error(absl::StrCat("Unknown attribute ", name));
-          });
+      .def("__getattr__",
+           [](PjRtTopologyDescription& topology,
+              std::string_view name) -> nb::object {
+             const auto& attrs = topology.Attributes();
+             auto it = attrs.find(name);
+             if (it != attrs.end()) {
+               return std::visit([](auto&& v) { return nb::cast(v); },
+                                 it->second);
+             }
+             throw nb::attribute_error(
+                 absl::StrCat("Unknown attribute ", name).c_str());
+           });
 
-  py::class_<PjRtExecutable, std::shared_ptr<PjRtExecutable>>(m, "Executable")
-      .def("hlo_modules",
-           [](const PjRtExecutable& self) {
-             // TODO(phawkins): revert to a direct wrapping of
-             // PyLoadedExecutable::GetParameterLayouts when nanobind transition
-             // is complete.
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(ValueOrThrow(self.GetHloModules())).release().ptr());
-           })
+  nb::class_<PjRtExecutable>(m_nb, "Executable")
+      .def("hlo_modules", ValueOrThrowWrapper(&PjRtExecutable::GetHloModules))
       .def("get_output_memory_kinds",
            xla::ValueOrThrowWrapper(&PjRtExecutable::GetOutputMemoryKinds))
-      .def("get_output_shardings",
-           [](const PjRtExecutable& self) {
-             return py::reinterpret_borrow<py::object>(
-                 nb::cast(self.GetOutputShardings()).release().ptr());
-           })
+      .def("get_output_shardings", &PjRtExecutable::GetOutputShardings)
       .def("get_parameter_layouts",
-           [](const PjRtExecutable& self) {
-             // TODO(phawkins): revert to a direct wrapping of
-             // PjRtExecutable::GetParameterLayouts when nanobind transition
-             // is complete.
-             // xla::ValueOrThrowWrapper(&PjRtExecutable::GetParameterLayouts)
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(ValueOrThrow(self.GetParameterLayouts()))
-                     .release()
-                     .ptr());
-           })
+           ValueOrThrowWrapper(&PjRtExecutable::GetParameterLayouts))
       .def("get_output_layouts",
-           [](const PjRtExecutable& self) {
-             // TODO(phawkins): revert to a direct wrapping of
-             // PjRtExecutable::GetOutputLayouts when nanobind transition
-             // is complete.
-             // xla::ValueOrThrowWrapper(&PjRtExecutable::GetOutputLayouts)
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(ValueOrThrow(self.GetOutputLayouts()))
-                     .release()
-                     .ptr());
-           })
-      .def("get_parameter_shardings",
-           [](const PjRtExecutable& self) {
-             return py::reinterpret_borrow<py::object>(
-                 nb::cast(self.GetParameterShardings()).release().ptr());
-           })
-      .def("get_compiled_memory_stats",
-           [](const PjRtExecutable& self) {
-             // TODO(phawkins): revert to a direct wrapping of
-             // PjRtExecutable::GetCompiledMemoryStats when nanobind transition
-             // is complete.
-             // xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompiledMemoryStats)
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(ValueOrThrow(self.GetCompiledMemoryStats()))
-                     .release()
-                     .ptr());
-           })
+           xla::ValueOrThrowWrapper(&PjRtExecutable::GetOutputLayouts))
+      .def("get_parameter_shardings", &PjRtExecutable::GetParameterShardings)
+      .def("get_compiled_memory_stats", &PjRtExecutable::GetCompiledMemoryStats)
       .def("compile_options",
-           // TODO(phawkins): revert to the following when nanobind transition
-           // complete
-           // xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompileOptions))
-           [](const PjRtExecutable& self) {
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(ValueOrThrow(self.GetCompileOptions()))
-                     .release()
-                     .ptr());
-           })
+           xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompileOptions))
       .def("serialize",
-           [](const PjRtExecutable& exec) -> py::bytes {
-             return ValueOrThrow(exec.SerializeExecutable());
+           [](const PjRtExecutable& exec) -> nb::bytes {
+             std::string serialized = ValueOrThrow(exec.SerializeExecutable());
+             return nb::bytes(serialized.data(), serialized.size());
            })
       .def("cost_analysis",
            xla::ValueOrThrowWrapper(&PjRtExecutable::GetCostAnalysis));
 
-  m.def("is_asan", IsAsan);
-  m.def("is_msan", IsMsan);
-  m.def("is_tsan", IsTsan);
-  m.def("is_sanitized", IsSanitized);
+  m_nb.def("is_asan", IsAsan);
+  m_nb.def("is_msan", IsMsan);
+  m_nb.def("is_tsan", IsTsan);
+  m_nb.def("is_sanitized", IsSanitized);
 
-  m.def(
+  m_nb.def(
       "batched_device_put",
-      [](py::object aval, py::object sharding, py::object xs_py,
-         std::vector<ClientAndPtr<PjRtDevice>> dst_devices, bool committed,
+      [](nb::object aval, nb::object sharding, std::vector<nb::object> xs,
+         std::vector<const PyDevice*> dst_devices, bool committed,
          bool force_copy,
-         PjRtClient::HostBufferSemantics host_buffer_semantics) -> py::object {
-        // TODO(phawkins): simplify after nanobind transition is complete.
-        auto xs = nb::cast<std::vector<nb::object>>(nb::handle(xs_py.ptr()));
-        return py::reinterpret_steal<py::object>(
-            ValueOrThrow(PyArray::BatchedDevicePut(
-                             nb::borrow(aval.ptr()), nb::borrow(sharding.ptr()),
-                             std::move(xs), std::move(dst_devices), committed,
-                             force_copy, host_buffer_semantics,
-                             jax::GetEnableX64()))
-                .release()
-                .ptr());
+         PjRtClient::HostBufferSemantics host_buffer_semantics) -> nb::object {
+        return ValueOrThrow(PyArray::BatchedDevicePut(
+            nb::borrow(aval.ptr()), nb::borrow(sharding.ptr()), std::move(xs),
+            std::move(dst_devices), committed, force_copy,
+            host_buffer_semantics, jax::GetEnableX64()));
       },
-      py::arg("aval"), py::arg("sharding"), py::arg("xs"), py::arg("devices"),
-      py::arg("committed") = true, py::arg("force_copy") = false,
-      py::arg("host_buffer_semantics") =
+      nb::arg("aval"), nb::arg("sharding"), nb::arg("xs"), nb::arg("devices"),
+      nb::arg("committed") = true, nb::arg("force_copy") = false,
+      nb::arg("host_buffer_semantics") =
           PjRtClient::HostBufferSemantics::kZeroCopy);
 
-  m.def("batched_block_until_ready", [](py::object xs_py) {
-    // TODO(phawkins): simplify after nanobind transition is complete.
-    auto xs = nb::cast<std::vector<nb::object>>(nb::handle(xs_py.ptr()));
-    xla::ThrowIfError(PyArray::BatchedBlockUntilReady(std::move(xs)));
+  m_nb.def("batched_block_until_ready", [](std::vector<nb::object> xs) {
+    ThrowIfError(PyArray::BatchedBlockUntilReady(std::move(xs)));
   });
 
   m_nb.def("check_and_canonicalize_memory_kind",
@@ -1386,24 +917,4 @@ static void Init(py::module_& m) {
            nb::arg("device_list"));
 }  // NOLINT(readability/fn_size)
 
-// This code in essence is a copy of PYBIND11_MODULE(). We can't just call
-// PYBIND11_MODULE because we want the entry point of the module to be in
-// the py_extension() translation unit but we don't want anything else to be
-// defined there. Inside Google, py_extension() translation units are linked
-// differently and they end up with a different instance of the
-// py::module_local() state, breaking that feature of pybind11.
-static py::module_::module_def xla_module_def;
-
-PyObject* InitializeXlaExtension() {
-  PYBIND11_CHECK_PYTHON_VERSION
-  PYBIND11_ENSURE_INTERNALS_READY
-  auto m = py::module_::create_extension_module("xla_extension", nullptr,
-                                                &xla_module_def);
-  try {
-    Init(m);
-    return m.ptr();
-  }
-  PYBIND11_CATCH_INIT_EXCEPTIONS
-}
-
 }  // namespace xla
diff --git a/third_party/xla/xla/python/xla.h b/third_party/xla/xla/python/xla.h
deleted file mode 100644
index a7de1a11c76cda..00000000000000
--- a/third_party/xla/xla/python/xla.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_XLA_H_
-#define XLA_PYTHON_XLA_H_
-
-// placeholder for index annotation headers
-#include "pybind11/pybind11.h"  // from @pybind11
-
-namespace xla {
-
-PyObject *InitializeXlaExtension();
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_XLA_H_
diff --git a/third_party/xla/xla/python/xla_compiler.cc b/third_party/xla/xla/python/xla_compiler.cc
index 1ef547777794ae..b5ef8148f142bc 100644
--- a/third_party/xla/xla/python/xla_compiler.cc
+++ b/third_party/xla/xla/python/xla_compiler.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/variant.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/array.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/client/xla_builder.h"
@@ -715,9 +714,8 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
         });
   m.def(
       "hlo_module_cost_analysis",
-      xla::ValueOrThrowWrapper([](nb::handle client_py, const HloModule& module)
+      xla::ValueOrThrowWrapper([](PyClient* client, const HloModule& module)
                                    -> StatusOr<nb::dict> {
-        PyClient* client = pybind11::cast<PyClient*>(client_py.ptr());
         TF_ASSIGN_OR_RETURN(auto analysis,
                             client->pjrt_client()->GetHloCostAnalysis());
         TF_RETURN_IF_ERROR(module.entry_computation()->Accept(analysis.get()));
diff --git a/third_party/xla/xla/python/xla_extension.cc b/third_party/xla/xla/python/xla_extension.cc
deleted file mode 100644
index 5adc194d65f054..00000000000000
--- a/third_party/xla/xla/python/xla_extension.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "xla/python/xla.h"
-
-extern "C" PYBIND11_EXPORT PyObject *PyInit_xla_extension() {
-  return xla::InitializeXlaExtension();
-}

From 8e8ee3834146461a7472de086742752eb7aca245 Mon Sep 17 00:00:00 2001
From: Kanvi Khanna <kanvi.khanna@intel.com>
Date: Tue, 19 Mar 2024 07:11:39 -0700
Subject: [PATCH 101/670] PR #9688: [XLA:CPU] Add F16 support for oneDNN matmul

Imported from GitHub PR https://github.com/openxla/xla/pull/9688

This PR enables F16 support for dot operation when onednn is enabled and adds tests. In the existing pipeline, F16 dot is converted to FP32 through the change-op-data-type pass. With the changes in this PR, if onednn is enabled and the conditions for rewriting dot to onednn matmul are met then F16 dot is not upcast to FP32.
Copybara import of the project:

--
190e446d920fb82732054383c81545c528d6d9d9 by Kanvi Khanna <kanvi.khanna@intel.com>:

Add F16 support for matmul

--
f99f2bb30febc5c818c28c379232151085deebd6 by Kanvi Khanna <kanvi.khanna@intel.com>:

fix formatting

--
aef8cb1685539a5e1c4765262b8ed5f537613d58 by Kanvi Khanna <kanvi.khanna@intel.com>:

fix for failing test

--
72dd379ce16eab6838ef568b20ab15e0e0ea0449 by Kanvi Khanna <kanvi.khanna@intel.com>:

Add check in ChangeOpDataType pass for F16 support as a separate pass for F16 support is not needed; undo changes for BMM;

Merging this change closes #9688

PiperOrigin-RevId: 617161505
---
 third_party/xla/xla/service/BUILD             |  1 +
 .../xla/xla/service/change_op_data_type.cc    |  9 +++
 .../xla/service/cpu/onednn_matmul_rewriter.cc | 57 ++++-----------
 .../xla/service/cpu/onednn_matmul_rewriter.h  |  2 +-
 third_party/xla/xla/tests/BUILD               |  1 +
 .../xla/xla/tests/onednn_matmul_test.cc       | 71 +++++++++++++++++++
 6 files changed, 97 insertions(+), 44 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index f41fe0f3a7c93c..272374d61dfd05 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -7275,6 +7275,7 @@ cc_library(
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
+        "//xla/service/cpu:onednn_matmul_rewriter",
     ],
 )
 
diff --git a/third_party/xla/xla/service/change_op_data_type.cc b/third_party/xla/xla/service/change_op_data_type.cc
index 6765ebc7f3c62c..e77f233b9634f2 100644
--- a/third_party/xla/xla/service/change_op_data_type.cc
+++ b/third_party/xla/xla/service/change_op_data_type.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include <optional>
 
 #include "xla/service/hlo_creation_utils.h"
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+#include "xla/service/cpu/onednn_matmul_rewriter.h"
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
 namespace xla {
 namespace {
@@ -59,6 +62,12 @@ absl::StatusOr<bool> ChangeOpDataType::Run(
       if (it == to_type_map_.end()) {
         continue;
       }
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+      if (instr->opcode == HloOpcode::kDot &&
+          OneDnnMatMulRewriter::ShouldRewrite(instr)) {
+        continue;
+      }
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
       const PrimitiveType to_type = it->second;
       absl::InlinedVector<HloInstruction*, 8> new_operands;
       for (HloInstruction* operand : instr->mutable_operands()) {
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
index 840661310f8744..fbcad680576ce6 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -55,16 +55,6 @@ inline Status ValidateDotDimensionNumbers(
   return OkStatus();
 }
 
-// We also check if the convert instruction has only one use.
-inline bool AllOperandsConvertedFromBF16ToF32(const HloInstruction* instr) {
-  return absl::c_all_of(instr->operands(), [](HloInstruction* operand) {
-    return Match(operand,
-                 m::Convert(m::Op().WithElementType(PrimitiveType::BF16))
-                     .WithElementType(PrimitiveType::F32)
-                     .WithOneUse());
-  });
-}
-
 template <typename Pattern>
 auto ElementwiseSafeIntermediate(HloInstruction** instr, Pattern pattern) {
   return m::AnyOf<HloInstruction>(m::Broadcast(instr, pattern.WithOneUser()),
@@ -254,10 +244,15 @@ inline bool IsRowMajor(const Shape& shape) {
 // TODO(intel-tf): Restict compatible types based on instruction kind.
 inline bool CompatibleElementType(const HloInstruction* instr) {
   PrimitiveType element_type = instr->shape().element_type();
-  return element_type == BF16 || element_type == F32;
+  return element_type == BF16 || element_type == F32 || element_type == F16;
+}
+
+inline bool LowPrecisionType(const HloInstruction* instr) {
+  PrimitiveType element_type = instr->shape().element_type();
+  return element_type == BF16 || element_type == F16;
 }
 
-// Type conversion from and to any of BF16 and FP32.
+// Type conversion from and to any of BF16, F16 and FP32.
 // TODO(intel-tf): Support more types when enabled.
 template <typename Pattern>
 inline auto SupportedConvert(Pattern pattern) {
@@ -305,14 +300,13 @@ inline auto OptionalConvertAndBitcast(HloInstruction** optional_convert,
   // Checks the presence of some intermediate operations that can be moved /
   // folded to allow dot fusion with add.
   // Try to match either of the following:
-  //   1. pattern-root -> bf16-to-fp32 convert -> bitcast
-  //   2. pattern-root -> bf16-to-fp32 convert
+  //   1. pattern-root -> bf16/f16-to-fp32 convert -> bitcast
+  //   2. pattern-root -> bf16/f16-to-fp32 convert
   //   3. pattern-root -> bitcast
   //   4. pattern-root
   auto common =
       m::AnyOf<HloInstruction>(
           SupportedConvert(optional_convert, std::move(pattern).WithOneUser())
-              .WithOperand(0, m::Op().WithElementType(PrimitiveType::BF16))
               .WithElementType(PrimitiveType::F32),
           std::move(pattern).WithOneUser())
           .WithOneUser();
@@ -387,6 +381,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
 
     auto dot_dim_numbers = dot_instr->dot_dimension_numbers();
     TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(dot_dim_numbers));
+
     if (!OneDnnMatMulRewriter::ShouldRewrite(dot_instr)) return OkStatus();
     const Shape& lhs_shape = dot_instr->operand(0)->shape();
     const Shape& rhs_shape = dot_instr->operand(1)->shape();
@@ -413,30 +408,6 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
     return OkStatus();
   }
 
-  Status HandleConvert(HloInstruction* convert) override {
-    HloInstruction* matmul_instr;
-    auto pattern =
-        m::Convert(m::CustomCall(&matmul_instr, {"__onednn$matmul"})
-                       .WithOneUse()
-                       .WithElementType(PrimitiveType::F32)
-                       .WithPredicate(AllOperandsConvertedFromBF16ToF32))
-            .WithElementType(PrimitiveType::BF16);
-
-    if (!Match(convert, pattern)) return OkStatus();
-    if (!IsSupportedType(convert->shape().element_type())) return OkStatus();
-
-    // BFloat16 operands.
-    std::vector<HloInstruction*> bf16_operands;
-    for (auto operand : matmul_instr->operands()) {
-      bf16_operands.push_back(operand->mutable_operand(0));
-    }
-
-    HloInstruction* matmul_call = convert->AddInstruction(
-        matmul_instr->CloneWithNewOperands(convert->shape(), bf16_operands));
-    TF_RETURN_IF_ERROR(ReplaceInstruction(convert, matmul_call));
-    return OkStatus();
-  }
-
   Status HandleAdd(HloInstruction* instr) override {
     // Try to do a fusion for Dot(onednn-matmul) + Add. However,
     // HLO Add instruction might receive the addends after additional
@@ -548,11 +519,11 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
       // for bf16 case to avoid datatype mismatch.
       if (optional_dot_bitcast != nullptr &&
           optional_dot_bitcast->opcode() == HloOpcode::kBitcast) {
-        if (matmul_call->shape().element_type() == PrimitiveType::BF16) {
+        if (LowPrecisionType(matmul_call)) {
           auto bitcast_call =
               matmul_call->AddInstruction(HloInstruction::CreateBitcast(
-                  ShapeUtil::ChangeElementType(instr->shape(),
-                                               PrimitiveType::BF16),
+                  ShapeUtil::ChangeElementType(
+                      instr->shape(), matmul_call->shape().element_type()),
                   matmul_call));
           new_instr =
               bitcast_call->AddInstruction(HloInstruction::CreateConvert(
@@ -564,7 +535,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
               HloInstruction::CreateBitcast(instr->shape(), matmul_call));
         }
       } else {
-        if (matmul_call->shape().element_type() == PrimitiveType::BF16) {
+        if (LowPrecisionType(matmul_call)) {
           new_instr = matmul_call->AddInstruction(HloInstruction::CreateConvert(
               ShapeUtil::ChangeElementType(matmul_call->shape(),
                                            PrimitiveType::F32),
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h
index 4c508e811aa473..36cab7ee949c37 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h
@@ -37,7 +37,7 @@ class OneDnnMatMulRewriter : public HloModulePass {
                        const tsl::thread::ThreadPool* compile_threadpool)
       : intra_op_parallelism_(intra_op_parallelism),
         compile_threadpool_(compile_threadpool) {}
-
+  OneDnnMatMulRewriter() = default;
   absl::string_view name() const override { return "onednn-matmul-rewriter"; }
 
   using HloPassInterface::Run;
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 0f0fd6cccd4933..13346a0796d682 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -2867,6 +2867,7 @@ xla_test(
         "//xla:shape_util",
         "//xla:test",
         "//xla:test_helpers",
+        "//xla/hlo/utils:hlo_matchers",
         "//xla/service/cpu:onednn_util",
         "@local_tsl//tsl/platform:platform_port",
     ],
diff --git a/third_party/xla/xla/tests/onednn_matmul_test.cc b/third_party/xla/xla/tests/onednn_matmul_test.cc
index 8d6a307b029a90..516f130f2c41a2 100644
--- a/third_party/xla/xla/tests/onednn_matmul_test.cc
+++ b/third_party/xla/xla/tests/onednn_matmul_test.cc
@@ -17,15 +17,20 @@ limitations under the License.
 
 #include <utility>
 
+#include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal.h"
+#include "xla/service/cpu/onednn_matmul_rewriter.h"
 #include "xla/service/cpu/onednn_util.h"
 #include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
+#include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_macros.h"
 #include "tsl/platform/cpu_info.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace cpu {
 
@@ -94,6 +99,23 @@ TEST_F(MatmulTest, SimpleTestBF16) {
   MatchOptimizedHlo(matmul_module_str, matmul_rewrite_str_);
 }
 
+TEST_F(MatmulTest, SimpleTestF16) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f16, entry_computation_layout={(f16[32,8,128,64]{3,2,1,0},f16[32,8,64,128]{3,2,1,0})->f16[32,8,128,128]{3,2,1,0}}
+  ENTRY matmul.test.f16 {
+    arg.0 = f16[32,8,128,64]{3,2,1,0} parameter(0), parameter_replication={false}
+    arg.1 = f16[32,8,64,128]{3,2,1,0} parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = f16[32,8,128,128]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, matmul_rewrite_str_);
+}
+
 TEST_F(MatmulTest, SimpleTestF32TransposeB) {
   const char* matmul_module_str = R"(
   HloModule matmul.test.1, entry_computation_layout={(f32[32,8,128,64]{3,1,2,0},f32[32,8,128,64]{3,1,2,0})->f32[32,8,128,128]{3,2,1,0}}
@@ -481,6 +503,55 @@ TEST_F(MatmulTest, DivisionByConstantWithEltwiseLinearF32) {
   )");
 }
 
+TEST_F(MatmulTest, SimpleBiasTestFP16_PARAM_F32) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+
+  const char* matmul_module_str = R"(
+  HloModule jit_apply, entry_computation_layout={(f32[3072]{0}, f32[768,3072]{1,0}, f32[16,128,768]{2,1,0})->f16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+  ENTRY matmul.test.f16 {
+    Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
+    convert.4 = f16[16,128,768]{2,1,0} convert(Arg_2.3)
+    Arg_1.2 = f32[768,3072]{1,0} parameter(1), sharding={replicated}
+    convert.5 = f16[768,3072]{1,0} convert(Arg_1.2)
+    dot.7 = f16[16,128,3072]{2,1,0} dot(convert.4, convert.5), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_0.1 = f32[3072]{0} parameter(0), sharding={replicated}
+    convert.6 = f16[3072]{0} convert(Arg_0.1)
+    reshape.8 = f16[1,1,3072]{2,1,0} reshape(convert.6)
+    broadcast.9 = f16[1,1,3072]{2,1,0} broadcast(reshape.8), dimensions={0,1,2}
+    reshape.10 = f16[3072]{0} reshape(broadcast.9)
+    broadcast.11 = f16[16,128,3072]{2,1,0} broadcast(reshape.10), dimensions={2}
+    ROOT add.12 = f16[16,128,3072]{2,1,0} add(dot.7, broadcast.11)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_);
+}
+
+TEST_F(MatmulTest, SimpleBiasTestFP16_PARAM_FP16) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule jit_apply, entry_computation_layout={(f16[3072]{0}, f16[768,3072]{1,0}, f32[16,128,768]{2,1,0})->f16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+  ENTRY matmul.test.f16 {
+    Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
+    convert.4 = f16[16,128,768]{2,1,0} convert(Arg_2.3)
+    Arg_1.2 = f16[768,3072]{1,0} parameter(1), sharding={replicated}
+    dot.5 = f16[16,128,3072]{2,1,0} dot(convert.4, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_0.1 = f16[3072]{0} parameter(0), sharding={replicated}
+    reshape.6 = f16[1,1,3072]{2,1,0} reshape(Arg_0.1)
+    broadcast.7 = f16[1,1,3072]{2,1,0} broadcast(reshape.6), dimensions={0,1,2}
+    reshape.8 = f16[3072]{0} reshape(broadcast.7)
+    broadcast.9 = f16[16,128,3072]{2,1,0} broadcast(reshape.8), dimensions={2}
+    ROOT add.10 = f16[16,128,3072]{2,1,0} add(dot.5, broadcast.9)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_);
+}
+
 TEST_F(MatmulTest, TestF32NonConstantWeights) {
   const char* matmul_module_str = R"(
   HloModule matmul.test.f32, entry_computation_layout={(f32[64,256,16]{2,1,0},f32[16,32]{1,0})->f32[64,256,32]{2,1,0}}

From 28f2be3d2d69bd0e1b129a764dfb6a293cb2857f Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 07:20:40 -0700
Subject: [PATCH 102/670] #shlo_ref Add `log_plus_one` op.

PiperOrigin-RevId: 617163311
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  34 ++++
 .../experimental/shlo/ops/log_plus_one.cc     |  74 +++++++++
 .../lite/experimental/shlo/ops/log_plus_one.h |  34 ++++
 .../shlo/ops/log_plus_one_test.cc             | 149 ++++++++++++++++++
 4 files changed, 291 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/log_plus_one.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/log_plus_one.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 0133e09d843cff..1f00fb9e588067 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -526,3 +526,37 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "log_plus_one",
+    srcs = ["log_plus_one.cc"],
+    hdrs = ["log_plus_one.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "log_plus_one_test",
+    srcs = ["log_plus_one_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":log_plus_one",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc b/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc
new file mode 100644
index 00000000000000..66d3a7a11cc5b8
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc
@@ -0,0 +1,74 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/log_plus_one.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct LogPlusOne {
+  template <class T>
+  T operator()(T v) const {
+    return std::log1p(v);
+  }
+
+  template <>
+  F16 operator()(F16 v) const {
+    return F16(operator()(static_cast<float>(v)));
+  }
+
+  template <>
+  BF16 operator()(BF16 v) const {
+    return BF16(operator()(static_cast<float>(v)));
+  }
+};
+
+LogPlusOneOp Create(LogPlusOneOp::Attributes) { return {}; }
+
+absl::Status Prepare(LogPlusOneOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(CheckCtx("log_plus_one"), input,
+                                               IsFloatTensor,
+                                               IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("log_plus_one"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(LogPlusOneOp& op, const Tensor& input, Tensor& output) {
+  LogPlusOne log_plus_one;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       log_plus_one, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   log_plus_one, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.log_plus_one: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/log_plus_one.h b/tensorflow/lite/experimental/shlo/ops/log_plus_one.h
new file mode 100644
index 00000000000000..de06a981e8a59e
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/log_plus_one.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_PLUS_ONE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_PLUS_ONE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct LogPlusOneOp {
+  struct Attributes {};
+};
+
+LogPlusOneOp Create(LogPlusOneOp::Attributes);
+absl::Status Prepare(LogPlusOneOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(LogPlusOneOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_PLUS_ONE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc b/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc
new file mode 100644
index 00000000000000..f636b72cb23e3f
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/log_plus_one.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<LogPlusOneOp> {
+  static std::string Get() { return "LogPlusOne"; }
+};
+
+namespace {
+
+struct LogPlusOne {
+  template <class T>
+  T operator()(T v) const {
+    return std::log1p(v);
+  }
+
+  template <>
+  F16 operator()(F16 v) const {
+    return F16(operator()(static_cast<float>(v)));
+  }
+
+  template <>
+  BF16 operator()(BF16 v) const {
+    return BF16(operator()(static_cast<float>(v)));
+  }
+} log_plus_one_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(LogPlusOne,
+                               UnaryElementwiseOpShapePropagationTest,
+                               LogPlusOneOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    LogPlusOne, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<LogPlusOneOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<LogPlusOneOp, ConcatTypes<BoolTestType, IntTestTypes,
+                                          PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(LogPlusOne, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct LogPlusOneTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(LogPlusOneTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(LogPlusOneTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/static_cast<StorageT>(-0.99));
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), log_plus_one_ref);
+
+  auto op = Create(LogPlusOneOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedLogPlusOneTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedLogPlusOneTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedLogPlusOneTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  Vector<StorageT> input_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/zero_point);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = log_plus_one_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(LogPlusOneOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From dfbbc888c23227bf76c6a1b0996c357bb7bdf64d Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Tue, 19 Mar 2024 07:58:50 -0700
Subject: [PATCH 103/670] [XLA:GPU][IndexAnalysis] Add DimVar, RangeVar and
 RTVar to IndexingMap.

RTVar is a new type of symbol/variable, that is associated with a runtime value of an HLO instruction, e.g. it can be used to model DUS, gather, etc.

PiperOrigin-RevId: 617172150
---
 .../xla/service/gpu/fusions/fusion_emitter.cc |  30 +--
 .../gpu/fusions/mlir/elemental_hlo_to_mlir.cc |  10 +-
 .../gpu/fusions/mlir/simplify_affine.cc       |  13 +-
 .../xla/service/gpu/fusions/reduction_base.cc |  12 +-
 .../xla/service/gpu/fusions/scatter_mlir.cc   |   6 +-
 .../xla/service/gpu/fusions/transpose_mlir.cc |  10 +-
 .../service/gpu/model/coalescing_analysis.cc  |   9 +-
 .../model/gpu_indexing_performance_model.cc   |   4 +-
 .../service/gpu/model/indexing_analysis.cc    |  55 +++---
 .../xla/service/gpu/model/indexing_context.cc |  16 +-
 .../xla/service/gpu/model/indexing_context.h  |  10 +-
 .../xla/xla/service/gpu/model/indexing_map.cc | 175 ++++++++++++++----
 .../xla/xla/service/gpu/model/indexing_map.h  | 121 +++++++++---
 .../service/gpu/model/indexing_map_test.cc    |  52 +++++-
 .../xla/service/gpu/model/tile_analysis.cc    |  27 +--
 15 files changed, 387 insertions(+), 163 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
index e18557c012df62..bfb26e3a1d8737 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
@@ -172,27 +172,27 @@ IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
     divisor *= output_shape.dimensions(dimension);
   }
 
-  std::vector<Interval> dimension_ranges = {
-      {0, static_cast<int64_t>(launch_dims.thread_counts_per_block().x) - 1},
-      {0, static_cast<int64_t>(launch_dims.thread_counts_per_block().y) - 1},
-      {0, static_cast<int64_t>(launch_dims.thread_counts_per_block().z) - 1},
-      {0, static_cast<int64_t>(launch_dims.block_counts().x) - 1},
-      {0, static_cast<int64_t>(launch_dims.block_counts().y) - 1},
-      {0, static_cast<int64_t>(launch_dims.block_counts().z) - 1},
+  std::vector<DimVar> dim_vars = {
+      {{0, static_cast<int64_t>(launch_dims.thread_counts_per_block().x) - 1}},
+      {{0, static_cast<int64_t>(launch_dims.thread_counts_per_block().y) - 1}},
+      {{0, static_cast<int64_t>(launch_dims.thread_counts_per_block().z) - 1}},
+      {{0, static_cast<int64_t>(launch_dims.block_counts().x) - 1}},
+      {{0, static_cast<int64_t>(launch_dims.block_counts().y) - 1}},
+      {{0, static_cast<int64_t>(launch_dims.block_counts().z) - 1}},
   };
-  std::vector<Interval> symbol_ranges;
+  std::vector<RangeVar> range_vars;
   int64_t num_elements = ShapeUtil::ElementsIn(output_shape);
-  symbol_ranges.push_back(
-      {0, CeilOfRatio(num_elements,
-                      static_cast<int64_t>(launch_dims.launch_bound()) *
-                          unroll_factor) -
-              1});
-  symbol_ranges.push_back({0, unroll_factor - 1});
+  range_vars.push_back(
+      {{0, CeilOfRatio(num_elements,
+                       static_cast<int64_t>(launch_dims.launch_bound()) *
+                           unroll_factor) -
+               1}});
+  range_vars.push_back({0, unroll_factor - 1});
   IndexingMap indexing_map(
       indexing_context,
       mlir::AffineMap::get(/*dimCount=*/6,
                            /*symbolCount=*/2, output_dims, mlir_context),
-      dimension_ranges, symbol_ranges);
+      dim_vars, range_vars, /*rt_vars=*/{});
   // Remove the unroll_elem_id symbol if unrolling divides num_elements.
   if (num_elements % unroll_factor == 0) {
     indexing_map.AddConstraint(linear_index.replace({{unroll_elem_id, c0}}),
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index b9169f82207ae2..7aeb2bcc0b2d64 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -576,8 +576,8 @@ Value CheckConstraints(const IndexingMap& map, ValueRange dims,
         ret, CheckConstraint(ApplyAffineExpr(expression, dims, symbols, b),
                              range, b));
   }
-  for (auto&& [index, range] : llvm::enumerate(map.GetDimensionRanges())) {
-    ret = b.create<AndIOp>(ret, CheckConstraint(dims[index], range, b));
+  for (auto&& [index, bound] : llvm::enumerate(map.GetDimensionBounds())) {
+    ret = b.create<AndIOp>(ret, CheckConstraint(dims[index], bound, b));
   }
   return ret;
 }
@@ -1061,9 +1061,9 @@ void GetLoopBoundsFromIndexingMap(ImplicitLocOpBuilder& b,
                                   SmallVectorImpl<Value>* steps) {
   Value c1 = b.create<ConstantIndexOp>(1);
 
-  for (const Interval& range : indexing_map.GetSymbolRanges()) {
-    lbs->push_back(b.create<ConstantIndexOp>(range.lower));
-    ubs->push_back(b.create<ConstantIndexOp>(range.upper + 1));
+  for (const Interval& bound : indexing_map.GetSymbolBounds()) {
+    lbs->push_back(b.create<ConstantIndexOp>(bound.lower));
+    ubs->push_back(b.create<ConstantIndexOp>(bound.upper + 1));
     // Note that this is not optimal, when there are mod constraints on symbols,
     // e.g. for reduce-window. In that case we have to extract loop steps from
     // the mod constraints.
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
index 2507281a283ebd..241ba30b77e1ec 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
@@ -100,15 +100,15 @@ struct RewriteAffineApply
       mlir::affine::AffineApplyOp op,
       mlir::PatternRewriter& rewriter) const override {
     auto affine_map = op.getAffineMap();
-    std::vector<Interval> dim_ranges(affine_map.getNumDims());
-    std::vector<Interval> symbol_ranges(affine_map.getNumSymbols());
+    std::vector<DimVar> dim_ranges(affine_map.getNumDims());
+    std::vector<RangeVar> symbol_ranges(affine_map.getNumSymbols());
 
     for (int i = 0; i < affine_map.getNumInputs(); ++i) {
       if (auto range = GetRange(op->getOperand(i))) {
         if (i >= dim_ranges.size()) {
-          symbol_ranges[i - dim_ranges.size()] = *range;
+          symbol_ranges[i - dim_ranges.size()] = RangeVar{*range};
         } else {
-          dim_ranges[i] = *range;
+          dim_ranges[i] = DimVar{*range};
         }
       } else {
         return rewriter.notifyMatchFailure(op, "failed to deduce range");
@@ -117,11 +117,12 @@ struct RewriteAffineApply
 
     IndexingContext indexing_context(op->getContext());
     IndexingMap map(&indexing_context, op.getAffineMap(), dim_ranges,
-                    symbol_ranges);
+                    symbol_ranges, /*rt_vars=*/{});
     map.Simplify();
     auto expr = map.GetAffineMap().getResult(0);
 
-    RangeEvaluator range_evaluator(dim_ranges, symbol_ranges, op->getContext());
+    RangeEvaluator range_evaluator(map.GetDimensionBounds(),
+                                   map.GetSymbolBounds(), op->getContext());
     std::function<bool(mlir::AffineExpr)> can_be_lowered;
     bool fits_32_bits = true;
     can_be_lowered = [&](mlir::AffineExpr expr) {
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
index 6ea9220034eaa8..fe5b3a42a62779 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
@@ -333,12 +333,12 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
 
   auto physical_shape = ShapeUtil::DeleteDimensions(hero->dimensions(),
                                                     hero->operand(0)->shape());
-  std::vector<Interval> dimension_ranges{
-      {0, tiling_.GetNumThreadsPerBlock() - 1},
+  std::vector<DimVar> dimension_ranges{
+      {{0, tiling_.GetNumThreadsPerBlock() - 1}},
       {},
       {},
-      {0, tiling_.GetNumBlocks() - 1},
-      {0, static_cast<int64_t>(groups_.grouped_roots.size() - 1)},
+      {{0, tiling_.GetNumBlocks() - 1}},
+      {{0, static_cast<int64_t>(groups_.grouped_roots.size() - 1)}},
       {},
   };
 
@@ -357,7 +357,7 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
           mlir::AffineMap::get(
               6, 0, block_offsets.getResult(kRowKept) + thread_ids[kRowKept],
               mlir_context),
-          dimension_ranges, {});
+          dimension_ranges, /*range_vars=*/{}, /*rt_vars=*/{});
       int rows_per_warp = GetRowsPerWarp();
       if (rows_per_warp > 1) {
         linear_index.AddConstraint(
@@ -379,7 +379,7 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
             {block_offsets.getResult(kColMajorKept),
              block_offsets.getResult(kColMinorKept) + thread_ids[kColReduced]},
             mlir_context),
-        dimension_ranges, {});
+        dimension_ranges, /*range_vars=*/{}, /*rt_vars=*/{});
 
     projected_index.AddConstraint(
         mlir::getAffineDimExpr(
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
index 2f4a3e0af8ce5d..979c4208d7ff8f 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
@@ -123,9 +123,9 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
             {mlir::getAffineDimExpr(0, mlir_context),
              mlir::getAffineSymbolExpr(0, mlir_context)},
             mlir_context),
-        /*dim_ranges=*/RangesFromTensorSizes(scatter_update_shape.dimensions()),
-        /*symbol_ranges=*/
-        RangesFromTensorSizes({scatter_indices_shape.dimensions(1)})};
+        DimVarsFromTensorSizes(scatter_update_shape.dimensions()),
+        RangeVarsFromTensorSizes({scatter_indices_shape.dimensions(1)}),
+        /*rt_vars=*/{}};
     auto scatter_indices_map = scatter_update_map * updates_to_indices_map;
     scatter_indices_map.Simplify();
     return scatter_indices_map;
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
index ba41af491180a6..7c0c172d230857 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
@@ -207,8 +207,9 @@ IndexingMap GetSharedMemoryWriteIndexingMap(
           thread_id_indexing.GetSymbolCount(),
           {c0, th_x.floorDiv(32) + 4 * tile_sizes[loop_dim], th_x % 32},
           mlir_context),
-      thread_id_indexing.GetDimensionRanges(),
-      thread_id_indexing.GetSymbolRanges(),
+      thread_id_indexing.GetDimVars(),
+      thread_id_indexing.GetRangeVars(),
+      thread_id_indexing.GetRTVars(),
       thread_id_indexing.GetConstraints()};
   shmem_write_indexing.Simplify();
   return shmem_write_indexing;
@@ -222,8 +223,9 @@ IndexingMap GetSharedMemoryReadIndexingMap(
       GetSharedMemoryWriteIndexingMap(thread_id_indexing, loop_dim);
   return IndexingMap{thread_id_indexing.GetIndexingContext(),
                      write_indexing.GetAffineMap().getSubMap({0, 2, 1}),
-                     write_indexing.GetDimensionRanges(),
-                     write_indexing.GetSymbolRanges(),
+                     write_indexing.GetDimVars(),
+                     write_indexing.GetRangeVars(),
+                     write_indexing.GetRTVars(),
                      write_indexing.GetConstraints()};
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index c697fb752f9b90..9b93141f74e0fb 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -115,7 +115,7 @@ void FindAllIndices(const IndexingMap& thread_id_to_physical_index,
                     std::vector<AffineExpr>* symbols,
                     std::vector<int64_t>* indices) {
   if (dim_id < thread_id_to_physical_index.GetDimensionCount()) {
-    Interval dim_range = thread_id_to_physical_index.GetDimensionRange(dim_id);
+    Interval dim_range = thread_id_to_physical_index.GetDimensionBound(dim_id);
     for (int64_t dim_value = dim_range.lower; dim_value <= dim_range.upper;
          ++dim_value) {
       dimensions->push_back(getAffineConstantExpr(dim_value, mlir_context));
@@ -127,7 +127,7 @@ void FindAllIndices(const IndexingMap& thread_id_to_physical_index,
   }
   if (symbol_id < thread_id_to_physical_index.GetSymbolCount()) {
     Interval symbol_range =
-        thread_id_to_physical_index.GetSymbolRange(symbol_id);
+        thread_id_to_physical_index.GetSymbolBound(symbol_id);
     for (int64_t symbol_value = symbol_range.lower;
          symbol_value <= symbol_range.upper; ++symbol_value) {
       symbols->push_back(getAffineConstantExpr(symbol_value, mlir_context));
@@ -232,8 +232,9 @@ bool IsCoalesced(const IndexingMap& thread_id_to_input_indexing_map,
   IndexingMap thread_x_first_32_elements{
       indexing_context,
       AffineMap::get(1, 0, {thread_x_dim, c0, c0, c0, c0, c0}, mlir_context),
-      {Interval{0, 31}},
-      {}};
+      {DimVar{{0, 31}}},
+      /*range_vars=*/{},
+      /*rt_vars=*/{}};
   IndexingMap thread_x_to_linearized_input =
       thread_x_first_32_elements * thread_id_to_input_indexing_map;
   thread_x_to_linearized_input.Simplify();
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
index 7d8802568a5e2d..ef4cd4a0654988 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -97,8 +97,8 @@ int64_t GetIterationSpaceSize(const IndexingMap& indexing_map,
         return num_iters;
       };
 
-  return get_ranges_iteration_space_size(indexing_map.GetSymbolRanges()) *
-         get_ranges_iteration_space_size(indexing_map.GetDimensionRanges());
+  return get_ranges_iteration_space_size(indexing_map.GetSymbolBounds()) *
+         get_ranges_iteration_space_size(indexing_map.GetDimensionBounds());
 }
 
 EstimateRunTimeData
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index c8d20d55bcdf7a..be53057e82c3db 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -151,15 +151,6 @@ HloInstructionIndexing ComputeInputToOutputBroadcastOpIndexing(
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
-std::vector<Interval> RangesFromUpperBounds(absl::Span<const int64_t> bounds) {
-  std::vector<Interval> dim_ranges;
-  dim_ranges.reserve(bounds.size());
-  for (int64_t dim : bounds) {
-    dim_ranges.push_back(Interval{0, dim - 1});
-  }
-  return dim_ranges;
-}
-
 HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
     const HloConcatenateInstruction* concat,
     IndexingContext* indexing_context) {
@@ -171,7 +162,7 @@ HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
   // be adjusted for a particular operand_id.
   mlir::MutableAffineMap affine_map =
       AffineMap::getMultiDimIdentityMap(operand_0_dims.size(), mlir_context);
-  std::vector<Interval> dim_ranges = RangesFromUpperBounds(operand_0_dims);
+  std::vector<DimVar> dim_vars = DimVarsFromTensorSizes(operand_0_dims);
 
   HloInstructionIndexing concat_indexing;
   concat_indexing.indexing_maps.resize(concat->operand_count());
@@ -181,10 +172,10 @@ HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
   for (const auto [operand_id, operand] : llvm::enumerate(concat->operands())) {
     affine_map.setResult(concat_dim, concat_dim_expr - offset);
     int64_t operand_concat_dim = operand->shape().dimensions()[concat_dim];
-    dim_ranges[concat_dim] = Interval{offset, offset + operand_concat_dim - 1};
+    dim_vars[concat_dim] = DimVar{{offset, offset + operand_concat_dim - 1}};
     concat_indexing.indexing_maps[operand_id].insert(
-        IndexingMap(indexing_context, affine_map.getAffineMap(), dim_ranges,
-                    /*symbol_ranges=*/{}));
+        IndexingMap(indexing_context, affine_map.getAffineMap(), dim_vars,
+                    /*range_vars=*/{}, /*rt_vars=*/{}));
     offset += operand_concat_dim;
   }
   return concat_indexing;
@@ -325,16 +316,16 @@ IndexingMap ComputeOutputToInputPadOpIndexingImpl(
 
   std::vector<AffineExpr> exprs;
   std::vector<std::pair<AffineExpr, Interval>> constraints;
-  std::vector<Interval> dimension_ranges;
+  std::vector<DimVar> dim_vars;
   exprs.reserve(output_rank);
   constraints.reserve(output_rank);
   int64_t output_dim_id = 0;
   for (const auto [output_dim, pad_low, pad_high, pad_interior] :
        llvm::zip(output_dims, padding_low, padding_high, padding_interior)) {
     AffineExpr dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
-    dimension_ranges.push_back(
-        Interval{std::max(int64_t{0}, pad_low),
-                 std::min(output_dim - 1, output_dim - 1 - pad_high)});
+    dim_vars.push_back(
+        {Interval{std::max(int64_t{0}, pad_low),
+                  std::min(output_dim - 1, output_dim - 1 - pad_high)}});
     if (pad_interior == 0) {
       exprs.push_back(dim_expr - pad_low);
     } else {
@@ -347,7 +338,10 @@ IndexingMap ComputeOutputToInputPadOpIndexingImpl(
   return IndexingMap{
       indexing_context,
       AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
-      dimension_ranges, /*symbol_ranges = */ {}, absl::MakeSpan(constraints)};
+      std::move(dim_vars),
+      /*range_vars = */ {},
+      /*rt_vars = */ {},
+      absl::MakeSpan(constraints)};
 }
 
 HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
@@ -487,10 +481,11 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
   padding_interior.reserve(rank);
   padded_input_dimensions.reserve(rank);
   SmallVector<AffineExpr, 4> exprs;
-  std::vector<Interval> dim_ranges, symbol_ranges;
+  std::vector<DimVar> dim_vars;
+  std::vector<RangeVar> range_vars;
   exprs.reserve(rank);
-  dim_ranges.reserve(rank);
-  symbol_ranges.reserve(rank);
+  dim_vars.reserve(rank);
+  range_vars.reserve(rank);
   for (const auto& [dim_id, window_config] :
        llvm::enumerate(reduce_window->window().dimensions())) {
     padding_low.push_back(window_config.padding_low());
@@ -507,8 +502,8 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
     AffineExpr symbol_expr = getAffineSymbolExpr(dim_id, mlir_context);
 
     exprs.push_back(symbol_expr + window_config.stride() * dim_expr);
-    dim_ranges.push_back(Interval{0, output_shape.dimensions(dim_id) - 1});
-    symbol_ranges.push_back(Interval{0, window_config.size() - 1});
+    dim_vars.push_back({Interval{0, output_shape.dimensions(dim_id) - 1}});
+    range_vars.push_back({Interval{0, window_config.size() - 1}});
   }
   // Indexing map for pad op that pads the input.
   IndexingMap padded_input_indexing = ComputeOutputToInputPadOpIndexingImpl(
@@ -517,7 +512,7 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
   // Indexing map for reduce-window, that does not do any padding.
   IndexingMap reduce_window_indexing_no_padding(
       indexing_context, AffineMap::get(rank, rank, exprs, mlir_context),
-      dim_ranges, symbol_ranges);
+      dim_vars, range_vars, /*rt_vars=*/{});
 
   // Composed indexing.
   IndexingMap inputs_indexing = ComposeIndexingMaps(
@@ -926,7 +921,8 @@ IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
     const Shape& shape, IndexingContext* indexing_context) {
   MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   if (shape.rank() == 0) {
-    return IndexingMap(indexing_context, AffineMap::get(mlir_context), {}, {});
+    return IndexingMap(indexing_context, AffineMap::get(mlir_context),
+                       /*dim_vars=*/{}, /*range vars=*/{}, /*rt_vars=*/{});
   }
   return IndexingMap::FromTensorSizes(
       indexing_context,
@@ -942,7 +938,8 @@ IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(
     const Shape& shape, IndexingContext* indexing_context) {
   MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   if (shape.rank() == 0) {
-    return IndexingMap(indexing_context, AffineMap::get(mlir_context), {}, {});
+    return IndexingMap(indexing_context, AffineMap::get(mlir_context),
+                       /*dim_vars=*/{}, /*range vars=*/{}, /*rt_vars=*/{});
   }
   return IndexingMap::FromTensorSizes(
       indexing_context,
@@ -1000,14 +997,14 @@ IndexingMap GetIndexingMapForTiling(AffineMap block_offsets,
        llvm::zip(block_offsets.getResults(), thread_offsets.getResults())) {
     offsets.push_back(block + thread);
   }
-  std::vector<Interval> dimension_ranges{
-      {0, threads_per_block - 1}, {}, {}, {0, num_blocks - 1}, {}, {},
+  std::vector<DimVar> dimension_ranges{
+      {{0, threads_per_block - 1}}, {}, {}, {{0, num_blocks - 1}}, {}, {},
   };
   auto affine_map = mlir::AffineMap::get(block_offsets.getNumDims(),
                                          block_offsets.getNumSymbols(), offsets,
                                          indexing_context->GetMLIRContext());
   IndexingMap map{indexing_context, affine_map, dimension_ranges,
-                  RangesFromUpperBounds(thread_tile_sizes)};
+                  RangeVarsFromTensorSizes(thread_tile_sizes), /*rt_vars=*/{}};
   for (int i = 0; i < tiled_shape.size(); ++i) {
     map.AddConstraint(affine_map.getResult(i), {0, tiled_shape[i] - 1});
   }
diff --git a/third_party/xla/xla/service/gpu/model/indexing_context.cc b/third_party/xla/xla/service/gpu/model/indexing_context.cc
index f44e4977e41baa..2d4e7c75f70014 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_context.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_context.cc
@@ -15,12 +15,22 @@ limitations under the License.
 
 #include "xla/service/gpu/model/indexing_context.h"
 
+#include <utility>
+
+#include "xla/service/gpu/model/indexing_map.h"
+
 namespace xla {
 namespace gpu {
 
-IndexingContext::RTValsID IndexingContext::RegisterRTSymbol(
-    const HloInstruction* instr, IndexingMap indexing_map) {
-  return 0;
+static RTVarID rt_var_count = 0;
+
+RTVar IndexingContext::RegisterRTVar(RTVarData rt_var_data) {
+  rt_vars_registry_.insert(std::make_pair(rt_var_count, rt_var_data));
+  return RTVar{rt_var_count++};
+}
+
+RTVarData& IndexingContext::GetRTVarData(RTVarID id) {
+  return rt_vars_registry_.at(id);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/indexing_context.h b/third_party/xla/xla/service/gpu/model/indexing_context.h
index 2560cd09ab1864..1106c65874e648 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_context.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_context.h
@@ -29,8 +29,6 @@ namespace gpu {
 
 class IndexingContext {
  public:
-  using RTValsID = int64_t;
-
   explicit IndexingContext(mlir::MLIRContext* mlir_context)
       : mlir_context_(mlir_context) {}
 
@@ -39,13 +37,13 @@ class IndexingContext {
   // TBD: This method should behave like a thread-safe counter. It will register
   // a new RTSymbol by adding it to `rt_vals_registry_` with the newly generated
   // ID.
-  RTValsID RegisterRTSymbol(const HloInstruction* instr,
-                            IndexingMap indexing_map);
+  RTVar RegisterRTVar(RTVarData rt_var_data);
+
+  RTVarData& GetRTVarData(RTVarID id);
 
  private:
   mlir::MLIRContext* mlir_context_;
-  absl::flat_hash_map<RTValsID, std::pair<const HloInstruction*, IndexingMap>>
-      rt_vals_registry_;
+  absl::flat_hash_map<RTVarID, RTVarData> rt_vars_registry_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index 7d92c7dcb9e6e1..cbec67aa90f249 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -586,12 +586,32 @@ bool operator==(const Interval& lhs, const Interval& rhs) {
   return lhs.lower == rhs.lower && lhs.upper == rhs.upper;
 }
 
-std::vector<Interval> RangesFromTensorSizes(
+bool operator==(const DimVar& lhs, const DimVar& rhs) {
+  return lhs.bounds == rhs.bounds;
+}
+
+bool operator==(const RangeVar& lhs, const RangeVar& rhs) {
+  return lhs.range == rhs.range;
+}
+
+bool operator==(const RTVar& lhs, const RTVar& rhs) { return lhs.id == rhs.id; }
+
+std::vector<DimVar> DimVarsFromTensorSizes(
+    absl::Span<const int64_t> tensor_sizes) {
+  std::vector<DimVar> ranges;
+  ranges.reserve(tensor_sizes.size());
+  for (int64_t size : tensor_sizes) {
+    ranges.push_back({Interval{0, size - 1}});
+  }
+  return ranges;
+}
+
+std::vector<RangeVar> RangeVarsFromTensorSizes(
     absl::Span<const int64_t> tensor_sizes) {
-  std::vector<Interval> ranges;
+  std::vector<RangeVar> ranges;
   ranges.reserve(tensor_sizes.size());
   for (int64_t size : tensor_sizes) {
-    ranges.push_back(Interval{0, size - 1});
+    ranges.push_back({Interval{0, size - 1}});
   }
   return ranges;
 }
@@ -600,9 +620,9 @@ IndexingMap IndexingMap::FromTensorSizes(
     IndexingContext* indexing_context, AffineMap affine_map,
     absl::Span<const int64_t> dim_upper_bounds,
     absl::Span<const int64_t> symbol_upper_bounds) {
-  return IndexingMap{indexing_context, affine_map,
-                     RangesFromTensorSizes(dim_upper_bounds),
-                     RangesFromTensorSizes(symbol_upper_bounds)};
+  return IndexingMap{
+      indexing_context, affine_map, DimVarsFromTensorSizes(dim_upper_bounds),
+      RangeVarsFromTensorSizes(symbol_upper_bounds), /*rt_vars=*/{}};
 }
 
 mlir::MLIRContext* IndexingMap::GetMLIRContext() const {
@@ -613,14 +633,66 @@ IndexingContext* IndexingMap::GetIndexingContext() const {
   return indexing_context_;
 }
 
+const Interval& IndexingMap::GetDimensionBound(int64_t dim_id) const {
+  return dim_vars_[dim_id].bounds;
+}
+
+Interval& IndexingMap::GetMutableDimensionBound(int64_t dim_id) {
+  return dim_vars_[dim_id].bounds;
+}
+
+std::vector<Interval> IndexingMap::GetDimensionBounds() const {
+  std::vector<Interval> bounds;
+  bounds.reserve(affine_map_.getNumDims());
+  for (const auto& dim : dim_vars_) {
+    bounds.push_back(dim.bounds);
+  }
+  return bounds;
+}
+
+const Interval& IndexingMap::GetSymbolBound(int64_t symbol_id) const {
+  // Because affine map symbols are packed like [range_vars, rt_vars],
+  // we have to pick the correct bounds.
+  int64_t range_var_count = GetRangeVarsCount();
+  return symbol_id < range_var_count
+             ? range_vars_[symbol_id].range
+             : indexing_context_
+                   ->GetRTVarData(rt_vars_[symbol_id - range_var_count].id)
+                   .feasible_values;
+}
+
+Interval& IndexingMap::GetMutableSymbolBound(int64_t symbol_id) {
+  // Because affine map symbols are packed like [range_vars, rt_vars],
+  // we have to pick the correct bounds.
+  int64_t range_var_count = GetRangeVarsCount();
+  return symbol_id < range_var_count
+             ? range_vars_[symbol_id].range
+             : indexing_context_
+                   ->GetRTVarData(rt_vars_[symbol_id - range_var_count].id)
+                   .feasible_values;
+}
+
+std::vector<Interval> IndexingMap::GetSymbolBounds() const {
+  std::vector<Interval> bounds;
+  bounds.reserve(affine_map_.getNumSymbols());
+  for (const auto& range_var : range_vars_) {
+    bounds.push_back(range_var.range);
+  }
+  for (const auto& rt_var : rt_vars_) {
+    bounds.push_back(
+        indexing_context_->GetRTVarData(rt_var.id).feasible_values);
+  }
+  return bounds;
+}
+
 void IndexingMap::AddConstraint(mlir::AffineExpr expr, Interval range) {
   if (auto dim_expr = mlir::dyn_cast<AffineDimExpr>(expr)) {
-    Interval& current_range = dim_ranges_[dim_expr.getPosition()];
+    Interval& current_range = GetMutableDimensionBound(dim_expr.getPosition());
     current_range = Intersect(current_range, range);
     return;
   }
   if (auto symbol_expr = mlir::dyn_cast<AffineSymbolExpr>(expr)) {
-    Interval& current_range = symbol_ranges_[symbol_expr.getPosition()];
+    Interval& current_range = GetMutableSymbolBound(symbol_expr.getPosition());
     current_range = Intersect(current_range, range);
     return;
   }
@@ -637,8 +709,8 @@ void IndexingMap::AddConstraint(mlir::AffineExpr expr, Interval range) {
 bool IndexingMap::ConstraintsSatisfied(
     ArrayRef<AffineExpr> dim_const_exprs,
     ArrayRef<AffineExpr> symbol_const_exprs) const {
-  CHECK(dim_const_exprs.size() == GetDimensionCount());
-  CHECK(symbol_const_exprs.size() == GetSymbolCount());
+  CHECK(dim_const_exprs.size() == affine_map_.getNumDims());
+  CHECK(symbol_const_exprs.size() == affine_map_.getNumSymbols());
   if (IsKnownEmpty()) {
     return false;
   }
@@ -666,14 +738,17 @@ SmallVector<int64_t, 4> IndexingMap::Evaluate(
 }
 
 bool IndexingMap::IsKnownEmpty() const {
-  auto is_infeasible = [](const Interval& range) {
-    return range.lower > range.upper;
-  };
-  return llvm::any_of(dim_ranges_, is_infeasible) ||
-         llvm::any_of(symbol_ranges_, is_infeasible) ||
+  return llvm::any_of(dim_vars_,
+                      [](const DimVar& dim_var) {
+                        return dim_var.bounds.lower > dim_var.bounds.upper;
+                      }) ||
+         llvm::any_of(range_vars_,
+                      [](const RangeVar& range_var) {
+                        return range_var.range.lower > range_var.range.upper;
+                      }) ||
          llvm::any_of(constraints_,
                       [&](const std::pair<AffineExpr, Interval>& item) {
-                        return is_infeasible(item.second);
+                        return item.second.lower > item.second.upper;
                       });
 }
 
@@ -760,16 +835,24 @@ void IndexingMap::Print(std::ostream& out,
                         const AffineMapPrinter& printer) const {
   printer.Print(out, affine_map_);
   out << "\ndomain:\n";
-  for (const auto& [index, range] : llvm::enumerate(dim_ranges_)) {
+  for (const auto& [index, range] : llvm::enumerate(dim_vars_)) {
     out << printer.GetDimensionName(static_cast<int64_t>(index)) << " in ";
-    range.Print(out);
+    dim_vars_.at(index).bounds.Print(out);
     out << '\n';
   }
-  for (const auto& [index, range] : llvm::enumerate(symbol_ranges_)) {
+  int64_t range_vars_count = GetRangeVarsCount();
+  for (const auto& [index, range] : llvm::enumerate(range_vars_)) {
     out << printer.GetSymbolName(static_cast<int64_t>(index)) << " in ";
-    range.Print(out);
+    range_vars_.at(index).range.Print(out);
     out << '\n';
   }
+  for (const auto& [index, range] : llvm::enumerate(rt_vars_)) {
+    auto id = rt_vars_.at(index).id;
+    const RTVarData& rt_var_data = indexing_context_->GetRTVarData(id);
+    out << printer.GetSymbolName(static_cast<int64_t>(range_vars_count + index))
+        << " id: " << id;
+    rt_var_data.Print(out);
+  }
   std::vector<std::string> expr_range_strings;
   expr_range_strings.reserve(constraints_.size());
   for (const auto& [expr, range] : constraints_) {
@@ -793,8 +876,9 @@ std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map) {
 
 bool operator==(const IndexingMap& lhs, const IndexingMap& rhs) {
   return lhs.GetAffineMap() == rhs.GetAffineMap() &&
-         lhs.GetDimensionRanges() == rhs.GetDimensionRanges() &&
-         lhs.GetSymbolRanges() == rhs.GetSymbolRanges();
+         lhs.GetDimVars() == rhs.GetDimVars() &&
+         lhs.GetRangeVars() == rhs.GetRangeVars() &&
+         lhs.GetRTVars() == rhs.GetRTVars();
 }
 
 IndexingMap operator*(const IndexingMap& lhs, const IndexingMap& rhs) {
@@ -826,7 +910,8 @@ bool IndexingMap::Simplify() {
   }
   // Simplify affine_map using the optimized ranges.
   // Potentially, we can be smarter about recreating the range_evaluator.
-  RangeEvaluator range_evaluator(dim_ranges_, symbol_ranges_, GetMLIRContext());
+  RangeEvaluator range_evaluator(GetDimensionBounds(), GetSymbolBounds(),
+                                 GetMLIRContext());
   AffineMap simplified_affine_map =
       AffineExprSimplifier(&range_evaluator).Simplify(affine_map_);
   bool affine_map_was_simplified = simplified_affine_map != affine_map_;
@@ -838,7 +923,8 @@ bool IndexingMap::Simplify() {
 
 bool IndexingMap::SimplifyConstraintExprs() {
   // Simplify affine expression in the constraints_.
-  RangeEvaluator range_evaluator(dim_ranges_, symbol_ranges_, GetMLIRContext());
+  RangeEvaluator range_evaluator(GetDimensionBounds(), GetSymbolBounds(),
+                                 GetMLIRContext());
   AffineExprSimplifier simplifier(&range_evaluator);
   std::vector<AffineExpr> to_remove;
   std::vector<std::pair<AffineExpr, Interval>> to_add;
@@ -933,6 +1019,8 @@ bool IsFunctionOfUnusedDimsAndSymbolsOnly(
 
 void IndexingMap::RemoveUnusedSymbols() {
   if (IsUndefined()) return;
+  // TODO(b/329052892): Implement composition with RT vars.
+  if (GetRTVarsCount()) return;
 
   // Remove unused symbols from the affine_map.
   unsigned num_symbols_before = affine_map_.getNumSymbols();
@@ -974,19 +1062,19 @@ void IndexingMap::RemoveUnusedSymbols() {
   unsigned num_symbols_after = affine_map_.getNumSymbols();
   if (num_symbols_after == num_symbols_before) return;
 
-  std::vector<Interval> compressed_symbol_ranges_;
+  std::vector<RangeVar> compressed_range_vars;
   MLIRContext* mlir_context = GetMLIRContext();
   int64_t used_symbols_count = 0;
   std::vector<AffineExpr> symbol_replacements(
       num_symbols_before, getAffineConstantExpr(0, mlir_context));
   for (int i = 0; i < unused_symbols_bit_vector.size(); ++i) {
     if (!unused_symbols_bit_vector[i]) {
-      compressed_symbol_ranges_.push_back(symbol_ranges_[i]);
+      compressed_range_vars.push_back(range_vars_[i]);
       symbol_replacements[i] =
           getAffineSymbolExpr(used_symbols_count++, mlir_context);
     }
   }
-  symbol_ranges_ = std::move(compressed_symbol_ranges_);
+  range_vars_ = std::move(compressed_range_vars);
   std::vector<AffineExpr> to_remove;
   std::vector<std::pair<AffineExpr, Interval>> to_add;
   for (const auto& [expr, range] : constraints_) {
@@ -1008,24 +1096,29 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
   if (second.IsUndefined() || first.IsUndefined()) {
     return IndexingMap::GetUndefined();
   }
+  // TODO(b/329052892): Implement composition with RT vars.
+  if (first.GetRTVarsCount() || second.GetRTVarsCount()) {
+    return IndexingMap::GetUndefined();
+  }
   AffineMap producer_affine_map = second.GetAffineMap();
   AffineMap composed_map = producer_affine_map.compose(first.GetAffineMap());
 
   // The symbols in the composed map, i.e. combined
   // producer_map.compose(consumer_map) are packed as [symbols(producer_map) |
   // symbols(consumer_map)].
-  std::vector<Interval> combined_symbol_ranges;
-  combined_symbol_ranges.reserve(second.GetSymbolCount() +
-                                 first.GetSymbolCount());
-  for (const Interval& symbol_range : llvm::concat<const Interval>(
-           second.GetSymbolRanges(), first.GetSymbolRanges())) {
+  std::vector<RangeVar> combined_symbol_ranges;
+  combined_symbol_ranges.reserve(second.GetRangeVarsCount() +
+                                 first.GetRangeVarsCount());
+  for (const RangeVar& symbol_range : llvm::concat<const RangeVar>(
+           second.GetRangeVars(), first.GetRangeVars())) {
     combined_symbol_ranges.push_back(symbol_range);
   }
 
   IndexingContext* indexing_context = first.GetIndexingContext();
   IndexingMap composed_indexing_map(indexing_context, composed_map,
-                                    first.GetDimensionRanges(),
-                                    std::move(combined_symbol_ranges));
+                                    first.GetDimVars(),
+                                    std::move(combined_symbol_ranges),
+                                    /*rt_vars=*/{});
   // Add constraints that are already present in the producer_map. We have to
   // compute consumer_map(producer_constraints). To keep all symbols and
   // dimension IDs the same as in the `composed_indexing_map.affine_map`, we
@@ -1057,7 +1150,7 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
   for (auto [index, expr] :
        llvm::enumerate(first.GetAffineMap().getResults())) {
     Interval producer_dim_range =
-        second.GetDimensionRange(static_cast<int64_t>(index));
+        second.GetDimensionBound(static_cast<int64_t>(index));
     composed_indexing_map.AddConstraint(
         expr.shiftSymbols(first.GetSymbolCount(), second.GetSymbolCount()),
         producer_dim_range);
@@ -1065,5 +1158,17 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
   return composed_indexing_map;
 }
 
+std::string RTVarData::ToString() const {
+  std::stringstream ss;
+  Print(ss);
+  return ss.str();
+}
+
+void RTVarData::Print(std::ostream& out) const {
+  out << " in " << feasible_values
+      << "\nhlo: " << (hlo == nullptr ? "NULL" : hlo->ToString()) << '\n';
+  indexing_map.Print(out, AffineMapPrinter());
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index e6e84bf82a9f7d..1d7c1051d48af7 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
 
 namespace xla {
@@ -137,7 +138,50 @@ class RangeEvaluator {
   llvm::DenseMap<mlir::AffineExpr, Interval> expression_ranges_cache_;
 };
 
-std::vector<Interval> RangesFromTensorSizes(
+// Dimension variable represents a dimension of a tensor or a GPU grid.
+// Dimensions correspond to the dimension parameter of `affine_map_`.
+struct DimVar {
+  Interval bounds;
+};
+bool operator==(const DimVar& lhs, const DimVar& rhs);
+
+template <typename H>
+H AbslHashValue(H h, const DimVar& dimension) {
+  return H::combine(std::move(h), dimension.bounds);
+}
+
+// RangeSymbol variable represents a range of values, e.g. to compute a single
+// element of the reduction's result we need a range of values from the input
+// tensor. RangeSymbol variables correspond to the front portion of the
+// symbols in `affine_map_`.
+struct RangeVar {
+  Interval range;
+};
+bool operator==(const RangeVar& lhs, const RangeVar& rhs);
+
+template <typename H>
+H AbslHashValue(H h, const RangeVar& range_var) {
+  return H::combine(std::move(h), range_var.range);
+}
+
+// RTSymbol variable represents a runtime symbol, e.g. a dynamic offset in
+// HLO dynamic-update-slice op. RTSymbol variables correspond to the back
+// portion of the symbols in `affine_map_`.
+using RTVarID = int64_t;
+struct RTVar {
+  RTVarID id;
+};
+bool operator==(const RTVar& lhs, const RTVar& rhs);
+
+template <typename H>
+H AbslHashValue(H h, const RTVar& rt_var) {
+  return H::combine(std::move(h), rt_var.id);
+}
+
+std::vector<DimVar> DimVarsFromTensorSizes(
+    absl::Span<const int64_t> tensor_sizes);
+
+std::vector<RangeVar> RangeVarsFromTensorSizes(
     absl::Span<const int64_t> tensor_sizes);
 
 // Contains an affine map with N dimension expressions and M symbols:
@@ -169,25 +213,27 @@ class IndexingMap {
  public:
   IndexingMap(
       IndexingContext* indexing_context, mlir::AffineMap affine_map,
-      std::vector<Interval> dim_ranges, std::vector<Interval> symbol_ranges,
+      std::vector<DimVar> dimensions, std::vector<RangeVar> range_vars,
+      std::vector<RTVar> rt_vars,
       absl::Span<std::pair<mlir::AffineExpr, Interval>> constraints = {})
       : indexing_context_(indexing_context),
         affine_map_(affine_map),
-        dim_ranges_(std::move(dim_ranges)),
-        symbol_ranges_(std::move(symbol_ranges)) {
+        dim_vars_(std::move(dimensions)),
+        range_vars_(std::move(range_vars)),
+        rt_vars_(std::move(rt_vars)) {
     for (const auto& [expr, range] : constraints) {
       AddConstraint(expr, range);
     }
   }
-
   IndexingMap(IndexingContext* indexing_context, mlir::AffineMap affine_map,
-              std::vector<Interval> dim_ranges,
-              std::vector<Interval> symbol_ranges,
+              std::vector<DimVar> dimensions, std::vector<RangeVar> range_vars,
+              std::vector<RTVar> rt_vars,
               const llvm::DenseMap<mlir::AffineExpr, Interval>& constraints)
       : indexing_context_(indexing_context),
         affine_map_(affine_map),
-        dim_ranges_(std::move(dim_ranges)),
-        symbol_ranges_(std::move(symbol_ranges)),
+        dim_vars_(std::move(dimensions)),
+        range_vars_(std::move(range_vars)),
+        rt_vars_(std::move(rt_vars)),
         constraints_(constraints) {}
 
   static IndexingMap GetUndefined() { return IndexingMap(); }
@@ -214,19 +260,32 @@ class IndexingMap {
   // Returns the affine map.
   mlir::AffineMap GetAffineMap() const { return affine_map_; }
 
-  // Getters for dimension ranges.
-  Interval GetDimensionRange(int64_t id) const { return dim_ranges_[id]; }
-  const std::vector<Interval>& GetDimensionRanges() const {
-    return dim_ranges_;
-  }
-  int64_t GetDimensionCount() const { return dim_ranges_.size(); }
-
-  // Getters for symbol ranges.
-  Interval GetSymbolRange(int64_t id) const { return symbol_ranges_[id]; }
-  const std::vector<Interval>& GetSymbolRanges() const {
-    return symbol_ranges_;
-  }
-  int64_t GetSymbolCount() const { return symbol_ranges_.size(); }
+  // Getters for dimension vars.
+  const DimVar& GetDimVars(int64_t id) const { return dim_vars_[id]; }
+  const std::vector<DimVar>& GetDimVars() const { return dim_vars_; }
+  int64_t GetDimVarsCount() const { return dim_vars_.size(); }
+
+  // Getters for range vars.
+  const RangeVar& GetRangeVar(int64_t id) const { return range_vars_[id]; }
+  const std::vector<RangeVar>& GetRangeVars() const { return range_vars_; }
+  int64_t GetRangeVarsCount() const { return range_vars_.size(); }
+
+  // Getters for runtime vars.
+  const RTVar& GetRTVar(int64_t id) const { return rt_vars_[id]; }
+  const std::vector<RTVar>& GetRTVars() const { return rt_vars_; }
+  int64_t GetRTVarsCount() const { return rt_vars_.size(); }
+
+  // Gets bounds of `affine_map_` dimensions.
+  const Interval& GetDimensionBound(int64_t dim_id) const;
+  Interval& GetMutableDimensionBound(int64_t dim_id);
+  std::vector<Interval> GetDimensionBounds() const;
+  int64_t GetDimensionCount() const { return affine_map_.getNumDims(); }
+
+  // Gets bounds of `affine_map_` symbols.
+  const Interval& GetSymbolBound(int64_t symbol_id) const;
+  Interval& GetMutableSymbolBound(int64_t symbol_id);
+  std::vector<Interval> GetSymbolBounds() const;
+  int64_t GetSymbolCount() const { return affine_map_.getNumSymbols(); }
 
   // Getters for affine expression constraints.
   const llvm::DenseMap<mlir::AffineExpr, Interval>& GetConstraints() const {
@@ -276,8 +335,9 @@ class IndexingMap {
 
   IndexingContext* indexing_context_ = nullptr;
   mlir::AffineMap affine_map_;
-  std::vector<Interval> dim_ranges_;
-  std::vector<Interval> symbol_ranges_;
+  std::vector<DimVar> dim_vars_;
+  std::vector<RangeVar> range_vars_;
+  std::vector<RTVar> rt_vars_;
   // Inequality constraints for affine expressions. They restrict the feasible
   // set for the domain of the indexing map. It contains affine expressions
   // other than AffineDimExpr and AffineSymbolExpr.
@@ -296,11 +356,20 @@ H AbslHashValue(H h, const IndexingMap& indexing_map) {
   llvm::hash_code affine_map_hash =
       llvm::hash_combine(indexing_map.GetAffineMap());
   return H::combine(std::move(h), static_cast<size_t>(affine_map_hash),
-                    indexing_map.GetDimensionRanges(),
-                    indexing_map.GetSymbolRanges(),
+                    indexing_map.GetDimVars(), indexing_map.GetRangeVars(),
+                    indexing_map.GetRTVars(),
                     indexing_map.GetConstraintsCount());
 }
 
+struct RTVarData {
+  std::string ToString() const;
+  void Print(std::ostream& out) const;
+
+  Interval feasible_values;
+  const HloInstruction* hlo;
+  IndexingMap indexing_map;
+};
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index ffc6743863244d..ee9046c945337d 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "xla/service/gpu/model/indexing_map.h"
 
 #include <optional>
+#include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -35,13 +37,49 @@ using ::testing::ElementsAre;
 
 class IndexingMapTest : public HloTestBase {
  public:
-  IndexingMapTest()
-      : HloTestBase(), mlir_context_(), indexing_context_(&mlir_context_) {}
+  IndexingMapTest() : indexing_context_(&mlir_context_) {}
   mlir::MLIRContext mlir_context_;
   IndexingContext indexing_context_;
   AffineMapPrinter printer_;
 };
 
+TEST_F(IndexingMapTest, RTVar) {
+  IndexingMap zero_dim_map = IndexingMap::FromTensorSizes(
+      &indexing_context_, ParseAffineMap("() -> ()", &mlir_context_), {100, 44},
+      {});
+  std::vector<RTVar> rt_vars{
+      indexing_context_.RegisterRTVar({Interval{0, 2},
+                                       /*instr=*/nullptr, zero_dim_map}),
+      indexing_context_.RegisterRTVar({Interval{0, 7},
+                                       /*instr=*/nullptr, zero_dim_map})};
+
+  IndexingMap indexing_map(
+      &indexing_context_,
+      ParseAffineMap("(d0, d1)[s0, s1, s2] -> (d1, d0, s0 + s1, s1)",
+                     &mlir_context_),
+      {DimVar{{0, 99}}, DimVar{{0, 43}}}, {RangeVar{{-99, 99}}},
+      std::move(rt_vars));
+  EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
+              (d0, d1)[s0, s1, s2] -> (d1, d0, s0 + s1, s1)
+              domain:
+              d0 in [0, 99]
+              d1 in [0, 43]
+              s0 in [-99, 99]
+              s1 id: 0 in [0, 2]
+                hlo: NULL
+                () -> ()
+                domain: 
+                d0 in [0, 99]
+                d1 in [0, 43]
+              s2 id: 1 in [0, 7]
+                  hlo: NULL
+                  () -> ()
+                  domain:
+                  d0 in [0, 99]
+                  d1 in [0, 43]
+              )"));
+}
+
 TEST_F(IndexingMapTest, Evaluation) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       &indexing_context_,
@@ -250,7 +288,7 @@ TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_FloorDivPositiveDivisorNegativeBounds) {
   IndexingMap indexing_map = IndexingMap(
       &indexing_context_, ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-      {Interval{0, 99}}, {Interval{-99, 99}});
+      {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 floordiv 3", &mlir_context_),
                              Interval{-11, -5});
@@ -266,7 +304,7 @@ TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_FloorDivNegativeDivisorNegativeBounds) {
   IndexingMap indexing_map = IndexingMap(
       &indexing_context_, ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-      {Interval{0, 99}}, {Interval{-99, 99}});
+      {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 floordiv -3", &mlir_context_),
                              Interval{-11, -5});
@@ -297,7 +335,7 @@ TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_MulPositiveMultiplierNegativeBounds) {
   IndexingMap indexing_map = IndexingMap(
       &indexing_context_, ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-      {Interval{0, 99}}, {Interval{-99, 99}});
+      {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 * 3", &mlir_context_),
                              Interval{-11, -5});
@@ -313,7 +351,7 @@ TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_MulNegativeMultiplierNegativeBounds) {
   IndexingMap indexing_map = IndexingMap(
       &indexing_context_, ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-      {Interval{0, 99}}, {Interval{-99, 99}});
+      {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 * -3", &mlir_context_),
                              Interval{-11, -5});
@@ -328,7 +366,7 @@ TEST_F(IndexingMapTest,
 TEST_F(IndexingMapTest, AffineMapSimplification_ConstantDims) {
   IndexingMap indexing_map = IndexingMap(
       &indexing_context_, ParseAffineMap("(d0) -> (d0)", &mlir_context_),
-      {Interval{5, 5}}, {});
+      {DimVar{{5, 5}}}, /*range_vars=*/{}, /*rt_vars=*/{});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0) -> (5)
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.cc b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
index 8560c10de5af0d..76e2db2f4e0750 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
@@ -220,7 +220,7 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
         if (symbol_expr && symbol_expr.getPosition() < num_known_symbols) {
           CHECK(!size_expr);
           const Interval& symbol_range =
-              indexing_map.GetSymbolRange(symbol_expr.getPosition());
+              indexing_map.GetSymbolBound(symbol_expr.getPosition());
           size_expr = getAffineConstantExpr(
               symbol_range.upper - symbol_range.lower + 1, mlir_context);
         } else if (auto dim_expr = llvm::dyn_cast<AffineDimExpr>(expr)) {
@@ -253,22 +253,25 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
 
 /*static*/ std::optional<SymbolicTile> SymbolicTile::FromIndexingMap(
     const IndexingMap& indexing_map) {
+  if (indexing_map.GetRTVarsCount()) {
+    return std::nullopt;
+  }
   IndexingContext* indexing_context = indexing_map.GetIndexingContext();
   MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   int64_t num_input_dims = indexing_map.GetDimensionCount();
   std::vector<AffineExpr> exprs;
   exprs.reserve(num_input_dims);
 
-  std::vector<Interval> tile_dimension_ranges;
-  tile_dimension_ranges.reserve(num_input_dims);
-  std::vector<Interval> tile_symbol_ranges;
-  tile_symbol_ranges.reserve(kNumTileParametersPerInputDim * num_input_dims +
-                             indexing_map.GetAffineMap().getNumSymbols());
+  std::vector<DimVar> tile_dim_vars;
+  tile_dim_vars.reserve(num_input_dims);
+  std::vector<RangeVar> tile_range_vars;
+  tile_range_vars.reserve(kNumTileParametersPerInputDim * num_input_dims +
+                          indexing_map.GetAffineMap().getNumSymbols());
 
   // The symbols declared in 'indexing_map.affine_map' will precede those
   // defined in the producer map we construct here.
-  absl::c_copy(indexing_map.GetSymbolRanges(),
-               std::back_inserter(tile_symbol_ranges));
+  absl::c_copy(indexing_map.GetRangeVars(),
+               std::back_inserter(tile_range_vars));
 
   // For each input dims we add kNumTileParametersPerInputDim = 3 symbols, as
   // well as a single dim. Symbols are ordered in (offset, size, stride)
@@ -282,12 +285,12 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
 
     exprs.push_back(offset + stride * index);
 
-    Interval range = indexing_map.GetDimensionRange(dim);
-    tile_dimension_ranges.push_back(range);
+    Interval range = indexing_map.GetDimensionBound(dim);
+    tile_dim_vars.push_back({range});
 
     for (int64_t symbol_index = 0; symbol_index < kNumTileParametersPerInputDim;
          ++symbol_index) {
-      tile_symbol_ranges.push_back(range);
+      tile_range_vars.push_back({range});
     }
   }
 
@@ -297,7 +300,7 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
 
   IndexingMap composed_indexing_map(
       indexing_context, indexing_map.GetAffineMap().compose(producer_map),
-      tile_dimension_ranges, tile_symbol_ranges);
+      tile_dim_vars, tile_range_vars, /*rt_vars=*/{});
 
   composed_indexing_map.Simplify();
 

From e89decb1e5e47dd0e62838eb079b25840cd64d33 Mon Sep 17 00:00:00 2001
From: sachinmuradi <sachin.muradi@intel.com>
Date: Tue, 19 Mar 2024 08:29:57 -0700
Subject: [PATCH 104/670] PR #10211: [XLA:CPU][oneDNN] Update oneDNN LayerNorm
 and Softmax

Imported from GitHub PR https://github.com/openxla/xla/pull/10211

This PR addresses following :
1) Add support for FP16 to oneDNN softmax pattern matching
2) Adds another pattern for oneDNN layernorm based on keras layernorm operation
3) Fixes previous oneDNN layernorm pattern, as it was focused on flax layernorm pattern, which recently wen through an update while moment calculation
4) Adds FP16 support to oneDNN layernorm pattern matching.
Copybara import of the project:

--
9eff5f704d99a2a594226e6a9cc1db72a67c8de8 by Sachin Muradi <sachin.muradi@intel.com>:

Update LN and Softmax

--
62497479066d9d4a9017ad13ab2389c4795b37db by Sachin Muradi <sachin.muradi@intel.com>:

Address review comments

Merging this change closes #10211

PiperOrigin-RevId: 617180877
---
 .../xla/xla/service/cpu/backend_config.proto  |   1 +
 .../xla/xla/service/cpu/onednn_layer_norm.cc  |   3 +-
 .../xla/service/cpu/onednn_ops_rewriter.cc    | 486 +++++++++++-------
 .../xla/xla/tests/onednn_layer_norm_test.cc   | 227 ++++----
 4 files changed, 399 insertions(+), 318 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/backend_config.proto b/third_party/xla/xla/service/cpu/backend_config.proto
index e32159c8044c5e..7e82b08f3da5db 100644
--- a/third_party/xla/xla/service/cpu/backend_config.proto
+++ b/third_party/xla/xla/service/cpu/backend_config.proto
@@ -45,4 +45,5 @@ message OneDnnLayerNormConfig {
     SCALE_AND_SHIFT = 3;
   }
   FusionKind fused_ops = 1;
+  int32 epsilon_typecast = 2;
 }
diff --git a/third_party/xla/xla/service/cpu/onednn_layer_norm.cc b/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
index 79a6a2290a063a..1d42f0290ee839 100644
--- a/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
+++ b/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
@@ -83,7 +83,8 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnLayerNorm(
   auto shift_mem = memory(scaleshift_md, cpu_engine, beta_minfo.Data());
 
   // TODO(intel-tf): Move epsilon to OneDnnLayerNormConfig.
-  const float epsilon = 1.e-5f;
+  float epsilon;
+  *(reinterpret_cast<int32_t*>(&epsilon)) = ln_config.epsilon_typecast();
 
   auto lnorm_pd = layer_normalization_forward::primitive_desc(
       cpu_engine, prop_kind::forward_inference, src_md, dst_md, epsilon,
diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
index ed904f44658876..183119e09f213a 100644
--- a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
@@ -29,11 +29,6 @@ namespace cpu {
 namespace {
 namespace m = match;
 
-auto ConvertPattern(HloInstruction** instr) {
-  return m::Convert(m::Op(instr).WithElementType(PrimitiveType::BF16))
-      .WithElementType(PrimitiveType::F32);
-}
-
 template <typename Pattern>
 auto OptionalConvert(Pattern pattern) {
   return m::AnyOf<HloInstruction>(m::Convert(pattern), std::move(pattern));
@@ -98,29 +93,30 @@ std::optional<HloInstruction*> MatchSoftmax(HloInstruction* instr) {
   HloInstruction* right_producer;
 
   // Lower diamond
-  if (!Match(
-          instr,
-          m::Divide(
-              m::Exp(&left_exponential, m::Op()),
-              m::Broadcast(m::Reshape(m::Broadcast(OptionalConvert(m::Reshape(
-                  m::Reduce(
-                      OptionalConvert(m::Exp(&right_exponential, m::Op())),
-                      m::Op())
-                      .WithPredicate([](const HloInstruction* reduce) {
-                        HloComputation* reducer = reduce->to_apply();
-                        return (reducer->root_instruction()->opcode() ==
-                                    HloOpcode::kAdd &&
-                                reduce->dimensions().size() == 1 &&
-                                reduce->dimensions()[0] !=
-                                    reduce->shape().rank() - 1);
-                      })
-                      .WithOneUse())))))))) {
+  if (!Match(instr,
+             m::Divide(
+                 m::Exp(&left_exponential, m::Op()),
+                 m::Broadcast(m::Reshape(
+                     m::Broadcast(OptionalConvert(m::Reshape(OptionalConvert(
+                         m::Reduce(OptionalConvert(
+                                       m::Exp(&right_exponential, m::Op())),
+                                   m::Op())
+                             .WithPredicate([](const HloInstruction* reduce) {
+                               HloComputation* reducer = reduce->to_apply();
+                               return (reducer->root_instruction()->opcode() ==
+                                           HloOpcode::kAdd &&
+                                       reduce->dimensions().size() == 1 &&
+                                       reduce->dimensions()[0] !=
+                                           reduce->shape().rank() - 1);
+                             })
+                             .WithOneUse()))))))))) {
     return std::nullopt;
   }
 
   if (left_exponential != right_exponential ||
-      left_exponential->user_count() != 2)
+      left_exponential->user_count() != 2) {
     return std::nullopt;
+  }
 
   // Upper diamond
   if (!Match(left_exponential->mutable_operand(0),
@@ -143,183 +139,284 @@ std::optional<HloInstruction*> MatchSoftmax(HloInstruction* instr) {
     return std::nullopt;
   }
 
-  if (left_producer != right_producer || left_producer->user_count() != 2)
+  if (left_producer != right_producer || left_producer->user_count() != 2) {
     return std::nullopt;
+  }
 
   return left_producer;
 }
 
+auto MeanPattern(HloInstruction** input) {
+  return m::Reshape(
+      m::Convert(m::Divide(m::Reduce(m::Convert(m::Op(input)), m::Op()),
+                           m::Broadcast(m::Convert()))));
+}
+
+template <typename Pattern>
+auto Square(Pattern pattern) {
+  return m::Multiply()
+      .WithBinaryOperandsAnyOrder(pattern, pattern)
+      .WithPredicate([](const HloInstruction* instr) {
+        return instr->unique_operands().size() == 1;
+      });
+}
+
+std::optional<bool> MatchTFKerasLayerNorm(HloInstruction* instr,
+                                          HloInstruction** src,
+                                          HloInstruction** scale,
+                                          HloInstruction** bias, float* eps) {
+  // variance = Mean((X - Mean(x))^2)
+  // Z = scale / sqrt(variance + eps)
+  // LN(X) = X*Z + Bias - Mean(X)*Z
+
+  HloInstruction *src_a, *src_b, *src_c;
+  HloInstruction *bias_node, *scaled_norm_a, *scaled_norm_b, *mean0_a, *epsilon,
+      *sqrd_diff_mean, *scale_node, *sqrd_diff;
+
+  // First Match X*Z + Bias - Mean(X)*Z
+  if (!Match(
+          instr,
+          m::Add().WithBinaryOperandsAnyOrder(
+              m::Multiply()
+                  .WithBinaryOperandsAnyOrder(m::Op(src), m::Op(&scaled_norm_a))
+                  .WithOneUser(),
+              m::Subtract(m::Op(&bias_node),
+                          m::Multiply().WithBinaryOperandsAnyOrder(
+                              m::Broadcast(m::Reshape(m::Op(&mean0_a))),
+                              m::Op(&scaled_norm_b)))
+                  .WithOneUser()))) {
+    return std::nullopt;
+  }
+
+  if (scaled_norm_a != scaled_norm_b) return std::nullopt;
+
+  const Shape& src_shape = (*src)->shape();
+  if (!IsSupportedType(src_shape.element_type())) return std::nullopt;
+
+  // Get bias
+  if (!Match(bias_node, m::Broadcast(m::Op(bias)))) return std::nullopt;
+
+  // Match Z = scale / sqrt(variance + eps)
+  if (!Match(scaled_norm_a,
+             m::Multiply().WithBinaryOperandsAnyOrder(
+                 m::Op(&scale_node),
+                 m::Broadcast(
+                     m::Reshape(m::Rsqrt(m::Add().WithBinaryOperandsAnyOrder(
+                         m::Broadcast(m::ConstantScalar(&epsilon)),
+                         m::Op(&sqrd_diff_mean)))))))) {
+    return std::nullopt;
+  }
+
+  // get epsilon
+  *eps = static_cast<float>(epsilon->literal().GetAsDouble({}).value());
+  // get scale
+  if (!Match(scale_node, m::Broadcast(m::Op(scale)))) return std::nullopt;
+
+  // match variance
+  if (!Match(sqrd_diff_mean, MeanPattern(&sqrd_diff))) return std::nullopt;
+
+  if (!Match(sqrd_diff, Square(m::Subtract(
+                            m::Op(&src_a),
+                            m::Broadcast(m::Reshape(MeanPattern(&src_b))))))) {
+    return std::nullopt;
+  }
+
+  if (src_a != src_b && src_a != *src) return std::nullopt;
+
+  // Match mean from Bias - Mean(X)*Z
+  if (!Match(mean0_a, MeanPattern(&src_c))) return std::nullopt;
+
+  if (src_c != *src) return std::nullopt;
+
+  return true;
+}
+
+bool MatchFlaxLayerNorm(HloInstruction* instr, HloInstruction** src,
+                        HloInstruction** scale, HloInstruction** bias,
+                        float* eps, bool* is_bf16orfp16_convert,
+                        bool* is_producer_bf16orfp16,
+                        HloInstruction** convert_instr) {
+  HloInstruction *prod_s, *hinge;
+  HloInstruction *div0, *div1, *div_red;
+  HloInstruction *mul_in0, *mul_in1, *main_pipe_mul_in0;
+  HloInstruction *reduce_in0, *epsilon;
+  HloInstruction *broadcast0, *broadcast1;
+
+  bool scaleFound = false;
+  bool shiftFound = false;
+
+  auto spine = m::Add().WithBinaryOperandsAnyOrder(
+      m::Broadcast(),
+      m::Multiply()
+          .WithBinaryOperandsAnyOrder(
+              m::Op(&hinge).WithOneUser(),
+              m::Subtract(
+                  OptionalConvert(m::Op(&prod_s)),
+                  m::Broadcast(
+                      m::Reshape(
+                          m::Broadcast(m::Reshape(m::Op(&div_red).WithOpcode(
+                                                      HloOpcode::kDivide))
+                                           .WithOneUser())
+                              .WithOneUser())
+                          .WithOneUser())
+                      .WithOneUser())
+                  .WithOneUser())
+          .WithOneUser());
+
+  if (!Match(instr, spine)) return false;
+
+  const Shape& prod_shape = prod_s->shape();
+  if (!IsSupportedType(prod_shape.element_type())) return false;
+
+  HloInstruction* shift = FindLayerNormShift(instr);
+  shiftFound = (shift != nullptr);
+
+  HloInstruction* scale_gamma = FindLayerNormScale(hinge);
+  scaleFound = (scale_gamma != nullptr);
+
+  // Currently patterns without scale and shift are not supported.
+  // OneDNN only supports 2 <= rank <= 5
+  if (!(prod_shape.rank() >= 2 && prod_shape.rank() <= 5) || !shiftFound ||
+      !scaleFound) {
+    return false;
+  }
+
+  // NOLINTBEGIN
+  auto main_pipeline = m::Multiply().WithBinaryOperandsAnyOrder(
+      m::Op(),
+      m::Broadcast(
+          m::Reshape(
+              m::Broadcast(
+                  m::Rsqrt(
+                      m::Add()
+                          .WithBinaryOperandsAnyOrder(
+                              m::Broadcast(m::ConstantScalar(&epsilon)),
+                              m::Reshape(
+                                  m::Maximum()
+                                      .WithBinaryOperandsAnyOrder(
+                                          m::Broadcast(),
+                                          m::Subtract(
+                                              m::Op(&div0).WithOpcode(
+                                                  HloOpcode::kDivide),
+                                              m::Multiply()
+                                                  .WithBinaryOperandsAnyOrder(
+                                                      m::Op(&main_pipe_mul_in0),
+                                                      m::Op(&div1).WithOpcode(
+                                                          HloOpcode::kDivide))
+                                                  .WithOneUser())
+                                              .WithOneUser())
+                                      .WithOneUser())
+                                  .WithOneUser())
+                          .WithOneUser())
+                      .WithOneUser())
+                  .WithOneUser())
+              .WithOneUser())
+          .WithOneUser());
+  // NOLINTEND
+
+  if (!Match(hinge, main_pipeline)) return false;
+
+  if ((div_red != div1) || (main_pipe_mul_in0 != div1)) return false;
+
+  auto div_red_mul_src =
+      m::Divide()
+          .WithOperand(0, m::Reduce(m::Multiply().WithBinaryOperandsAnyOrder(
+                                        OptionalConvert(m::Op(&mul_in0)),
+                                        OptionalConvert(m::Op(&mul_in1))),
+                                    m::Constant())
+                              .WithPredicate([](const HloInstruction* reduce) {
+                                HloComputation* reducer = reduce->to_apply();
+                                return (reducer->root_instruction()->opcode() ==
+                                            HloOpcode::kAdd &&
+                                        reduce->dimensions().size() == 1 &&
+                                        reduce->dimensions()[0] ==
+                                            reduce->shape().rank());
+                              }))
+          .WithOperand(1, m::Op(&broadcast0).WithOpcode(HloOpcode::kBroadcast))
+          .WithOneUser();
+
+  if (!Match(div0, div_red_mul_src)) return false;
+
+  if (mul_in0 != mul_in1) return false;
+
+  auto div_red_subgraph =
+      m::Divide()
+          .WithOperand(
+              0,
+              m::Reduce(OptionalConvert(m::Op(&reduce_in0)), m::Constant())
+                  .WithPredicate([](const HloInstruction* reduce) {
+                    HloComputation* reducer = reduce->to_apply();
+                    return (reducer->root_instruction()->opcode() ==
+                                HloOpcode::kAdd &&
+                            reduce->dimensions().size() == 1 &&
+                            reduce->dimensions()[0] == reduce->shape().rank());
+                  }))
+          .WithOperand(1, m::Op(&broadcast1).WithOpcode(HloOpcode::kBroadcast));
+
+  if (!Match(div1, div_red_subgraph)) return false;
+
+  if (broadcast1 != broadcast0 || reduce_in0 != mul_in0 || mul_in0 != prod_s) {
+    return false;
+  }
+
+  *is_producer_bf16orfp16 =
+      (prod_s->shape().element_type() == PrimitiveType::F16) ||
+      (prod_s->shape().element_type() == PrimitiveType::BF16);
+  if (instr->user_count() == 1 &&
+      instr->users().at(0)->opcode() == HloOpcode::kConvert) {
+    *convert_instr = instr->users().at(0);
+    *is_bf16orfp16_convert =
+        ((*convert_instr)->shape().element_type() == PrimitiveType::F16 ||
+         (*convert_instr)->shape().element_type() == PrimitiveType::BF16);
+  }
+
+  *src = prod_s;
+  *scale = scale_gamma;
+  *bias = shift;
+  // get epsilon
+  *eps = static_cast<float>(epsilon->literal().GetAsDouble({}).value());
+
+  return true;
+}
+
 }  // namespace
 
 class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
  public:
   Status HandleAdd(HloInstruction* instr) override {
-    HloInstruction *slicemu1, *slicemu2;
-    HloInstruction *slicesource1, *slicesource2;
-    HloInstruction *musquare1, *musquare2;
-    HloInstruction *prod_c, *prod_l, *prod_s, *prod_r;
-    HloInstruction *slicevar, *hinge;
-
-    bool scaleFound = false;
-    bool shiftFound = false;
-
-    auto spine = m::Add().WithBinaryOperandsAnyOrder(
-        m::Broadcast(),
-        m::Multiply()
-            .WithBinaryOperandsAnyOrder(
-                m::Op(&hinge).WithOneUser(),
-                m::Subtract(
-                    m::Op(&prod_s),
-                    m::Broadcast(
-                        m::Reshape(
-                            m::Broadcast(
-                                m::Reshape(
-                                    m::Op(&slicemu1)
-                                        .WithOpcode(HloOpcode::kSlice)
-                                        .WithOperand(
-                                            0, m::Op(&slicesource1)
-                                                   .WithOpcode(
-                                                       HloOpcode::kDivide)))
-                                    .WithOneUser())
-                                .WithOneUser())
-                            .WithOneUser())
-                        .WithOneUser())
-                    .WithOneUser())
-            .WithOneUser());
-
-    if (!Match(instr, spine)) {
-      return OkStatus();
+    HloInstruction *src, *scale, *bias;
+    float eps;
+    bool is_bf16orfp16_convert = false;
+    bool is_producer_bf16orfp16 = false;
+    HloInstruction* convert_instr;
+
+    bool found_ln =
+        MatchTFKerasLayerNorm(instr, &src, &scale, &bias, &eps).value_or(false);
+
+    if (!found_ln) {
+      found_ln = MatchFlaxLayerNorm(instr, &src, &scale, &bias, &eps,
+                                    &is_bf16orfp16_convert,
+                                    &is_producer_bf16orfp16, &convert_instr);
     }
 
-    const Shape& prod_shape = prod_s->shape();
-    if (!IsSupportedType(prod_shape.element_type())) return OkStatus();
+    if (!found_ln) return OkStatus();
 
-    HloInstruction* shift = FindLayerNormShift(instr);
-    shiftFound = (shift != nullptr);
+    const Shape& src_shape = src->shape();
 
-    HloInstruction* scale = FindLayerNormScale(hinge);
-    scaleFound = (scale != nullptr);
-
-    // Currently patterns without scale and shift are
-    // not supported.
-    // OneDNN only supports 2 <= rank <= 5
-    if (!(prod_shape.rank() >= 2 && prod_shape.rank() <= 5) || !shiftFound ||
-        !scaleFound) {
-      return OkStatus();
-    }
-
-    // NOLINTBEGIN
-    auto main_pipeline = m::Multiply().WithBinaryOperandsAnyOrder(
-        m::Op(),
-        m::Broadcast(
-            m::Reshape(
-                m::Broadcast(
-                    m::Rsqrt(
-                        m::Add()
-                            .WithBinaryOperandsAnyOrder(
-                                m::Broadcast(m::Constant()),
-                                m::Reshape(
-                                    m::Maximum()
-                                        .WithBinaryOperandsAnyOrder(
-                                            m::Broadcast(),
-                                            m::Subtract(
-                                                m::Reshape(
-                                                    m::Op(&slicevar)
-                                                        .WithOpcode(
-                                                            HloOpcode::kSlice)
-                                                        .WithOperand(
-                                                            0,
-                                                            m::Op(&slicesource2)
-                                                                .WithOpcode(
-                                                                    HloOpcode::
-                                                                        kDivide)))
-                                                    .WithOneUser(),
-                                                m::Multiply(
-                                                    m::Op(&musquare1),
-                                                    m::Op(&musquare2)
-                                                        .WithOperand(
-                                                            0,
-                                                            m::Op(&slicemu2)
-                                                                .WithOpcode(
-                                                                    HloOpcode::
-                                                                        kSlice)))
-                                                    .WithOneUser())
-                                                .WithOneUser())
-                                        .WithOneUser())
-                                    .WithOneUser())
-                            .WithOneUser())
-                        .WithOneUser())
-                    .WithOneUser())
-                .WithOneUser())
-            .WithOneUser());
-    // NOLINTEND
-
-    if (!Match(hinge, main_pipeline) || slicemu1 != slicemu2 ||
-        musquare1 != musquare2 || slicesource1 != slicesource2) {
-      return OkStatus();
-    }
-
-    // Check if the slices are compatible
-    if (!(absl::c_all_of(slicemu1->slice_starts(),
-                         [](int64_t i) { return i == 0; }) &&
-          absl::c_equal(slicemu1->slice_limits(),
-                        slicemu1->shape().dimensions())) &&
-        !(absl::c_all_of(slicevar->slice_starts(),
-                         [](int64_t i) { return i == 0; }) &&
-          absl::c_equal(slicevar->slice_limits(),
-                        slicevar->shape().dimensions()))) {
-      return OkStatus();
-    }
+    HloInstruction* ln_call =
+        instr->AddInstruction(HloInstruction::CreateCustomCall(
+            src_shape, {src, scale, bias}, "__onednn$layernorm"));
+    BackendConfig backend_config;
+    OneDnnLayerNormConfig* ln_config =
+        backend_config.mutable_onednn_layer_norm_config();
+    ln_config->set_fused_ops(OneDnnLayerNormConfig::SCALE_AND_SHIFT);
+    ln_config->set_epsilon_typecast(*(reinterpret_cast<int32_t*>(&eps)));
+    TF_RETURN_IF_ERROR(ln_call->set_backend_config(backend_config));
 
-    auto empirical_expectations = m::Divide(
-        m::Reduce(m::Concatenate()
-                      .WithBinaryOperandsAnyOrder(
-                          m::Reshape(m::Multiply(m::Op(&prod_l), m::Op(&prod_c))
-                                         .WithOneUser())
-                              .WithOneUser(),
-                          m::Reshape(m::Op(&prod_r)).WithOneUser())
-                      .WithPredicate([](const HloInstruction* comb) {
-                        return (comb->dimensions().size() == 1 &&
-                                comb->dimensions()[0] == 0 &&
-                                comb->shape().dimensions(0) == 2);
-                      })
-                      .WithOneUser(),
-                  m::Constant())
-            .WithPredicate([](const HloInstruction* reduce) {
-              HloComputation* reducer = reduce->to_apply();
-              return (reducer->root_instruction()->opcode() ==
-                          HloOpcode::kAdd &&
-                      reduce->dimensions().size() == 1 &&
-                      reduce->dimensions()[0] == reduce->shape().rank());
-            })
-            .WithOneUser(),
-        m::Broadcast(m::ConstantScalar().WithPredicate(
-            [orig = prod_s](const HloInstruction* divisor) {
-              std::optional<double> actual =
-                  static_cast<const HloConstantInstruction*>(divisor)
-                      ->literal()
-                      .GetAsDouble({});
-              return (actual.has_value() &&
-                      orig->shape().dimensions(orig->shape().rank() - 1) ==
-                          *actual);
-            })));
-
-    HloInstruction *src1, *src2;
-    if (Match(slicesource2, empirical_expectations) &&
-        // Float32 pattern check
-        ((prod_l == prod_c && prod_c == prod_r && prod_l == prod_s) ||
-         // Bfloat16 pattern check
-         (prod_l == prod_c && prod_c == prod_r &&
-          Match(prod_l, ConvertPattern(&src1)) &&
-          Match(prod_s, ConvertPattern(&src2)) && src1 == src2))) {
-      HloInstruction* ln_call =
-          instr->AddInstruction(HloInstruction::CreateCustomCall(
-              prod_shape, {prod_r, scale, shift}, "__onednn$layernorm"));
-      BackendConfig backend_config;
-      OneDnnLayerNormConfig* ln_config =
-          backend_config.mutable_onednn_layer_norm_config();
-      ln_config->set_fused_ops(OneDnnLayerNormConfig::SCALE_AND_SHIFT);
-      TF_RETURN_IF_ERROR(ln_call->set_backend_config(backend_config));
+    if (convert_instr != nullptr && is_bf16orfp16_convert &&
+        is_producer_bf16orfp16) {
+      TF_RETURN_IF_ERROR(ReplaceInstruction(convert_instr, ln_call));
+    } else {
       TF_RETURN_IF_ERROR(ReplaceInstruction(instr, ln_call));
     }
 
@@ -328,15 +425,22 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
 
   Status HandleConvert(HloInstruction* instr) override {
     HloInstruction* ln_instr;
-    auto pattern = m::Convert(m::Op(&ln_instr)
-                                  .WithOneUser()
-                                  .WithOpcode(HloOpcode::kCustomCall)
-                                  .WithCustomCallTarget({"__onednn$layernorm"})
-                                  .WithElementType(PrimitiveType::F32))
-                       .WithElementType(PrimitiveType::BF16);
+    HloInstruction* convert_instr;
+    auto pattern =
+        m::Op(&convert_instr)
+            .WithOpcode(HloOpcode::kConvert)
+            .WithOperand(0, m::Op(&ln_instr)
+                                .WithOneUser()
+                                .WithOpcode(HloOpcode::kCustomCall)
+                                .WithCustomCallTarget({"__onednn$layernorm"})
+                                .WithElementType(PrimitiveType::F32));
 
     if (!IsSupportedType(instr->shape().element_type())) return OkStatus();
     if (Match(instr, pattern)) {
+      bool is_bf16orfp16_convert =
+          (convert_instr->shape().element_type() == PrimitiveType::BF16) ||
+          (convert_instr->shape().element_type() == PrimitiveType::F16);
+      if (!is_bf16orfp16_convert) return OkStatus();
       HloInstruction* producer = instr->mutable_operand(0)->mutable_operand(0);
       HloInstruction* newinp =
           producer->AddInstruction(HloInstruction::CreateConvert(
diff --git a/third_party/xla/xla/tests/onednn_layer_norm_test.cc b/third_party/xla/xla/tests/onednn_layer_norm_test.cc
index d4e22512a3a01c..b19603cc19739b 100644
--- a/third_party/xla/xla/tests/onednn_layer_norm_test.cc
+++ b/third_party/xla/xla/tests/onednn_layer_norm_test.cc
@@ -21,149 +21,124 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class LayerNormTest : public HloTestBase {};
-
-TEST_F(LayerNormTest, SimpleTest) {
-  const char* layer_norm_module_str = R"(
-  HloModule layer_norm.test, entry_computation_layout={(f32[4,1,256]{2,1,0}, f32[1,1,256]{2,1,0}, f32[1,1,256]{2,1,0})->f32[4,1,256]{2,1,0}}
+class LayerNormTest : public HloTestBase {
+ protected:
+  const char* onednn_layer_norm_ =
+      R"(
+  ; CHECK:     custom_call_target="__onednn$layernorm",
+  ; CHECK:       backend_config={
+  ; CHECK-DAG:     "onednn_layer_norm_config":{
+  ; CHECK-DAG:       "fused_ops":"SCALE_AND_SHIFT"
+  ; CHECK-DAG:   }
+  ; CHECK:     }
+  )";
+  std::string common_hlo_region_ =
+      R"(
 
   region_add {
     Arg_0.7555 = f32[] parameter(0)
     Arg_1.7556 = f32[] parameter(1)
     ROOT add.7557 = f32[] add(Arg_0.7555, Arg_1.7556)
   }
+)";
 
-  ENTRY main {
-    Arg_0.1 = f32[4,1,256]{2,1,0} parameter(0), sharding={replicated}
-    Arg_0.2 = f32[1,1,256]{2,1,0} parameter(1), sharding={replicated}
-    Arg_0.3 = f32[1,1,256]{2,1,0} parameter(2), sharding={replicated}
-    reshape.9744 = f32[1,4,1,256]{3,2,1,0} reshape(Arg_0.1)
-    multiply.9743 = f32[4,1,256]{2,1,0} multiply(Arg_0.1, Arg_0.1)
-    reshape.9745 = f32[1,4,1,256]{3,2,1,0} reshape(multiply.9743)
-    concatenate.9746 = f32[2,4,1,256]{3,2,1,0} concatenate(reshape.9744, reshape.9745), dimensions={0}
-    constant.9731 = f32[] constant(0)
-    reduce.9747 = f32[2,4,1]{2,1,0} reduce(concatenate.9746, constant.9731), dimensions={3}, to_apply=region_add
-    constant.9729 = f32[] constant(256)
-    broadcast.9730 = f32[2,4,1]{2,1,0} broadcast(constant.9729), dimensions={}
-    divide.9748 = f32[2,4,1]{2,1,0} divide(reduce.9747, broadcast.9730)
-    slice.9749 = f32[1,4,1]{2,1,0} slice(divide.9748), slice={[0:1], [0:4], [0:1]}
-    reshape.9756 = f32[4,1,1]{2,1,0} reshape(slice.9749)
-    broadcast.9758 = f32[4,1,1]{2,1,0} broadcast(reshape.9756), dimensions={0,1,2}
-    reshape.9759 = f32[4,1]{1,0} reshape(broadcast.9758)
-    broadcast.9760 = f32[4,1,256]{2,1,0} broadcast(reshape.9759), dimensions={0,1}
-    subtract.9761 = f32[4,1,256]{2,1,0} subtract(Arg_0.1, broadcast.9760)
-    slice.9751 = f32[1,4,1]{2,1,0} slice(divide.9748), slice={[1:2], [0:4], [0:1]}
-    reshape.9752 = f32[4,1]{1,0} reshape(slice.9751)
-    reshape.9750 = f32[4,1]{1,0} reshape(slice.9749)
-    multiply.9753 = f32[4,1]{1,0} multiply(reshape.9750, reshape.9750)
-    subtract.9754 = f32[4,1]{1,0} subtract(reshape.9752, multiply.9753)
-    constant.9727 = f32[] constant(0)
-    broadcast.9728 = f32[4,1]{1,0} broadcast(constant.9727), dimensions={}
-    maximum.9755 = f32[4,1]{1,0} maximum(subtract.9754, broadcast.9728)
-    reshape.9757 = f32[4,1,1]{2,1,0} reshape(maximum.9755)
-    constant.9725 = f32[] constant(1e-05)
-    broadcast.9726 = f32[4,1,1]{2,1,0} broadcast(constant.9725), dimensions={}
-    add.9762 = f32[4,1,1]{2,1,0} add(reshape.9757, broadcast.9726)
-    rsqrt.9763 = f32[4,1,1]{2,1,0} rsqrt(add.9762)
-    broadcast.9764 = f32[4,1,1]{2,1,0} broadcast(rsqrt.9763), dimensions={0,1,2}
-    reshape.9765 = f32[4,1]{1,0} reshape(broadcast.9764)
-    broadcast.9766 = f32[4,1,256]{2,1,0} broadcast(reshape.9765), dimensions={0,1}
-    broadcast.9767 = f32[1,1,256]{2,1,0} broadcast(Arg_0.2), dimensions={0,1,2}
-    reshape.9768 = f32[1,256]{1,0} reshape(broadcast.9767)
-    broadcast.9769 = f32[4,1,256]{2,1,0} broadcast(reshape.9768), dimensions={1,2}
-    multiply.9770 = f32[4,1,256]{2,1,0} multiply(broadcast.9766, broadcast.9769)
-    multiply.9771 = f32[4,1,256]{2,1,0} multiply(subtract.9761, multiply.9770)
-    broadcast.9772 = f32[1,1,256]{2,1,0} broadcast(Arg_0.3), dimensions={0,1,2}
-    reshape.9773 = f32[1,256]{1,0} reshape(broadcast.9772)
-    broadcast.9774 = f32[4,1,256]{2,1,0} broadcast(reshape.9773), dimensions={1,2}
-    ROOT add.9775 = f32[4,1,256]{2,1,0} add(multiply.9771, broadcast.9774)
-  }  
+  std::string common_hlo_entry_computation_block_ =
+      R"(
+    Arg_0.2 = f32[768]{0} parameter(1), sharding={replicated}
+    Arg_0.3 = f32[768]{0} parameter(2), sharding={replicated}
+
+    convert.290 = f32[84,197,768]{2,1,0} convert(Arg_0.1)
+    constant.291 = f32[] constant(0)
+    convert.292 = f32[] convert(constant.291)
+    reduce.297 = f32[84,197]{1,0} reduce(convert.290, convert.292), dimensions={2}, to_apply=region_add
+    constant.298 = s32[] constant(768)
+    convert.299 = f32[] convert(constant.298)
+    broadcast.300 = f32[84,197]{1,0} broadcast(convert.299), dimensions={}
+    divide.301 = f32[84,197]{1,0} divide(reduce.297, broadcast.300)
+    convert.302 = f32[84,197]{1,0} convert(divide.301)
+    reshape.303 = f32[84,197,1]{2,1,0} reshape(convert.302)
+    reshape.304 = f32[84,197]{1,0} reshape(reshape.303)
+    broadcast.305 = f32[84,197,768]{2,1,0} broadcast(reshape.304), dimensions={0,1}
+    subtract.306 = f32[84,197,768]{2,1,0} subtract(Arg_0.1, broadcast.305)
+    multiply.307 = f32[84,197,768]{2,1,0} multiply(subtract.306, subtract.306)
+    convert.308 = f32[84,197,768]{2,1,0} convert(multiply.307)
+    constant.309 = f32[] constant(0)
+    convert.310 = f32[] convert(constant.309)
+    reduce.315 = f32[84,197]{1,0} reduce(convert.308, convert.310), dimensions={2}, to_apply=region_add
+    constant.316 = s32[] constant(768)
+    convert.317 = f32[] convert(constant.316)
+    broadcast.318 = f32[84,197]{1,0} broadcast(convert.317), dimensions={}
+    divide.319 = f32[84,197]{1,0} divide(reduce.315, broadcast.318)
+    convert.320 = f32[84,197]{1,0} convert(divide.319)
+    reshape.321 = f32[84,197,1]{2,1,0} reshape(convert.320)
+    constant.322 = f32[] constant(1e-12)
+    broadcast.323 = f32[84,197,1]{2,1,0} broadcast(constant.322), dimensions={}
+    add.324 = f32[84,197,1]{2,1,0} add(reshape.321, broadcast.323)
+    rsqrt.325 = f32[84,197,1]{2,1,0} rsqrt(add.324)
+    reshape.328 = f32[84,197]{1,0} reshape(rsqrt.325)
+    broadcast.329 = f32[84,197,768]{2,1,0} broadcast(reshape.328), dimensions={0,1}
+    broadcast.327 = f32[84,197,768]{2,1,0} broadcast(Arg_0.2), dimensions={2}
+    multiply.330 = f32[84,197,768]{2,1,0} multiply(broadcast.329, broadcast.327)
+    multiply.331 = f32[84,197,768]{2,1,0} multiply(Arg_0.1, multiply.330)
+    broadcast.336 = f32[84,197,768]{2,1,0} broadcast(Arg_0.3), dimensions={2}
+    reshape.332 = f32[84,197]{1,0} reshape(reshape.303)
+    broadcast.333 = f32[84,197,768]{2,1,0} broadcast(reshape.332), dimensions={0,1}
+    multiply.334 = f32[84,197,768]{2,1,0} multiply(multiply.330, broadcast.333)
+    subtract.337 = f32[84,197,768]{2,1,0} subtract(broadcast.336, multiply.334)
 )";
+};
+
+TEST_F(LayerNormTest, LayerNormTest0_FP32) {
+  std::string layer_norm_module_str =
+      R"(HloModule layer_norm.test, entry_computation_layout={(f32[84,197,768]{2,1,0}, f32[768]{0}, f32[768]{0})->f32[84,197,768]{2,1,0}})" +
+      common_hlo_region_ + R"(
+  ENTRY main {
+    Arg_0.1 = f32[84,197,768]{2,1,0} parameter(0), sharding={replicated}
+        
+  )" + common_hlo_entry_computation_block_ +
+      R"(
+    ROOT add.338 = f32[84,197,768]{2,1,0} add(multiply.331, subtract.337)
+  }
+  )";
 
   EXPECT_TRUE(RunAndCompare(layer_norm_module_str, ErrorSpec{1e-4, 1e-4}));
-  MatchOptimizedHlo(layer_norm_module_str,
-                    R"(
-  ; CHECK:     custom_call_target="__onednn$layernorm",
-  ; CHECK:       backend_config={
-  ; CHECK-DAG:     "onednn_layer_norm_config":{
-  ; CHECK-DAG:       "fused_ops":"SCALE_AND_SHIFT"
-  ; CHECK-DAG:   }
-  ; CHECK:     }
-  )");
+  MatchOptimizedHlo(layer_norm_module_str, onednn_layer_norm_);
 }
 
-TEST_F(LayerNormTest, SimpleTestBF16) {
-  const char* layer_norm_module_str = R"(
-  HloModule layer_norm_bf16.test, entry_computation_layout={(f32[768]{0}, f32[768]{0}, bf16[16,128,768]{2,1,0})->bf16[16,128,768]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
-
-  region_0.16 {
-    Arg_0.17 = f32[] parameter(0)
-    Arg_1.18 = f32[] parameter(1)
-    ROOT add.19 = f32[] add(Arg_0.17, Arg_1.18)
+TEST_F(LayerNormTest, LayerNormTest0_BF16) {
+  std::string layer_norm_module_str =
+      R"(HloModule layer_norm.test, entry_computation_layout={(bf16[84,197,768]{2,1,0}, f32[768]{0}, f32[768]{0})->bf16[84,197,768]{2,1,0}})" +
+      common_hlo_region_ + R"(
+  ENTRY main {
+    Arg_0.1.0 = bf16[84,197,768]{2,1,0} parameter(0), sharding={replicated}
+    Arg_0.1 = f32[84,197,768]{2,1,0} convert(Arg_0.1.0)
+  )" + common_hlo_entry_computation_block_ +
+      R"(
+    add.338 = f32[84,197,768]{2,1,0} add(multiply.331, subtract.337)
+    ROOT convert.339 = bf16[84,197,768]{2,1,0} convert(add.338)
   }
+  )";
 
-  ENTRY main.53 {
-    Arg_2.3 = bf16[16,128,768]{2,1,0} parameter(2), sharding={replicated}
-    convert.31 = f32[16,128,768]{2,1,0} convert(Arg_2.3)
-    convert.11 = f32[16,128,768]{2,1,0} convert(Arg_2.3)
-    reshape.13 = f32[1,16,128,768]{3,2,1,0} reshape(convert.11)
-    multiply.12 = f32[16,128,768]{2,1,0} multiply(convert.11, convert.11)
-    reshape.14 = f32[1,16,128,768]{3,2,1,0} reshape(multiply.12)
-    concatenate.15 = f32[2,16,128,768]{3,2,1,0} concatenate(reshape.13, reshape.14), dimensions={0}
-    constant.10 = f32[] constant(0)
-    reduce.20 = f32[2,16,128]{2,1,0} reduce(concatenate.15, constant.10), dimensions={3}, to_apply=region_0.16
-    constant.8 = f32[] constant(768)
-    broadcast.9 = f32[2,16,128]{2,1,0} broadcast(constant.8), dimensions={}
-    divide.21 = f32[2,16,128]{2,1,0} divide(reduce.20, broadcast.9)
-    slice.22 = f32[1,16,128]{2,1,0} slice(divide.21), slice={[0:1], [0:16], [0:128]}
-    reshape.29 = f32[16,128,1]{2,1,0} reshape(slice.22)
-    broadcast.32 = f32[16,128,1]{2,1,0} broadcast(reshape.29), dimensions={0,1,2}
-    reshape.33 = f32[16,128]{1,0} reshape(broadcast.32)
-    broadcast.34 = f32[16,128,768]{2,1,0} broadcast(reshape.33), dimensions={0,1}
-    subtract.35 = f32[16,128,768]{2,1,0} subtract(convert.31, broadcast.34)
-    slice.24 = f32[1,16,128]{2,1,0} slice(divide.21), slice={[1:2], [0:16], [0:128]}
-    reshape.25 = f32[16,128]{1,0} reshape(slice.24)
-    reshape.23 = f32[16,128]{1,0} reshape(slice.22)
-    multiply.26 = f32[16,128]{1,0} multiply(reshape.23, reshape.23)
-    subtract.27 = f32[16,128]{1,0} subtract(reshape.25, multiply.26)
-    constant.6 = f32[] constant(0)
-    broadcast.7 = f32[16,128]{1,0} broadcast(constant.6), dimensions={}
-    maximum.28 = f32[16,128]{1,0} maximum(subtract.27, broadcast.7)
-    reshape.30 = f32[16,128,1]{2,1,0} reshape(maximum.28)
-    constant.4 = f32[] constant(1e-06)
-    broadcast.5 = f32[16,128,1]{2,1,0} broadcast(constant.4), dimensions={}
-    add.36 = f32[16,128,1]{2,1,0} add(reshape.30, broadcast.5)
-    rsqrt.37 = f32[16,128,1]{2,1,0} rsqrt(add.36)
-    broadcast.39 = f32[16,128,1]{2,1,0} broadcast(rsqrt.37), dimensions={0,1,2}
-    reshape.40 = f32[16,128]{1,0} reshape(broadcast.39)
-    broadcast.41 = f32[16,128,768]{2,1,0} broadcast(reshape.40), dimensions={0,1}
-    Arg_1.2 = f32[768]{0} parameter(1), sharding={replicated}
-    reshape.38 = f32[1,1,768]{2,1,0} reshape(Arg_1.2)
-    broadcast.42 = f32[1,1,768]{2,1,0} broadcast(reshape.38), dimensions={0,1,2}
-    reshape.43 = f32[768]{0} reshape(broadcast.42)
-    broadcast.44 = f32[16,128,768]{2,1,0} broadcast(reshape.43), dimensions={2}
-    multiply.45 = f32[16,128,768]{2,1,0} multiply(broadcast.41, broadcast.44)
-    multiply.46 = f32[16,128,768]{2,1,0} multiply(subtract.35, multiply.45)
-    Arg_0.1 = f32[768]{0} parameter(0), sharding={replicated}
-    reshape.47 = f32[1,1,768]{2,1,0} reshape(Arg_0.1)
-    broadcast.48 = f32[1,1,768]{2,1,0} broadcast(reshape.47), dimensions={0,1,2}
-    reshape.49 = f32[768]{0} reshape(broadcast.48)
-    broadcast.50 = f32[16,128,768]{2,1,0} broadcast(reshape.49), dimensions={2}
-    add.51 = f32[16,128,768]{2,1,0} add(multiply.46, broadcast.50)
-    ROOT convert.52 = bf16[16,128,768]{2,1,0} convert(add.51)
+  EXPECT_TRUE(RunAndCompare(layer_norm_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(layer_norm_module_str, onednn_layer_norm_);
+}
+
+TEST_F(LayerNormTest, LayerNormTest0_F16) {
+  std::string layer_norm_module_str =
+      R"(HloModule layer_norm.test, entry_computation_layout={(f16[84,197,768]{2,1,0}, f32[768]{0}, f32[768]{0})->f16[84,197,768]{2,1,0}})" +
+      common_hlo_region_ + R"(
+  ENTRY main {
+    Arg_0.1.0 = f16[84,197,768]{2,1,0} parameter(0), sharding={replicated}
+    Arg_0.1 = f32[84,197,768]{2,1,0} convert(Arg_0.1.0)
+  )" + common_hlo_entry_computation_block_ +
+      R"(
+    add.338 = f32[84,197,768]{2,1,0} add(multiply.331, subtract.337)
+    ROOT convert.339 = f16[84,197,768]{2,1,0} convert(add.338)
   }
-)";
+  )";
 
   EXPECT_TRUE(RunAndCompare(layer_norm_module_str, ErrorSpec{1e-2, 1e-2}));
-  MatchOptimizedHlo(layer_norm_module_str,
-                    R"(
-  ; CHECK:     custom_call_target="__onednn$layernorm",
-  ; CHECK:       backend_config={
-  ; CHECK-DAG:     "onednn_layer_norm_config":{
-  ; CHECK-DAG:       "fused_ops":"SCALE_AND_SHIFT"
-  ; CHECK-DAG:   }
-  ; CHECK:     }
-  )");
+  MatchOptimizedHlo(layer_norm_module_str, onednn_layer_norm_);
 }
 
 }  // namespace

From 076f8d5236da716abbfe1a8ce4d4f6fdebf596e1 Mon Sep 17 00:00:00 2001
From: Surya <116063290+SuryanarayanaY@users.noreply.github.com>
Date: Tue, 19 Mar 2024 21:22:02 +0530
Subject: [PATCH 105/670] Replace TFError with absl for substr_op.cc

Replace TFError with absl for substr_op.cc
---
 tensorflow/core/kernels/substr_op.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 5f4b2a3a3b0d54..3628fe733d646e 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -57,11 +57,13 @@ class SubstrOp : public OpKernel {
                     "pos and len should have the same shape, got: ",
                     pos_shape.DebugString(), " vs. ", len_shape.DebugString()));
     OP_REQUIRES(context, pos_tensor.NumElements() > 0,
-                errors::InvalidArgument("received empty tensor pos_tensor: ",
-                                        pos_tensor.DebugString()));
+                absl::InvalidArgumentError(
+                    absl::StrCat("received empty tensor pos_tensor: ",
+                                 pos_tensor.DebugString())));
     OP_REQUIRES(context, len_tensor.NumElements() > 0,
-                errors::InvalidArgument("received empty tensor len_tensor: ",
-                                        len_tensor.DebugString()));
+                absl::InvalidArgumentError(
+                    absl::StrCat("received empty tensor len_tensor: ",
+                                 len_tensor.DebugString())));
     bool is_scalar = TensorShapeUtils::IsScalar(pos_shape);
 
     if (is_scalar || input_shape == pos_shape) {

From c9febcf54cdb7e6d5a75d58a549a15b906699cc2 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Tue, 19 Mar 2024 09:11:32 -0700
Subject: [PATCH 106/670] [XLA:GPU] Avoid adding CudnnNormRewriter to the
 compilation pipeline if it is disabled.

This pass previously performed expensive matching work at even when it was disabled
and would not rewrite anything.

PiperOrigin-RevId: 617192276
---
 third_party/xla/xla/service/gpu/nvptx_compiler.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 299e5ca02f4391..5821d21efc4757 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -267,8 +267,10 @@ absl::Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
   }
 
   HloPassPipeline pre_pipeline("nvptx post-layout_assignment part 1");
-  // Rewrite normalization patterns into cuDNN Custom Calls.
-  pre_pipeline.AddPass<CudnnNormRewriter>(cuda_compute_capability);
+  if (hlo_module->config().debug_options().xla_gpu_enable_cudnn_layer_norm()) {
+    // Rewrite normalization patterns into cuDNN Custom Calls.
+    pre_pipeline.AddPass<CudnnNormRewriter>(cuda_compute_capability);
+  }
 
   pre_pipeline.AddPass<DotDimensionMerger>();
 

From 21df51ad7f3dcdeb03e8b7c7bc3bd0af6d887b92 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 09:20:33 -0700
Subject: [PATCH 107/670] Admits partially replicated strategies for broadcast
 ops.

PiperOrigin-RevId: 617194697
---
 .../experimental/auto_sharding/auto_sharding_strategy.cc    | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
index 4563141e30b67f..a4a5149d0338f6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
@@ -277,17 +277,13 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
         break;
       }
       case HloOpcode::kBroadcast: {
-        // For an unknown reason, we do not generate partially replicated
-        // strategies for >1D broadcast ops. This can be changed if we find that
-        // our search isn't exhaustive enough for certain ops.
         strategy_group =
             CreateAllStrategiesGroup(
                 ins, ins->shape(), instruction_id, strategy_groups, cluster_env,
                 strategy_map, option, replicated_penalty, batch_dim_map,
                 call_graph, only_allow_divisible,
                 /* create_replicated_strategies */ true,
-                /* create_partially_replicated_strategies */
-                (ins->shape().rank() == 1))
+                /* create_partially_replicated_strategies */ true)
                 .value();
         break;
       }

From f7d52a56db3637b188eee88255eac7386b4891c9 Mon Sep 17 00:00:00 2001
From: Alan Kelly <alankelly@google.com>
Date: Tue, 19 Mar 2024 09:21:09 -0700
Subject: [PATCH 108/670] Verify that input and output tensors have the same
 quantization parameters

PiperOrigin-RevId: 617194878
---
 .../delegates/xnnpack/xnnpack_delegate.cc     | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 9ce317fd271831..a91e43678b8539 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -5500,6 +5500,27 @@ class Subgraph {
         delegate, logging_context, output_tensor, node->outputs->data[0],
         node_index));
 
+    if (output_tensor.type == kTfLiteUInt8 ||
+        output_tensor.type == kTfLiteInt8) {
+      if (input_tensor.params.zero_point != output_tensor.params.zero_point) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "Mismatching quantization zero point across the input "
+            "(%" PRId32 ") and the output (%" PRId32
+            ") for RESHAPE operator #%d",
+            input_tensor.params.zero_point, output_tensor.params.zero_point,
+            node_index);
+        return kTfLiteError;
+      }
+      if (input_tensor.params.scale != output_tensor.params.scale) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "Mismatching quantization scale across the input (%f) "
+            "and the output (%f) for RESHAPE operator #%d",
+            input_tensor.params.scale, output_tensor.params.scale, node_index);
+        return kTfLiteError;
+      }
+    }
     if (subgraph != nullptr) {
       const xnn_status status = xnn_define_static_reshape(
           subgraph, num_new_dimensions, new_shape.data(),

From 1484a7fc9492615c7275c1d227194d078e06404f Mon Sep 17 00:00:00 2001
From: Aliia Khasanova <aliia@google.com>
Date: Tue, 19 Mar 2024 09:23:43 -0700
Subject: [PATCH 109/670] Integrate Triton up to
 [61be8e79](https://github.com/openai/triton/commit/61be8e792f206366f5ab2fe66d8984e97acc7324)

PiperOrigin-RevId: 617195492
---
 third_party/triton/cl614757739.patch             | 14 --------------
 third_party/triton/cl616113650.patch             | 16 ----------------
 third_party/triton/workspace.bzl                 |  6 ++----
 .../xla/third_party/triton/cl614757739.patch     | 14 --------------
 .../xla/third_party/triton/cl616113650.patch     | 16 ----------------
 third_party/xla/third_party/triton/workspace.bzl |  6 ++----
 6 files changed, 4 insertions(+), 68 deletions(-)
 delete mode 100644 third_party/triton/cl614757739.patch
 delete mode 100644 third_party/triton/cl616113650.patch
 delete mode 100644 third_party/xla/third_party/triton/cl614757739.patch
 delete mode 100644 third_party/xla/third_party/triton/cl616113650.patch

diff --git a/third_party/triton/cl614757739.patch b/third_party/triton/cl614757739.patch
deleted file mode 100644
index 058238e0005310..00000000000000
--- a/third_party/triton/cl614757739.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-==== triton/python/src/ir.cc#8 - /google/src/cloud/shyshkov/mlir_a924da6d4b8733e5bf08098b18dd7ad1a5ba5f46_1710185763/triton/python/src/ir.cc ====
-# action=edit type=text
---- triton/python/src/ir.cc	2024-02-21 21:40:18.000000000 -0800
-+++ triton/python/src/ir.cc	2024-03-11 12:39:44.000000000 -0700
-@@ -205,7 +205,8 @@
-   });
- 
-   py::class_<Type>(m, "type", py::module_local())
--      .def("is_integer", &Type::isInteger)
-+      .def("is_integer",
-+           [](Type &self, unsigned width) { return self.isInteger(width); })
-       .def("is_fp16", &Type::isF16)
-       .def("__str__", [](Type &self) {
-         std::string str;
diff --git a/third_party/triton/cl616113650.patch b/third_party/triton/cl616113650.patch
deleted file mode 100644
index 4dc3675d9a9ea5..00000000000000
--- a/third_party/triton/cl616113650.patch
+++ /dev/null
@@ -1,16 +0,0 @@
-==== triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#30 - triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp ====
-# action=edit type=text
---- triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp	2024-03-11 11:42:57.000000000 -0700
-+++ triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp	2024-03-15 06:57:27.000000000 -0700
-@@ -510,10 +510,8 @@
-                                    ConversionPatternRewriter &rewriter,
-                                    Type elemTy, MultipleOperandsRange operands,
-                                    Location loc) const {
--    auto boolFalse = rewriter.getBoolAttr(false);
--    auto constFalse = rewriter.create<LLVM::ConstantOp>(loc, boolFalse);
-     return {rewriter.create<LLVM::AbsOp>(loc, elemTy, operands[0][0],
--                                         /*is_int_min_poison=*/constFalse)};
-+                                         /*is_int_min_poison=*/false)};
-   }
- };
- 
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index 8eabda8c9eddcd..418bdf0b9d6a5b 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl612852008"
-    TRITON_SHA256 = "f2e330075469dd230ae5c72e4a2c9765c7851f7f6de5ad10f58578c07fc1dca4"
+    TRITON_COMMIT = "cl615024090"
+    TRITON_SHA256 = "157aa79fc6a0aec852c3af40a27791ffe0620553285c94315f41f4d455c212ff"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
@@ -15,7 +15,5 @@ def repo():
         # For temporary changes which haven't landed upstream yet.
         patch_file = [
             "//third_party/triton:cl607293980.patch",  # long standing :(
-            "//third_party/triton:cl614757739.patch",
-            "//third_party/triton:cl616113650.patch",
         ],
     )
diff --git a/third_party/xla/third_party/triton/cl614757739.patch b/third_party/xla/third_party/triton/cl614757739.patch
deleted file mode 100644
index 058238e0005310..00000000000000
--- a/third_party/xla/third_party/triton/cl614757739.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-==== triton/python/src/ir.cc#8 - /google/src/cloud/shyshkov/mlir_a924da6d4b8733e5bf08098b18dd7ad1a5ba5f46_1710185763/triton/python/src/ir.cc ====
-# action=edit type=text
---- triton/python/src/ir.cc	2024-02-21 21:40:18.000000000 -0800
-+++ triton/python/src/ir.cc	2024-03-11 12:39:44.000000000 -0700
-@@ -205,7 +205,8 @@
-   });
- 
-   py::class_<Type>(m, "type", py::module_local())
--      .def("is_integer", &Type::isInteger)
-+      .def("is_integer",
-+           [](Type &self, unsigned width) { return self.isInteger(width); })
-       .def("is_fp16", &Type::isF16)
-       .def("__str__", [](Type &self) {
-         std::string str;
diff --git a/third_party/xla/third_party/triton/cl616113650.patch b/third_party/xla/third_party/triton/cl616113650.patch
deleted file mode 100644
index 4dc3675d9a9ea5..00000000000000
--- a/third_party/xla/third_party/triton/cl616113650.patch
+++ /dev/null
@@ -1,16 +0,0 @@
-==== triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#30 - triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp ====
-# action=edit type=text
---- triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp	2024-03-11 11:42:57.000000000 -0700
-+++ triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp	2024-03-15 06:57:27.000000000 -0700
-@@ -510,10 +510,8 @@
-                                    ConversionPatternRewriter &rewriter,
-                                    Type elemTy, MultipleOperandsRange operands,
-                                    Location loc) const {
--    auto boolFalse = rewriter.getBoolAttr(false);
--    auto constFalse = rewriter.create<LLVM::ConstantOp>(loc, boolFalse);
-     return {rewriter.create<LLVM::AbsOp>(loc, elemTy, operands[0][0],
--                                         /*is_int_min_poison=*/constFalse)};
-+                                         /*is_int_min_poison=*/false)};
-   }
- };
- 
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 8eabda8c9eddcd..418bdf0b9d6a5b 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl612852008"
-    TRITON_SHA256 = "f2e330075469dd230ae5c72e4a2c9765c7851f7f6de5ad10f58578c07fc1dca4"
+    TRITON_COMMIT = "cl615024090"
+    TRITON_SHA256 = "157aa79fc6a0aec852c3af40a27791ffe0620553285c94315f41f4d455c212ff"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
@@ -15,7 +15,5 @@ def repo():
         # For temporary changes which haven't landed upstream yet.
         patch_file = [
             "//third_party/triton:cl607293980.patch",  # long standing :(
-            "//third_party/triton:cl614757739.patch",
-            "//third_party/triton:cl616113650.patch",
         ],
     )

From 1028c195835d68a8b02574d161f4fea77a20a829 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Tue, 19 Mar 2024 09:56:00 -0700
Subject: [PATCH 110/670] [xla:gpu] Handle arguments passed in arbitrary order
 to AddressComputationThunk

Passing the buffers in correct order to embedded thunk when AddressComputationThunk itself is getting buffers in arbitrary order.

PiperOrigin-RevId: 617205859
---
 .../gpu/runtime/address_computation_thunk.cc  |   9 +-
 .../runtime/address_computation_thunk_test.cc | 127 ++++++++++++++++++
 2 files changed, 132 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 3872683e70a75d..36c5f4deac7864 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -106,7 +106,8 @@ absl::Status AddressComputationThunk::Initialize(
 absl::Status AddressComputationThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   auto& stream = *params.stream;
-  std::vector<se::DeviceMemoryBase> new_buffers;
+  std::vector<se::DeviceMemoryBase> new_buffers(
+      embedded_thunk_arguments_.size(), se::DeviceMemoryBase());
   const BufferAllocations& orig_allocations = *params.buffer_allocations;
 
   // Get memory allocation for copying offsets from device.
@@ -117,14 +118,13 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
 
   for (unsigned i = 0; i < offset_buffer_indices_.size(); ++i) {
     if (embedded_thunk_arguments_[i] == std::nullopt) {
-      new_buffers.push_back(se::DeviceMemoryBase());
       continue;
     }
 
     se::DeviceMemoryBase orig_operand =
         orig_allocations.GetDeviceAddress(*embedded_thunk_arguments_[i]);
     if (offset_buffer_indices_[i] == std::nullopt) {
-      new_buffers.push_back(orig_operand);
+      new_buffers[embedded_thunk_arguments_[i]->index()] = orig_operand;
       continue;
     }
 
@@ -164,7 +164,8 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
       new_offset += start * stride;
     }
 
-    new_buffers.push_back(orig_operand.GetByteSlice(new_offset, new_size));
+    new_buffers[embedded_thunk_arguments_[i]->index()] =
+        orig_operand.GetByteSlice(new_offset, new_size);
   }
 
   // Safe to create a local BufferAllocations here since buffers are only slices
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index d2b2d48262ccc2..e31b246c404b4c 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -829,4 +829,131 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
   ASSERT_EQ(out, ref);
 }
 
+TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t lhs_length = sizeof(float) * 2 * 4;
+  int64_t rhs_length = sizeof(float) * 3 * 1;
+  int64_t out_length = sizeof(float) * 1 * 1;
+  int64_t offset_length = sizeof(int64_t);
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  BufferAllocation alloc_lhs(/*index=*/1, lhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, lhs_length);
+
+  BufferAllocation alloc_rhs(/*index=*/3, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, rhs_length);
+
+  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+
+  BufferAllocation alloc_workspace(/*index=*/0, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+
+  BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_offset_1(/*index=*/5, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_fake(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, rhs_length);
+
+  // Preparing config for GEMM thunk.
+  auto config =
+      GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
+                      0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
+                      se::blas::kDefaultComputePrecision, false, false);
+  ASSERT_TRUE(config.ok());
+
+  // Creating embedded GEMM thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<GemmThunk>(
+      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs,
+      slice_out, slice_workspace, /*deterministic=*/true));
+
+  // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
+       std::nullopt, std::nullopt});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+  // Given a `lhs` tensor of shape f32[2,4]{1,0}
+  // The `lhs` slice that we want to use will be equivalent to this static
+  // slice op:
+  // f32[1,3]{1,0} slice(lhs), slice={[0:1], [1:4]}
+
+  // Preparing memory for thunk arguments.
+  // lhs = [1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0]
+  se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
+  std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
+  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+
+  // rhs = [1.0,
+  //        1.0,
+  //        1.0]
+  se::DeviceMemory<float> rhs = executor->AllocateArray<float>(3 * 1);
+  std::vector<float> rhs_arr(3, 1);
+  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+
+  se::DeviceMemory<float> out = executor->AllocateArray<float>(1 * 1);
+  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+
+  se::DeviceMemory<float> workspace =
+      executor->AllocateArray<float>(1024 * 1024);
+  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> lhs_offset_arr{0, 1};
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations(
+      {workspace, lhs, out, rhs, lhs_offset_0, lhs_offset_1}, 0,
+      executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Executing address computation thunk.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copying `out` data back to host for verification.
+  std::vector<float> dst(1, 0);
+  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+
+  ASSERT_EQ(dst, std::vector<float>({9}));
+}
+
 }  // namespace xla::gpu

From 558d9806cc343c3b795cf52ea4201aff4b7f3f62 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 10:07:28 -0700
Subject: [PATCH 111/670] Remove the check for positive resharding costs from
 the request validation code, and mention in our problem formulation
 description, that some edges can have negative costs.

PiperOrigin-RevId: 617209988
---
 .../experimental/auto_sharding/auto_sharding_solver.cc   | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
index 600a9053f5d3da..d627cc84f01409 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
@@ -276,6 +276,12 @@ AutoShardingSolverRequest ScaleRequest(
 // 3. If request.makespan_coeff is present, the objective additionally includes
 //    a makespan term. This is experimental and turned off by default.
 // 4. request.max_departures is used only for debugging and can be ignored.
+// 5. Note that due to our modeling of XLA's AllReduceReassociate optimization
+//    (more details in CostGraph::CostGraph() in auto_sharding_cost_graph.cc,
+//    and in CreateElementwiseOperatorStrategies() in auto_sharding.cc), there
+//    can be a few (usually < 10) edges in the problem with negative costs. This
+//    is guaranteed to never produce a negative overall cost for the graph,
+//    however.
 AutoShardingSolverResult CallORToolsSolver(
     const AutoShardingSolverRequest& unscaled_request) {
   const AutoShardingSolverRequest& request = ScaleRequest(unscaled_request);
@@ -1040,9 +1046,6 @@ Status ValidateRequest(const AutoShardingSolverRequest& request) {
     const int num_u_strategies = request.computation_costs(u).costs_size();
     const int num_v_strategies = request.computation_costs(v).costs_size();
     CHECK_EQ(num_strategies, num_u_strategies * num_v_strategies);
-    for (EdgeStrategyIdx strategy = 0; strategy < num_strategies; ++strategy) {
-      TF_RET_CHECK(request.resharding_costs(e).costs(strategy) >= 0.0);
-    }
   }
   return OkStatus();
 }

From 9d5b0ca9db26db55dc6902415e7790e28fa1680e Mon Sep 17 00:00:00 2001
From: Jackson Stokes <jacksonstokes@google.com>
Date: Tue, 19 Mar 2024 10:30:02 -0700
Subject: [PATCH 112/670] [XLA:GPU] Add test for triton custom call.

Triton custom calls allow the caller to express their own triton kernels using the MLIR triton dialect (tt). This adds a test that the passed in tt ir is correctly lowered to llvm ir.

PiperOrigin-RevId: 617217892
---
 third_party/xla/xla/service/gpu/tests/BUILD   |  19 +++
 .../gpu/tests/gpu_triton_custom_call_test.cc  | 143 ++++++++++++++++++
 2 files changed, 162 insertions(+)
 create mode 100644 third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc

diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index f187ebf29b5979..39528d7b8305c8 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -442,6 +442,25 @@ xla_cc_test(
     ],
 )
 
+xla_cc_test(
+    name = "gpu_triton_custom_call_test",
+    srcs = ["gpu_triton_custom_call_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":gpu_codegen_test",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:verified_hlo_module",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CAPIIRHeaders",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 xla_cc_test(
     name = "gpu_ftz_test",
     srcs = ["gpu_ftz_test.cc"],
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
new file mode 100644
index 00000000000000..439292ca2eab92
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tests/verified_hlo_module.h"
+
+namespace xla {
+namespace gpu {
+
+using ::mlir::ArrayRef;
+using ::mlir::NamedAttribute;
+
+using GpuIrEmitterUnnestedTest = GpuCodegenTest;
+
+TEST_F(GpuIrEmitterUnnestedTest,
+       EmitTritonCustomCallWithCorrectLoweringAndWithoutNoaliasOrAlignment) {
+  // Tests that the lowering of a Triton custom call produces the correct LLVM
+  // IR, and that the arguments do not specify noalias or alignment attributes.
+
+  HloComputation::Builder computation_builder(TestName());
+  mlir::MLIRContext context_;
+  mlir::Builder builder(&context_);
+
+  // Create parameters and custom call in the computation builder.
+  Shape scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  HloInstruction* param_0 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "arg_0"));
+
+  HloInstruction* param_1 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "arg_1"));
+
+  // Create the backend_config for the triton custom call.
+  const std::string kMLIRText = R"(
+  module {
+    tt.func public @add_one(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}) {
+      %0 = tt.get_program_id x : i32
+      %1 = tt.load %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      %2 = tt.load %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      %cst = arith.constant 1.000000e+00 : f32
+      %3 = arith.addf %1, %cst : f32
+      %4 = tt.load %arg2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      tt.store %arg2, %3 {cache = 1 : i32, evict = 1 : i32} : f32
+      %5 = tt.load %arg3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      tt.store %arg3, %2 {cache = 1 : i32, evict = 1 : i32} : f32
+      tt.return
+    }
+  }
+  )";
+
+  NamedAttribute name =
+      builder.getNamedAttr("name", builder.getStringAttr("add_one"));
+  NamedAttribute ir =
+      builder.getNamedAttr("ir", builder.getStringAttr(kMLIRText));
+  NamedAttribute num_stages =
+      builder.getNamedAttr("num_stages", builder.getI32IntegerAttr(3));
+  NamedAttribute num_warps =
+      builder.getNamedAttr("num_warps", builder.getI32IntegerAttr(4));
+  NamedAttribute grid_x =
+      builder.getNamedAttr("grid_x", builder.getI32IntegerAttr(1));
+  NamedAttribute grid_y =
+      builder.getNamedAttr("grid_y", builder.getI32IntegerAttr(1));
+  NamedAttribute grid_z =
+      builder.getNamedAttr("grid_z", builder.getI32IntegerAttr(1));
+  NamedAttribute debug =
+      builder.getNamedAttr("debug", builder.getBoolAttr(false));
+
+  std::vector<NamedAttribute> attributes = {
+      name, ir, num_stages, num_warps, grid_x, grid_y, grid_z, debug};
+  ArrayRef<NamedAttribute> attributesRef(attributes);
+  mlir::DictionaryAttr backend_config =
+      mlir::DictionaryAttr::get(&context_, attributesRef);
+
+  // Parse the backend_config into a string.
+  std::string backend_config_str;
+  llvm::raw_string_ostream(backend_config_str) << backend_config;
+
+  computation_builder.AddInstruction(HloInstruction::CreateCustomCall(
+      tuple_shape, {param_0, param_1}, "__gpu$xla.gpu.triton",
+      backend_config_str));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(computation_builder.Build());
+
+  // Check that the compiled llvm ir matches the expected lowering of our tt ir.
+  // We check that the arguments do not specify noalias or alignment attributes,
+  // as this prevents recompilation based on the alignment of the input buffers.
+  CompileAndVerifyIr(std::move(module),
+                     R"(
+; CHECK: @add_one
+; CHECK-NOT: noalias align
+; CHECK-SAME: dereferenceable(4) %arg0
+; CHECK-NOT: noalias align
+; CHECK-SAME: dereferenceable(4) %arg1
+; CHECK-NOT: noalias align
+; CHECK-SAME: dereferenceable(4) %arg2
+; CHECK-NOT: noalias align
+; CHECK-SAME: dereferenceable(4) %arg3
+; CHECK-DAG:  addrspacecast ptr %arg0 to ptr addrspace(1)
+; CHECK-DAG:  addrspacecast ptr %arg1 to ptr addrspace(1)
+; CHECK-DAG:  addrspacecast ptr %arg2 to ptr addrspace(1)
+; CHECK-DAG:  addrspacecast ptr %arg3 to ptr addrspace(1)
+; CHECK: tail call i32 asm sideeffect
+; CHECK: tail call i32 asm sideeffect
+; CHECK: fadd float
+; CHECK-SAME: 1.000000e+00
+; CHECK-DAG: tail call void asm sideeffect
+; CHECK-DAG: tail call void asm sideeffect
+; CHECK:    ret void
+      )",
+                     /*match_optimized_ir=*/false);
+}
+
+}  // namespace gpu
+}  // namespace xla

From 6b6121dd81692a6edcc1d06a794675adeb7c476e Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 19 Mar 2024 10:31:46 -0700
Subject: [PATCH 113/670] Integrate LLVM at llvm/llvm-project@930f21c6bc0f

Updates LLVM usage to match
[930f21c6bc0f](https://github.com/llvm/llvm-project/commit/930f21c6bc0f)

PiperOrigin-RevId: 617218529
---
 tensorflow/compiler/mlir/lite/BUILD           |  2 +-
 tensorflow/compiler/mlir/tensorflow/BUILD     |  5 +-
 .../compiler/mlir/tensorflow/transforms/BUILD |  1 +
 tensorflow/compiler/mlir/tfr/BUILD            | 14 ++--
 tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD   |  1 +
 .../compiler/mlir/tools/kernel_gen/BUILD      |  1 +
 ...50c1995015daac552548c34b87220f21156d.patch | 77 -------------------
 third_party/llvm/generated.patch              | 74 ++++++++++++++++++
 third_party/llvm/workspace.bzl                |  5 +-
 third_party/xla/xla/mlir/xla_cpu/ir/BUILD     |  1 +
 third_party/xla/xla/mlir_hlo/BUILD            |  2 +
 .../xla/mlir_hlo/transforms/bufferize_pass.cc |  3 +
 third_party/xla/xla/service/gpu/fusions/BUILD |  1 +
 .../xla/xla/service/gpu/fusions/mlir/BUILD    |  3 +-
 .../xla/xla/service/gpu/fusions/mlir/ir/BUILD |  1 +
 15 files changed, 102 insertions(+), 89 deletions(-)
 delete mode 100644 third_party/llvm/daa350c1995015daac552548c34b87220f21156d.patch

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 95b9e92fa4c97f..2576b3e5e5fdd5 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -349,7 +349,6 @@ cc_library(
         "transforms/passes.h",
         "utils/attribute_utils.h",
         "utils/utils.h",
-        "@llvm-project//mlir:include/mlir/Transforms/InliningUtils.h",
     ],
     deps = [
         ":converter_inc",
@@ -382,6 +381,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index c1c8966849e4b9..3fb2f5a73bab44 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -360,6 +360,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:SideEffectInterfaces",
@@ -406,6 +407,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:SideEffectInterfaces",
@@ -453,6 +455,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:SideEffectInterfaces",
@@ -521,7 +524,6 @@ cc_library(
         "ir/tf_saved_model.h",
         "ir/tf_structs.h",
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.h",
-        "@llvm-project//mlir:include/mlir/Transforms/InliningUtils.h",
     ],
     includes = ["include"],
     visibility = ["//visibility:public"],
@@ -558,6 +560,7 @@ cc_library(
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:MLProgramDialect",
         "@llvm-project//mlir:Parser",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index 2bbd90a3aeeebc..8434d9c4124f25 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -652,6 +652,7 @@ cc_library(
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
index 04cd4282e5c451..1efa99861b588a 100644
--- a/tensorflow/compiler/mlir/tfr/BUILD
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -1,4 +1,10 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load(
+    "@llvm-project//mlir:tblgen.bzl",
+    "gentbl_cc_library",
+    "td_library",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
@@ -6,12 +12,6 @@ load(
 )
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "tf_py_strict_test", "tf_python_pybind_extension")
 load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
-load(
-    "@llvm-project//mlir:tblgen.bzl",
-    "gentbl_cc_library",
-    "td_library",
-)
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -114,6 +114,7 @@ cc_library(
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:SideEffectInterfaces",
@@ -163,6 +164,7 @@ cc_library(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:SCFDialect",
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
index 4b2b0576430bd1..bff2a14af4f68b 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
@@ -58,6 +58,7 @@ cc_library(
         ":mlrt_ops_inc_gen",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index d73899b6f85ecb..0fc38d776b7e26 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -125,6 +125,7 @@ tf_cc_binary(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:TargetParser",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:ExecutionEngineUtils",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
         "@llvm-project//mlir:MemRefTransforms",
diff --git a/third_party/llvm/daa350c1995015daac552548c34b87220f21156d.patch b/third_party/llvm/daa350c1995015daac552548c34b87220f21156d.patch
deleted file mode 100644
index 541c4e2f3bbbcb..00000000000000
--- a/third_party/llvm/daa350c1995015daac552548c34b87220f21156d.patch
+++ /dev/null
@@ -1,77 +0,0 @@
-commit daa350c1995015daac552548c34b87220f21156d
-Author: Benjamin Kramer <benny.kra@googlemail.com>
-Date:   Sun Mar 17 14:05:41 2024 +0100
-
-    [mlir] Work around MSVC bug
-    
-    MSVC fails to parse this construct, leading to
-    MlirTranslateMain.cpp(70): error C2065: 'inputSplitMarker': undeclared identifier
-    
-    Just switching to brace init works around the issue
-
-diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
-index 51504ad58282..44c5e9826f3b 100644
---- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
-+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
-@@ -128,7 +128,7 @@ struct MlirOptMainConfigCLOptions : public MlirOptMainConfig {
-         cl::desc("Print the list of registered dialects and exit"),
-         cl::location(showDialectsFlag), cl::init(false));
- 
--    static cl::opt<std::string, /*ExternalStorage=*/true> splitInputFile(
-+    static cl::opt<std::string, /*ExternalStorage=*/true> splitInputFile{
-         "split-input-file", llvm::cl::ValueOptional,
-         cl::callback([&](const std::string &str) {
-           // Implicit value: use default marker if flag was used without value.
-@@ -137,7 +137,7 @@ struct MlirOptMainConfigCLOptions : public MlirOptMainConfig {
-         }),
-         cl::desc("Split the input file into chunks using the given or "
-                  "default marker and process each chunk independently"),
--        cl::location(splitInputFileFlag), cl::init(""));
-+        cl::location(splitInputFileFlag), cl::init("")};
- 
-     static cl::opt<std::string, /*ExternalStorage=*/true> outputSplitMarker(
-         "output-split-marker",
-diff --git a/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp b/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
-index 1aaf8adb50a7..bd9928950ecc 100644
---- a/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
-+++ b/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
-@@ -62,7 +62,7 @@ LogicalResult mlir::mlirTranslateMain(int argc, char **argv,
-       llvm::cl::desc("Allow operation with no registered dialects (discouraged: testing only!)"),
-       llvm::cl::init(false));
- 
--  static llvm::cl::opt<std::string> inputSplitMarker(
-+  static llvm::cl::opt<std::string> inputSplitMarker{
-       "split-input-file", llvm::cl::ValueOptional,
-       llvm::cl::callback([&](const std::string &str) {
-         // Implicit value: use default marker if flag was used without value.
-@@ -71,7 +71,7 @@ LogicalResult mlir::mlirTranslateMain(int argc, char **argv,
-       }),
-       llvm::cl::desc("Split the input file into chunks using the given or "
-                      "default marker and process each chunk independently"),
--      llvm::cl::init(""));
-+      llvm::cl::init("")};
- 
-   static llvm::cl::opt<bool> verifyDiagnostics(
-       "verify-diagnostics",
-diff --git a/mlir/tools/mlir-pdll/mlir-pdll.cpp b/mlir/tools/mlir-pdll/mlir-pdll.cpp
-index d312765e40b0..c6ad6c361e99 100644
---- a/mlir/tools/mlir-pdll/mlir-pdll.cpp
-+++ b/mlir/tools/mlir-pdll/mlir-pdll.cpp
-@@ -136,7 +136,7 @@ int main(int argc, char **argv) {
-       llvm::cl::desc(
-           "Print out the parsed ODS information from the input file"),
-       llvm::cl::init(false));
--  llvm::cl::opt<std::string> inputSplitMarker(
-+  llvm::cl::opt<std::string> inputSplitMarker{
-       "split-input-file", llvm::cl::ValueOptional,
-       llvm::cl::callback([&](const std::string &str) {
-         // Implicit value: use default marker if flag was used without value.
-@@ -145,7 +145,7 @@ int main(int argc, char **argv) {
-       }),
-       llvm::cl::desc("Split the input file into chunks using the given or "
-                      "default marker and process each chunk independently"),
--      llvm::cl::init(""));
-+      llvm::cl::init("")};
-   llvm::cl::opt<std::string> outputSplitMarker(
-       "output-split-marker",
-       llvm::cl::desc("Split marker to use for merging the ouput"),
diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 509398da979e83..ed71b6eda58bb5 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1 +1,75 @@
 Auto generated patch. Do not edit or delete it, even if empty.
+diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
++++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+@@ -66,7 +66,7 @@
+ 
+ libc_support_library(
+     name = "internal_includes",
+-    hdrs = glob([
++    textual_hdrs = glob([
+         "include/llvm-libc-macros/*.h",
+         "include/llvm-libc-types/*",
+     ]),
+diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
+--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
++++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
+@@ -178,6 +178,7 @@
+     deps = [
+         "//libc:__support_fputil_basic_operations",
+         "//libc:__support_fputil_fp_bits",
++        "//libc:internal_includes",
+         "//libc/test/UnitTest:LibcUnitTest",
+         "//libc/test/UnitTest:fp_test_helpers",
+         "//libc/utils/MPFRWrapper:mpfr_wrapper",
+@@ -323,6 +324,7 @@
+         "//libc:__support_fputil_basic_operations",
+         "//libc:__support_fputil_fenv_impl",
+         "//libc:__support_fputil_fp_bits",
++        "//libc:internal_includes",
+         "//libc/test/UnitTest:LibcUnitTest",
+         "//libc/test/UnitTest:fp_test_helpers",
+     ],
+@@ -350,6 +352,7 @@
+         "//libc:__support_cpp_limits",
+         "//libc:__support_fputil_fp_bits",
+         "//libc:__support_fputil_normal_float",
++        "//libc:internal_includes",
+         "//libc/test/UnitTest:LibcUnitTest",
+         "//libc/test/UnitTest:fp_test_helpers",
+     ],
+@@ -376,6 +379,7 @@
+     deps = [
+         "//libc:__support_fputil_fenv_impl",
+         "//libc:__support_fputil_fp_bits",
++        "//libc:internal_includes",
+         "//libc/test/UnitTest:LibcUnitTest",
+         "//libc/test/UnitTest:fp_test_helpers",
+         "//libc/utils/MPFRWrapper:mpfr_wrapper",
+@@ -412,6 +416,7 @@
+     deps = [
+         "//libc:__support_fputil_fenv_impl",
+         "//libc:__support_fputil_fp_bits",
++        "//libc:internal_includes",
+         "//libc/test/UnitTest:LibcUnitTest",
+         "//libc/test/UnitTest:fp_test_helpers",
+         "//libc/utils/MPFRWrapper:mpfr_wrapper",
+@@ -523,6 +528,7 @@
+         "//libc:__support_cpp_type_traits",
+         "//libc:__support_fputil_basic_operations",
+         "//libc:__support_fputil_fp_bits",
++        "//libc:internal_includes",
+         "//libc/test/UnitTest:LibcUnitTest",
+         "//libc/test/UnitTest:fp_test_helpers",
+     ],
+diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
++++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+@@ -84,6 +84,7 @@
+         "//libc:__support_fputil_fp_bits",
+         "//libc:__support_fputil_fpbits_str",
+         "//libc:__support_fputil_rounding_mode",
++        "//libc:internal_includes",
+     ],
+ )
+ 
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 98eef196d2d988..4e7f72bf7a0b85 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "fadc38efed815511c21032abab4b71e4320adc1c"
-    LLVM_SHA256 = "38e08676b4c612da3a97f3418de89ffd5206879106fccadefa1ccc6932b3b375"
+    LLVM_COMMIT = "930f21c6bc0fe05c1d08be6353f7c7c6c51f4dc0"
+    LLVM_SHA256 = "e588ac42b895b4b9746bd5d15ede287f3247a88d82ed41a0704e6ff8641f4734"
 
     tf_http_archive(
         name = name,
@@ -17,7 +17,6 @@ def repo(name):
         ],
         build_file = "//third_party/llvm:llvm.BUILD",
         patch_file = [
-            "//third_party/llvm:daa350c1995015daac552548c34b87220f21156d.patch",
             "//third_party/llvm:generated.patch",  # Autogenerated, don't remove.
             "//third_party/llvm:build.patch",
             "//third_party/llvm:mathextras.patch",
diff --git a/third_party/xla/xla/mlir/xla_cpu/ir/BUILD b/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
index 9cb3de61b2118e..bb09094c14dca8 100644
--- a/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
+++ b/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
@@ -104,6 +104,7 @@ cc_library(
         "//xla/mlir_hlo",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
     ],
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index 3c82352fa32ee3..d929cb6af8e9c6 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -594,6 +594,7 @@ cc_library(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ControlFlowDialect",
@@ -1032,6 +1033,7 @@ cc_library(
         "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ComplexToLLVM",
diff --git a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
index 5b7e0842e32ae9..93756798df52b1 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
@@ -138,6 +138,9 @@ struct ComputeOpAndFuncBufferizePass
         .insert<bufferization::BufferizationDialect, lmhlo::LmhloDialect,
                 linalg::LinalgDialect, memref::MemRefDialect, mhlo::MhloDialect,
                 shape::ShapeDialect, vector::VectorDialect>();
+    arith::registerBufferizableOpInterfaceExternalModels(registry);
+    mlir::bufferization::func_ext::
+        registerBufferizableOpInterfaceExternalModels(registry);
     linalg::registerBufferizableOpInterfaceExternalModels(registry);
     mhlo::registerBufferizableOpInterfaceExternalModels(registry);
     shape::registerBufferizableOpInterfaceExternalModels(registry);
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index fefac9f2e75fa0..1f2aa2d38ec810 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -747,6 +747,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
index 9f08ec5c244089..8daf5509ce0d3b 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
@@ -174,6 +174,7 @@ cc_library(
         "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexToStandard",
         "@llvm-project//mlir:ControlFlowDialect",
@@ -218,7 +219,7 @@ xla_cc_test(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:FuncDialect",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
index cd2f620c0266a1..9199626d83d5f3 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
@@ -61,6 +61,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:Support",
     ],
 )

From e01875e4288d85f773105441c2048957ce51c884 Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Tue, 19 Mar 2024 10:35:24 -0700
Subject: [PATCH 114/670] [XLA:GPU] Make
 TensorIterationSpec::IsPhysicallyEquivalent correctly handle trivial
 dimensions

If a dimension has iteration count 1, it does not affect the physical equivalence to other tensors.

PiperOrigin-RevId: 617220042
---
 third_party/xla/xla/service/gpu/BUILD         | 11 +++
 .../service/gpu/triton_tiling_propagation.cc  | 38 ++++++++-
 .../gpu/triton_tiling_propagation_test.cc     | 84 +++++++++++++++++++
 3 files changed, 129 insertions(+), 4 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index dd3fc565d28481..6ebe462a1f12a9 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1612,6 +1612,17 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "triton_tiling_propagation_test",
+    srcs = ["triton_tiling_propagation_test.cc"],
+    deps = [
+        ":triton_tiling_propagation",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "triton_fusion_analysis",
     srcs = ["triton_fusion_analysis.cc"],
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 123032dd955cfe..35be72b8ad001f 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -49,6 +49,27 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+namespace {
+
+// The input is a map from dimension index to DimIterationSpec. The function
+// removes dimensions that have a trivial DimIterationSpec.
+absl::flat_hash_map<int, TensorIterationSpec::DimIterationSpec>
+FilterTrivialDims(
+    const absl::flat_hash_map<int, TensorIterationSpec::DimIterationSpec>&
+        dim_iter_specs) {
+  absl::flat_hash_map<int, TensorIterationSpec::DimIterationSpec>
+      non_trivial_dim_iteration_specs;
+  for (const auto& [dim, dim_spec] : dim_iter_specs) {
+    if (dim_spec.size() == 1 && dim_spec[0].count == 1) {
+      continue;
+    }
+    non_trivial_dim_iteration_specs[dim] = dim_spec;
+  }
+  return non_trivial_dim_iteration_specs;
+}
+
+}  // namespace
+
 const TensorIterationSpec::DimIterationSpec* TensorIterationSpec::Find(
     const int dimension) const {
   if (auto it = dim_iteration_specs_.find(dimension);
@@ -60,14 +81,23 @@ const TensorIterationSpec::DimIterationSpec* TensorIterationSpec::Find(
 
 bool TensorIterationSpec::IsPhysicallyEquivalent(
     const TensorIterationSpec& other) const {
-  if (dim_iteration_specs_.size() != other.dim_iteration_specs_.size()) {
+  // Filter out trivial dims since they don't affect physical representation.
+  const absl::flat_hash_map<int, DimIterationSpec>
+      non_trivial_dim_iteration_specs = FilterTrivialDims(dim_iteration_specs_);
+  const absl::flat_hash_map<int, DimIterationSpec>
+      other_non_trivial_dim_iteration_specs =
+          FilterTrivialDims(other.dim_iteration_specs_);
+
+  if (non_trivial_dim_iteration_specs.size() !=
+      other_non_trivial_dim_iteration_specs.size()) {
     return false;
   }
-  for (const auto& pair : dim_iteration_specs_) {
+
+  for (const auto& pair : non_trivial_dim_iteration_specs) {
     int dimension = pair.first;
     const DimIterationSpec& dim_iter_spec = pair.second;
-    auto other_it = other.dim_iteration_specs_.find(dimension);
-    if (other_it == other.dim_iteration_specs_.end()) {
+    auto other_it = other_non_trivial_dim_iteration_specs.find(dimension);
+    if (other_it == other_non_trivial_dim_iteration_specs.end()) {
       return false;
     }
     const DimIterationSpec& other_dim_iter_spec = other_it->second;
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc
new file mode 100644
index 00000000000000..b483c73f4ddbd2
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/triton_tiling_propagation.h"
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "xla/tests/hlo_test_base.h"
+
+namespace xla::gpu {
+namespace {
+
+using TritonTilingPropagationTest = HloTestBase;
+using triton_fusion::DimensionOrder;
+
+DimensionOrder FromFragments(DimensionOrder::Fragments fragments) {
+  DimensionOrder dim_order;
+  DimensionOrder::Fragments& tensor_fragments_order =
+      dim_order.TensorFragmentsOrder();
+  DimensionOrder::FragmentOrders& dim_fragments_orders =
+      dim_order.DimFragmentsOrders();
+  for (const DimensionOrder::Fragment& fragment : fragments) {
+    tensor_fragments_order.push_back(fragment);
+    dim_fragments_orders[fragment.dst_dim_number()].push_back(
+        tensor_fragments_order.size());
+  }
+  return dim_order;
+}
+
+TEST_F(
+    TritonTilingPropagationTest,
+    DimensionOrdersRemainPhysicallyEquivalentAfterInsertingTrivialDimensions) {
+  DimensionOrder::Fragment fragment_1(/*dst_dim_number=*/0, /*count=*/97);
+  DimensionOrder::Fragment fragment_2(/*dst_dim_number=*/0, /*count=*/1);
+  DimensionOrder dimension_order_1 = FromFragments({fragment_1, fragment_2});
+
+  DimensionOrder::Fragment fragment_3(/*dst_dim_number=*/0, /*count=*/97);
+  DimensionOrder::Fragment fragment_4(/*dst_dim_number=*/1, /*count=*/1);
+  DimensionOrder dimension_order_2 = FromFragments({fragment_3, fragment_4});
+
+  // They should be equivalent because fragment_2 and fragment_4 both have count
+  // 1, so they don't affect the physical representation.
+  EXPECT_TRUE(dimension_order_1.IsPhysicallyEquivalent(dimension_order_2));
+}
+
+TEST_F(
+    TritonTilingPropagationTest,
+    IterationSpecsRemainPhysicallyEquivalentAfterInsertingTrivialDimensions) {
+  TensorIterationSpec::IterationSpecFragment fragment_1 = {
+      /*stride=*/1, /*count=*/97, /*slice_start=*/0, /*sliced_count=*/97,
+      /*subfragments=*/{97}};
+  TensorIterationSpec spec_1;
+  spec_1[0].push_back(fragment_1);
+
+  TensorIterationSpec::IterationSpecFragment fragment_2 = {
+      /*stride=*/1, /*count=*/97, /*slice_start=*/0, /*sliced_count=*/97,
+      /*subfragments=*/{97}};
+  TensorIterationSpec::IterationSpecFragment fragment_3 = {
+      /*stride=*/97, /*count=*/1, /*slice_start=*/0, /*sliced_count=*/1,
+      /*subfragments=*/{1}};
+  TensorIterationSpec spec_2;
+  spec_2[0].push_back(fragment_2);
+  spec_2[1].push_back(fragment_3);
+
+  // spec_2's extra dimension is degenerate, so it should have the same physical
+  // representation as spec_1.
+  EXPECT_TRUE(spec_1.IsPhysicallyEquivalent(spec_2));
+}
+
+}  // namespace
+}  // namespace xla::gpu

From f861735246fb093b394c508355147f8e68cb7ef5 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 10:48:23 -0700
Subject: [PATCH 115/670] #shlo_ref Add `logistic` op.

PiperOrigin-RevId: 617224783
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  34 ++++
 .../lite/experimental/shlo/ops/logistic.cc    |  74 +++++++++
 .../lite/experimental/shlo/ops/logistic.h     |  34 ++++
 .../experimental/shlo/ops/logistic_test.cc    | 147 ++++++++++++++++++
 4 files changed, 289 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/logistic.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/logistic.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/logistic_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 1f00fb9e588067..a559dc9da2e7c6 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -560,3 +560,37 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "logistic",
+    srcs = ["logistic.cc"],
+    hdrs = ["logistic.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "logistic_test",
+    srcs = ["logistic_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":logistic",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/logistic.cc b/tensorflow/lite/experimental/shlo/ops/logistic.cc
new file mode 100644
index 00000000000000..0174e4113e2c29
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/logistic.cc
@@ -0,0 +1,74 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/logistic.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Logistic {
+  template <class T>
+  T operator()(T v) const {
+    constexpr T one = static_cast<T>(1);
+    return one / (one + std::exp(-v));
+  }
+
+  template <>
+  F16 operator()(F16 v) const {
+    return F16(operator()(static_cast<float>(v)));
+  }
+
+  template <>
+  BF16 operator()(BF16 v) const {
+    return BF16(operator()(static_cast<float>(v)));
+  }
+};
+
+LogisticOp Create(LogisticOp::Attributes) { return {}; }
+
+absl::Status Prepare(LogisticOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("logistic"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("logistic"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(LogisticOp& op, const Tensor& input, Tensor& output) {
+  Logistic logistic;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       logistic, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   logistic, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.logistic: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/logistic.h b/tensorflow/lite/experimental/shlo/ops/logistic.h
new file mode 100644
index 00000000000000..b44ef6feb76e75
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/logistic.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOGISTIC_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOGISTIC_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct LogisticOp {
+  struct Attributes {};
+};
+
+LogisticOp Create(LogisticOp::Attributes);
+absl::Status Prepare(LogisticOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(LogisticOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOGISTIC_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/logistic_test.cc b/tensorflow/lite/experimental/shlo/ops/logistic_test.cc
new file mode 100644
index 00000000000000..e11df7372a40c0
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/logistic_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/logistic.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<LogisticOp> {
+  static std::string Get() { return "Logistic"; }
+};
+
+namespace {
+
+struct Logistic {
+  template <class T>
+  T operator()(T v) const {
+    constexpr T one = static_cast<T>(1);
+    return one / (one + std::exp(-v));
+  }
+
+  template <>
+  F16 operator()(F16 v) const {
+    return F16(operator()(static_cast<float>(v)));
+  }
+
+  template <>
+  BF16 operator()(BF16 v) const {
+    return BF16(operator()(static_cast<float>(v)));
+  }
+} logistic_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Logistic, UnaryElementwiseOpShapePropagationTest,
+                               LogisticOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Logistic, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<LogisticOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<LogisticOp, ConcatTypes<BoolTestType, IntTestTypes,
+                                        PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Logistic, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct LogisticTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(LogisticTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(LogisticTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), logistic_ref);
+
+  auto op = Create(LogisticOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedLogisticTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedLogisticTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedLogisticTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = logistic_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(LogisticOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From b49ea8a957b658c8e25be6de26f048b3cddb23ca Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 10:59:41 -0700
Subject: [PATCH 116/670] #shlo_ref Add `sign` op.

PiperOrigin-RevId: 617228822
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  34 ++++
 tensorflow/lite/experimental/shlo/ops/sign.cc |  75 +++++++++
 tensorflow/lite/experimental/shlo/ops/sign.h  |  34 ++++
 .../lite/experimental/shlo/ops/sign_test.cc   | 146 ++++++++++++++++++
 4 files changed, 289 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/sign.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/sign.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/sign_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index a559dc9da2e7c6..6c541f8185452b 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -594,3 +594,37 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "sign",
+    srcs = ["sign.cc"],
+    hdrs = ["sign.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "sign_test",
+    srcs = ["sign_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":sign",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/sign.cc b/tensorflow/lite/experimental/shlo/ops/sign.cc
new file mode 100644
index 00000000000000..6703650dc0c18c
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sign.cc
@@ -0,0 +1,75 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/sign.h"
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Sign {
+  template <class T>
+  T operator()(T v) const {
+    constexpr T one = static_cast<T>(1);
+    constexpr T zero = static_cast<T>(0);
+    return v < zero ? -one : (v > zero ? one : v);
+  }
+
+  template <>
+  F16 operator()(F16 v) const {
+    return static_cast<F16>(operator()(static_cast<float>(v)));
+  }
+
+  template <>
+  BF16 operator()(BF16 v) const {
+    return static_cast<BF16>(operator()(static_cast<float>(v)));
+  }
+};
+
+SignOp Create(SignOp::Attributes) { return {}; }
+
+absl::Status Prepare(SignOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(CheckCtx("sign"), input,
+                                               IsSignedIntTensor, IsFloatTensor,
+                                               IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("sign"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(SignOp& op, const Tensor& input, Tensor& output) {
+  Sign sign;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       sign, input, output)
+  } else if (IsSignedIntTensor(input) || IsFloatTensor(input)) {
+    DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
+                       input.tensor_element_type(), sign, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.sign: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/sign.h b/tensorflow/lite/experimental/shlo/ops/sign.h
new file mode 100644
index 00000000000000..70bdb71ec4035b
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sign.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SIGN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SIGN_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct SignOp {
+  struct Attributes {};
+};
+
+SignOp Create(SignOp::Attributes);
+absl::Status Prepare(SignOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(SignOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SIGN_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/sign_test.cc b/tensorflow/lite/experimental/shlo/ops/sign_test.cc
new file mode 100644
index 00000000000000..e786c185b9840d
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sign_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/sign.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<SignOp> {
+  static std::string Get() { return "Sign"; }
+};
+
+namespace {
+
+struct Sign {
+  template <class T>
+  T operator()(T v) const {
+    constexpr T one = static_cast<T>(1);
+    constexpr T zero = static_cast<T>(0);
+    return v < zero ? -one : (v > zero ? one : v);
+  }
+
+  template <>
+  F16 operator()(F16 v) const {
+    return static_cast<F16>(operator()(static_cast<float>(v)));
+  }
+
+  template <>
+  BF16 operator()(BF16 v) const {
+    return static_cast<BF16>(operator()(static_cast<float>(v)));
+  }
+} sign_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sign, UnaryElementwiseOpShapePropagationTest,
+                               SignOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Sign, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<SignOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<SignOp, ConcatTypes<BoolTestType, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sign, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct SignTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(SignTest, ArithmeticTestTypes, TestParamNames);
+
+TYPED_TEST(SignTest, ArithmeticTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), sign_ref);
+
+  auto op = Create(SignOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedSignTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedSignTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedSignTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = sign_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(SignOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From e997c4e2878211be70fbbd102bba3c60a6ad3567 Mon Sep 17 00:00:00 2001
From: Sania Nagpal <sanianagpal@google.com>
Date: Tue, 19 Mar 2024 11:11:02 -0700
Subject: [PATCH 117/670] Add test coverage for attr_util.cc

PiperOrigin-RevId: 617233239
---
 tensorflow/core/runtime_fallback/util/BUILD   |  1 +
 .../runtime_fallback/util/attr_util_test.cc   | 25 +++++++++++++------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/runtime_fallback/util/BUILD b/tensorflow/core/runtime_fallback/util/BUILD
index 92db3499ec3977..fb8acb0b423e45 100644
--- a/tensorflow/core/runtime_fallback/util/BUILD
+++ b/tensorflow/core/runtime_fallback/util/BUILD
@@ -207,6 +207,7 @@ tf_cc_test(
         "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:errors",
+        "@com_google_googletest//:gtest_main",
         "@tf_runtime//:bef",
         "@tf_runtime//:bef_attr_encoder",
         "@tf_runtime//:core_runtime",
diff --git a/tensorflow/core/runtime_fallback/util/attr_util_test.cc b/tensorflow/core/runtime_fallback/util/attr_util_test.cc
index 85bbfeb9cc0907..2beeb0848600a7 100644
--- a/tensorflow/core/runtime_fallback/util/attr_util_test.cc
+++ b/tensorflow/core/runtime_fallback/util/attr_util_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -280,10 +281,14 @@ TEST(UtilsTest, IsUnusedAttributeOk) {
 TEST(UtilsTest, FillAttrValueMapOk) {
   tfrt::OpAttrs attrs;
   attrs.SetArray("shape", tfrt::ArrayRef<int64_t>{2, 2});
-  attrs.SetArray("values", tfrt::ArrayRef<int64_t>{2});
+  attrs.SetArray("values", tfrt::ArrayRef<float>{2});
+  attrs.SetArray("flags", tfrt::ArrayRef<bool>{false, true});
+
   attrs.Set<bool>("transpose_a", false);
   attrs.Set<bool>("transpose_b", true);
-  attrs.Set<int64_t>("result_segment_sizes", 1);
+  attrs.Set<int64_t>("result_segment_sizes", 2);  // unused
+  attrs.Set<float>("foo", 2);
+  attrs.Set<int64_t>("bar", 2);
 
   AttrValueMap map;
   auto host_context = CreateTestHostContext();
@@ -292,12 +297,16 @@ TEST(UtilsTest, FillAttrValueMapOk) {
   ASSERT_FALSE(llvm::errorToBool(
       FillAttrValueMap(attrs.freeze(), host_context.get(), &map)));
 
-  EXPECT_THAT(map,
-              UnorderedElementsAre(
-                  Pair(Eq("shape"), EqualsProto(R"pb(list { i: 2 i: 2 })pb")),
-                  Pair(Eq("values"), EqualsProto(R"pb(list { i: 2 })pb")),
-                  Pair(Eq("transpose_a"), EqualsProto(R"pb(b: false)pb")),
-                  Pair(Eq("transpose_b"), EqualsProto(R"pb(b: true)pb"))));
+  EXPECT_THAT(
+      map,
+      UnorderedElementsAre(
+          Pair(Eq("shape"), EqualsProto(R"pb(list { i: 2 i: 2 })pb")),
+          Pair(Eq("values"), EqualsProto(R"pb(list { f: 2 })pb")),
+          Pair(Eq("flags"), EqualsProto(R"pb(list { b: false b: true })pb")),
+          Pair(Eq("transpose_a"), EqualsProto(R"pb(b: false)pb")),
+          Pair(Eq("transpose_b"), EqualsProto(R"pb(b: true)pb")),
+          Pair(Eq("foo"), EqualsProto(R"pb(f: 2)pb")),
+          Pair(Eq("bar"), EqualsProto(R"pb(i: 2)pb"))));
 }
 
 }  // namespace

From 348eb2b7522ba433fbf56dcfeb034dce6666d441 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 11:11:33 -0700
Subject: [PATCH 118/670] #shlo_ref Add `negate` op.

PiperOrigin-RevId: 617233380
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  30 ++++
 .../lite/experimental/shlo/ops/negate.cc      |  57 ++++++++
 .../lite/experimental/shlo/ops/negate.h       |  34 +++++
 .../lite/experimental/shlo/ops/negate_test.cc | 129 ++++++++++++++++++
 4 files changed, 250 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/negate.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/negate.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/negate_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 6c541f8185452b..9a1e7bca5b308c 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -628,3 +628,33 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "negate",
+    srcs = ["negate.cc"],
+    hdrs = ["negate.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "negate_test",
+    srcs = ["negate_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":negate",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/negate.cc b/tensorflow/lite/experimental/shlo/ops/negate.cc
new file mode 100644
index 00000000000000..079d3faf0b21bb
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/negate.cc
@@ -0,0 +1,57 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/negate.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Negate : std::negate<void> {};
+
+NegateOp Create(NegateOp::Attributes) { return {}; }
+
+absl::Status Prepare(NegateOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(CheckCtx("negate"), input,
+                                               IsSignedIntTensor, IsFloatTensor,
+                                               IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("negate"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(NegateOp& op, const Tensor& input, Tensor& output) {
+  Negate negate;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       negate, input, output)
+  } else if (IsSignedIntTensor(input) || IsFloatTensor(input)) {
+    DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
+                       input.tensor_element_type(), negate, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.negate: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/negate.h b/tensorflow/lite/experimental/shlo/ops/negate.h
new file mode 100644
index 00000000000000..07fc6329d1dbbe
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/negate.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NEGATE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NEGATE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct NegateOp {
+  struct Attributes {};
+};
+
+NegateOp Create(NegateOp::Attributes);
+absl::Status Prepare(NegateOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(NegateOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NEGATE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/negate_test.cc b/tensorflow/lite/experimental/shlo/ops/negate_test.cc
new file mode 100644
index 00000000000000..4786dfe813cbca
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/negate_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/negate.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<NegateOp> {
+  static std::string Get() { return "Negate"; }
+};
+
+namespace {
+
+struct Negate : std::negate<void> {
+} negate_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Negate, UnaryElementwiseOpShapePropagationTest,
+                               NegateOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Negate, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<NegateOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<NegateOp, ConcatTypes<BoolTestType, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Negate, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct NegateTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(NegateTest, ArithmeticTestTypes, TestParamNames);
+
+TYPED_TEST(NegateTest, ArithmeticTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), negate_ref);
+
+  auto op = Create(NegateOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedNegateTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedNegateTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedNegateTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = negate_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(NegateOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From 1d3732ddde9988a4b91200bbda4f945ce8f03246 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 11:12:54 -0700
Subject: [PATCH 119/670] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/1741a4616ea64a43b4d58a43e5fbc32e382afeaf.

PiperOrigin-RevId: 617233852
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 5a8a6f91caf6d4..df3e6bb4f924ec 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "59be786cfd0fa9945bce36b9b2918d4b1753c426"
-    TFRT_SHA256 = "843a2f3f4ef5d1ca44cfd6b7e596ddc5e2ed37e692b6c77bca4a8d1790471d4b"
+    TFRT_COMMIT = "1741a4616ea64a43b4d58a43e5fbc32e382afeaf"
+    TFRT_SHA256 = "d77c2839945ef6636829b782308c63c6e9a675e0c2b45c92d3bf8553944d0020"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 5a8a6f91caf6d4..df3e6bb4f924ec 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "59be786cfd0fa9945bce36b9b2918d4b1753c426"
-    TFRT_SHA256 = "843a2f3f4ef5d1ca44cfd6b7e596ddc5e2ed37e692b6c77bca4a8d1790471d4b"
+    TFRT_COMMIT = "1741a4616ea64a43b4d58a43e5fbc32e382afeaf"
+    TFRT_SHA256 = "d77c2839945ef6636829b782308c63c6e9a675e0c2b45c92d3bf8553944d0020"
 
     tf_http_archive(
         name = "tf_runtime",

From c3c36a4d381e648dab9d9b8f57076c8eae33b9fc Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Tue, 19 Mar 2024 11:19:42 -0700
Subject: [PATCH 120/670] [xla:gpu] Handle arbitrary number of arguments passed
 to AddressComputationThunk

Passing the correct buffers to embedded thunk when AddressComputationThunk itself is getting an arbitrary number of buffers.

PiperOrigin-RevId: 617236185
---
 .../gpu/runtime/address_computation_thunk.cc  |   4 +-
 .../runtime/address_computation_thunk_test.cc | 128 ++++++++++++++++++
 2 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 36c5f4deac7864..5a975cfe7d6a0b 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -106,9 +106,9 @@ absl::Status AddressComputationThunk::Initialize(
 absl::Status AddressComputationThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   auto& stream = *params.stream;
-  std::vector<se::DeviceMemoryBase> new_buffers(
-      embedded_thunk_arguments_.size(), se::DeviceMemoryBase());
   const BufferAllocations& orig_allocations = *params.buffer_allocations;
+  std::vector<se::DeviceMemoryBase> new_buffers(orig_allocations.size(),
+                                                se::DeviceMemoryBase());
 
   // Get memory allocation for copying offsets from device.
   int64_t* offsets_base = [&] {
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index e31b246c404b4c..9298415e2f5651 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -956,4 +956,132 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
   ASSERT_EQ(dst, std::vector<float>({9}));
 }
 
+TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t lhs_length = sizeof(float) * 2 * 4;
+  int64_t rhs_length = sizeof(float) * 3 * 1;
+  int64_t out_length = sizeof(float) * 1 * 1;
+  int64_t offset_length = sizeof(int64_t);
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  BufferAllocation alloc_lhs(/*index=*/7, lhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, lhs_length);
+
+  BufferAllocation alloc_rhs(/*index=*/3, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, rhs_length);
+
+  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+
+  BufferAllocation alloc_workspace(/*index=*/0, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+
+  BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_offset_1(/*index=*/5, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_fake(/*index=*/7, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, rhs_length);
+
+  // Preparing config for GEMM thunk.
+  auto config =
+      GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
+                      0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
+                      se::blas::kDefaultComputePrecision, false, false);
+  ASSERT_TRUE(config.ok());
+
+  // Creating embedded GEMM thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<GemmThunk>(
+      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs,
+      slice_out, slice_workspace, /*deterministic=*/true));
+
+  // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
+       std::nullopt, std::nullopt});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+  // Given a `lhs` tensor of shape f32[2,4]{1,0}
+  // The `lhs` slice that we want to use will be equivalent to this static
+  // slice op:
+  // f32[1,3]{1,0} slice(lhs), slice={[0:1], [1:4]}
+
+  // Preparing memory for thunk arguments.
+  // lhs = [1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0]
+  se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
+  std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
+  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+
+  // rhs = [1.0,
+  //        1.0,
+  //        1.0]
+  se::DeviceMemory<float> rhs = executor->AllocateArray<float>(3 * 1);
+  std::vector<float> rhs_arr(3, 1);
+  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+
+  se::DeviceMemory<float> out = executor->AllocateArray<float>(1 * 1);
+  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+
+  se::DeviceMemory<float> workspace =
+      executor->AllocateArray<float>(1024 * 1024);
+  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> lhs_offset_arr{0, 1};
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations(
+      {workspace, /*garbage, to be ignored*/ se::DeviceMemoryBase(), out, rhs,
+       lhs_offset_0, lhs_offset_1, /*garbage, to be ignored*/ rhs, lhs},
+      0, executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Executing address computation thunk.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copying `out` data back to host for verification.
+  std::vector<float> dst(1, 0);
+  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+
+  ASSERT_EQ(dst, std::vector<float>({9}));
+}
+
 }  // namespace xla::gpu

From f2a8ccfd1ccf2b34e658b9354e6097012821ae62 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 11:24:05 -0700
Subject: [PATCH 121/670] #shlo_ref Add `count_leading_zeros` op.

PiperOrigin-RevId: 617237620
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  30 +++++
 .../shlo/ops/count_leading_zeros.cc           |  59 +++++++++
 .../shlo/ops/count_leading_zeros.h            |  36 ++++++
 .../shlo/ops/count_leading_zeros_test.cc      | 120 ++++++++++++++++++
 .../shlo/ops/unary_elementwise_test_util.h    |  29 ++++-
 tensorflow/lite/experimental/shlo/ops/util.cc |   4 +
 tensorflow/lite/experimental/shlo/ops/util.h  |   3 +
 7 files changed, 277 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/count_leading_zeros.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/count_leading_zeros_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 9a1e7bca5b308c..93ce34f3cd637a 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -658,3 +658,33 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "count_leading_zeros",
+    srcs = ["count_leading_zeros.cc"],
+    hdrs = ["count_leading_zeros.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "count_leading_zeros_test",
+    srcs = ["count_leading_zeros_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":count_leading_zeros",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.cc b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.cc
new file mode 100644
index 00000000000000..7e3f17f551093c
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.cc
@@ -0,0 +1,59 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h"
+
+#include <type_traits>
+
+#include "absl/numeric/bits.h"
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct CountLeadingZeros {
+  template <class T>
+  T operator()(T v) const {
+    return absl::countl_zero(static_cast<std::make_unsigned_t<T>>(v));
+  }
+};
+
+CountLeadingZerosOp Create(CountLeadingZerosOp::Attributes) { return {}; }
+
+absl::Status Prepare(CountLeadingZerosOp& op, const Tensor& input,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("count_leading_zeros"), input, IsIntTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("count_leading_zeros"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(CountLeadingZerosOp& op, const Tensor& input,
+                      Tensor& output) {
+  CountLeadingZeros count_leading_zeros;
+  if (IsIntTensor(input)) {
+    DISPATCH_INT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                 count_leading_zeros, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.count_leading_zeros: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h
new file mode 100644
index 00000000000000..77b0de2dd7bcda
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COUNT_LEADING_ZEROS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COUNT_LEADING_ZEROS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct CountLeadingZerosOp {
+  struct Attributes {};
+};
+
+CountLeadingZerosOp Create(CountLeadingZerosOp::Attributes);
+absl::Status Prepare(CountLeadingZerosOp& op, const Tensor& input,
+                     Tensor& output);
+absl::Status Evaluate(CountLeadingZerosOp& op, const Tensor& input,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COUNT_LEADING_ZEROS_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/count_leading_zeros_test.cc b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros_test.cc
new file mode 100644
index 00000000000000..4780d1c4fb4b87
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h"
+
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <type_traits>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<CountLeadingZerosOp> {
+  static std::string Get() { return "CountLeadingZeros"; }
+};
+
+template <>
+struct SupportedOpDataType<CountLeadingZerosOp> {
+  static constexpr DataType kStorageType = DataType::kSI32;
+};
+
+namespace {
+
+struct CountLeadingZeros {
+  template <class T>
+  T operator()(T v) const {
+    return absl::countl_zero(static_cast<std::make_unsigned_t<T>>(v));
+  }
+} count_leading_zeros_ref;
+
+template <class T>
+struct CountLeadingZerosFunctorTest : ::testing::Test {};
+
+using CountLeadingZerosTypes = ::testing::Types<int32_t, int16_t, int8_t>;
+
+TYPED_TEST_SUITE(CountLeadingZerosFunctorTest, CountLeadingZerosTypes);
+
+TYPED_TEST(CountLeadingZerosFunctorTest, GivesCorrectResults) {
+  constexpr TypeParam byte_count = 8 * sizeof(TypeParam);
+  EXPECT_EQ(count_leading_zeros_ref(std::numeric_limits<TypeParam>::lowest()),
+            0);
+  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(-1)), 0);
+  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(0)), byte_count);
+  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(1)), byte_count - 1);
+  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(2)), byte_count - 2);
+  EXPECT_EQ(count_leading_zeros_ref(std::numeric_limits<TypeParam>::max()), 1);
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CountLeadingZeros,
+                               UnaryElementwiseOpShapePropagationTest,
+                               CountLeadingZerosOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    CountLeadingZeros, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    BaselineMismatchSignedIntegerTypes<CountLeadingZerosOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<CountLeadingZerosOp, ConcatTypes<BoolTestType, FloatTestTypes,
+                                                 PerTensorQuantizedTestTypes,
+                                                 PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CountLeadingZeros,
+                               UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct CountLeadingZerosTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(CountLeadingZerosTest, IntTestTypes, TestParamNames);
+
+TYPED_TEST(CountLeadingZerosTest, IntTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = IotaBuffer<TypeParam::kStorage>(shape, -12);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), count_leading_zeros_ref);
+
+  auto op = Create(CountLeadingZerosOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h
index 3c153ea6cb77f9..8ea34b69141403 100644
--- a/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h
+++ b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h
@@ -30,6 +30,15 @@ limitations under the License.
 
 namespace shlo_ref {
 
+template <class Op>
+using BaselineMismatchSignedIntegerTypes = ::testing::Types<
+    std::tuple<Op, TestParam<DataType::kSI4>, TestParam<DataType::kSI8>>,
+    std::tuple<Op, TestParam<DataType::kSI4>, TestParam<DataType::kSI16>>,
+    std::tuple<Op, TestParam<DataType::kSI8>, TestParam<DataType::kSI4>>,
+    std::tuple<Op, TestParam<DataType::kSI8>, TestParam<DataType::kSI16>>,
+    std::tuple<Op, TestParam<DataType::kSI16>, TestParam<DataType::kSI4>>,
+    std::tuple<Op, TestParam<DataType::kSI16>, TestParam<DataType::kSI8>>>;
+
 // Lists couples of unmatched baseline element types.
 template <class Op>
 using UnaryElementwiseConstraint1Types = ::testing::Types<
@@ -126,6 +135,16 @@ using UnaryElementwiseConstraint1Types = ::testing::Types<
 
 // Tests that the input shape is compared to the output shape and that it is
 // propagated if needed.
+
+// Customization point for generic tests that need to create a supported tensor
+// for an op but that don't care what that type is.
+//
+// Specialize this in the test file if F32 isn't supported by the op under test.
+template <class Op>
+struct SupportedOpDataType {
+  static constexpr DataType kStorageType = DataType::kF32;
+};
+
 template <class Op>
 class UnaryElementwiseOpShapePropagationTest : public ::testing::Test {
  protected:
@@ -137,11 +156,13 @@ class UnaryElementwiseOpShapePropagationTest : public ::testing::Test {
   }
 
   Op op_ = Create(typename Op::Attributes{});
-  Tensor input_tensor_ = {.type = TensorType{.shape = Shape({2, 3, 4}),
-                                             .element_type = DataType::kF32},
-                          .data = nullptr};
+  Tensor input_tensor_ = {
+      .type = TensorType{.shape = Shape({2, 3, 4}),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
   Tensor output_tensor_ = {
-      .type = TensorType{.shape = Shape(), .element_type = DataType::kF32},
+      .type = TensorType{.shape = Shape(),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
       .data = nullptr};
 };
 
diff --git a/tensorflow/lite/experimental/shlo/ops/util.cc b/tensorflow/lite/experimental/shlo/ops/util.cc
index d65d805daf8b9e..e3d6bc6552e855 100644
--- a/tensorflow/lite/experimental/shlo/ops/util.cc
+++ b/tensorflow/lite/experimental/shlo/ops/util.cc
@@ -46,6 +46,10 @@ bool IsUnsignedIntTensor(const Tensor& tensor) {
   return !tensor.IsQuantized() && IsUnsignedInteger(tensor.StorageType());
 }
 
+bool IsIntTensor(const Tensor& tensor) {
+  return !tensor.IsQuantized() && IsInteger(tensor.StorageType());
+}
+
 bool IsFloatTensor(const Tensor& tensor) {
   return !tensor.IsQuantized() && IsFloat(tensor.StorageType());
 }
diff --git a/tensorflow/lite/experimental/shlo/ops/util.h b/tensorflow/lite/experimental/shlo/ops/util.h
index a7ef8125225dbd..97a87c3e59728d 100644
--- a/tensorflow/lite/experimental/shlo/ops/util.h
+++ b/tensorflow/lite/experimental/shlo/ops/util.h
@@ -70,6 +70,9 @@ bool IsSignedIntTensor(const Tensor& tensor);
 // Returns true if the tensor's storage type is an unsigned integer type.
 bool IsUnsignedIntTensor(const Tensor& tensor);
 
+// Returns true if the tensor's storage type is an integer type.
+bool IsIntTensor(const Tensor& tensor);
+
 // Returns true if the tensor's storage type is an floating point type.
 bool IsFloatTensor(const Tensor& tensor);
 

From b930dc852bdc61b33f3714fafb436441ca84bf9e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 11:27:35 -0700
Subject: [PATCH 122/670] Extract passes required for unrolling to the
 while_loop_unroller interface.

PiperOrigin-RevId: 617238957
---
 third_party/xla/xla/service/BUILD             |  1 +
 .../xla/xla/service/while_loop_unroller.cc    | 68 ++++++++---------
 .../xla/xla/service/while_loop_unroller.h     | 14 +++-
 .../xla/service/while_loop_unroller_test.cc   | 76 +++++++++++++++++++
 4 files changed, 119 insertions(+), 40 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 272374d61dfd05..bc62dab1bf8a9a 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -3188,6 +3188,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index 4f96cc48ef53e1..a87baef4c5237e 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -68,43 +68,6 @@ const int kUnrollTripCountThreshold = 64;
 const int kUnrollInstructionCountThreshold = 800;
 const int kUnrollExpandFactorThreshold = 10000;
 
-// The following sequence of passes are necessary to prepare loops for
-// unrolling. Failure to run these passes will prevent unroller from unrolling
-// loops that would have been otherwise unrollable.
-//
-// Instead of placing these passes in compiler, they are placed
-// here to indicate explicit dependency to these passes.
-StatusOr<bool> PrepareModuleForUnrolling(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  bool changed = false;
-  TF_ASSIGN_OR_RETURN(
-      bool applied_cse,
-      HloCSE{/*is_layout_sensitive=*/true}.Run(module, execution_threads));
-  if (applied_cse) {
-    changed = true;
-    VLOG(3) << "Applied hlo cse to module " << module->name();
-  }
-
-  TF_ASSIGN_OR_RETURN(bool applied_tuple_simplifier,
-                      TupleSimplifier{}.Run(module, execution_threads));
-  if (applied_tuple_simplifier) {
-    changed = true;
-    VLOG(3) << "Applied tuple simplifier to module " << module->name();
-  }
-
-  // We apply constant sinking to fix point.
-  HloPassFix<WhileLoopConstantSinking> constant_sinking(
-      /*sink_broadcast_of_constants=*/true);
-  TF_ASSIGN_OR_RETURN(bool applied_constant_sinking,
-                      constant_sinking.Run(module, execution_threads));
-  if (applied_constant_sinking) {
-    changed = true;
-    VLOG(3) << "Applied constant sinking to module " << module->name();
-  }
-  return changed;
-}
-
 // A utility function that decides whether a loop is unrollable or not.
 std::optional<WhileLoopConfig> IsLoopUnrollable(HloInstruction* while_op) {
   CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
@@ -452,6 +415,37 @@ StatusOr<bool> UnrollInternalWrapped(HloInstruction* while_op,
 
 };  // namespace
 
+absl::StatusOr<bool> PrepareModuleForUnrolling(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  TF_ASSIGN_OR_RETURN(
+      bool applied_cse,
+      HloCSE{/*is_layout_sensitive=*/true}.Run(module, execution_threads));
+  if (applied_cse) {
+    changed = true;
+    VLOG(3) << "Applied hlo cse to module " << module->name();
+  }
+
+  TF_ASSIGN_OR_RETURN(bool applied_tuple_simplifier,
+                      TupleSimplifier{}.Run(module, execution_threads));
+  if (applied_tuple_simplifier) {
+    changed = true;
+    VLOG(3) << "Applied tuple simplifier to module " << module->name();
+  }
+
+  // We apply constant sinking to fix point.
+  HloPassFix<WhileLoopConstantSinking> constant_sinking(
+      /*sink_broadcast_of_constants=*/true);
+  TF_ASSIGN_OR_RETURN(bool applied_constant_sinking,
+                      constant_sinking.Run(module, execution_threads));
+  if (applied_constant_sinking) {
+    changed = true;
+    VLOG(3) << "Applied constant sinking to module " << module->name();
+  }
+  return changed;
+}
+
 absl::flat_hash_map<HloInstruction*, WhileLoopConfig> GetUnrollableLoops(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
diff --git a/third_party/xla/xla/service/while_loop_unroller.h b/third_party/xla/xla/service/while_loop_unroller.h
index bd9bebaa3d16e0..ca462e387323f3 100644
--- a/third_party/xla/xla/service/while_loop_unroller.h
+++ b/third_party/xla/xla/service/while_loop_unroller.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -39,14 +40,21 @@ struct WhileLoopConfig {
   int64_t induction_var_idx;
 };
 
+// Runs a sequence of passes that are necessary to prepare loops for unrolling.
+// Failure to run these passes will prevent unroller from unrolling loops that
+// would have been otherwise unrollable.
+absl::StatusOr<bool> PrepareModuleForUnrolling(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads);
+
 // Returns the list of unrollable loops in the given module
 absl::flat_hash_map<HloInstruction*, WhileLoopConfig> GetUnrollableLoops(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads);
 
-// Unrolls the given while loop with the defaul behaviour set to full unroll. If
-// wrap_in_trivial_loop is set, the unrolled body of the loop will be wrapped in
-// a loop with trip count of one.
+// Unrolls the given while loop with the default behaviour set to full unroll.
+// If wrap_in_trivial_loop is set, the unrolled body of the loop will be wrapped
+// in a loop with trip count of one.
 StatusOr<bool> Unroll(HloInstruction* while_op, int64_t unroll_factor = -1,
                       bool wrap_in_trivial_loop = false);
 
diff --git a/third_party/xla/xla/service/while_loop_unroller_test.cc b/third_party/xla/xla/service/while_loop_unroller_test.cc
index d8122067f30bf4..cd4cfeb20f59b5 100644
--- a/third_party/xla/xla/service/while_loop_unroller_test.cc
+++ b/third_party/xla/xla/service/while_loop_unroller_test.cc
@@ -353,6 +353,82 @@ TEST_F(WhileLoopUnrollerTest, SimpleLoopUnroll) {
   UnrollAndCompare(MakeModuleWithSimpleLoop(/*num_iters=*/5), {}, -1, true);
 }
 
+// This test passes because we run WhileLoopConstantSinking before unrolling.
+TEST_F(WhileLoopUnrollerTest, SimpleLoopUnrollNeedPrepare) {
+  std::string hlo_string = R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s64[], s32[3]{0}, s64[]) parameter(0)
+    get-tuple-element.1 = s64[] get-tuple-element(loop_var.1), index=0
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.1), index=2
+    add = s64[] add(get-tuple-element.1, get-tuple-element.3)
+    multiply = s32[3]{0} add(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s64[], s32[3]{0}, s64[]) tuple(add, multiply, get-tuple-element.3)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s64[], s32[3]{0}, s64[]) parameter(0)
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.2), index=0
+    /* number of iterations is 10 */
+    constant.2 = s64[] constant(10)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s64[] constant(0)
+    one = s64[] constant(1)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s64[], s32[3]{0}, s64[]) tuple(constant.3, constant.4, one)
+    while = (s64[], s32[3]{0}, s64[]) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT result = s32[3]{0} get-tuple-element(while), index=1
+  }
+  )";
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   false);
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   true);
+}
+
+// This test passes because we run TupleSimplifier before unrolling.
+TEST_F(WhileLoopUnrollerTest, SimpleLoopUnrollNeedPrepare2) {
+  std::string hlo_string = R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s64[], s32[3]{0}, s64[]) parameter(0)
+    get-tuple-element.1 = s64[] get-tuple-element(loop_var.1), index=0
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.1), index=2
+    add = s64[] add(get-tuple-element.1, get-tuple-element.3)
+    multiply = s32[3]{0} add(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s64[], s32[3]{0}, s64[]) tuple(add, multiply, get-tuple-element.3)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s64[], s32[3]{0}, s64[]) parameter(0)
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.2), index=0
+    /* number of iterations is 10 */
+    constant.2 = s64[] constant(10)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s64[] constant(0)
+    one = s64[] constant(1)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s64[], s32[3]{0}, s64[]) tuple(constant.3, constant.4, one)
+    gte1 = s64[] get-tuple-element(tuple.1), index=0
+    gte2 = s32[3]{0} get-tuple-element(tuple.1), index=1
+    gte3 = s64[] get-tuple-element(tuple.1), index=2
+    tuple = (s64[], s32[3]{0}, s64[]) tuple(gte1, gte2, gte3)
+    while = (s64[], s32[3]{0}, s64[]) while(tuple), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT result = s32[3]{0} get-tuple-element(while), index=1
+  }
+  )";
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   false);
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   true);
+}
+
 TEST_F(WhileLoopUnrollerTest, SimpleLoopNotRoot) {
   std::string hlo_string = R"(
   HloModule SimpleLoop

From 1e50aa63e73d25a6c533ae94ac0853efd07562ff Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 11:35:15 -0700
Subject: [PATCH 123/670] #shlo_ref Add `not` op.

PiperOrigin-RevId: 617241997
---
 tensorflow/lite/experimental/shlo/BUILD       |   1 -
 tensorflow/lite/experimental/shlo/dispatch.h  |  18 ++++
 tensorflow/lite/experimental/shlo/ops/BUILD   |  28 +++++
 tensorflow/lite/experimental/shlo/ops/not.cc  |  58 ++++++++++
 tensorflow/lite/experimental/shlo/ops/not.h   |  34 ++++++
 .../lite/experimental/shlo/ops/not_test.cc    | 102 ++++++++++++++++++
 .../lite/experimental/shlo/ops/test_util.h    |  21 +++-
 7 files changed, 260 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/not.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/not.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/not_test.cc

diff --git a/tensorflow/lite/experimental/shlo/BUILD b/tensorflow/lite/experimental/shlo/BUILD
index ae0fefd70dc03a..02838f1449002d 100644
--- a/tensorflow/lite/experimental/shlo/BUILD
+++ b/tensorflow/lite/experimental/shlo/BUILD
@@ -127,7 +127,6 @@ cc_library(
     name = "dispatch",
     hdrs = ["dispatch.h"],
     visibility = ["//tensorflow/lite/experimental/shlo:__subpackages__"],
-    deps = [":data_type"],
 )
 
 cc_test(
diff --git a/tensorflow/lite/experimental/shlo/dispatch.h b/tensorflow/lite/experimental/shlo/dispatch.h
index a3d4a641f2eb4c..ecadbd7e1e8fc7 100644
--- a/tensorflow/lite/experimental/shlo/dispatch.h
+++ b/tensorflow/lite/experimental/shlo/dispatch.h
@@ -81,6 +81,24 @@ limitations under the License.
     }                                                                   \
   }
 
+#define DISPATCH_BOOL_INT(name, element_type, ...)                      \
+  {                                                                     \
+    switch (element_type) {                                             \
+      case DataType::kI1:                                               \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kI1>(__VA_ARGS__)));   \
+      case DataType::kSI4:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI4>(__VA_ARGS__)));  \
+      case DataType::kSI8:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI8>(__VA_ARGS__)));  \
+      case DataType::kSI16:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI16>(__VA_ARGS__))); \
+      case DataType::kSI32:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI32>(__VA_ARGS__))); \
+      default:                                                          \
+        return absl::InvalidArgumentError("Unsupported element type");  \
+    }                                                                   \
+  }
+
 #define DISPATCH_BOOL_INT_FLOAT(name, element_type, ...)                \
   {                                                                     \
     switch (element_type) {                                             \
diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 93ce34f3cd637a..e79c5cbd274f8d 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -688,3 +688,31 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "not",
+    srcs = ["not.cc"],
+    hdrs = ["not.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "not_test",
+    srcs = ["not_test.cc"],
+    deps = [
+        ":not",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/not.cc b/tensorflow/lite/experimental/shlo/ops/not.cc
new file mode 100644
index 00000000000000..5029afc7260f41
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/not.cc
@@ -0,0 +1,58 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/not.h"
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Not {
+  template <class T>
+  T operator()(T v) const {
+    return ~v;
+  }
+  template <>
+  bool operator()(bool v) const {
+    return !v;
+  }
+};
+
+NotOp Create(NotOp::Attributes) { return {}; }
+
+absl::Status Prepare(NotOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("not"), input, IsBoolTensor, IsIntTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("not"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(NotOp& op, const Tensor& input, Tensor& output) {
+  Not not_func;
+  if (IsIntTensor(input) || IsBoolTensor(input)) {
+    DISPATCH_BOOL_INT(detail::EvaluateNoQuantization,
+                      input.tensor_element_type(), not_func, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.not: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/not.h b/tensorflow/lite/experimental/shlo/ops/not.h
new file mode 100644
index 00000000000000..c05bc0ef0ae7a4
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/not.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NOT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NOT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct NotOp {
+  struct Attributes {};
+};
+
+NotOp Create(NotOp::Attributes);
+absl::Status Prepare(NotOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(NotOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NOT_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/not_test.cc b/tensorflow/lite/experimental/shlo/ops/not_test.cc
new file mode 100644
index 00000000000000..cc719a1badbf3f
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/not_test.cc
@@ -0,0 +1,102 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/not.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<NotOp> {
+  static std::string Get() { return "Not"; }
+};
+
+template <>
+struct SupportedOpDataType<NotOp> {
+  static constexpr DataType kStorageType = DataType::kSI32;
+};
+
+namespace {
+
+struct Not {
+  template <class T>
+  T operator()(T v) const {
+    return ~v;
+  }
+  template <>
+  bool operator()(bool v) const {
+    return !v;
+  }
+} not_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Not, UnaryElementwiseOpShapePropagationTest,
+                               NotOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Not, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    BaselineMismatchSignedIntegerTypes<NotOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<NotOp, ConcatTypes<FloatTestTypes, PerTensorQuantizedTestTypes,
+                                   PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Not, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct BoolAndIntNotTest : ::testing::Test {};
+
+using SupportedTypes = ConcatTypes<BoolTestType, IntTestTypes>;
+
+TYPED_TEST_SUITE(BoolAndIntNotTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(BoolAndIntNotTest, BoolAndIntTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), not_ref);
+
+  auto op = Create(NotOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/test_util.h b/tensorflow/lite/experimental/shlo/ops/test_util.h
index 9eaab155e5c20f..9e849416875367 100644
--- a/tensorflow/lite/experimental/shlo/ops/test_util.h
+++ b/tensorflow/lite/experimental/shlo/ops/test_util.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TEST_UTIL_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TEST_UTIL_H_
 
+#include <cstdint>
 #include <random>
 #include <string>
 #include <tuple>
@@ -39,6 +40,12 @@ using Vector = absl::InlinedVector<T, 1>;
 template <DataType storage_type, typename = void>
 struct Distribution;
 
+template <>
+struct Distribution<DataType::kI1, void>
+    : std::uniform_int_distribution<int32_t> {
+  using std::uniform_int_distribution<int32_t>::uniform_int_distribution;
+};
+
 template <DataType storage_type>
 struct Distribution<storage_type, std::enable_if_t<IsInteger(storage_type)>>
     : std::uniform_int_distribution<typename Storage<storage_type>::Type> {
@@ -59,7 +66,13 @@ Vector<typename Config::Type> RandomBuffer(
   Vector<typename Config::Type> vec(shape.NumElements());
   std::random_device rd;
   Distribution<storage_type> dist(min, max);
-  absl::c_generate(vec, [&] { return dist(rd); });
+  absl::c_generate(vec, [&] {
+    if constexpr (storage_type == DataType::kI1) {
+      return dist(rd) >= 0;
+    } else {
+      return dist(rd);
+    }
+  });
   return vec;
 }
 
@@ -238,6 +251,12 @@ using FloatTestTypes =
     ::testing::Types<TestParam<DataType::kBF16>, TestParam<DataType::kF16>,
                      TestParam<DataType::kF32>>;
 
+// Use this with TYPED_TEST_SUITE for non quantized integer testing.
+using NonQuantizedBoolIntTestTypes =
+    testing::Types<TestParam<DataType::kI1>, TestParam<DataType::kSI4>,
+                   TestParam<DataType::kSI8>, TestParam<DataType::kSI16>,
+                   TestParam<DataType::kSI32>>;
+
 // Use this with TYPED_TEST_SUITE for non quantized testing.
 using ArithmeticTestTypes = ConcatTypes<IntTestTypes, FloatTestTypes>;
 

From aaa1a1badfd293d7995b7c43ddb682ce5f4ccfe6 Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Tue, 19 Mar 2024 11:45:27 -0700
Subject: [PATCH 124/670] Avoid hard-coding the list of different sanitizer
 build types.

PiperOrigin-RevId: 617245408
---
 tensorflow/lite/build_def.bzl | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index d207680c10ea09..2814d0da630889 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -170,10 +170,7 @@ def tflite_linkopts_no_undefined():
             # since undefined symbols in shared libraries (references to symbols
             # that will be defined in the main executable) are normal and
             # expected in those cases.
-            "//tools/cpp:asan_build": [],
-            "//tools/cpp:hwasan_build": [],
-            "//tools/cpp:msan_build": [],
-            "//tools/cpp:tsan_build": [],
+            "//tools/cpp:sanitizer_build": [],
             "//tensorflow:ios": [
                 "-Wl,-undefined,error",
             ],

From 996120fc29acebd53e4f577be726ab41de430281 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 19 Mar 2024 11:45:40 -0700
Subject: [PATCH 125/670] #shlo_ref Add `popcnt` op.

PiperOrigin-RevId: 617245473
---
 tensorflow/lite/experimental/shlo/ops/BUILD   | 30 ++++++
 .../lite/experimental/shlo/ops/popcnt.cc      | 57 +++++++++++
 .../lite/experimental/shlo/ops/popcnt.h       | 34 +++++++
 .../lite/experimental/shlo/ops/popcnt_test.cc | 99 +++++++++++++++++++
 .../lite/experimental/shlo/ops/test_util.h    |  5 +
 5 files changed, 225 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/popcnt.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/popcnt.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/popcnt_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index e79c5cbd274f8d..6e66dae30bfec9 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -716,3 +716,33 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "popcnt",
+    srcs = ["popcnt.cc"],
+    hdrs = ["popcnt.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "popcnt_test",
+    srcs = ["popcnt_test.cc"],
+    deps = [
+        ":popcnt",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/popcnt.cc b/tensorflow/lite/experimental/shlo/ops/popcnt.cc
new file mode 100644
index 00000000000000..0de4f84e71fb4f
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/popcnt.cc
@@ -0,0 +1,57 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/popcnt.h"
+
+#include <type_traits>
+
+#include "absl/numeric/bits.h"
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Popcnt {
+  template <class T>
+  T operator()(T v) const {
+    return absl::popcount(static_cast<std::make_unsigned_t<T>>(v));
+  }
+};
+
+PopcntOp Create(PopcntOp::Attributes) { return {}; }
+
+absl::Status Prepare(PopcntOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("popcnt"), input, IsIntTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("popcnt"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(PopcntOp& op, const Tensor& input, Tensor& output) {
+  Popcnt popcnt;
+  if (IsIntTensor(input)) {
+    DISPATCH_INT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                 popcnt, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.popcnt: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/popcnt.h b/tensorflow/lite/experimental/shlo/ops/popcnt.h
new file mode 100644
index 00000000000000..56f3c3b6f441df
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/popcnt.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_POPCNT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_POPCNT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct PopcntOp {
+  struct Attributes {};
+};
+
+PopcntOp Create(PopcntOp::Attributes);
+absl::Status Prepare(PopcntOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(PopcntOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_POPCNT_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/popcnt_test.cc b/tensorflow/lite/experimental/shlo/ops/popcnt_test.cc
new file mode 100644
index 00000000000000..3be3bdb39ac4ff
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/popcnt_test.cc
@@ -0,0 +1,99 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/popcnt.h"
+
+#include <string>
+#include <type_traits>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/numeric/bits.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<PopcntOp> {
+  static std::string Get() { return "Popcnt"; }
+};
+
+template <>
+struct SupportedOpDataType<PopcntOp> {
+  static constexpr DataType kStorageType = DataType::kSI32;
+};
+
+namespace {
+
+struct Popcnt {
+  template <class T>
+  T operator()(T v) const {
+    return absl::popcount(static_cast<std::make_unsigned_t<T>>(v));
+  }
+} popcnt_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Popcnt, UnaryElementwiseOpShapePropagationTest,
+                               PopcntOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Popcnt, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    BaselineMismatchSignedIntegerTypes<PopcntOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<PopcntOp, ConcatTypes<BoolTestType, FloatTestTypes,
+                                      PerTensorQuantizedTestTypes,
+                                      PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Popcnt, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct PopcntTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(PopcntTest, IntTestTypes, TestParamNames);
+
+TYPED_TEST(PopcntTest, IntTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = IotaBuffer<TypeParam::kStorage>(shape, -12);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), popcnt_ref);
+
+  auto op = Create(PopcntOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/test_util.h b/tensorflow/lite/experimental/shlo/ops/test_util.h
index 9e849416875367..6285b040026da2 100644
--- a/tensorflow/lite/experimental/shlo/ops/test_util.h
+++ b/tensorflow/lite/experimental/shlo/ops/test_util.h
@@ -257,6 +257,11 @@ using NonQuantizedBoolIntTestTypes =
                    TestParam<DataType::kSI8>, TestParam<DataType::kSI16>,
                    TestParam<DataType::kSI32>>;
 
+// Use this with TYPED_TEST_SUITE for non quantized integer testing.
+using NonQuantizedBoolFloatTestTypes =
+    testing::Types<TestParam<DataType::kI1>, TestParam<DataType::kBF16>,
+                   TestParam<DataType::kF16>, TestParam<DataType::kF32>>;
+
 // Use this with TYPED_TEST_SUITE for non quantized testing.
 using ArithmeticTestTypes = ConcatTypes<IntTestTypes, FloatTestTypes>;
 

From 9ce37ea5fa64a130e7509dad18cb9b70e473ca8b Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Tue, 19 Mar 2024 11:52:59 -0700
Subject: [PATCH 126/670] Add empty dep to XLA in TSL's cc_library to avoid
 https://github.com/bazelbuild/bazel/issues/21519

PiperOrigin-RevId: 617247731
---
 tensorflow/core/BUILD                         |  1 -
 tensorflow/tensorflow.bzl                     |  1 -
 .../tsl/tsl/platform/default/rules_cc.bzl     | 21 +++++++++++++++++--
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 191f032e1a4527..4bc1b596d4eefc 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1027,7 +1027,6 @@ cc_library(
         "//tensorflow/core:mobile_additional_lib_deps",
         "//tensorflow/core/platform:resource",
         "//tensorflow/core/util:stats_calculator_portable",
-        "@local_xla//xla:bazel_issue_21519",
     ] + tf_portable_proto_lib() + tf_portable_deps_no_runtime(),
     alwayslink = 1,
 )
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d6128779eb5bf6..7dbf684d6a078b 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -3352,7 +3352,6 @@ def tf_python_pybind_extension_opensource(
     """
     extended_deps = deps + if_mkl_ml(["@local_xla//xla/tsl/mkl:intel_binary_blob"])
     extended_deps += [] if dynamic_deps else if_windows([], ["//tensorflow:libtensorflow_framework_import_lib"]) + tf_binary_pybind_deps()
-    extended_deps += ["@local_xla//xla:bazel_issue_21519"]  # buildifier: disable=list-append
     pybind_extension_opensource(
         name,
         srcs,
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/rules_cc.bzl b/third_party/xla/third_party/tsl/tsl/platform/default/rules_cc.bzl
index 054460a83e19a0..60b1bdfa5ec130 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/rules_cc.bzl
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/rules_cc.bzl
@@ -2,12 +2,29 @@
 
 _cc_binary = native.cc_binary
 _cc_import = native.cc_import
-_cc_library = native.cc_library
 _cc_shared_library = native.cc_shared_library
 _cc_test = native.cc_test
 
 cc_binary = _cc_binary
 cc_import = _cc_import
-cc_library = _cc_library
 cc_shared_library = _cc_shared_library
 cc_test = _cc_test
+
+def cc_library(name, deps = None, **kwargs):
+    """cc_library that hides side effects of https://github.com/bazelbuild/bazel/issues/21519.
+
+    Args:
+      name: name of target.
+      deps: deps with `xla:bazel_issue_21519` added.
+      **kwargs: passed to native.cc_library.
+    """
+
+    if deps == None:
+        deps = []
+
+    # Horrifying, but needed to prevent a cycle, as `bazel_issue_21519` is an
+    # alias of `empty`.
+    if name != "empty":
+        deps = deps + ["@local_xla//xla:bazel_issue_21519"]  # buildifier: disable=list-append
+
+    native.cc_library(name = name, deps = deps, **kwargs)

From f770a29f7e59cdfe7d5b17deee7efd935eeccbc0 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 19 Mar 2024 12:01:36 -0700
Subject: [PATCH 127/670] [XLA:Python] Fix breakage in
 get_client_memory_stats()

PiperOrigin-RevId: 617250261
---
 third_party/xla/xla/python/xla.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index f6d35ca1724ac0..c9194a2cdc694c 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -876,7 +876,8 @@ NB_MODULE(xla_extension, m_nb) {
       .def("get_output_layouts",
            xla::ValueOrThrowWrapper(&PjRtExecutable::GetOutputLayouts))
       .def("get_parameter_shardings", &PjRtExecutable::GetParameterShardings)
-      .def("get_compiled_memory_stats", &PjRtExecutable::GetCompiledMemoryStats)
+      .def("get_compiled_memory_stats",
+           xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompiledMemoryStats))
       .def("compile_options",
            xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompileOptions))
       .def("serialize",

From 3f2e1e54a8cf2251e2325fb4e10b0dc0f1bcd4a1 Mon Sep 17 00:00:00 2001
From: Raunak <mayank.kumar.raunak@intel.com>
Date: Tue, 19 Mar 2024 14:10:06 -0700
Subject: [PATCH 128/670] update configure.py

---
 configure.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/configure.py b/configure.py
index c1cb20162012f6..28b3101ab67f51 100644
--- a/configure.py
+++ b/configure.py
@@ -806,6 +806,16 @@ def choose_compiler(environ_cp):
   )
   return var
 
+def choose_compiler_Win(environ_cp):
+  question = 'Do you want to use Clang to build TensorFlow?'
+  yes_reply = 'Please use "--config=win_clang" to compile TensorFlow with CLANG.'
+  no_reply = 'MSVC will be used to compile TensorFlow.'
+  var = int(
+      get_var(
+          environ_cp, 'TF_NEED_CLANG', None, True, question, yes_reply, no_reply
+      )
+  )
+  return var
 
 def set_clang_compiler_path(environ_cp):
   """Set CLANG_COMPILER_PATH and environment variables.
@@ -1415,6 +1425,12 @@ def main():
         clang_compiler_path = set_clang_compiler_path(environ_cp)
         clang_version = retrieve_clang_version(clang_compiler_path)
         disable_clang_offsetof_extension(clang_version)
+    if is_windows():
+      environ_cp['TF_NEED_CLANG'] = str(choose_compiler_Win(environ_cp))
+      if environ_cp.get('TF_NEED_CLANG') == '1':
+        clang_compiler_path = set_clang_compiler_path(environ_cp)
+        clang_version = retrieve_clang_version(clang_compiler_path)
+        disable_clang_offsetof_extension(clang_version)
 
   # ROCm / CUDA are mutually exclusive.
   # At most 1 GPU platform can be configured.

From 7a2c56a31ff07158311d03837f7dac523c796c17 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Tue, 19 Mar 2024 12:29:54 -0700
Subject: [PATCH 129/670] [xla:gpu] Do not take into account original slice
 offset when computing offsets in AddressComputationThunk

The original slice offset is already taken into account in `BufferAllocations::GetDeviceAddress`.

PiperOrigin-RevId: 617258784
---
 .../gpu/runtime/address_computation_thunk.cc  |   5 +-
 .../runtime/address_computation_thunk_test.cc | 134 ++++++++++++++++++
 2 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 5a975cfe7d6a0b..fce7902590ddf8 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -121,6 +121,8 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
       continue;
     }
 
+    // `orig_operand` will contain the original offset for slice
+    // `embedded_thunk_arguments_[i]` within `orig_allocations`
     se::DeviceMemoryBase orig_operand =
         orig_allocations.GetDeviceAddress(*embedded_thunk_arguments_[i]);
     if (offset_buffer_indices_[i] == std::nullopt) {
@@ -156,9 +158,8 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
     // Compute new slice. No need to copy the content to new buffers as we can
     // reuse the original buffers since slices are contiguous.
     int64_t new_size = ShapeUtil::ByteSizeOf(dst_shape);
-    BufferAllocation::Slice orig_slice = *embedded_thunk_arguments_[i];
 
-    int64_t new_offset = orig_slice.offset();
+    int64_t new_offset = 0;
     for (auto [start, stride] :
          llvm::zip(slice_starts, *ShapeUtil::ByteStrides(src_shape))) {
       new_offset += start * stride;
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index 9298415e2f5651..37906bf6de7760 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -1084,4 +1084,138 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
   ASSERT_EQ(dst, std::vector<float>({9}));
 }
 
+TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t lhs_length = sizeof(float) * 2 * 4;
+  int64_t rhs_length = sizeof(float) * 3 * 1;
+  int64_t out_length = sizeof(float) * 1 * 1;
+  int64_t offset_length = sizeof(int64_t);
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  BufferAllocation alloc_lhs(/*index=*/0, 3 * lhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs(&alloc_lhs, lhs_length, lhs_length);
+
+  BufferAllocation alloc_rhs(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, rhs_length);
+
+  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+
+  BufferAllocation alloc_workspace(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+
+  BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_offset_1(/*index=*/5, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_fake(/*index=*/0, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, rhs_length);
+
+  // Preparing config for GEMM thunk.
+  auto config =
+      GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
+                      0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
+                      se::blas::kDefaultComputePrecision, false, false);
+  ASSERT_TRUE(config.ok());
+
+  // Creating embedded GEMM thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<GemmThunk>(
+      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs,
+      slice_out, slice_workspace, /*deterministic=*/true));
+
+  // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
+       std::nullopt, std::nullopt});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+
+  // Preparing memory for thunk arguments.
+  // lhs = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+  //
+  // The real `lhs` tensor will look more like this:
+  // lhs = [1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0]
+  // The `lhs` slice that we want to use will be equivalent to this static
+  // slice op:
+  // f32[1,3]{1,0} slice(lhs), slice={[0:1], [1:4]}
+  se::DeviceMemory<float> lhs_whole_buffer =
+      executor->AllocateArray<float>(2 * 4 * 3);
+  TF_ASSERT_OK(stream.MemZero(&lhs_whole_buffer, 2 * 4 * 3));
+  std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
+  se::DeviceMemoryBase lhs =
+      lhs_whole_buffer.GetByteSlice(lhs_length, lhs_length);
+  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+
+  // rhs = [1.0,
+  //        1.0,
+  //        1.0]
+  se::DeviceMemory<float> rhs = executor->AllocateArray<float>(3 * 1);
+  std::vector<float> rhs_arr(3, 1);
+  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+
+  se::DeviceMemory<float> out = executor->AllocateArray<float>(1 * 1);
+  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+
+  se::DeviceMemory<float> workspace =
+      executor->AllocateArray<float>(1024 * 1024);
+  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> lhs_offset_arr{0, 1};
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations(
+      {lhs_whole_buffer, rhs, out, workspace, lhs_offset_0, lhs_offset_1}, 0,
+      executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Executing address computation thunk.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copying `out` data back to host for verification.
+  std::vector<float> dst(1, 0);
+  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+
+  ASSERT_EQ(dst, std::vector<float>({9}));
+}
+
 }  // namespace xla::gpu

From da39c61f63dc3dba441fc7617bd0adca3a2d8491 Mon Sep 17 00:00:00 2001
From: Kyle Lucke <klucke@google.com>
Date: Tue, 19 Mar 2024 12:47:46 -0700
Subject: [PATCH 130/670] Cache the executor pointer in the StreamPool at
 construction time.  Otherwise there could potentially be streams from
 difference executors cached in the same pool.

PiperOrigin-RevId: 617263590
---
 third_party/xla/xla/service/backend.cc        |  9 ++++----
 third_party/xla/xla/service/stream_pool.cc    |  5 ++---
 third_party/xla/xla/service/stream_pool.h     |  6 +++---
 .../xla/xla/service/stream_pool_test.cc       | 21 +++++++++++--------
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/third_party/xla/xla/service/backend.cc b/third_party/xla/xla/service/backend.cc
index 459b0d37c0d521..aac138f395caf9 100644
--- a/third_party/xla/xla/service/backend.cc
+++ b/third_party/xla/xla/service/backend.cc
@@ -114,9 +114,9 @@ absl::StatusOr<StreamPool::Ptr> Backend::BorrowStream(
     se::StreamExecutor* executor, se::StreamPriority priority) {
   absl::MutexLock l(&mu_);
   if (!stream_pools_.contains(executor)) {
-    stream_pools_.emplace(executor, std::make_unique<StreamPool>());
+    stream_pools_.emplace(executor, std::make_unique<StreamPool>(executor));
   }
-  return stream_pools_.at(executor)->BorrowStream(executor, priority);
+  return stream_pools_.at(executor)->BorrowStream(priority);
 }
 
 absl::StatusOr<std::vector<StreamPool::Ptr>> Backend::BorrowStreams(
@@ -124,13 +124,12 @@ absl::StatusOr<std::vector<StreamPool::Ptr>> Backend::BorrowStreams(
   absl::MutexLock l(&mu_);
   TF_ASSIGN_OR_RETURN(auto executor, stream_executor(device_ordinal));
   if (!stream_pools_.contains(executor)) {
-    stream_pools_.emplace(executor, std::make_unique<StreamPool>());
+    stream_pools_.emplace(executor, std::make_unique<StreamPool>(executor));
   }
 
   std::vector<StreamPool::Ptr> ptrs;
   for (int i = 0; i < num_streams; i++) {
-    StreamPool::Ptr ptr =
-        stream_pools_.at(executor)->BorrowStream(executor, priority);
+    StreamPool::Ptr ptr = stream_pools_.at(executor)->BorrowStream(priority);
     ptrs.push_back(std::move(ptr));
   }
   return ptrs;
diff --git a/third_party/xla/xla/service/stream_pool.cc b/third_party/xla/xla/service/stream_pool.cc
index 17f455869aee0f..54f5c773e76138 100644
--- a/third_party/xla/xla/service/stream_pool.cc
+++ b/third_party/xla/xla/service/stream_pool.cc
@@ -22,8 +22,7 @@ limitations under the License.
 
 namespace xla {
 
-StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor,
-                                         se::StreamPriority priority) {
+StreamPool::Ptr StreamPool::BorrowStream(se::StreamPriority priority) {
   std::unique_ptr<se::Stream> stream;
 
   {
@@ -51,7 +50,7 @@ StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor,
 
   if (!stream) {
     // Create a new stream.
-    stream = executor->CreateStream(priority).value();
+    stream = executor_->CreateStream(priority).value();
     VLOG(1) << absl::StrFormat("Created new stream (%p) with priority = %s",
                                stream.get(),
                                se::StreamPriorityToString(priority));
diff --git a/third_party/xla/xla/service/stream_pool.h b/third_party/xla/xla/service/stream_pool.h
index f29c97e105a9f2..1610071de6dcff 100644
--- a/third_party/xla/xla/service/stream_pool.h
+++ b/third_party/xla/xla/service/stream_pool.h
@@ -38,15 +38,14 @@ class StreamPool {
   // stream to the pool on destruction.
   using Ptr = std::unique_ptr<se::Stream, PtrDeleter>;
 
-  StreamPool() = default;
+  explicit StreamPool(se::StreamExecutor* executor) : executor_(executor) {}
 
   // Returns a pointer to a stream in the pool, creating a new stream
   // if none are available in the pool. The returned smart pointer
   // returns the stream to the pool on destruction.
   //
   // This method is thread-safe.
-  Ptr BorrowStream(se::StreamExecutor* executor,
-                   se::StreamPriority priority = se::StreamPriority::Default);
+  Ptr BorrowStream(se::StreamPriority priority = se::StreamPriority::Default);
 
  private:
   // Puts a pointer to a stream back into the pool, leaving it free
@@ -61,6 +60,7 @@ class StreamPool {
   std::unordered_map<se::StreamPriority,
                      std::vector<std::unique_ptr<se::Stream>>>
       streams_with_pri_ ABSL_GUARDED_BY(mu_);
+  se::StreamExecutor* executor_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/stream_pool_test.cc b/third_party/xla/xla/service/stream_pool_test.cc
index 551a35cef1843c..fd0a05e5d2f237 100644
--- a/third_party/xla/xla/service/stream_pool_test.cc
+++ b/third_party/xla/xla/service/stream_pool_test.cc
@@ -34,20 +34,23 @@ class StreamPoolTest : public ::testing::Test {
   }
 };
 
-TEST_F(StreamPoolTest, EmptyPool) { StreamPool pool; }
+TEST_F(StreamPoolTest, EmptyPool) {
+  std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
+  StreamPool pool(executor.get());
+}
 
 TEST_F(StreamPoolTest, OneStreamPool) {
   std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
-  StreamPool pool;
+  StreamPool pool(executor.get());
 
   // Borrow and return a stream.
-  StreamPool::Ptr stream1 = pool.BorrowStream(executor.get());
+  StreamPool::Ptr stream1 = pool.BorrowStream();
   se::Stream* stream1_ptr = stream1.get();
   EXPECT_TRUE(stream1->ok());
   stream1 = nullptr;
 
   // Borrow and return another stream.
-  StreamPool::Ptr stream2 = pool.BorrowStream(executor.get());
+  StreamPool::Ptr stream2 = pool.BorrowStream();
   se::Stream* stream2_ptr = stream2.get();
   EXPECT_TRUE(stream2->ok());
   stream2 = nullptr;
@@ -59,13 +62,13 @@ TEST_F(StreamPoolTest, OneStreamPool) {
 
 TEST_F(StreamPoolTest, TwoStreamPool) {
   std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
-  StreamPool pool;
+  StreamPool pool(executor.get());
 
   // Borrow two streams.
-  StreamPool::Ptr stream1 = pool.BorrowStream(executor.get());
+  StreamPool::Ptr stream1 = pool.BorrowStream();
   se::Stream* stream1_ptr = stream1.get();
   EXPECT_TRUE(stream1->ok());
-  StreamPool::Ptr stream2 = pool.BorrowStream(executor.get());
+  StreamPool::Ptr stream2 = pool.BorrowStream();
   se::Stream* stream2_ptr = stream2.get();
   EXPECT_TRUE(stream2->ok());
 
@@ -75,7 +78,7 @@ TEST_F(StreamPoolTest, TwoStreamPool) {
 
   // Return stream1 and borrow stream3.
   stream1 = nullptr;
-  StreamPool::Ptr stream3 = pool.BorrowStream(executor.get());
+  StreamPool::Ptr stream3 = pool.BorrowStream();
   se::Stream* stream3_ptr = stream3.get();
   EXPECT_TRUE(stream3->ok());
 
@@ -85,7 +88,7 @@ TEST_F(StreamPoolTest, TwoStreamPool) {
 
   // Return stream2, and borrow stream4.
   stream2 = nullptr;
-  StreamPool::Ptr stream4 = pool.BorrowStream(executor.get());
+  StreamPool::Ptr stream4 = pool.BorrowStream();
   se::Stream* stream4_ptr = stream4.get();
   EXPECT_TRUE(stream4->ok());
 

From a8228226e05b1eeb13ecf12318abcc6a99415308 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 19 Mar 2024 12:53:35 -0700
Subject: [PATCH 131/670] [IFRT] Introduce ifrt::Array::layout()

PiperOrigin-RevId: 617265234
---
 third_party/xla/xla/python/ifrt/BUILD         |  1 +
 third_party/xla/xla/python/ifrt/array.h       |  5 ++++
 third_party/xla/xla/python/ifrt/mock.cc       |  4 +++
 third_party/xla/xla/python/ifrt/mock.h        |  4 ++-
 .../xla/xla/python/ifrt_proxy/client/array.h  |  4 +++
 .../xla/xla/python/pjrt_ifrt/pjrt_array.cc    | 26 +++++++++++++++++--
 .../xla/xla/python/pjrt_ifrt/pjrt_array.h     |  2 ++
 7 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index d6c7a913e60d59..3aa5d4d2443bc7 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -77,6 +77,7 @@ cc_library(
         "//xla:util",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt/ir",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
diff --git a/third_party/xla/xla/python/ifrt/array.h b/third_party/xla/xla/python/ifrt/array.h
index 84730430a028de..f1406877a6c110 100644
--- a/third_party/xla/xla/python/ifrt/array.h
+++ b/third_party/xla/xla/python/ifrt/array.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/shape.h"
@@ -69,6 +70,10 @@ class Array : public llvm::RTTIExtends<Array, Value> {
   virtual const Shape& shape() const = 0;
   virtual const Sharding& sharding() const = 0;
   virtual std::shared_ptr<const Sharding> shared_ptr_sharding() const = 0;
+  // The device memory layout for each shard of the Array. All shards are
+  // assumed to have the same layout. Cannot be nullptr; implementations should
+  // return UNIMPLEMENTED instead.
+  virtual absl::StatusOr<std::unique_ptr<PjRtLayout>> layout() const = 0;
 
   // Breaks an array up into per-device arrays. This is the elimination
   // counterpart of `Client::AssembleArrayFromSingleDeviceArrays()`.
diff --git a/third_party/xla/xla/python/ifrt/mock.cc b/third_party/xla/xla/python/ifrt/mock.cc
index da63d7d1543dee..0684702b988647 100644
--- a/third_party/xla/xla/python/ifrt/mock.cc
+++ b/third_party/xla/xla/python/ifrt/mock.cc
@@ -74,6 +74,10 @@ MockArray::MockArray(tsl::RCReference<xla::ifrt::Array> delegated)
   ON_CALL(*this, shared_ptr_sharding).WillByDefault([this]() {
     return delegated_->shared_ptr_sharding();
   });
+  ON_CALL(*this, layout)
+      .WillByDefault([this]() -> absl::StatusOr<std::unique_ptr<PjRtLayout>> {
+        return delegated_->layout();
+      });
   ON_CALL(*this, DisassembleIntoSingleDeviceArrays)
       .WillByDefault([this](ArrayCopySemantics semantics) {
         return delegated_->DisassembleIntoSingleDeviceArrays(semantics);
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index cc93f4f95c9982..afb810fe4e7262 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -64,7 +64,9 @@ class MockArray final : public llvm::RTTIExtends<MockArray, Array> {
   MOCK_METHOD(const Sharding&, sharding, (), (const, final));
   MOCK_METHOD(std::shared_ptr<const Sharding>, shared_ptr_sharding, (),
               (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<tsl::RCReference<Array>>>,
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<PjRtLayout>>, layout, (),
+              (const, final));
+  MOCK_METHOD(StatusOr<std::vector<tsl::RCReference<Array>>>,
               DisassembleIntoSingleDeviceArrays, (ArrayCopySemantics semantics),
               (final));
   MOCK_METHOD(absl::StatusOr<tsl::RCReference<Array>>, FullyReplicatedShard,
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.h b/third_party/xla/xla/python/ifrt_proxy/client/array.h
index c17f497b1ce81d..3b5e8d9d5e1149 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.h
@@ -102,6 +102,10 @@ class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
   std::shared_ptr<const Sharding> shared_ptr_sharding() const override {
     return sharding_;
   }
+  absl::StatusOr<std::unique_ptr<PjRtLayout>> layout() const override {
+    return absl::UnimplementedError(
+        "Array::layout() not implemented for IFRT proxy");
+  };
 
   absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
   DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index fb84eabe13ece7..e661361ad0a9e3 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -546,10 +546,32 @@ bool PjRtArray::IsDeleted() const {
 
 std::string PjRtArray::DebugString() const {
   DCHECK(this);
+  absl::StatusOr<std::unique_ptr<PjRtLayout>> layout_ptr = layout();
+  std::string layout_str =
+      layout_ptr.ok() ? (*layout_ptr)->ToString() : "<unknown>";
+
   return absl::StrFormat(
-      "PjRtArray(dtype=%s; shape=%s; sharding=%s)", dtype_.DebugString(),
+      "PjRtArray(dtype=%s; shape=%s; sharding=%s; layout=%s)",
+      dtype_.DebugString(),
       std::visit([](const auto& shape) { return shape.DebugString(); }, shape_),
-      sharding_->DebugString());
+      sharding_->DebugString(), layout_str);
+}
+
+// TODO(b/330198879): populate layout at construction instead of accessing PJRT
+// buffer directly for consistency with Pathways.
+absl::StatusOr<std::unique_ptr<PjRtLayout>> PjRtArray::layout() const {
+  CHECK(!pjrt_buffers_.empty());
+  std::unique_ptr<PjRtLayout> layout = pjrt_buffers_[0]->layout();
+#ifndef NDEBUG
+  for (int i = 1; i < pjrt_buffers_.size(); ++i) {
+    std::unique_ptr<PjRtLayout> layout_i = pjrt_buffers_[i]->layout();
+    DCHECK_EQ(layout, layout_i)
+        << "PjRtArray has mismatched layouts across shards! "
+        << "shard 0: " << layout->ToString() << ", shard " << i << ": "
+        << layout_i->ToString();
+  }
+#endif
+  return layout;
 }
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
index 457a890e8adb03..5542ae9e54e11d 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
@@ -146,6 +146,8 @@ class PjRtArray final
     return sharding_;
   }
 
+  absl::StatusOr<std::unique_ptr<PjRtLayout>> layout() const override;
+
   absl::StatusOr<std::vector<tsl::RCReference<Array>>>
   DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
 

From 77c42debd4baab6d0c89277277a1159374eff268 Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Tue, 19 Mar 2024 12:56:09 -0700
Subject: [PATCH 132/670] Remove TSL configs now that TSL is tested via XLA

Later the Windows and MacOS configs should be revived under XLA

PiperOrigin-RevId: 617265937
---
 .../tsl/.kokoro/generate_index_html.sh        |  47 --------
 .../third_party/tsl/.kokoro/linux/build.sh    |  73 ------------
 .../tsl/.kokoro/linux/cpu/build_cpu.cfg       |   5 -
 .../tsl/.kokoro/linux/cpu/common.cfg          |  11 --
 .../third_party/tsl/.kokoro/macos/build.sh    | 111 ------------------
 .../tsl/.kokoro/macos/cpu/common.cfg          |  25 ----
 .../tsl/.kokoro/macos/cpu/cpu_py39_full.cfg   |   5 -
 .../third_party/tsl/.kokoro/windows/build.bat |  20 ----
 .../.kokoro/windows/cpu/build_cpu_py39.cfg    |   1 -
 .../tsl/.kokoro/windows/cpu/common.cfg        |  12 --
 .../tsl/.kokoro/windows/windows_build.sh      |  66 -----------
 11 files changed, 376 deletions(-)
 delete mode 100755 third_party/xla/third_party/tsl/.kokoro/generate_index_html.sh
 delete mode 100644 third_party/xla/third_party/tsl/.kokoro/linux/build.sh
 delete mode 100644 third_party/xla/third_party/tsl/.kokoro/linux/cpu/build_cpu.cfg
 delete mode 100644 third_party/xla/third_party/tsl/.kokoro/linux/cpu/common.cfg
 delete mode 100644 third_party/xla/third_party/tsl/.kokoro/macos/build.sh
 delete mode 100644 third_party/xla/third_party/tsl/.kokoro/macos/cpu/common.cfg
 delete mode 100644 third_party/xla/third_party/tsl/.kokoro/macos/cpu/cpu_py39_full.cfg
 delete mode 100644 third_party/xla/third_party/tsl/.kokoro/windows/build.bat
 delete mode 100644 third_party/xla/third_party/tsl/.kokoro/windows/cpu/build_cpu_py39.cfg
 delete mode 100644 third_party/xla/third_party/tsl/.kokoro/windows/cpu/common.cfg
 delete mode 100644 third_party/xla/third_party/tsl/.kokoro/windows/windows_build.sh

diff --git a/third_party/xla/third_party/tsl/.kokoro/generate_index_html.sh b/third_party/xla/third_party/tsl/.kokoro/generate_index_html.sh
deleted file mode 100755
index 8870bb2818e52d..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/generate_index_html.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2022 Google LLC All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Generates a handy index.html with a bunch of Kokoro links for GitHub
-# presubmits.
-# Usage: generate_index_html.sh /path/to/output/index.html
-
-tee "$1" <<EOF
-<html>
-<head>
-<title>#$KOKORO_GITHUB_PULL_REQUEST_NUMBER_tsl | $(basename "$KOKORO_JOB_NAME")</title>
-</head>
-<body>
-<h1>TSL Job Logs and Links</h1>
-<h2>Job Details</h2>
-<ul>
-<li>Job name: $KOKORO_JOB_NAME</li>
-<li>Job pool: $KOKORO_JOB_POOL</li>
-<li>Job ID: $KOKORO_BUILD_ID</li>
-<li>Current HEAD Piper Changelist (may be empty): cl/${KOKORO_PIPER_CHANGELIST:-not available}</li>
-<li>Pull Request Number: $KOKORO_GITHUB_PULL_REQUEST_NUMBER_tsl</li>
-<li>Pull Request Link: <a href="$KOKORO_GITHUB_PULL_REQUEST_URL_tsl">$KOKORO_GITHUB_PULL_REQUEST_URL_tsl</a></li>
-<li>Commit: $KOKORO_GIT_COMMIT_tsl</li>
-</ul>
-<h2>Googlers-Only Links</h2>
-<ul>
-<li><a href="http://sponge2/$KOKORO_BUILD_ID">Sponge2</a></li>
-<li><a href="http://sponge/target:$KOKORO_JOB_NAME">Sponge - recent jobs</a></li>
-</ul>
-<h2>Non-Googler Links</h2>
-<ul>
-<li><a href="https://source.cloud.google.com/results/invocations/$KOKORO_BUILD_ID">ResultStore</a></li>
-</ul>
-</body></html>
-EOF
diff --git a/third_party/xla/third_party/tsl/.kokoro/linux/build.sh b/third_party/xla/third_party/tsl/.kokoro/linux/build.sh
deleted file mode 100644
index f05e02bfc6cc7b..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/linux/build.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-# Copyright 2022 Google LLC All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# -e: abort script if one command fails
-# -u: error if undefined variable used
-# -o pipefail: entire command fails if pipe fails. watch out for yes | ...
-# -o history: record shell history
-set -euo pipefail -o history
-
-# Generate a templated results file to make output accessible to everyone
-"$KOKORO_ARTIFACTS_DIR"/github/tsl/.kokoro/generate_index_html.sh "$KOKORO_ARTIFACTS_DIR"/index.html
-
-function is_continuous_job() {
-  [[ "$KOKORO_JOB_NAME" =~ tensorflow/tsl/.*continuous.* ]]
-}
-
-ADDITIONAL_FLAGS=""
-TAGS_FILTER="-no_oss,-oss_excluded,-oss_serial,-gpu,-requires-gpu-nvidia"
-
-if is_continuous_job ; then
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --google_default_credentials"
-else
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --remote_upload_local_results=false"
-fi
-
-# Pull the container (in case it was updated since the instance started) and
-# store its SHA in the Sponge log.
-docker pull "$DOCKER_IMAGE"
-echo "TF_INFO_DOCKER_IMAGE,$DOCKER_IMAGE" >> "$KOKORO_ARTIFACTS_DIR/custom_sponge_config.csv"
-echo "TF_INFO_DOCKER_SHA,$(docker pull "$DOCKER_IMAGE" | sed -n '/Digest:/s/Digest: //g p')" >> "$KOKORO_ARTIFACTS_DIR/custom_sponge_config.csv"
-
-# Start a container in the background
-docker run --name tsl -w /tf/tsl -itd --rm \
-    -v "$KOKORO_ARTIFACTS_DIR/github/tsl:/tf/tsl" \
-    "$DOCKER_IMAGE" \
-    bash
-
-# Build TSL
-docker exec tsl bazel --bazelrc=/usertools/cpu.bazelrc build \
-    --output_filter="" \
-    --keep_going \
-    --build_tag_filters=$TAGS_FILTER  \
-    --test_tag_filters=$TAGS_FILTER \
-    --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/tsl/linux" \
-    $ADDITIONAL_FLAGS \
-    -- //tsl/...
-
-# Test TSL
-docker exec tsl bazel --bazelrc=/usertools/cpu.bazelrc test \
-    --output_filter="" \
-    --keep_going \
-    --flaky_test_attempts=3 \
-    --test_output=errors \
-    --build_tests_only \
-    --build_tag_filters=$TAGS_FILTER  \
-    --test_tag_filters=$TAGS_FILTER \
-    --verbose_failures=true \
-    -- //tsl/...
-
-# Stop container
-docker stop tsl
diff --git a/third_party/xla/third_party/tsl/.kokoro/linux/cpu/build_cpu.cfg b/third_party/xla/third_party/tsl/.kokoro/linux/cpu/build_cpu.cfg
deleted file mode 100644
index 8e105be39e67c0..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/linux/cpu/build_cpu.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-build_file: "tsl/.kokoro/linux/build.sh"
-env_vars: {
-  key: "DOCKER_IMAGE"
-  value: "gcr.io/tensorflow-sigs/build:latest-python3.9"
-}
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/.kokoro/linux/cpu/common.cfg b/third_party/xla/third_party/tsl/.kokoro/linux/cpu/common.cfg
deleted file mode 100644
index e23a70ebe5127d..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/linux/cpu/common.cfg
+++ /dev/null
@@ -1,11 +0,0 @@
-action {
-  define_artifacts {
-    # Sponge logs
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    # Full test logs to debug
-    regex: "**/*.tar.gz"
-    # Html helper for presubmits
-    regex: "**/*.html"
-  }
-}
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/.kokoro/macos/build.sh b/third_party/xla/third_party/tsl/.kokoro/macos/build.sh
deleted file mode 100644
index 5f4a806d87be23..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/macos/build.sh
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/bin/bash
-# Copyright 2022 Google LLC All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# -e: abort script if one command fails
-# -u: error if undefined variable used
-# -o pipefail: entire command fails if pipe fails. watch out for yes | ...
-# -o history: record shell history
-set -euo pipefail -o history
-
-cd "${KOKORO_ARTIFACTS_DIR}/github/tsl"
-
-# Install Bazelisk, Bats, Pyenv, Python, upgrade pip, and activate ".tf-venv"
-# virtual environment. We use the "PYENV_VERSION" variable here to decide which
-# Python version to install. In addition, we update $PATH with the PYENV_ROOT
-# environment variable and we set STATIC_DEPS=true for installing lxml for
-# Python. Finally, we set up a symlink to the Python packages directory in
-# ".tf-venv" which is referenced in macos.bazelrc.
-function install_build_env_tools(){
-  # Install Bazelisk; Useful as it would automatically pick the correct
-  # version of Bazel.
-  echo "===== Installing Bazelisk ====="
-  sudo wget --no-verbose -O "/usr/local/bin/bazel" \
-      "https://github.com/bazelbuild/bazelisk/releases/download/v1.11.0/bazelisk-darwin-amd64" \
-      && chmod +x "/usr/local/bin/bazel"
-
-  echo "===== Installing Pyenv ====="
-  # Install pyenv; Set up a virtual environment to control dependencies and their
-  # versions
-  git clone --branch v2.3.17 https://github.com/pyenv/pyenv.git /Users/kbuilder/.tf_pyenv
-  export PYENV_ROOT=/Users/kbuilder/.tf_pyenv
-  export PATH="$PYENV_ROOT/bin:$PATH"    # if `pyenv` is not already on PATH
-  eval "$(pyenv init --path)"
-  eval "$(pyenv init -)"
-
-  echo "===== Installing Python ====="
-  # Install Python and set the local python version
-  pyenv install -s "${TF_PYENV_VERSION}"
-  pyenv rehash
-  pyenv local "${TF_PYENV_VERSION}"
-  # Do a sanity check to make sure that we using the correct Python version
-  echo "===== Python version ====="
-  python --version
-  # Set up virtual environment and activate it
-  python -m venv /Users/kbuilder/.tf-venv && source /Users/kbuilder/.tf-venv/bin/activate
-
-  # Setup links to Python. Referenced in ./macos.bazelrc
-  ln -s /Users/kbuilder/.tf-venv/lib/python* /Users/kbuilder/.tf-venv/lib/python
-
-  echo "===== Upgrading to latest pip ====="
-  python -m pip install --upgrade pip
-}
-
-install_build_env_tools
-
-python -m pip install numpy==1.21.4
-
-# Generate a templated results file to make output accessible to everyone
-"$KOKORO_ARTIFACTS_DIR"/github/tsl/.kokoro/generate_index_html.sh "$KOKORO_ARTIFACTS_DIR"/index.html
-
-function is_continuous_job() {
-  [[ "$KOKORO_JOB_NAME" =~ tensorflow/tsl/.*continuous.* ]]
-}
-
-# Set authentication for reading and writing cache from Google Cloud Storage
-export GOOGLE_APPLICATION_CREDENTIALS="$KOKORO_KEYSTORE_DIR/73361_tensorflow_bazel_cache_writer"
-
-TAGS_FILTER="-no_oss,-oss_excluded,-gpu,-no_mac,-nomac,-mac_excluded"
-ADDITIONAL_FLAGS=""
-
-if is_continuous_job ; then
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --google_default_credentials"
-else
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --remote_upload_local_results=false"
-fi
-
-# Build TSL
-bazel build \
-    --output_filter="" \
-    --macos_minimum_os=10.15 \
-    --build_tag_filters=$TAGS_FILTER  \
-    --test_tag_filters=$TAGS_FILTER \
-    --keep_going \
-    --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/tsl/macos" \
-    $ADDITIONAL_FLAGS \
-    -- //tsl/...
-
-# Test TSL
-bazel test \
-    --output_filter="" \
-    --macos_minimum_os=10.15 \
-    --test_tag_filters=-no_mac,-nomac,-mac_excluded \
-    --keep_going \
-    --test_output=errors \
-    --build_tests_only \
-    --build_tag_filters=$TAGS_FILTER  \
-    --test_tag_filters=$TAGS_FILTER \
-    --verbose_failures=true \
-    --flaky_test_attempts=3 \
-    -- //tsl/...
diff --git a/third_party/xla/third_party/tsl/.kokoro/macos/cpu/common.cfg b/third_party/xla/third_party/tsl/.kokoro/macos/cpu/common.cfg
deleted file mode 100644
index b12f5c22a4f835..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/macos/cpu/common.cfg
+++ /dev/null
@@ -1,25 +0,0 @@
-# Not sure how long the timeout should be
-timeout_mins: 720
-
-action {
-  define_artifacts {
-    # Sponge logs
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    # Full test logs to debug the log squasher, and libtf.tar.gz
-    regex: "**/*.tar.gz"
-    # Html helper for presubmits
-    regex: "**/*.html"
-  }
-}
-
-before_action {
-  fetch_keystore {
-  # Authentication for reading and writing cache to/from Google Cloud Storage
-    keystore_resource {
-      keystore_config_id: 73361
-      keyname: "tensorflow_bazel_cache_writer"
-      backend: "blade:keystore-fastconfigpush"  # disable-keystore-reliability-check
-    }
-  }
-}
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/.kokoro/macos/cpu/cpu_py39_full.cfg b/third_party/xla/third_party/tsl/.kokoro/macos/cpu/cpu_py39_full.cfg
deleted file mode 100644
index 885373fc8c1d0b..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/macos/cpu/cpu_py39_full.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-build_file: "tsl/.kokoro/macos/build.sh"
-env_vars: {
-   key: "TF_PYENV_VERSION"
-   value: "3.9.16"
-}
diff --git a/third_party/xla/third_party/tsl/.kokoro/windows/build.bat b/third_party/xla/third_party/tsl/.kokoro/windows/build.bat
deleted file mode 100644
index 4fb8ddf28ed4b4..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/windows/build.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-@REM Copyright 2023 Google LLC
-
-@REM Licensed under the Apache License, Version 2.0 (the "License");
-@REM you may not use this file except in compliance with the License.
-@REM You may obtain a copy of the License at
-
-@REM     https://www.apache.org/licenses/LICENSE-2.0
-
-@REM Unless required by applicable law or agreed to in writing, software
-@REM distributed under the License is distributed on an "AS IS" BASIS,
-@REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@REM See the License for the specific language governing permissions and
-@REM limitations under the License.
-
-SET TMPDIR=T:/tmp
-SET TMP=%TMPDIR%
-SET TEMP=%TMPDIR%
-
-bash -l %0/../windows_build.sh %*
-exit /b %ERRORLEVEL%
diff --git a/third_party/xla/third_party/tsl/.kokoro/windows/cpu/build_cpu_py39.cfg b/third_party/xla/third_party/tsl/.kokoro/windows/cpu/build_cpu_py39.cfg
deleted file mode 100644
index 3a935b23ed8280..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/windows/cpu/build_cpu_py39.cfg
+++ /dev/null
@@ -1 +0,0 @@
-build_file: "tsl/.kokoro/windows/build.bat"
diff --git a/third_party/xla/third_party/tsl/.kokoro/windows/cpu/common.cfg b/third_party/xla/third_party/tsl/.kokoro/windows/cpu/common.cfg
deleted file mode 100644
index 8f936b4071a993..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/windows/cpu/common.cfg
+++ /dev/null
@@ -1,12 +0,0 @@
-# timeout_mins: 6000
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    # regex: "**/.tf_configure.bazelrc"
-    regex: "**/lib_package/*"
-    # regex: "**/java.log"
-    # regex: "**/win_minidumps/*.dmp"
-    # Html helper for presubmits
-    regex: "**/*.html"
-  }
-}
diff --git a/third_party/xla/third_party/tsl/.kokoro/windows/windows_build.sh b/third_party/xla/third_party/tsl/.kokoro/windows/windows_build.sh
deleted file mode 100644
index 4f4b0a0fdf9d31..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/windows/windows_build.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-# Copyright 2022 Google LLC All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# -e: abort script if one command fails
-# -u: error if undefined variable used
-# -o pipefail: entire command fails if pipe fails. watch out for yes | ...
-# Note: set -x <code> +x around anything you want to have logged.
-set -euo pipefail
-
-cd "${KOKORO_ARTIFACTS_DIR}/github/tsl"
-
-# Generate a templated results file to make output accessible to everyone
-"$KOKORO_ARTIFACTS_DIR"/github/tsl/.kokoro/generate_index_html.sh "$KOKORO_ARTIFACTS_DIR"/index.html
-
-function is_continuous_job() {
-  [[ "$KOKORO_JOB_NAME" =~ tensorflow/tsl/.*continuous.* ]]
-}
-
-ADDITIONAL_FLAGS=""
-TAGS_FILTER="-no_oss,-oss_excluded,-gpu,-no_windows,-windows_excluded"
-
-if is_continuous_job ; then
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --google_default_credentials"
-else
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --remote_upload_local_results=false"
-fi
-
-export PATH="$PATH:/c/Python38"
-
-# Build TSL
-/c/tools/bazel.exe build \
-  --output_filter="" \
-  --keep_going \
-  --build_tag_filters=$TAGS_FILTER  \
-  --test_tag_filters=$TAGS_FILTER \
-  --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/tsl/windows" \
-    $ADDITIONAL_FLAGS \
-  -- //tsl/... \
-  || { echo "Bazel Build Failed" && exit 1; }
-
-# Test TSL
-/c/tools/bazel.exe test \
-  --output_filter="" \
-  --flaky_test_attempts=3 \
-  --test_output=errors \
-  --build_tests_only \
-  --verbose_failures=true \
-  --build_tag_filters=$TAGS_FILTER  \
-  --test_tag_filters=$TAGS_FILTER \
-  --keep_going \
-  -- //tsl/... \
-  || { echo "Bazel Test Failed" && exit 1; }
-
-exit 0

From 3ea7ba1b7b864b8a73a11c6b4851196d5ff17c6d Mon Sep 17 00:00:00 2001
From: Harsha H S <hsharsha@users.noreply.github.com>
Date: Tue, 19 Mar 2024 12:57:16 -0700
Subject: [PATCH 133/670] =?UTF-8?q?PR=20#10704:=20[ROCm]=20Make=20DynLoadS?=
 =?UTF-8?q?him=20static=20struct=20in=20header=20to=20avoid=20redefinition?=
 =?UTF-8?q?=20of=20sym=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/10704

…bols
Copybara import of the project:

--
b407a7c7ed136c37189a2a907c50b14d0dac1230 by Harsha HS <harsha.havanurshamsundara@amd.com>:

Make DynLoadShim static struct in header to avoid redefinition of symbols

Merging this change closes #10704

PiperOrigin-RevId: 617266279
---
 .../xla/stream_executor/rocm/hipsparse_wrapper.h    | 13 ++++++-------
 .../xla/xla/stream_executor/rocm/rocblas_wrapper.h  |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h b/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h
index e37c006ef2b2cd..b4bcc7d8f3944e 100644
--- a/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h
@@ -46,8 +46,8 @@ namespace wrap {
 #else
 
 #define HIPSPARSE_API_WRAPPER(__name)                                          \
-  struct DynLoadShim__##__name {                                               \
-    static const char* kName;                                                  \
+  static struct DynLoadShim__##__name {                                        \
+    constexpr static const char* kName = #__name;                              \
     using FuncPtrT = std::add_pointer<decltype(::__name)>::type;               \
     static void* GetDsoHandle() {                                              \
       auto s =                                                                 \
@@ -56,8 +56,8 @@ namespace wrap {
     }                                                                          \
     static FuncPtrT LoadOrDie() {                                              \
       void* f;                                                                 \
-      auto s = tsl::Env::Default()                                             \
-          -> GetSymbolFromLibrary(GetDsoHandle(), kName, &f);                  \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(),       \
+                                                         kName, &f);           \
       CHECK(s.ok()) << "could not find " << kName                              \
                     << " in miopen DSO; dlerror: " << s.message();             \
       return reinterpret_cast<FuncPtrT>(f);                                    \
@@ -70,8 +70,7 @@ namespace wrap {
     hipsparseStatus_t operator()(Args... args) {                               \
       return DynLoad()(args...);                                               \
     }                                                                          \
-  } __name;                                                                    \
-  const char* DynLoadShim__##__name::kName = #__name;
+  } __name;
 
 #endif
 
@@ -128,7 +127,7 @@ namespace wrap {
   __macro(hipsparseDcsru2csr_bufferSizeExt)     \
   __macro(hipsparseDcsru2csr)                   \
   __macro(hipsparseScsru2csr_bufferSizeExt)     \
-  __macro(hipsparseScsru2csr)                   \  
+  __macro(hipsparseScsru2csr)                   \
   __macro(hipsparseSpMM_bufferSize)             \
   __macro(hipsparseSpMM)                        \
   __macro(hipsparseZcsru2csr_bufferSizeExt)     \
diff --git a/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h b/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h
index a637d68428d5e3..3d444ab83a0ee6 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h
@@ -46,7 +46,7 @@ using stream_executor::internal::CachedDsoLoader::GetRocblasDsoHandle;
 #else
 
 #define ROCBLAS_API_WRAPPER(__name)                                      \
-  struct DynLoadShim__##__name {                                         \
+  static struct DynLoadShim__##__name {                                  \
     constexpr static const char* kName = #__name;                        \
     using FuncPtrT = std::add_pointer<decltype(::__name)>::type;         \
     static void* GetDsoHandle() {                                        \

From 7c402f8d1fb5a2a4a83ccb62c0f7df47d88ecdd7 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Tue, 19 Mar 2024 12:59:06 -0700
Subject: [PATCH 134/670] [xla:gpu][NFC] Use llvm::zip instead of unsigned
 index

PiperOrigin-RevId: 617266849
---
 .../gpu/runtime/address_computation_thunk.cc  | 36 +++++++++++--------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index fce7902590ddf8..87d2d2b42c58d6 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -116,34 +116,41 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
     return reinterpret_cast<int64_t*>(offsets_.at(stream.parent())->opaque());
   }();
 
-  for (unsigned i = 0; i < offset_buffer_indices_.size(); ++i) {
-    if (embedded_thunk_arguments_[i] == std::nullopt) {
+  for (auto [operand_idx, values] : llvm::enumerate(
+           llvm::zip(embedded_thunk_arguments_, offset_buffer_indices_,
+                     orig_shapes_, sliced_shapes_))) {
+    auto [argument_slice, offset_slice, orig_shape, sliced_shape] = values;
+
+    if (argument_slice == std::nullopt) {
       continue;
     }
 
     // `orig_operand` will contain the original offset for slice
-    // `embedded_thunk_arguments_[i]` within `orig_allocations`
+    // `argument_slice` within `orig_allocations`
     se::DeviceMemoryBase orig_operand =
-        orig_allocations.GetDeviceAddress(*embedded_thunk_arguments_[i]);
-    if (offset_buffer_indices_[i] == std::nullopt) {
-      new_buffers[embedded_thunk_arguments_[i]->index()] = orig_operand;
+        orig_allocations.GetDeviceAddress(*argument_slice);
+    auto buffer_idx = argument_slice->index();
+
+    if (offset_slice == std::nullopt) {
+      new_buffers[buffer_idx] = orig_operand;
       continue;
     }
 
-    const Shape& src_shape = *orig_shapes_[i];
-    const Shape& dst_shape = *sliced_shapes_[i];
+    const Shape& src_shape = *orig_shape;
+    const Shape& dst_shape = *sliced_shape;
     TF_RET_CHECK(IsContiguousSlice(src_shape, dst_shape));
 
     std::vector<int64_t> slice_starts;
     slice_starts.reserve(dst_shape.rank());
 
-    // Get offset for ith operand, which has `dst_shape.rank()` components.
-    for (auto [idx, offset_slice] :
-         llvm::enumerate(*offset_buffer_indices_[i])) {
+    // Get offset for `operand_idx`-th operand, which has `dst_shape.rank()`
+    // components.
+    for (auto [offset_idx, offset_slice] : llvm::enumerate(*offset_slice)) {
       se::DeviceMemoryBase offset_src =
           orig_allocations.GetDeviceAddress(offset_slice);
-      int64_t* offset_dst = &offsets_base[i + idx];
-      // Copy the idx-th component of the ith offset from device to host.
+      int64_t* offset_dst = &offsets_base[operand_idx + offset_idx];
+      // Copy the `offset_idx`-th component of the offset for the
+      // `operand_idx`-th operand from device to host.
       TF_RETURN_IF_ERROR(
           stream.Memcpy(offset_dst, offset_src, sizeof(int64_t)));
 
@@ -165,8 +172,7 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
       new_offset += start * stride;
     }
 
-    new_buffers[embedded_thunk_arguments_[i]->index()] =
-        orig_operand.GetByteSlice(new_offset, new_size);
+    new_buffers[buffer_idx] = orig_operand.GetByteSlice(new_offset, new_size);
   }
 
   // Safe to create a local BufferAllocations here since buffers are only slices

From 3557ecd2be2abc1bb072203814a7df53d2c31ad8 Mon Sep 17 00:00:00 2001
From: Yimei Sun <yimei.sun@intel.com>
Date: Tue, 19 Mar 2024 13:04:10 -0700
Subject: [PATCH 135/670] PR #10631: [XLA:CPU][oneDNN] Minor fix in CPU
 bfloat16 support

Imported from GitHub PR https://github.com/openxla/xla/pull/10631

Align the usage of oneDNN bf16 cpu float normalization with the same guarding condition as for oneDNN rewrites in compilation step, i.e., only when oneDNN is enabled and in non-AOT mode.
Copybara import of the project:

--
8166205b6491dd53dcd65e3898b4b9edfa641517 by Yimei Sun <yimei.sun@intel.com>:

[XLA:CPU][oneDNN] Minor fix in CPU bfloat16 support

--
9b4f9136f1976b4432e1a60f2deb4a17c150dd3f by Yimei Sun <yimei.sun@intel.com>:

Move CpuFloatSupport reference under guard condition

Merging this change closes #10631

PiperOrigin-RevId: 617268450
---
 third_party/xla/xla/service/cpu/cpu_compiler.cc      | 11 ++++++++---
 third_party/xla/xla/service/cpu/cpu_float_support.cc |  7 ++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 263dbbb0c5ec28..c47bda35bce11a 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -711,12 +711,17 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   // Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU
   // backend can support BF16/F8 operations without directly implementing a
   // BF16/F8 lowering for most ops.
+  FloatSupport bf16_support(BF16);
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
-  CpuFloatSupport bf16_support(BF16);
+  CpuFloatSupport onednn_bf16_support(BF16);
+  if (!is_aot_compile) {
+    pipeline.AddPass<FloatNormalization>(&onednn_bf16_support);
+  } else {
+    pipeline.AddPass<FloatNormalization>(&bf16_support);
+  }
 #else
-  FloatSupport bf16_support(BF16);
-#endif
   pipeline.AddPass<FloatNormalization>(&bf16_support);
+#endif
   FloatSupport f8e5m2_support(F8E5M2, F16);
   pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
   FloatSupport f8e4m3fn_support(F8E4M3FN, F16);
diff --git a/third_party/xla/xla/service/cpu/cpu_float_support.cc b/third_party/xla/xla/service/cpu/cpu_float_support.cc
index 336a8709725ea7..dd5c6c5b9d5049 100644
--- a/third_party/xla/xla/service/cpu/cpu_float_support.cc
+++ b/third_party/xla/xla/service/cpu/cpu_float_support.cc
@@ -24,6 +24,10 @@ namespace cpu {
 
 bool CpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
   switch (hlo.opcode()) {
+    // oneDNN rewritable ops
+    case HloOpcode::kDot:
+      return LowPrecisionType() == BF16 &&
+             OneDnnMatMulRewriter::ShouldRewrite(&hlo) && DotSupported(hlo);
     // Collective ops.
     case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
@@ -32,9 +36,6 @@ bool CpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kReduceScatter:
-    case HloOpcode::kDot:
-      return LowPrecisionType() == BF16 &&
-             OneDnnMatMulRewriter::ShouldRewrite(&hlo) && DotSupported(hlo);
     // Data movement only ops.
     case HloOpcode::kBroadcast:
     case HloOpcode::kConcatenate:

From 2996081d458edae46ef9aebbfce57203bffe970b Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <isergachev@nvidia.com>
Date: Tue, 19 Mar 2024 13:46:09 -0700
Subject: [PATCH 136/670] PR #10684: [GPU] Support more elementwise operations
 and boolean data type in cuDNN GEMM fusions.

Imported from GitHub PR https://github.com/openxla/xla/pull/10684

Copybara import of the project:

--
210f8aaae3206264379c9f1ff5f70f24ffca4a14 by Ilia Sergachev <isergachev@nvidia.com>:

[GPU] Support more elementwise operations and data types in cuDNN GEMM fusions.

Merging this change closes #10684

PiperOrigin-RevId: 617281947
---
 .../xla/service/gpu/cudnn_fusion_compiler.cc  |  42 +++++
 third_party/xla/xla/service/gpu/fusions/BUILD |   2 +
 .../xla/xla/service/gpu/fusions/cudnn_test.cc | 164 ++++++++++++++++++
 3 files changed, 208 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
index 1bd93e3ee2a243..8f2f70c816ffb8 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
@@ -68,18 +68,58 @@ inline std::optional<fe::PointwiseMode_t> GetElementwiseMode(
   const HloOpcode opcode = instruction.opcode();
   using m = fe::PointwiseMode_t;
   switch (opcode) {
+    case HloOpcode::kAbs:
+      return m::ABS;
     case HloOpcode::kAdd:
       return m::ADD;
+    case HloOpcode::kCompare:
+      switch (instruction.comparison_direction()) {
+        case Comparison::Direction::kEq:
+          return m::CMP_EQ;
+        case Comparison::Direction::kNe:
+          return m::CMP_NEQ;
+        case Comparison::Direction::kGe:
+          return m::CMP_GE;
+        case Comparison::Direction::kGt:
+          return m::CMP_GT;
+        case Comparison::Direction::kLe:
+          return m::CMP_LE;
+        case Comparison::Direction::kLt:
+          return m::CMP_LT;
+      }
+      break;
     case HloOpcode::kConvert:
       return m::IDENTITY;
+    case HloOpcode::kCos:
+      return m::COS;
     case HloOpcode::kDivide:
       return m::DIV;
+    case HloOpcode::kExp:
+      return m::EXP;
+    case HloOpcode::kLog:
+      return m::LOG;
+    case HloOpcode::kMaximum:
+      return m::MAX;
+    case HloOpcode::kMinimum:
+      return m::MIN;
     case HloOpcode::kMultiply:
       return m::MUL;
     case HloOpcode::kNegate:
       return m::NEG;
+    case HloOpcode::kPower:
+      return m::POW;
+    case HloOpcode::kRsqrt:
+      return m::RSQRT;
+    case HloOpcode::kSin:
+      return m::SIN;
+    case HloOpcode::kSqrt:
+      return m::SQRT;
     case HloOpcode::kSubtract:
       return m::SUB;
+    case HloOpcode::kTan:
+      return m::TAN;
+    case HloOpcode::kTanh:
+      return m::TANH_FWD;
     default:
       return std::nullopt;
   }
@@ -98,6 +138,8 @@ inline std::optional<fe::DataType_t> ToCudnnDataType(const PrimitiveType type) {
       return t::INT32;
     case PrimitiveType::S8:
       return t::INT8;
+    case PrimitiveType::PRED:
+      return t::INT8;
     default:
       return std::nullopt;
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 1f2aa2d38ec810..d011d06b0ab4dc 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -575,8 +575,10 @@ xla_test(
         "gpu",
     ],
     deps = [
+        "//xla/hlo/ir:hlo",
         "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu/tests:gpu_codegen_test",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
index 40da7bbbc039ff..1567cd17104859 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <gtest/gtest.h>
+#include "absl/strings/str_replace.h"
+#include "absl/strings/substitute.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 
@@ -471,6 +474,167 @@ ENTRY r {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+class ElementwiseTest : public CuDnnFusionExecutionTest,
+                        public ::testing::WithParamInterface<
+                            std::tuple<PrimitiveType, HloOpcode, float>> {};
+
+std::string ElementwiseTestParamsToString(
+    const ::testing::TestParamInfo<std::tuple<PrimitiveType, HloOpcode, float>>&
+        data) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  float tolerance;
+  std::tie(data_type, opcode, tolerance) = data.param;
+  return absl::StrCat(
+      primitive_util::LowercasePrimitiveTypeName(data_type), "_",
+      absl::StrReplaceAll(HloOpcodeString(opcode), {{"-", "_"}}));
+}
+
+using UnaryElementwiseTest = ElementwiseTest;
+
+TEST_P(UnaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  float tolerance;
+  std::tie(data_type, opcode, tolerance) = GetParam();
+
+  const std::string kHloTemplate = R"(
+fusion_computation {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  f1.1 = $0[32,32] $1(p1)
+  c.1 = f32[32,32] convert(f1.1)
+  ROOT _ = f32[32,32] dot(p0, c.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p1 = $0[32,32] parameter(1)
+  p0 = f32[32,32] parameter(0)
+  ROOT r = f32[32,32] fusion(p0, p1), kind=kCustom,
+    calls=fusion_computation,
+    backend_config={"fusion_backend_config":{"kind":"__cudnn$$fusion"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+
+  EXPECT_TRUE(RunAndCompare(hlo_test,
+                            ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteF32, UnaryElementwiseTest,
+    ::testing::Combine(::testing::Values(F32),
+                       ::testing::ValuesIn({HloOpcode::kAbs, HloOpcode::kCos,
+                                            HloOpcode::kExp, HloOpcode::kLog,
+                                            HloOpcode::kNegate,
+                                            HloOpcode::kRsqrt, HloOpcode::kSin,
+                                            HloOpcode::kSqrt, HloOpcode::kTan,
+                                            HloOpcode::kTanh}),
+                       ::testing::Values(5e-4)),
+    ElementwiseTestParamsToString);
+
+using BinaryElementwiseTest = ElementwiseTest;
+
+TEST_P(BinaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  float tolerance;
+  std::tie(data_type, opcode, tolerance) = GetParam();
+
+  const std::string kHloTemplate = R"(
+fusion_computation {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  p2 = $0[32,32] parameter(2)
+  f1.1 = $0[32,32] $1(p1, p2)
+  c.1 = f32[32,32] convert(f1.1)
+  ROOT _ = f32[32,32] dot(p0, c.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+ENTRY e {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  p2 = $0[32,32] parameter(2)
+  ROOT r = f32[32,32] fusion(p0, p1, p2), kind=kCustom,
+    calls=fusion_computation,
+    backend_config={"fusion_backend_config":{"kind":"__cudnn$$fusion"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+
+  EXPECT_TRUE(RunAndCompare(hlo_test,
+                            ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteF32, BinaryElementwiseTest,
+    ::testing::Combine(
+        ::testing::Values(F32),
+        ::testing::ValuesIn({HloOpcode::kAdd, HloOpcode::kDivide,
+                             HloOpcode::kMaximum, HloOpcode::kMinimum,
+                             HloOpcode::kMultiply, HloOpcode::kPower,
+                             HloOpcode::kSubtract}),
+        ::testing::Values(3e-3)),
+    ElementwiseTestParamsToString);
+
+class CompareTest : public CuDnnFusionExecutionTest,
+                    public ::testing::WithParamInterface<
+                        std::tuple<PrimitiveType, Comparison::Direction>> {};
+
+std::string CompareTestParamsToString(
+    const ::testing::TestParamInfo<
+        std::tuple<PrimitiveType, Comparison::Direction>>& data) {
+  PrimitiveType data_type;
+  Comparison::Direction direction;
+  std::tie(data_type, direction) = data.param;
+  return absl::StrCat(primitive_util::LowercasePrimitiveTypeName(data_type),
+                      "_", ComparisonDirectionToString(direction));
+}
+
+TEST_P(CompareTest, FusedComparisonExecutesCorrectly) {
+  PrimitiveType data_type;
+  Comparison::Direction direction;
+  std::tie(data_type, direction) = GetParam();
+
+  const std::string kHloTemplate = R"(
+fusion_computation {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  p2 = $0[32,32] parameter(2)
+  f1.1 = pred[32,32] compare(p1, p2), direction=$1
+  c.1 = f32[32,32] convert(f1.1)
+  ROOT _ = f32[32,32] dot(p0, c.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+ENTRY e {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  p2 = $0[32,32] parameter(2)
+  ROOT r = f32[32,32] fusion(p0, p1, p2), kind=kCustom,
+    calls=fusion_computation,
+    backend_config={"fusion_backend_config":{"kind":"__cudnn$$fusion"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      ComparisonDirectionToString(direction));
+
+  EXPECT_TRUE(RunAndCompare(hlo_test, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+using cd = Comparison::Direction;
+
+INSTANTIATE_TEST_SUITE_P(
+    CompareTestSuite, CompareTest,
+    ::testing::Combine(::testing::Values(PRED, S8, S32, F16, F32),
+                       ::testing::Values(cd::kEq, cd::kNe, cd::kGe, cd::kGt,
+                                         cd::kLe, cd::kLt)),
+    CompareTestParamsToString);
+
 class CuDnnFusionRewriteTest : public CuDnnFusionTest {
  public:
   DebugOptions GetDebugOptionsForTest() override {

From e0c0dae3c5e0c2c7b78444d3fdd2b2ea5c47d798 Mon Sep 17 00:00:00 2001
From: Zichuan Wei <zichuanwei@google.com>
Date: Tue, 19 Mar 2024 13:46:43 -0700
Subject: [PATCH 137/670] lite: enable VHLO serialization for TFLite path

PiperOrigin-RevId: 617282108
---
 .../compiler/mlir/lite/flatbuffer_export.cc   | 33 -------------------
 .../mlir/lite/tf_to_tfl_flatbuffer.cc         |  7 ++++
 2 files changed, 7 insertions(+), 33 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index dd28efd44eab14..b98d3220ee15a8 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -579,9 +579,6 @@ class Translator {
         module.getContext()->getOrLoadDialect<mlir::TF::TensorFlowDialect>();
     tfl_dialect_ = module.getContext()
                        ->getOrLoadDialect<mlir::TFL::TensorFlowLiteDialect>();
-    stablehlo_dialect_ =
-        module.getContext()
-            ->getOrLoadDialect<mlir::stablehlo::StablehloDialect>();
     vhlo_dialect_ =
         module.getContext()->getOrLoadDialect<mlir::vhlo::VhloDialect>();
     // Right now the TF executor dialect is still needed to build NodeDef.
@@ -836,7 +833,6 @@ class Translator {
   // dialect is not registered.
   const Dialect* tf_dialect_;
   const Dialect* tfl_dialect_;
-  const Dialect* stablehlo_dialect_;
   const Dialect* vhlo_dialect_;
 
   // The failed ops during legalization.
@@ -1998,35 +1994,6 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
     return offset;
   }
 
-  // EXPERIMENTAL: If the source is in stablehlo dialect, also create them as
-  // builtin ops
-  if (dialect == stablehlo_dialect_) {
-    // for stablehlo ops with kernels, we directly serialize them whenever
-    // possible
-    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::ScatterOp>(inst)) {
-      return BuildStablehloScatterOp(shlo_op, operands, results);
-    }
-    if (auto shlo_op =
-            llvm::dyn_cast<mlir::stablehlo::RngBitGeneratorOp>(inst)) {
-      return BuildStablehloRngBitGeneratorOp(shlo_op, operands, results);
-    }
-    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::GatherOp>(inst)) {
-      return BuildStablehloGatherOp(shlo_op, operands, results);
-    }
-    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::ReduceWindowOp>(inst)) {
-      return BuildStablehloReduceWindowOp(shlo_op, operands, results);
-    }
-    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::PadOp>(inst)) {
-      return BuildStablehloPadOp(shlo_op, operands, results);
-    }
-    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::AddOp>(inst)) {
-      return BuildStablehloOperatorwithoutOptions(
-          shlo_op, operands, results, tflite::BuiltinOperator_STABLEHLO_ADD);
-    }
-    return inst->emitOpError("is not part of the stablehlo support yet."),
-           std::nullopt;
-  }
-
   if (dialect == vhlo_dialect_) {
     mlir::VhloToStablehloTypeConverter vhlo_type_converter;
     if (auto vhlo_op = llvm::dyn_cast<mlir::vhlo::ScatterOpV1>(inst)) {
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index 7a3f06bb376784..ac4de7f82b23d0 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -497,6 +497,13 @@ absl::Status ConvertTFExecutorToTFLOrFlatbuffer(
     options.metadata.insert(
         MetadataForReducedPrecisionSupport(quant_specs.support_mask));
   }
+  pass_manager.clear();
+  pass_manager.addPass(mlir::odml::createLegalizeStablehloToVhloPass());
+  if (failed(pass_manager.run(module))) {
+    return status_handler.Combine(
+        absl::InvalidArgumentError("VHLO lowering failed"));
+  }
+
   if (!tflite::MlirToFlatBufferTranslateFunction(
           module, options, &translated_result, serialize_stablehlo_ops)) {
     return status_handler.Combine(

From 255e14e5ddbfa13c985fbac871a34c39350b574a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 14:09:57 -0700
Subject: [PATCH 138/670] Add a knob to WhileLoopConstantSinking to only sink
 scalar constants. We use this to prepare the module for urolling while loops.

PiperOrigin-RevId: 617289800
---
 third_party/xla/xla/service/BUILD                  |  1 +
 .../xla/xla/service/while_loop_constant_sinking.cc |  7 +++++++
 .../xla/xla/service/while_loop_constant_sinking.h  |  7 +++++--
 .../service/while_loop_constant_sinking_test.cc    | 14 +++++++++++---
 third_party/xla/xla/service/while_loop_unroller.cc |  3 ++-
 5 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index bc62dab1bf8a9a..b16fd5ea5328b4 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -6202,6 +6202,7 @@ cc_library(
     deps = [
         ":hlo_pass",
         ":while_util",
+        "//xla:shape_util",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking.cc b/third_party/xla/xla/service/while_loop_constant_sinking.cc
index 2a87c3a22cb3bc..bd2cc6a87c7729 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking.cc
+++ b/third_party/xla/xla/service/while_loop_constant_sinking.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "xla/service/while_util.h"
+#include "xla/shape_util.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -94,6 +95,12 @@ StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop(
       continue;
     }
 
+    if (sink_only_scalar_constants_) {
+      if (!ShapeUtil::IsScalar(init_value.operand(index)->shape())) {
+        continue;
+      }
+    }
+
     // Sink into the while_body.
     // Should have at least one user that's not while_body_root.
     if (invariant_body_gte->user_count() > 1) {
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking.h b/third_party/xla/xla/service/while_loop_constant_sinking.h
index 7cb8405997862f..b6484e280c574b 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking.h
+++ b/third_party/xla/xla/service/while_loop_constant_sinking.h
@@ -48,8 +48,10 @@ namespace xla {
 //
 class WhileLoopConstantSinking : public HloModulePass {
  public:
-  explicit WhileLoopConstantSinking(bool sink_broadcast_of_constants = false)
-      : sink_broadcast_of_constants_(sink_broadcast_of_constants) {}
+  explicit WhileLoopConstantSinking(bool sink_broadcast_of_constants = false,
+                                    bool sink_only_scalar_constants = false)
+      : sink_broadcast_of_constants_(sink_broadcast_of_constants),
+        sink_only_scalar_constants_(sink_only_scalar_constants) {}
 
   ~WhileLoopConstantSinking() override = default;
 
@@ -66,6 +68,7 @@ class WhileLoopConstantSinking : public HloModulePass {
   StatusOr<bool> TrySinkingConstantsIntoWhileLoop(HloInstruction* while_instr);
 
   const bool sink_broadcast_of_constants_;
+  const bool sink_only_scalar_constants_;
 };
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking_test.cc b/third_party/xla/xla/service/while_loop_constant_sinking_test.cc
index 2299546e13b0e6..3597686e9b9cce 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking_test.cc
+++ b/third_party/xla/xla/service/while_loop_constant_sinking_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
-#include "tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
@@ -56,8 +55,17 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          WhileLoopConstantSinking{}.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      WhileLoopConstantSinking(/*sink_broadcast_of_constants=*/false,
+                               /*sink_only_scalar_constants=*/true)
+          .Run(module.get()));
+  ASSERT_FALSE(changed);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      changed, WhileLoopConstantSinking(/*sink_broadcast_of_constants=*/false,
+                                        /*sink_only_scalar_constants=*/false)
+                   .Run(module.get()));
   ASSERT_TRUE(changed);
 
   auto* while_body = module->GetComputationWithName("body");
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index a87baef4c5237e..3329635b7db3d0 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -436,7 +436,8 @@ absl::StatusOr<bool> PrepareModuleForUnrolling(
 
   // We apply constant sinking to fix point.
   HloPassFix<WhileLoopConstantSinking> constant_sinking(
-      /*sink_broadcast_of_constants=*/true);
+      /*sink_broadcast_of_constants=*/true,
+      /*sink_only_scalar_constants=*/true);
   TF_ASSIGN_OR_RETURN(bool applied_constant_sinking,
                       constant_sinking.Run(module, execution_threads));
   if (applied_constant_sinking) {

From 9c8c1e45bd71eae5b3834ca70bda73728bb41697 Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <rameshsampath@google.com>
Date: Tue, 19 Mar 2024 14:11:57 -0700
Subject: [PATCH 139/670] Update keras-nightly version in TensorFlow
 requirements.

PiperOrigin-RevId: 617290482
---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 161509369633e4..0a53c467a2fe58 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -115,7 +115,7 @@ def standard_or_nightly(standard, nightly):
     # or final). For example, 'keras-nightly ~= 2.14.0.dev' will be replaced by
     # 'keras >= 2.14.0rc0, < 2.15' on the release branch after the branch cut.
     'tb-nightly ~= 2.17.0.a',
-    'keras-nightly ~= 3.1.0.dev',
+    'keras-nightly >= 3.2.0.dev',
 ]
 REQUIRED_PACKAGES = [p for p in REQUIRED_PACKAGES if p is not None]
 

From 1ae76b1d547959daa05c69d1987a4129d6298cb1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 14:25:13 -0700
Subject: [PATCH 140/670] All subgraphs share a mutex. Subgraphs and Delegates
 are not thread safe.

PiperOrigin-RevId: 617294455
---
 tensorflow/lite/delegates/flex/delegate_data.cc    | 9 +++------
 tensorflow/lite/delegates/flex/delegate_data.h     | 2 --
 tensorflow/lite/delegates/flex/subgraph_resource.h | 9 ++++-----
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/delegates/flex/delegate_data.cc b/tensorflow/lite/delegates/flex/delegate_data.cc
index 2839ad526424c3..e0b0d1041e45b4 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data.cc
@@ -33,13 +33,11 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
-#include "tensorflow/lite/delegates/flex/subgraph_resource.h"
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
@@ -159,8 +157,7 @@ tensorflow::Status RegisterFunctionDefForSubgraphs(
         const std::vector<std::unique_ptr<Subgraph>>&, std::set<std::string>*)>&
         select_subgraphs_to_register,
     tensorflow::ResourceMgr* resource_mgr,
-    tensorflow::EagerContext* eager_context, TfLiteDelegate* flex_delegate,
-    tensorflow::mutex* mutex) {
+    tensorflow::EagerContext* eager_context, TfLiteDelegate* flex_delegate) {
   std::vector<std::unique_ptr<Subgraph>>* subgraphs =
       main_subgraph.GetSubgraphs();
   if (!subgraphs) {
@@ -182,7 +179,7 @@ tensorflow::Status RegisterFunctionDefForSubgraphs(
     // This is to ensure that we only register FunctionDefs for subgraphs that
     // are used by TF ops to invoke functions.
     auto* subgraph_resource =
-        new TFLiteSubgraphResource(*(subgraphs->at(i)), flex_delegate, mutex);
+        new TFLiteSubgraphResource(*(subgraphs->at(i)), flex_delegate);
     TF_RETURN_IF_ERROR(resource_mgr->Create<TFLiteSubgraphResource>(
         "flex", subgraph_name, subgraph_resource));
     tensorflow::FunctionDef fdef;
@@ -235,7 +232,7 @@ tensorflow::Status DelegateData::Prepare(
     TF_RETURN_IF_ERROR(RegisterFunctionDefForSubgraphs(
         *main_subgraph, GetSubgraphNamesForFunctionExecution,
         eager_context_->HostCPU()->resource_manager(), eager_context_,
-        flex_delegate, &mutex_));
+        flex_delegate));
   }
   return tensorflow::Status();
 }
diff --git a/tensorflow/lite/delegates/flex/delegate_data.h b/tensorflow/lite/delegates/flex/delegate_data.h
index 871b49bdc551a8..ada2f9847b269c 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.h
+++ b/tensorflow/lite/delegates/flex/delegate_data.h
@@ -87,8 +87,6 @@ class DelegateData {
   // lifetime information.
   std::unordered_map<const TfLiteContext*, std::map<int, int>>
       tensor_release_map_;
-  // Shared mutex for accessing TFLite subgraphs.
-  tensorflow::mutex mutex_;
 };
 
 // Creates a `TFLiteSubgraphResource` for each subgraph (execpt
diff --git a/tensorflow/lite/delegates/flex/subgraph_resource.h b/tensorflow/lite/delegates/flex/subgraph_resource.h
index 1eca555353b80f..5b5734ecc055b0 100644
--- a/tensorflow/lite/delegates/flex/subgraph_resource.h
+++ b/tensorflow/lite/delegates/flex/subgraph_resource.h
@@ -32,9 +32,8 @@ namespace flex {
 // needs to first acquire a lock on the mutex object.
 class TFLiteSubgraphResource : public tensorflow::ResourceBase {
  public:
-  explicit TFLiteSubgraphResource(Subgraph& subgraph, TfLiteDelegate* delegate,
-                                  tensorflow::mutex* mutex)
-      : mutex_(mutex), subgraph_(subgraph), delegate_(delegate) {}
+  explicit TFLiteSubgraphResource(Subgraph& subgraph, TfLiteDelegate* delegate)
+      : subgraph_(subgraph), delegate_(delegate) {}
 
   std::string DebugString() const override { return "TFLiteSubgraphResource"; }
 
@@ -45,7 +44,7 @@ class TFLiteSubgraphResource : public tensorflow::ResourceBase {
   }
 
   tensorflow::mutex& GetExclusiveLock() TF_LOCK_RETURNED(mutex_) {
-    return *mutex_;
+    return mutex_;
   }
 
   // Returns a pointer to the TfLiteDelegate which this instance of subgraph
@@ -55,7 +54,7 @@ class TFLiteSubgraphResource : public tensorflow::ResourceBase {
   }
 
  private:
-  tensorflow::mutex* mutex_;
+  tensorflow::mutex mutex_;
   Subgraph& subgraph_ TF_GUARDED_BY(mutex_);
   TfLiteDelegate* delegate_ TF_GUARDED_BY(mutex_) = nullptr;
 };

From bb9e13d73be8419bb3bf701cffa979d772090214 Mon Sep 17 00:00:00 2001
From: Majid Dadashi <majiddadashi@google.com>
Date: Tue, 19 Mar 2024 14:56:24 -0700
Subject: [PATCH 141/670] Provide an experimental flag to try converting
 composite ops directly to tflite ops.

The implementation of the feature to follow.

PiperOrigin-RevId: 617303694
---
 tensorflow/compiler/mlir/lite/BUILD                         | 1 +
 tensorflow/compiler/mlir/lite/common/tfl_pass_config.h      | 3 +++
 .../mlir/lite/python/saved_model_to_tfl_flatbuffer.cc       | 2 ++
 tensorflow/compiler/mlir/lite/tf_tfl_passes.cc              | 6 ++++++
 tensorflow/lite/python/convert.py                           | 6 ++++++
 tensorflow/lite/python/lite.py                              | 4 ++++
 tensorflow/lite/toco/toco_flags.proto                       | 6 +++++-
 7 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 2576b3e5e5fdd5..900426c63ce0f8 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -1384,6 +1384,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
+        "@com_google_absl//absl/log",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
index dd4f59ebe3a889..69ec0bbbcee3dc 100644
--- a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
+++ b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
@@ -99,6 +99,9 @@ struct PassConfig {
   // When set to true, StableHLO Quantizer is run. The full configuration for
   // the quantizer is at `TocoFlags::quantization_config`.
   bool enable_stablehlo_quantizer = false;
+
+  // Enables the attempt to directly lower composites into tflite ops.
+  bool enable_composite_direct_lowering = false;
 };
 
 inline llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index 8cf08dea534d5c..3e50192fa0640d 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -204,6 +204,8 @@ Status ConvertSavedModelToTFLiteFlatBuffer(
   pass_config.legalize_custom_tensor_list_ops =
       toco_flags.legalize_custom_tensor_list_ops();
   pass_config.enable_stablehlo_quantizer = toco_flags.has_quantization_config();
+  pass_config.enable_composite_direct_lowering =
+      toco_flags.enable_composite_direct_lowering();
 
   if (toco_flags.qdq_conversion_mode() == "STATIC") {
     pass_config.quant_specs.qdq_conversion_mode =
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 655aee59b77378..3a1c84fbfebd99 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -231,6 +232,11 @@ void AddPostQuantizationStableHloToTfPasses(
     pass_manager.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   }
 
+  if (pass_config.enable_composite_direct_lowering) {
+    LOG(WARNING) << "Direct lowerting of composites to TFLite ops is not "
+                    "implemented yet.";
+  }
+
   // TFLite dialect passes.
   if (!pass_config.disable_hlo_to_tfl_conversion) {
     pass_manager.addPass(mlir::odml::CreateLegalizeHloToTfLitePass());
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 6ae3a1724f5202..32fc17a1ce5ae3 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -595,6 +595,7 @@ def build_conversion_flags(
     reduce_type_precision=False,
     qdq_conversion_mode=None,
     disable_per_channel_quantization_for_dense_layers=False,
+    enable_composite_direct_lowering=False,
     **_,
 ):
   """Builds protocol buffer describing a conversion of a model.
@@ -724,6 +725,8 @@ def build_conversion_flags(
     disable_per_channel_quantization_for_dense_layers: If set, disables per
       channel end enables per tensor integer quantization for weights in Dense
       layers. The flag works only for integer quantized model.
+    enable_composite_direct_lowering: If set, attempts to lower composite ops
+      directly to tflite ops.
 
   Returns:
     conversion_flags: protocol buffer describing the conversion process.
@@ -844,6 +847,9 @@ def build_conversion_flags(
   conversion_flags.disable_per_channel_quantization_for_dense_layers = (
       disable_per_channel_quantization_for_dense_layers
   )
+  conversion_flags.enable_composite_direct_lowering = (
+      enable_composite_direct_lowering
+  )
   return conversion_flags
 
 
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 952392dcb8df84..849e854798368b 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -665,6 +665,7 @@ def __init__(self):
     self._experimental_reduce_type_precision = False
     self._experimental_qdq_conversion_mode = None
     self._experimental_disable_per_channel_quantization_for_dense_layers = False
+    self._experimental_enable_composite_direct_lowering = False
 
     # Debug parameters
     self.ir_dump_dir = None
@@ -819,6 +820,9 @@ def _get_base_converter_args(self):
         "disable_per_channel_quantization_for_dense_layers": (
             self._experimental_disable_per_channel_quantization_for_dense_layers
         ),
+        "enable_composite_direct_lowering": (
+            self._experimental_enable_composite_direct_lowering
+        ),
     }
 
     if self.saved_model_dir:
diff --git a/tensorflow/lite/toco/toco_flags.proto b/tensorflow/lite/toco/toco_flags.proto
index e7a31365a65330..1760841a333f6a 100644
--- a/tensorflow/lite/toco/toco_flags.proto
+++ b/tensorflow/lite/toco/toco_flags.proto
@@ -41,7 +41,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 63.
+// Next ID to use: 64.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -356,4 +356,8 @@ message TocoFlags {
   // inconsistent with Conv 1x1 which always performs per channel quantization.
   optional bool disable_per_channel_quantization_for_dense_layers = 62
       [default = false];
+
+  // Enables the attempt to directly lower composites into tflite ops.
+  // WARNING: Experimental interface, subject to change.
+  optional bool enable_composite_direct_lowering = 63 [default = false];
 }

From 8818a8b6fbe2ca985a8223ea232726686fba7b43 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Tue, 19 Mar 2024 15:00:29 -0700
Subject: [PATCH 142/670] [xla:gpu] Emission of DynamicAddressComputationFusion

PiperOrigin-RevId: 617304920
---
 third_party/xla/xla/service/gpu/fusions/BUILD |   1 +
 .../address_computation_fusion_test.cc        | 816 ++++++++++++++++++
 .../xla/xla/service/gpu/fusions/custom.cc     | 167 ++++
 .../xla/xla/service/gpu/fusions/custom.h      |  24 +-
 .../xla/xla/service/gpu/fusions/fusions.cc    |   3 +
 5 files changed, 1010 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index d011d06b0ab4dc..c43db4b68b5a32 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -76,6 +76,7 @@ cc_library(
         "//xla/service/gpu:thunk",
         "//xla/service/gpu/kernels:custom_kernel",
         "//xla/service/gpu/kernels:custom_kernel_fusion",
+        "//xla/service/gpu/runtime:address_computation_thunk",
         "//xla/service/gpu/runtime:custom_call_thunk",
         "//xla/service/gpu/runtime:gemm_thunk",
         "//xla/service/gpu/runtime:kernel_thunk",
diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index 6fb3b14f9f77b0..2bd9b4c5061da9 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -1116,6 +1116,822 @@ TEST_F(AddressComputationFusionTest, NilTupleLegacyAPI) {
                                       /*run_hlo_passes=*/false));
 }
 
+TEST_F(AddressComputationFusionTest, CublasGemmDynamic) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY main.9 {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[2,8,8]{2,1,0} parameter(1)
+    c1_s32 = s32[] constant(1)
+    c0_s32 = s32[] constant(0)
+    slice.13 = bf16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.41 = bf16[8,8]{1,0} bitcast(slice.13)
+    slice.14 = bf16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.42 = bf16[8,8]{1,0} bitcast(slice.14)
+
+    ROOT custom-call.1 = bf16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  fused_computation {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[2,8,8]{2,1,0} parameter(1)
+    c1_s32 = s32[] parameter(2)
+    c0_s32 = s32[] parameter(3)
+    slice.13 = bf16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.41 = bf16[8,8]{1,0} bitcast(slice.13)
+    slice.14 = bf16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.42 = bf16[8,8]{1,0} bitcast(slice.14)
+
+    ROOT custom-call.1 = bf16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  }
+
+  ENTRY main.9 {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[2,8,8]{2,1,0} parameter(1)
+    c1_s32 = s32[] constant(1)
+    c0_s32 = s32[] constant(0)
+    ROOT fusion.2 = bf16[8,8]{1,0} fusion(p0, p1, c1_s32, c0_s32), kind=kCustom, calls=fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, CublasGemmDynamicWithWorkspace) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = f16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %c1_s32 = s32[] parameter(2)
+    %c0_s32 = s32[] parameter(3)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[8,8]{1,0}, s8[256]{0}) tuple(%get-tuple-element.0, %get-tuple-element.1)
+  }
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = f16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    ROOT %fusion.2 = (f16[8,8]{1,0}, s8[256]{0}) fusion(%p0, %p1, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicContiguousSlice) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = bf16[8,8,10,8]{3,2,1,0} parameter(1), sharding={replicated}
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    %c2_s32 = s32[] constant(2)
+    %c5_s32 = s32[] constant(5)
+    %slice.13 = bf16[1,4,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,4,8}
+    %bitcast.41 = bf16[4,8]{1,0} bitcast(%slice.13)
+    %slice.14 = bf16[1,1,8,8]{3,2,1,0} dynamic-slice(%p1, %c1_s32, %c5_s32, %c2_s32, %c0_s32), dynamic_slice_sizes={1,1,8,8}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
+    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    %p1 = bf16[8,8,10,8]{3,2,1,0} parameter(1)
+    %c1_s32 = s32[] parameter(2)
+    %c0_s32 = s32[] parameter(3)
+    %c2_s32 = s32[] parameter(4)
+    %c5_s32 = s32[] parameter(5)
+    %slice.13 = bf16[1,4,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,4,8}
+    %bitcast.41 = bf16[4,8]{1,0} bitcast(%slice.13)
+    %slice.14 = bf16[1,1,8,8]{3,2,1,0} dynamic-slice(%p1, %c1_s32, %c5_s32, %c2_s32, %c0_s32), dynamic_slice_sizes={1,1,8,8}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  }
+
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = bf16[8,8,10,8]{3,2,1,0} parameter(1), sharding={replicated}
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    %c2_s32 = s32[] constant(2)
+    %c5_s32 = s32[] constant(5)
+    ROOT %fusion.2 = bf16[4,8]{1,0} fusion(%p0, %p1, %c1_s32, %c0_s32, %c2_s32, %c5_s32), kind=kCustom,
+    calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicContiguousSliceNonDefaultLayout) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{1,2,0} parameter(0), sharding={replicated}
+    %p1 = bf16[8,8,10,8]{1,2,3,0} parameter(1), sharding={replicated}
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    %c2_s32 = s32[] constant(2)
+    %c5_s32 = s32[] constant(5)
+    %slice.13 = bf16[1,8,4]{1,2,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,4}
+    %bitcast.41 = bf16[4,8]{1,0} bitcast(%slice.13)
+    %slice.14 = bf16[1,8,8,1]{1,2,3,0} dynamic-slice(%p1, %c0_s32, %c0_s32, %c2_s32, %c5_s32), dynamic_slice_sizes={1,8,8,1}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = bf16[2,8,8]{1,2,0} parameter(0)
+    %p1 = bf16[8,8,10,8]{1,2,3,0} parameter(1)
+    %c1_s32 = s32[] parameter(2)
+    %c0_s32 = s32[] parameter(3)
+    %c2_s32 = s32[] parameter(4)
+    %c5_s32 = s32[] parameter(5)
+    %slice.13 = bf16[1,8,4]{1,2,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,4}
+    %bitcast.41 = bf16[4,8]{1,0} bitcast(%slice.13)
+    %slice.14 = bf16[1,8,8,1]{1,2,3,0} dynamic-slice(%p1, %c0_s32, %c0_s32, %c2_s32, %c5_s32), dynamic_slice_sizes={1,8,8,1}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  }
+
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{1,2,0} parameter(0), sharding={replicated}
+    %p1 = bf16[8,8,10,8]{1,2,3,0} parameter(1), sharding={replicated}
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    %c2_s32 = s32[] constant(2)
+    %c5_s32 = s32[] constant(5)
+    ROOT %fusion.2 = bf16[4,8]{1,0} fusion(%p0, %p1, %c1_s32, %c0_s32, %c2_s32, %c5_s32), kind=kCustom,
+    calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicOperandIsSlicedGetTupleElement) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %c0_s32 = s32[] constant(0)
+    %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+    %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"20000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+    %slice.26 = f32[100,100]{1,0} dynamic-slice(%get-tuple-element.97, %c0_s32, %c0_s32), dynamic_slice_sizes={100,100}
+    ROOT %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %address-computation {
+    %p0.3 = f32[200,100]{1,0} parameter(0)
+    %p1.3 = f32[100,100]{1,0} parameter(1)
+    %c0_s32 = s32[] parameter(2)
+    %slice.56 = f32[100,100]{1,0} dynamic-slice(%p0.3, %c0_s32, %c0_s32), dynamic_slice_sizes={100,100}
+    %cublas-gemm.23 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.56, %p1.3),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%cublas-gemm.23), index=0
+    %get-tuple-element.222 = s8[80000]{0} get-tuple-element(%cublas-gemm.23), index=1
+    ROOT %tuple.58 = (f32[100,100]{1,0}, s8[80000]{0}) tuple(%get-tuple-element.221, %get-tuple-element.222)
+  }
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %c0_s32 = s32[] constant(0)
+    %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+    %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"20000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+    ROOT %address_computation.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97, %get-tuple-element.240, %c0_s32),
+      kind=kCustom,
+      calls=%address-computation,
+      backend_config={
+        "fusion_backend_config":{
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
+        }
+      }
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicReversedOperandOrder) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %c0_s32 = s32[] constant(0)
+    %c1_s32 = s32[] constant(1)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c0_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.42, %bitcast.41),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %address-computation {
+    %p0.1 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1.1 = f16[2,8,8]{2,1,0} parameter(1)
+    %c0_s32 = s32[] parameter(2)
+    %c1_s32 = s32[] parameter(3)
+    %slice.1 = f16[1,8,8]{2,1,0} dynamic-slice(%p0.1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.1 = f16[8,8]{1,0} bitcast(%slice.1)
+    %slice.0 = f16[1,8,8]{2,1,0} dynamic-slice(%p1.1, %c0_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.0 = f16[8,8]{1,0} bitcast(%slice.0)
+    ROOT %custom-call.0 = f16[8,8]{1,0} custom-call(%bitcast.1, %bitcast.0),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+  }
+
+  ENTRY %main {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %c0_s32 = s32[] constant(0)
+    %c1_s32 = s32[] constant(1)
+    ROOT %address_computation.6 = f16[8,8]{1,0} fusion(%p1, %p0, %c0_s32, %c1_s32),
+      kind=kCustom,
+      calls=%address-computation,
+      backend_config={
+        "fusion_backend_config":{
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
+        }
+      }
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicSingleOperandComputation) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %c0_s32 = s32[] constant(0)
+    %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+    %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"20000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+    %slice.26 = f32[100,100]{1,0} dynamic-slice(%get-tuple-element.97, %c0_s32, %c0_s32), dynamic_slice_sizes={100,100}
+    ROOT %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %slice.26),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %address-computation {
+    %p0.3 = f32[200,100]{1,0} parameter(0)
+    %c0_s32 = s32[] parameter(1)
+    %slice.56 = f32[100,100]{1,0} dynamic-slice(%p0.3, %c0_s32, %c0_s32), dynamic_slice_sizes={100,100}
+    %cublas-gemm.23 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.56, %slice.56),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%cublas-gemm.23), index=0
+    %get-tuple-element.222 = s8[80000]{0} get-tuple-element(%cublas-gemm.23), index=1
+    ROOT %tuple.58 = (f32[100,100]{1,0}, s8[80000]{0}) tuple(%get-tuple-element.221, %get-tuple-element.222)
+  }
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %c0_s32 = s32[] constant(0)
+    %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+    %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"20000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+    ROOT %address_computation.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97, %c0_s32),
+      kind=kCustom,
+      calls=%address-computation,
+      backend_config={
+        "fusion_backend_config":{
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
+        }
+      }
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicSlicedOperandAliasingOutput) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+    ENTRY %main.9 {
+      %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+      %c20_s32 = s32[] constant(20)
+      %c99_s32 = s32[] constant(99)
+      %c0_s32 = s32[] constant(0)
+      %get-tuple-element.287 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+      %get-tuple-element.288 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+      %concatenate.12 = f32[200,100]{1,0} concatenate(%get-tuple-element.287, %get-tuple-element.288), dimensions={0}
+      %slice.30 = f32[100,100]{1,0} dynamic-slice(%concatenate.12, %c20_s32, %c0_s32), dynamic_slice_sizes={100,100}
+      %slice.34 = f32[100,100]{1,0} dynamic-slice(%concatenate.12, %c99_s32, %c0_s32), dynamic_slice_sizes={100,100}
+      ROOT %cublas-gemm.15 = (f32[100,100]{1,0}, s8[120000]{0}) custom-call(%get-tuple-element.287, %slice.30, %slice.34),
+        custom_call_target="__cublas$gemm",
+        output_to_operand_aliasing={{0}: (2, {})},
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":1,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %address-computation {
+    %p0.1 = f32[100,100]{1,0} parameter(0)
+    %p1 = f32[100,100]{1,0} parameter(1)
+    %p2 = f32[200,100]{1,0} parameter(2)
+    %c0_s32 = s32[] parameter(3)
+    %c20_s32 = s32[] parameter(4)
+    %slice.0 = f32[100,100]{1,0} dynamic-slice(f32[200,100]{1,0} %p2, %c20_s32, %c0_s32), dynamic_slice_sizes={100,100}
+    %cublas-gemm.0 = (f32[100,100]{1,0}, s8[120000]{0}) custom-call(%p0.1, %slice.0, %p1),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":1,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element = f32[100,100]{1,0} get-tuple-element(%cublas-gemm.0), index=0
+    %get-tuple-element.1 = s8[120000]{0} get-tuple-element(%cublas-gemm.0), index=1
+    ROOT %tuple = (f32[100,100]{1,0}, s8[120000]{0}) tuple(%get-tuple-element, %get-tuple-element.1)
+  }
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %c20_s32 = s32[] constant(20)
+    %c99_s32 = s32[] constant(99)
+    %c0_s32 = s32[] constant(0)
+    %get-tuple-element.287 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.288 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.12 = f32[200,100]{1,0} concatenate(%get-tuple-element.287, %get-tuple-element.288), dimensions={0}
+    %slice.34 = f32[100,100]{1,0} dynamic-slice(%concatenate.12, %c99_s32, %c0_s32), dynamic_slice_sizes={100,100}
+    ROOT %address_computation.6 = (f32[100,100]{1,0}, s8[120000]{0}) fusion(%get-tuple-element.287, %slice.34, %concatenate.12, %c0_s32, %c20_s32),
+      kind=kCustom,
+      calls=%address-computation,
+      output_to_operand_aliasing={{0}: (1, {})},
+      backend_config={
+        "fusion_backend_config":{
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
+        }
+      }
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 619fe2281611d7..ce543a26e9c574 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/kernels/custom_kernel_fusion.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/runtime/address_computation_thunk.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 #include "xla/service/gpu/runtime/gemm_thunk.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
@@ -180,6 +181,148 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   return result;
 }
 
+absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
+    IrEmitterContext& ir_emitter_context, const HloFusionAdaptor& adaptor,
+    const HloFusionInstruction& fusion,
+    const HloCustomCallInstruction& custom_call) {
+  const BufferAssignment& buffer_assignment =
+      ir_emitter_context.buffer_assignment();
+
+  HloDynamicSliceInstruction* slice_instr = nullptr;
+  auto get_original_slice =
+      [&](const HloInstruction* start,
+          const ShapeIndex& index) -> absl::StatusOr<BufferAllocation::Slice> {
+    if (const auto* param = DynCast<HloParameterInstruction>(start)) {
+      return GetAllocationSlice(
+          buffer_assignment, fusion.operand(param->parameter_number()), index);
+    }
+
+    auto slice_adaptor = HloFindIf(
+        {HloInstructionAdaptor(*start)}, adaptor,
+        [](auto node) { return node.opcode() == HloOpcode::kDynamicSlice; });
+    if (!slice_adaptor.has_value()) {
+      return absl::InternalError(
+          "AddressComputationFusion expects at least one sliced operand");
+    }
+
+    slice_instr = const_cast<HloDynamicSliceInstruction*>(
+        static_cast<const HloDynamicSliceInstruction*>(
+            &slice_adaptor->instruction()));
+
+    if (!IsContiguousSlice(slice_instr->operand(0)->shape(),
+                           slice_instr->shape())) {
+      return absl::InternalError(
+          "AddressComputationFusion only handles contiguous slices currently");
+    }
+
+    const auto* param = Cast<HloParameterInstruction>(slice_instr->operand(0));
+    return GetAllocationSlice(buffer_assignment,
+                              fusion.operand(param->parameter_number()), index);
+  };
+
+  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
+      offset_buffer_indices;
+  std::vector<std::optional<const Shape>> orig_shapes;
+  std::vector<std::optional<const Shape>> sliced_shapes;
+
+  auto get_operand_slice_info = [&]() {
+    if (slice_instr == nullptr) {
+      offset_buffer_indices.push_back(std::nullopt);
+      orig_shapes.push_back(std::nullopt);
+      sliced_shapes.push_back(std::nullopt);
+      return;
+    }
+
+    std::vector<BufferAllocation::Slice> offset_slices;
+    for (auto idx_op : slice_instr->index_operands()) {
+      const auto* param = Cast<HloParameterInstruction>(idx_op);
+      offset_slices.push_back(
+          GetAllocationSlice(buffer_assignment,
+                             fusion.operand(param->parameter_number()),
+                             /*index=*/{})
+              .value());
+    }
+    offset_buffer_indices.push_back(offset_slices);
+    orig_shapes.push_back(slice_instr->operand(0)->shape());
+    sliced_shapes.push_back(slice_instr->shape());
+  };
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice lhs_slice,
+                      get_original_slice(custom_call.operand(0), /*index=*/{}));
+  get_operand_slice_info();
+
+  slice_instr = nullptr;
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice rhs_slice,
+                      get_original_slice(custom_call.operand(1), /*index=*/{}));
+  get_operand_slice_info();
+
+  BufferAllocation::Slice output;
+  std::optional<BufferAllocation::Slice> workspace = std::nullopt;
+  std::optional<BufferAllocation::Slice> slice_workspace_fake = std::nullopt;
+
+  // TODO(vuson): handle DUS
+  int64_t out_byte_size = 0;
+  if (custom_call.shape().IsArray()) {
+    TF_ASSIGN_OR_RETURN(output,
+                        GetAllocationSlice(buffer_assignment, &fusion, {}));
+    out_byte_size = ShapeUtil::ByteSizeOf(custom_call.shape());
+  } else {
+    TF_ASSIGN_OR_RETURN(output,
+                        GetAllocationSlice(buffer_assignment, &fusion, {0}));
+    TF_ASSIGN_OR_RETURN(workspace,
+                        GetAllocationSlice(buffer_assignment, &fusion, {1}));
+    out_byte_size = ShapeUtil::ByteSizeOf(custom_call.shape().tuple_shapes(0));
+    slice_workspace_fake =
+        BufferAllocation::Slice(workspace->allocation(), 0, workspace->size());
+  }
+  offset_buffer_indices.push_back(std::nullopt);
+  offset_buffer_indices.push_back(std::nullopt);
+  orig_shapes.push_back(std::nullopt);
+  orig_shapes.push_back(std::nullopt);
+  sliced_shapes.push_back(std::nullopt);
+  sliced_shapes.push_back(std::nullopt);
+
+  // Creating embedded GEMM thunk.
+  bool deterministic_ops =
+      ir_emitter_context.debug_options().xla_gpu_deterministic_ops();
+
+  TF_ASSIGN_OR_RETURN(
+      GemmConfig config,
+      GemmConfig::For(static_cast<const HloInstruction*>(&custom_call)));
+
+  // TODO(vuson): handle cases where LHS and RHS share the same buffer, with
+  // different offset. In such cases, the fake slices need to contain the
+  // correct offset instead of default value 0.
+  int64_t lhs_byte_size =
+      ShapeUtil::ByteSizeOf(custom_call.operand(0)->shape());
+  BufferAllocation::Slice slice_lhs_fake(lhs_slice.allocation(), 0,
+                                         lhs_byte_size);
+
+  int64_t rhs_byte_size =
+      ShapeUtil::ByteSizeOf(custom_call.operand(1)->shape());
+  BufferAllocation::Slice slice_rhs_fake(rhs_slice.allocation(), 0,
+                                         rhs_byte_size);
+
+  BufferAllocation::Slice slice_out_fake(output.allocation(), 0, out_byte_size);
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<GemmThunk>(
+      Thunk::ThunkInfo::WithProfileAnnotation(&custom_call), std::move(config),
+      slice_lhs_fake, slice_rhs_fake, slice_out_fake, slice_workspace_fake,
+      deterministic_ops));
+
+  std::vector<std::optional<const BufferAllocation::Slice>> arguments{
+      lhs_slice, rhs_slice, output, workspace};
+
+  auto thunk = std::make_unique<AddressComputationThunk>(
+      Thunk::ThunkInfo::WithProfileAnnotation(&custom_call),
+      std::make_unique<ThunkSequence>(std::move(seq)), arguments,
+      offset_buffer_indices, orig_shapes, sliced_shapes);
+
+  FusionEmissionResult result;
+  result.thunks.push_back(std::move(thunk));
+  return result;
+}
+
 absl::StatusOr<FusionEmissionResult> EmitCustomCall(
     IrEmitterContext& ir_emitter_context, const HloFusionAdaptor& adaptor,
     const HloFusionInstruction& fusion,
@@ -414,5 +557,29 @@ absl::StatusOr<FusionEmissionResult> AddressComputationFusion::Emit(
   return EmitCustomCall(ir_emitter_context, adaptor, fusion, custom_call);
 }
 
+absl::StatusOr<FusionEmissionResult> DynamicAddressComputationFusion::Emit(
+    IrEmitterContext& ir_emitter_context,
+    const HloFusionInstruction& fusion) const {
+  const HloFusionAdaptor& adaptor = analysis_.fusion();
+  auto maybe_custom_call_adaptor = HloFindIf(
+      adaptor.GetRoots(), adaptor,
+      [](auto node) { return node.opcode() == HloOpcode::kCustomCall; });
+  if (maybe_custom_call_adaptor == std::nullopt) {
+    return absl::InternalError(
+        "DynamicAddressComputationFusion requires a CustomCall hero");
+  }
+
+  const auto& custom_call = *static_cast<const HloCustomCallInstruction*>(
+      &maybe_custom_call_adaptor->instruction());
+  if (IsLegacyCublasMatmul(custom_call)) {
+    return EmitDynamicSlicedGemm(ir_emitter_context, adaptor, fusion,
+                                 custom_call);
+  }
+
+  return absl::UnimplementedError(absl::StrCat(
+      "No emission for DynamicAddressComputationFusion of custom call ",
+      custom_call.custom_call_target()));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.h b/third_party/xla/xla/service/gpu/fusions/custom.h
index e5f763027f754b..24f82865e05354 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.h
+++ b/third_party/xla/xla/service/gpu/fusions/custom.h
@@ -36,7 +36,7 @@ class CustomFusion : public FusionInterface {
 
 // Emitter for custom fusions implementing address computation. An address
 // computation contains a custom call hero, with at least one of its operands
-// comes from a static contiguous slice. E.g. operand `%cast` of `%gemm` coming
+// coming from a static contiguous slice. E.g. operand `%cast` of `%gemm` coming
 // from `%slice`:
 // %address_computation {
 //   %p0 = f32[2, 1024, 1024]
@@ -63,6 +63,28 @@ class AddressComputationFusion : public FusionInterface {
   const HloFusionAnalysis& analysis_;
 };
 
+// TODO(vuson): merge these two fusions.
+// Emitter for custom fusions implementing dynamic address computation. A
+// dynamic address computation contains a custom call hero, with at least one of
+// its operands coming from a dynamic contiguous slice, and/or with at least one
+// of its results feeding into a contiguous DUS.
+//
+// The goal is to compute the buffer addresses for sliced operands/results
+// without having to allocate new buffers for these by wrapping
+// AddressComputationThunk around the original custom call thunk.
+class DynamicAddressComputationFusion : public FusionInterface {
+ public:
+  explicit DynamicAddressComputationFusion(const HloFusionAnalysis& analysis)
+      : analysis_(analysis) {}
+
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context,
+      const HloFusionInstruction& fusion) const final;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+};
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.cc b/third_party/xla/xla/service/gpu/fusions/fusions.cc
index 5037dd676e0171..c94d5bda249fc0 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.cc
@@ -178,6 +178,9 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
       if (config.name() == "address_computation") {
         return std::make_unique<AddressComputationFusion>(analysis);
       }
+      if (config.name() == "dynamic_address_computation") {
+        return std::make_unique<DynamicAddressComputationFusion>(analysis);
+      }
       return std::make_unique<CustomFusion>();
     }
     case HloFusionAnalysis::EmitterFusionKind::kInputSlices:

From 0a5daef11aed78975d63ae83ca2ec830cbed2d78 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 15:01:16 -0700
Subject: [PATCH 143/670] Rename GetFusionComputations within instruction
 fusion.

The GetFusionComputations method returns computations that are candidates for performing fusion. These candidate computations are just non-fusion computations. As we are essentially just returning non-fusion computations, we can rename this function to GetNonFusionComputations.

PiperOrigin-RevId: 617305149
---
 third_party/xla/xla/service/gpu/priority_fusion.cc | 8 ++++----
 third_party/xla/xla/service/instruction_fusion.cc  | 5 +++--
 third_party/xla/xla/service/instruction_fusion.h   | 5 +++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/priority_fusion.cc b/third_party/xla/xla/service/gpu/priority_fusion.cc
index 21b353bf6850e5..cc1db570d441f6 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion.cc
@@ -669,7 +669,8 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
   // With this modification it will be easier to match instructions before and
   // after fusion passes, because they will have the same unique prefix. Names
   // are not used in the pipeline, but it makes debugging much easier.
-  for (auto* computation : GetFusionComputations(module, execution_threads)) {
+  for (auto* computation :
+       GetNonFusionComputations(module, execution_threads)) {
     for (auto* instruction : computation->instructions()) {
       module->SetAndUniquifyInstrName(instruction,
                                       absl::StrCat(instruction->name(), ".0"));
@@ -682,9 +683,8 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
   }
 
   int changed = false;
-  // Note: `GetFusionComputations` doesn't return the fusion computations, but
-  // the computations to be fused.
-  for (auto* computation : GetFusionComputations(module, execution_threads)) {
+  for (auto* computation :
+       GetNonFusionComputations(module, execution_threads)) {
     CHECK(!computation->IsFusionComputation());
 
     auto fusion_queue = std::make_unique<GpuPriorityFusionQueue>(
diff --git a/third_party/xla/xla/service/instruction_fusion.cc b/third_party/xla/xla/service/instruction_fusion.cc
index 44b6a064aff32b..bcc9d2e4549f4a 100644
--- a/third_party/xla/xla/service/instruction_fusion.cc
+++ b/third_party/xla/xla/service/instruction_fusion.cc
@@ -500,7 +500,7 @@ class ReversePostOrderFusionQueue : public FusionQueue {
 
 }  // namespace
 
-std::vector<HloComputation*> InstructionFusion::GetFusionComputations(
+std::vector<HloComputation*> InstructionFusion::GetNonFusionComputations(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Use sorted computations because fusion configuration is order-sensitive.
@@ -522,7 +522,8 @@ absl::StatusOr<bool> InstructionFusion::Run(
   bool dump_fusion =
       module->config().debug_options().xla_dump_fusion_visualization();
 
-  for (auto* computation : GetFusionComputations(module, execution_threads)) {
+  for (auto* computation :
+       GetNonFusionComputations(module, execution_threads)) {
     CHECK(!computation->IsFusionComputation());
     std::unique_ptr<HloReachabilityMap> reachability =
         HloReachabilityMap::Build(computation);
diff --git a/third_party/xla/xla/service/instruction_fusion.h b/third_party/xla/xla/service/instruction_fusion.h
index b446a603f79e1e..8ace349f141db4 100644
--- a/third_party/xla/xla/service/instruction_fusion.h
+++ b/third_party/xla/xla/service/instruction_fusion.h
@@ -166,8 +166,9 @@ class InstructionFusion : public HloModulePass {
                                             const HloInstruction* consumer);
 
  protected:
-  // Returns a list of computations on which Fusion is performed.
-  virtual std::vector<HloComputation*> GetFusionComputations(
+  // Returns a list of computations that are not fusion computations. These
+  // computations contain instructions which are candidates for fusions.
+  virtual std::vector<HloComputation*> GetNonFusionComputations(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 

From e131cecaf9d09cf7e33c9c62fcb927aba54098bd Mon Sep 17 00:00:00 2001
From: Sania Nagpal <sanianagpal@google.com>
Date: Tue, 19 Mar 2024 15:02:49 -0700
Subject: [PATCH 144/670] Increase Code Coverage of
 //third_party/tensorflow/core/tfrt/run_handler_thread_pool/

PiperOrigin-RevId: 617305617
---
 tensorflow/core/tfrt/run_handler_thread_pool/BUILD         | 1 +
 .../run_handler_concurrent_work_queue_test.cc              | 7 ++++---
 .../core/tfrt/run_handler_thread_pool/run_handler_test.cc  | 5 +++++
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/BUILD b/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
index 3acb0edd074e96..a7c5fb24092b17 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
@@ -76,6 +76,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:matmul_op",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc
index f01f92b67c981f..f290637521a74a 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include <gtest/gtest.h>
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/platform/errors.h"
@@ -161,16 +162,16 @@ TEST_F(RunHandlerThreadWorkQueueTest, RunningMixedTask) {
 }
 
 TEST_F(RunHandlerThreadWorkQueueTest, NameReturnsValidString) {
-  EXPECT_EQ(queue_->name(), "run_handler");
+  EXPECT_TRUE(absl::StrContains(pool_->name(), "RunHandlerThreadWorkQueue"));
 }
 
 TEST_F(RunHandlerThreadWorkQueueTest, GetParallelismLevelOk) {
-  EXPECT_EQ(queue_->GetParallelismLevel(),
+  EXPECT_EQ(pool_->GetParallelismLevel(),
             kNumComplementaryThreads + kNumMainThreads);
 }
 
 TEST_F(RunHandlerThreadWorkQueueTest, IsWorkerThreadOk) {
-  EXPECT_TRUE(queue_->IsInWorkerThread());
+  EXPECT_TRUE(pool_->IsInWorkerThread());
 }
 
 TEST_F(RunHandlerThreadWorkQueueTest, NoHandlerReturnsError) {
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_test.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_test.cc
index fd839ef5709513..07ae2030434a2d 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_test.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/strings/match.h"
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h"
@@ -188,6 +189,7 @@ TEST_P(RunHandlerThreadPoolTest, EnqueueTask) {
   EXPECT_EQ(result, 1);
   tws.PopNonBlockingTask(0, true).f->f();
   EXPECT_EQ(result, 2);
+  EXPECT_TRUE(absl::StrContains(tws.ToString(), "traceme_id = 0"));
 }
 
 TEST_P(RunHandlerThreadPoolTest, FindTask) {
@@ -208,6 +210,9 @@ TEST_P(RunHandlerThreadPoolTest, FindTask) {
       tensorflow::Env::Default(), tensorflow::ThreadOptions(),
       "tf_run_handler_pool", &waiters_mu, &waiters);
 
+  EXPECT_EQ(run_handler_thread_pool.NumBlockingThreads(), 1);
+  EXPECT_EQ(run_handler_thread_pool.NumNonBlockingThreads(), 0);
+
   Eigen::MaxSizeVector<internal::ThreadWorkSource*> thread_work_sources(5);
   thread_work_sources.resize(5);
   for (int i = 0; i < 5; ++i) {

From 44353c5ca13022e1d00210ebab14a0f6beb4d40e Mon Sep 17 00:00:00 2001
From: Alan Kelly <alankelly@google.com>
Date: Tue, 19 Mar 2024 15:09:17 -0700
Subject: [PATCH 145/670] Replaces XNN_FLAG_KEEP_DIM --> XNN_FLAG_REDUCE_DIM to
 maintain existing behaviour

PiperOrigin-RevId: 617307564
---
 tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index a91e43678b8539..e24c2b03d21012 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -4501,6 +4501,8 @@ class Subgraph {
     uint32_t flags = 0;
     if (!reducer_params->keep_dims) {
       expected_output_dims -= num_reduction_axes;
+    } else {
+      flags = XNN_FLAG_KEEP_DIMS;
     }
     TF_LITE_ENSURE_STATUS(CheckTensorShape(
         logging_context, output_tensor, expected_output_dims,
@@ -4669,7 +4671,7 @@ class Subgraph {
         node_index));
 
     if (subgraph != nullptr) {
-      uint32_t flags = 0;
+      uint32_t flags = reducer_params->keep_dims ? XNN_FLAG_KEEP_DIMS : 0;
       xnn_status status = xnn_status_success;
       switch (num_reduction_axes) {
         case 1:

From 3eaeabe30f6653b0e78e8e8fa8deeb10b5030475 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 15:29:36 -0700
Subject: [PATCH 146/670] Add tensor shape check for ADD & MUL.

PiperOrigin-RevId: 617313236
---
 .../delegates/gpu/common/model_builder.cc     | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 10e3efd3bbbb5a..548cbcba1afc80 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -1143,32 +1143,19 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
         int input_tensor1 = 1;
         if (operation_type_ == OperationType::MUL ||
             operation_type_ == OperationType::ADD) {
-          // The "larger" input tensor MUST be the 1st argument, and the
-          // "smaller" input tensor must be the 2nd.
+          // The "larger" input tensor must be bound to 1st input and the
+          // "smaller" input tensor must be bound to 2nd input.
           BHWC shape0;
           RETURN_IF_ERROR(ExtractTensorShape(*input0, &shape0));
           BHWC shape1;
           RETURN_IF_ERROR(ExtractTensorShape(*input1, &shape1));
-          if (shape0.b != shape1.b) {
-            return absl::InvalidArgumentError(absl::StrCat(
-                "Tensor shape (b) mismatch: ", shape0.b, " vs ", shape1.b));
-          } else if (shape0.c != shape1.c) {
-            return absl::InvalidArgumentError(absl::StrCat(
-                "Tensor shape (c) mismatch: ", shape0.c, " vs ", shape1.c));
-          } else if (!(shape0.h <= shape1.h && shape0.w <= shape1.w) &&
-                     !(shape0.h >= shape1.h && shape0.w >= shape1.w)) {
-            // One input tensor must be consistently larger (or smaller) than or
-            // as same shaped as the other input tensor in both dimensions.
-            return absl::InvalidArgumentError(absl::StrCat(
-                "Tensor shape (h, w) mismatch: (", shape0.h, ", ", shape0.w,
-                ") vs (", shape1.h, ", ", shape1.w, ")"));
-          }
           if (shape0.h <= shape1.h && shape0.w <= shape1.w &&
               shape0.c == shape1.c) {
             input_tensor0 = 1;
             input_tensor1 = 0;
           }
         }
+
         RETURN_IF_ERROR(reader->AddInput(node, input_tensor0));
         RETURN_IF_ERROR(reader->AddInput(node, input_tensor1));
       }

From 3605cb2c8d6b248ff8ea4067560d7bcd8127e99f Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Tue, 19 Mar 2024 16:45:24 -0700
Subject: [PATCH 147/670] #tf-data Fix asan failure.

PiperOrigin-RevId: 617333468
---
 tensorflow/python/data/kernel_tests/BUILD | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index bdb05c5950c821..109eeb25ad87bb 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -1173,9 +1173,9 @@ tf_py_strict_test(
 
 tf_py_strict_test(
     name = "skip_test",
-    size = "small",
+    size = "medium",
     srcs = ["skip_test.py"],
-    shard_count = 4,
+    shard_count = 8,
     deps = [
         ":checkpoint_test_base",
         ":test_base",

From 13a286004cf143d662921a6e22527ddb4d486a24 Mon Sep 17 00:00:00 2001
From: Zichuan Wei <zichuanwei@google.com>
Date: Tue, 19 Mar 2024 17:51:32 -0700
Subject: [PATCH 148/670] lite: code clean up

PiperOrigin-RevId: 617349472
---
 tensorflow/compiler/mlir/lite/BUILD           |   1 -
 tensorflow/compiler/mlir/lite/stablehlo/BUILD |  27 -
 .../mlir/lite/stablehlo/odml_to_stablehlo.cc  |  27 -
 .../mlir/lite/stablehlo/serializer/BUILD      |  64 --
 .../stablehlo/serializer/flatbuffer_export.cc |  54 --
 .../stablehlo/serializer/flatbuffer_export.h  |  58 --
 .../serializer/flatbuffer_operator.h          | 173 ----
 .../serializer/flatbuffer_translator.cc       | 904 ------------------
 .../serializer/flatbuffer_translator.h        | 176 ----
 .../tests/legalize-stablehlo-tf-fb-tf.mlir    |  15 -
 .../tests/legalize-stablehlo-tfl-add.mlir     |  15 -
 ...galize-stablehlo-tfl-broadcast_in_dim.mlir |  15 -
 .../tests/legalize-stablehlo-tfl-clamp.mlir   |  15 -
 .../tests/legalize-stablehlo-tfl-compare.mlir |  19 -
 .../tests/legalize-stablehlo-tfl-concat.mlir  |  18 -
 .../legalize-stablehlo-tfl-constant.mlir      |  17 -
 .../tests/legalize-stablehlo-tfl-conv.mlir    |  27 -
 .../tests/legalize-stablehlo-tfl-dot.mlir     |  22 -
 .../tests/legalize-stablehlo-tfl-gather.mlir  |  23 -
 .../tests/legalize-stablehlo-tfl-max.mlir     |  15 -
 .../tests/legalize-stablehlo-tfl-mul.mlir     |  15 -
 .../tests/legalize-stablehlo-tfl-pad.mlir     |  19 -
 .../tests/legalize-stablehlo-tfl-reshape.mlir |  15 -
 .../tests/legalize-stablehlo-tfl-rsqrt.mlir   |  15 -
 .../tests/legalize-stablehlo-tfl-scatter.mlir |  26 -
 .../tests/legalize-stablehlo-tfl-sub.mlir     |  15 -
 .../tests/legalize-stablehlo-tfl.mlir         |  17 -
 .../transforms/stablehlo_tfl_pass.cc          | 279 ------
 .../stablehlo/transforms/stablehlo_tfl_pass.h |  34 -
 29 files changed, 2120 deletions(-)
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/serializer/BUILD
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.cc
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.h
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_operator.h
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.cc
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.h
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-add.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-clamp.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-compare.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-concat.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-constant.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-dot.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-max.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-mul.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-reshape.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-rsqrt.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-scatter.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-sub.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl.mlir
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
 delete mode 100644 tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 900426c63ce0f8..38a17a52026270 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -1414,7 +1414,6 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization/stablehlo:quantization",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_to_vhlo_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:op_stat_pass",
-        "//tensorflow/compiler/mlir/lite/stablehlo:stablehlo_tfl",
         "//tensorflow/compiler/mlir/lite/stablehlo:stablehlo_util",
         "//tensorflow/compiler/mlir/lite/stablehlo:transforms",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 92b6d2c9abb7b3..14cdc985c8be8c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -33,30 +33,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "stablehlo_tfl",
-    srcs = [
-        "transforms/stablehlo_tfl_pass.cc",
-    ],
-    hdrs = [
-        "transforms/stablehlo_tfl_pass.h",
-    ],
-    copts = [
-        "-Ithird_party",
-    ],
-    deps = [
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "@flatbuffers",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-        "@stablehlo//:stablehlo_ops",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "stablehlo_util",
     srcs = [
@@ -667,7 +643,6 @@ tf_cc_binary(
     deps = [
         ":check_accepted_ops_pass",
         ":op_stat_pass",
-        ":stablehlo_tfl",
         ":stablehlo_util",
         ":transforms",
         "//tensorflow/cc/saved_model:loader",
@@ -675,7 +650,6 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir:passes",
         "//tensorflow/compiler/mlir/lite:flatbuffer_export",
         "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
-        "//tensorflow/compiler/mlir/lite/stablehlo/serializer:flatbuffer_export",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
         "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton_impl",
@@ -713,7 +687,6 @@ tf_cc_binary(
         ":legalize_tf_xla_call_module_to_stablehlo_pass",
         ":optimize",
         ":passes_inc_gen",
-        ":stablehlo_tfl",
         ":tf_legalize_hlo",
         ":tf_stablehlo",
         ":tfl_legalize_hlo",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
index dfcb9de5cc717a..f1d6b237ac2ef6 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
@@ -50,10 +50,8 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/check_accepted_ops_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
@@ -191,17 +189,6 @@ tensorflow::StatusOr<OwningOpRef<mlir::ModuleOp>> ImportSavedModelOrMLIR(
                           saved_model_bundle);
 }
 
-tensorflow::Status ConvertStableHLOToFlatbuffer(mlir::ModuleOp module,
-                                                std::string* flatbuffer_str) {
-  mlir::odml::FlatbufferExportOptions options;
-  if (!mlir::odml::MlirToFlatBufferTranslateFunction(module, options,
-                                                     flatbuffer_str)) {
-    return tensorflow::errors::Aborted("Unable to export flatbuffer");
-  }
-
-  return ::tensorflow::OkStatus();
-}
-
 tensorflow::Status ExportModule(mlir::ModuleOp module,
                                 const std::string& output_filename,
                                 bool elide_large_elements_attrs) {
@@ -212,20 +199,6 @@ tensorflow::Status ExportModule(mlir::ModuleOp module,
     return tensorflow::errors::Aborted("Unable to write to output path.");
   }
 
-  // Export TFLite Flatbuffer as output
-  if (export_type == "tflite") {
-    std::string flatbuffer_str;
-    auto status =
-        mlir::odml::ConvertStableHLOToFlatbuffer(module, &flatbuffer_str);
-    if (!status.ok()) {
-      return status;
-    }
-
-    output->os() << flatbuffer_str;
-    output->keep();
-    return ::tensorflow::OkStatus();
-  }
-
   // Export StableHLO MLIR as output
   std::string result;
   llvm::raw_string_ostream os(result);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/serializer/BUILD
deleted file mode 100644
index a93ec34c1bfa81..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/BUILD
+++ /dev/null
@@ -1,64 +0,0 @@
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//visibility:public",
-    ],
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "flatbuffer_translator",
-    srcs = [
-        "flatbuffer_translator.cc",
-    ],
-    hdrs = [
-        "flatbuffer_operator.h",
-        "flatbuffer_translator.h",
-    ],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
-        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/lite/stablehlo/schema:schema_fbs",
-        "//tensorflow/lite/toco:toco_flags_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@flatbuffers",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TranslateLib",
-        "@local_xla//xla:statusor",
-        "@stablehlo//:stablehlo_ops",
-    ],
-)
-
-cc_library(
-    name = "flatbuffer_export",
-    srcs = [
-        "flatbuffer_export.cc",
-    ],
-    hdrs = ["flatbuffer_export.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":flatbuffer_translator",
-        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/lite/toco:toco_flags_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//mlir:IR",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.cc
deleted file mode 100644
index a35f7821e68bbd..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.h"
-
-#include <stddef.h>
-#include <stdlib.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.h"
-#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace mlir {
-namespace odml {
-
-bool MlirToFlatBufferTranslateFunction(mlir::ModuleOp module,
-                                       const FlatbufferExportOptions& options,
-                                       std::string* serialized_flatbuffer) {
-  auto maybe_translated = Translator::Translate(
-      module, options.toco_flags, options.saved_model_tags,
-      options.op_or_arg_name_mapper, options.metadata);
-  if (!maybe_translated) return false;
-  *serialized_flatbuffer = std::move(*maybe_translated);
-  return true;
-}
-
-}  // namespace odml
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.h b/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.h
deleted file mode 100644
index ae980f6f6522ad..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_EXPORT_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_EXPORT_H_
-
-#include <map>
-#include <string>
-#include <unordered_set>
-
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
-#include "tensorflow/lite/toco/toco_flags.pb.h"
-
-namespace mlir {
-namespace odml {
-
-// Options for exporting to Flatbuffer.
-struct FlatbufferExportOptions {
-  // TocoFlags proto. The following fields are migrated.
-  // bool emit_builtin_tflite_ops  -> !toco_flags.force_select_tf_ops()
-  // bool emit_select_tf_ops       -> toco_flags.enable_select_tf_ops()
-  // bool emit_custom_ops          -> toco_flags.allow_custom_ops()
-  // bool allow_all_select_tf_ops  -> toco_flags.allow_all_select_tf_ops()
-  // std::set<> select_user_tf_ops -> toco_flags.select_user_tf_ops()
-  toco::TocoFlags toco_flags;
-  // When exporting from SavedModel, this will have the requested tags.
-  std::unordered_set<std::string> saved_model_tags;
-  // Metadata key/value pairs to write to the flatbuffer.
-  std::map<std::string, std::string> metadata;
-  // OpOrArgNameMapper to convert location of the op to name in flatbuffer.
-  // If not set, a default mapper will be used.
-  tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper = nullptr;
-};
-
-// Translates the given MLIR `module` into a FlatBuffer and stores the
-// serialized flatbuffer into the string.
-// Returns true on successful exporting, false otherwise.
-bool MlirToFlatBufferTranslateFunction(mlir::ModuleOp module,
-                                       const FlatbufferExportOptions& options,
-                                       std::string* serialized_flatbuffer);
-
-}  // namespace odml
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_EXPORT_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_operator.h b/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_operator.h
deleted file mode 100644
index 453f7f508c39d6..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_operator.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// prototype for stablehlo serialization, WIP
-// WARNING: converting to stablehlo file is experimental feature, and no runtime
-// support is provided
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_OPERATOR_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_OPERATOR_H_
-
-#include <cstdint>
-#include <optional>
-#include <vector>
-
-#include "llvm/ADT/APInt.h"
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-
-namespace mlir {
-namespace odml {
-
-// TODO(zichuanwei@): support float16/bfloat16 & int4
-
-// Function calls with a non-specialized type will result to a linker error.
-template <typename T>
-inline std::vector<T> GetVector(DenseElementsAttr elements);
-
-// TODO(zichuanwei@): for each type, we need to make sure the element type
-// matches the expected type otherwise an error should be thrown, but for now
-// we're just returning empty vector
-template <>
-inline std::vector<bool> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isSignlessInteger(1)) {
-    auto vec = llvm::to_vector(
-        llvm::map_range(elements.getValues<bool>(),
-                        [&](bool value) -> uint8_t { return value ? 1 : 0; }));
-    return std::vector<bool>(vec.begin(), vec.end());
-  }
-
-  return std::vector<bool>();
-}
-
-template <>
-inline std::vector<int8_t> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isSignlessInteger(8)) {
-    auto vec = llvm::to_vector(llvm::map_range(
-        elements.getValues<APInt>(),
-        [&](APInt value) -> int8_t { return value.getSExtValue(); }));
-    return std::vector<int8_t>(vec.begin(), vec.end());
-  }
-
-  return std::vector<int8_t>();
-}
-
-template <>
-inline std::vector<int16_t> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isSignlessInteger(16)) {
-    auto vec = llvm::to_vector(llvm::map_range(
-        elements.getValues<APInt>(),
-        [&](APInt value) -> int16_t { return value.getSExtValue(); }));
-    return std::vector<int16_t>(vec.begin(), vec.end());
-  }
-
-  return std::vector<int16_t>();
-}
-
-template <>
-inline std::vector<int32_t> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isSignlessInteger(32)) {
-    auto vec = llvm::to_vector(llvm::map_range(
-        elements.getValues<APInt>(),
-        [&](APInt value) -> int32_t { return value.getSExtValue(); }));
-    return std::vector<int32_t>(vec.begin(), vec.end());
-  }
-
-  return std::vector<int32_t>();
-}
-
-template <>
-inline std::vector<int64_t> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isSignlessInteger(64)) {
-    auto vec = llvm::to_vector(llvm::map_range(
-        elements.getValues<APInt>(),
-        [&](APInt value) -> int64_t { return value.getSExtValue(); }));
-    return std::vector<int64_t>(vec.begin(), vec.end());
-  }
-
-  return std::vector<int64_t>();
-}
-
-template <>
-inline std::vector<float> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isF32()) {
-    auto vec = llvm::to_vector(llvm::map_range(
-        elements.getValues<APFloat>(),
-        [&](APFloat value) -> float { return value.convertToFloat(); }));
-    return std::vector<float>(vec.begin(), vec.end());
-  }
-
-  return std::vector<float>();
-}
-
-template <>
-inline std::vector<double> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isF64()) {
-    auto vec = llvm::to_vector(llvm::map_range(
-        elements.getValues<APFloat>(),
-        [&](APFloat value) -> double { return value.convertToFloat(); }));
-    return std::vector<double>(vec.begin(), vec.end());
-  }
-
-  return std::vector<double>();
-}
-
-// Handles the case when the DenseElementsAttr doesn't exist, and when it
-// doesn't returns a vector of length `default_size` all with the same value
-// `default_value`.
-template <typename T>
-static inline std::vector<T> GetOptionalVector(
-    std::optional<DenseElementsAttr> elements, int64_t default_size,
-    int64_t default_value) {
-  if (elements.has_value()) {
-    return GetVector<T>(elements.value());
-  }
-  return std::vector<T>(default_size, default_value);
-}
-
-// Handles the case when the SmallVector doesn't exist, and when it
-// doesn't returns a vector of length `default_size` all with the same value
-// `default_value`.
-template <typename T>
-static inline std::vector<T> GetOptionalVector(
-    std::optional<ArrayRef<T>> values, int64_t default_size,
-    int64_t default_value) {
-  if (values.has_value()) {
-    return std::vector<T>(values->begin(), values->end());
-  }
-  return std::vector<T>(default_size, default_value);
-}
-
-}  // namespace odml
-}  // namespace mlir
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_OPERATOR_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.cc b/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.cc
deleted file mode 100644
index fb5e2fadab907b..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.cc
+++ /dev/null
@@ -1,904 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// prototype for stablehlo serialization, WIP
-// WARNING: converting to stablehlo file is experimental feature, and no runtime
-// support is provided
-
-#include "tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <limits>
-#include <map>
-#include <optional>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_operator.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
-#include "xla/statusor.h"
-#include "tensorflow/lite/stablehlo/schema/schema_generated.h"
-
-#define kStablehloOptionalTensor (-1)
-
-using llvm::isa;
-using llvm::StringRef;
-using llvm::Twine;
-using mlir::ElementsAttr;
-using mlir::ModuleOp;
-using mlir::Operation;
-using mlir::StringAttr;
-using mlir::TensorType;
-using mlir::Value;
-using mlir::func::FuncOp;
-using tensorflow::OpOrArgLocNameMapper;
-using tensorflow::OpOrArgNameMapper;
-using xla::StatusOr;
-
-namespace mlir {
-namespace odml {
-
-// TODO(b/267689361) this and the following functions should be automatically
-// generated similar to operator_converters.inc in tflite
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator> CreateAddOperator(
-    mlir::stablehlo::AddOp& hlo_op, flatbuffers::FlatBufferBuilder* fbb,
-    uint32_t opcode_index, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateReshapeOperator(mlir::stablehlo::ReshapeOp& hlo_op,
-                      flatbuffers::FlatBufferBuilder* fbb,
-                      uint32_t opcode_index,
-                      const std::vector<int32_t>& operands,
-                      const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator> CreateDivOperator(
-    mlir::stablehlo::DivOp& hlo_op, flatbuffers::FlatBufferBuilder* fbb,
-    uint32_t opcode_index, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateSubtractOperator(mlir::stablehlo::SubtractOp& hlo_op,
-                       flatbuffers::FlatBufferBuilder* fbb,
-                       uint32_t opcode_index,
-                       const std::vector<int32_t>& operands,
-                       const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator> CreateMulOperator(
-    mlir::stablehlo::MulOp hlo_op, flatbuffers::FlatBufferBuilder* fbb,
-    uint32_t opcode_index, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator> CreateMaxOperator(
-    mlir::stablehlo::MaxOp& hlo_op, flatbuffers::FlatBufferBuilder* fbb,
-    uint32_t opcode_index, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateConvertOperator(mlir::stablehlo::ConvertOp& hlo_op,
-                      flatbuffers::FlatBufferBuilder* fbb,
-                      uint32_t opcode_index,
-                      const std::vector<int32_t>& operands,
-                      const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator> CreateDotOperator(
-    mlir::stablehlo::DotOp& hlo_op, flatbuffers::FlatBufferBuilder* fbb,
-    uint32_t opcode_index, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator> CreateClampOperator(
-    mlir::stablehlo::ClampOp& hlo_op, flatbuffers::FlatBufferBuilder* fbb,
-    uint32_t opcode_index, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateLogisticOperator(mlir::stablehlo::LogisticOp& hlo_op,
-                       flatbuffers::FlatBufferBuilder* fbb,
-                       uint32_t opcode_index,
-                       const std::vector<int32_t>& operands,
-                       const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateConcatenateOperator(mlir::stablehlo::ConcatenateOp& hlo_op,
-                          flatbuffers::FlatBufferBuilder* fbb,
-                          uint32_t opcode_index,
-                          const std::vector<int32_t>& operands,
-                          const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  auto options = ::stablehlo::flatbuf::CreateConcatenateOptions(
-      *fbb, hlo_op.getDimension());
-
-  return ::stablehlo::flatbuf::CreateOperator(
-      *fbb, opcode_index, inputs, outputs,
-      ::stablehlo::flatbuf::OperatorOptions_ConcatenateOptions,
-      options.Union());
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateConvolutionOperator(mlir::stablehlo::ConvolutionOp& hlo_op,
-                          flatbuffers::FlatBufferBuilder* fbb,
-                          uint32_t opcode_index,
-                          const std::vector<int32_t>& operands,
-                          const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  // converting from mlir struct to std
-  std::vector<int64_t> window_strides_vec =
-      GetOptionalVector<int64_t>(hlo_op.getWindowStrides(), 0, 0);
-  std::vector<int64_t> padding_vec =
-      GetOptionalVector<int64_t>(hlo_op.getPadding(), 0, 0);
-  std::vector<int64_t> lhs_dilation_vec =
-      GetOptionalVector<int64_t>(hlo_op.getLhsDilation(), 0, 0);
-  std::vector<int64_t> rhs_dilation_vec =
-      GetOptionalVector<int64_t>(hlo_op.getRhsDilation(), 0, 0);
-  std::vector<bool> window_reversal_vec =
-      GetOptionalVector<bool>(hlo_op.getWindowReversal(), 0, 0);
-  const int64_t feature_group_count = hlo_op.getFeatureGroupCount();
-  const int64_t batch_group_count = hlo_op.getBatchGroupCount();
-
-  auto conv_dimension_numbers = hlo_op.getDimensionNumbersAttr();
-
-  std::vector<int64_t> input_spatial_dimensions_vec =
-      conv_dimension_numbers.getInputSpatialDimensions().vec();
-  std::vector<int64_t> kernel_spatial_dimensions_vec =
-      conv_dimension_numbers.getKernelSpatialDimensions().vec();
-  std::vector<int64_t> output_spatial_dimensions_vec =
-      conv_dimension_numbers.getOutputSpatialDimensions().vec();
-  const int64_t input_batch_dimension =
-      conv_dimension_numbers.getInputBatchDimension();
-  const int64_t input_feature_dimension =
-      conv_dimension_numbers.getInputFeatureDimension();
-  const int64_t kernel_input_feature_dimension =
-      conv_dimension_numbers.getKernelInputFeatureDimension();
-  const int64_t kernel_output_feature_dimension =
-      conv_dimension_numbers.getKernelOutputFeatureDimension();
-  const int64_t output_batch_dimension =
-      conv_dimension_numbers.getOutputBatchDimension();
-  const int64_t output_feature_dimension =
-      conv_dimension_numbers.getOutputFeatureDimension();
-
-  // serialize all vectors to flatbuffer
-  auto window_strides = fbb->CreateVector(window_strides_vec);
-  auto padding = fbb->CreateVector(padding_vec);
-  auto lhs_dilation = fbb->CreateVector(lhs_dilation_vec);
-  auto rhs_dilation = fbb->CreateVector(rhs_dilation_vec);
-  auto input_spatial_dimensions =
-      fbb->CreateVector(input_spatial_dimensions_vec);
-  auto kernel_spatial_dimensions =
-      fbb->CreateVector(kernel_spatial_dimensions_vec);
-  auto output_spatial_dimensions =
-      fbb->CreateVector(output_spatial_dimensions_vec);
-  auto window_reversal = fbb->CreateVector(window_reversal_vec);
-
-  auto options = ::stablehlo::flatbuf::CreateConvolutionOptions(
-      *fbb, window_strides, padding, lhs_dilation, rhs_dilation,
-      window_reversal, input_batch_dimension, input_feature_dimension,
-      input_spatial_dimensions, kernel_input_feature_dimension,
-      kernel_output_feature_dimension, kernel_spatial_dimensions,
-      output_batch_dimension, output_feature_dimension,
-      output_spatial_dimensions, feature_group_count, batch_group_count);
-
-  return ::stablehlo::flatbuf::CreateOperator(
-      *fbb, opcode_index, inputs, outputs,
-      ::stablehlo::flatbuf::OperatorOptions_ConvolutionOptions,
-      options.Union());
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateReduceWindowOperator(mlir::stablehlo::ReduceWindowOp& hlo_op,
-                           flatbuffers::FlatBufferBuilder* fbb,
-                           uint32_t opcode_index,
-                           const std::vector<int32_t>& operands,
-                           const std::vector<int32_t>& results,
-                           const int subgraph_idx) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  // TODO(zichuanwei@): instead of create these vectors let's just create
-  // Flatbuffers vector directly
-  std::vector<int64_t> window_dimension_vec(
-      GetOptionalVector<int64_t>(hlo_op.getWindowDimensions(), 0, 0));
-  std::vector<int64_t> window_strides_vec(
-      GetOptionalVector<int64_t>(hlo_op.getWindowStrides(), 0, 0));
-  std::vector<int64_t> base_dilations_vec(
-      GetOptionalVector<int64_t>(hlo_op.getBaseDilations(), 0, 0));
-  std::vector<int64_t> window_dilations_vec(
-      GetOptionalVector<int64_t>(hlo_op.getWindowDilations(), 0, 0));
-  std::vector<int64_t> padding_vec(
-      GetOptionalVector<int64_t>(hlo_op.getPadding(), 0, 0));
-
-  auto window_dimension = fbb->CreateVector(window_dimension_vec);
-  auto window_strides = fbb->CreateVector(window_strides_vec);
-  auto base_dilations = fbb->CreateVector(base_dilations_vec);
-  auto window_dilations = fbb->CreateVector(window_dilations_vec);
-  auto padding = fbb->CreateVector(padding_vec);
-
-  auto options = ::stablehlo::flatbuf::CreateReduceWindowOptions(
-      *fbb, window_dimension, window_strides, base_dilations, window_dilations,
-      padding, subgraph_idx);
-
-  return ::stablehlo::flatbuf::CreateOperator(
-      *fbb, opcode_index, inputs, outputs,
-      ::stablehlo::flatbuf::OperatorOptions_ReduceWindowOptions,
-      options.Union());
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateBroadcastInDimOperator(mlir::stablehlo::BroadcastInDimOp& hlo_op,
-                             flatbuffers::FlatBufferBuilder* fbb,
-                             uint32_t opcode_index,
-                             const std::vector<int32_t>& operands,
-                             const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  auto dims = hlo_op.getBroadcastDimensions();
-  auto broadcast_dimension =
-      fbb->CreateVector(std::vector<int64_t>(dims.begin(), dims.end()));
-
-  auto options = ::stablehlo::flatbuf::CreateBroadcastInDimOptions(
-      *fbb, broadcast_dimension);
-
-  return ::stablehlo::flatbuf::CreateOperator(
-      *fbb, opcode_index, inputs, outputs,
-      ::stablehlo::flatbuf::OperatorOptions_BroadcastInDimOptions,
-      options.Union());
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateResizeBilinearOperator(mlir::stablehlo::CustomCallOp& hlo_op,
-                             flatbuffers::FlatBufferBuilder* fbb,
-                             uint32_t opcode_index,
-                             const std::vector<int32_t>& operands,
-                             const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  auto align_corners =
-      hlo_op->getAttr("align_corners").dyn_cast<mlir::BoolAttr>();
-  assert(align_corners);
-  auto half_pixel_center =
-      hlo_op->getAttr("half_pixel_centers").dyn_cast<mlir::BoolAttr>();
-  assert(half_pixel_center);
-
-  auto options = ::stablehlo::flatbuf::CreateResizeBilinearOptions(
-      *fbb, align_corners.getValue(), half_pixel_center.getValue());
-
-  return ::stablehlo::flatbuf::CreateOperator(
-      *fbb, opcode_index, inputs, outputs,
-      ::stablehlo::flatbuf::OperatorOptions_ResizeBilinearOptions,
-      options.Union());
-}
-
-std::optional<flatbuffers::Offset<::stablehlo::flatbuf::Operator>>
-CreateFlatBufferOperator(mlir::Operation* op, uint32_t opcode_index,
-                         const std::vector<int32_t>& operands,
-                         const std::vector<int32_t>& results,
-                         flatbuffers::FlatBufferBuilder* fbb,
-                         int subgraph_idx = 0) {
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::AddOp>(op))
-    return CreateAddOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::DotOp>(op))
-    return CreateDotOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::LogisticOp>(op))
-    return CreateLogisticOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::DivOp>(op))
-    return CreateDivOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::SubtractOp>(op))
-    return CreateSubtractOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::MulOp>(op))
-    return CreateMulOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::MaxOp>(op))
-    return CreateMaxOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::ReshapeOp>(op))
-    return CreateReshapeOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::ConvolutionOp>(op))
-    return CreateConvolutionOperator(hlo_op, fbb, opcode_index, operands,
-                                     results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::ReduceWindowOp>(op))
-    return CreateReduceWindowOperator(hlo_op, fbb, opcode_index, operands,
-                                      results, subgraph_idx);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::BroadcastInDimOp>(op))
-    return CreateBroadcastInDimOperator(hlo_op, fbb, opcode_index, operands,
-                                        results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::CustomCallOp>(op))
-    return CreateResizeBilinearOperator(hlo_op, fbb, opcode_index, operands,
-                                        results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::ClampOp>(op))
-    return CreateClampOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::ConcatenateOp>(op))
-    return CreateConcatenateOperator(hlo_op, fbb, opcode_index, operands,
-                                     results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::ConvertOp>(op))
-    return CreateConvertOperator(hlo_op, fbb, opcode_index, operands, results);
-  return std::nullopt;
-}
-
-static absl::StatusOr<::stablehlo::flatbuf::DataType> GetDataType(
-    Type type, bool is_signed = true) {
-  if (type.isF16()) return ::stablehlo::flatbuf::DataType_FLOAT16;
-  if (type.isF32()) return ::stablehlo::flatbuf::DataType_FLOAT32;
-  if (type.isF64()) return ::stablehlo::flatbuf::DataType_FLOAT64;
-  if (type.isSignlessInteger(8)) return ::stablehlo::flatbuf::DataType_INT8;
-  if (type.isSignlessInteger(16)) return ::stablehlo::flatbuf::DataType_INT16;
-  if (type.isSignlessInteger(32)) return ::stablehlo::flatbuf::DataType_INT32;
-  if (type.isSignlessInteger(64)) return ::stablehlo::flatbuf::DataType_INT64;
-  if (type.isUnsignedInteger(8)) return ::stablehlo::flatbuf::DataType_UINT8;
-  if (type.isUnsignedInteger(16)) return ::stablehlo::flatbuf::DataType_UINT16;
-  if (type.isUnsignedInteger(32)) return ::stablehlo::flatbuf::DataType_UINT32;
-  if (type.isUnsignedInteger(64)) return ::stablehlo::flatbuf::DataType_UINT64;
-  std::string type_str;
-  llvm::raw_string_ostream str_stream(type_str);
-  str_stream << type;
-  LOG(ERROR) << "unsupported datatype" << type_str;
-  return tensorflow::errors::InvalidArgument("unsupported datatype" + type_str);
-}
-
-std::optional<::stablehlo::flatbuf::OperatorCode> GetOpCode(
-    mlir::Operation* op) {
-  if (isa<mlir::stablehlo::AddOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_ADD;
-  if (isa<mlir::stablehlo::DotOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_DOT;
-  if (isa<mlir::stablehlo::SubtractOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_SUBTRACT;
-  if (isa<mlir::stablehlo::DivOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_DIVIDE;
-  if (isa<mlir::stablehlo::LogisticOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_LOGISTIC;
-  if (isa<mlir::stablehlo::MulOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_MULTIPLY;
-  if (isa<mlir::stablehlo::MaxOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_MAXIMUM;
-  if (isa<mlir::stablehlo::ReshapeOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_RESHAPE;
-  if (isa<mlir::stablehlo::ConvolutionOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_CONVOLUTION;
-  if (isa<mlir::stablehlo::BroadcastInDimOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_BROADCAST_IN_DIM;
-  if (isa<mlir::stablehlo::ReduceWindowOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_REDUCE_WINDOW;
-  if (isa<mlir::stablehlo::ClampOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_CLAMP;
-  if (isa<mlir::stablehlo::ConcatenateOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_CONCATENATE;
-  if (isa<mlir::stablehlo::ConvertOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_CONVERT;
-
-  // For now we assume the incoming custom op is a resize_bilinear, it is
-  // expected any other custom op will cause the program to error out
-  if (isa<mlir::stablehlo::CustomCallOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_RESIZE_BILINEAR;
-
-  op->emitError(Twine("unsupported op type " + op->getName().getStringRef()));
-  return std::nullopt;
-}
-
-static bool IsConst(Operation* op) {
-  return isa<mlir::func::ConstantOp, mlir::arith::ConstantOp,
-             mlir::stablehlo::ConstantOp>(op);
-}
-
-std::optional<std::string> Translator::Translate(
-    ModuleOp module, const toco::TocoFlags& toco_flags,
-    const std::unordered_set<std::string>& tags,
-    OpOrArgNameMapper* op_or_arg_name_mapper,
-    const std::map<std::string, std::string>& metadata) {
-  OpOrArgLocNameMapper default_op_or_arg_name_mapper;
-  if (!op_or_arg_name_mapper)
-    op_or_arg_name_mapper = &default_op_or_arg_name_mapper;
-  // TODO(b/267689626): sanity checkers not implemented
-  Translator translator(module, toco_flags, tags, op_or_arg_name_mapper,
-                        metadata);
-  return translator.TranslateInternal();
-}
-
-std::optional<std::string> Translator::TranslateInternal() {
-  // A list of named regions in the module with main function being the first in
-  // the list. The main function is required as the first subgraph in the model
-  // is entry point for the model.
-  std::vector<std::pair<std::string, Region*>> named_regions;
-  named_regions.reserve(std::distance(module_.begin(), module_.end()));
-
-  int subgraph_idx = 0;
-
-  // Entry functions for signature defs.
-  std::vector<FuncOp> entry_functions;
-  std::vector<FuncOp> non_entry_functions;
-  FuncOp main_fn = module_.lookupSymbol<FuncOp>("main");
-  if (main_fn != nullptr) {
-    // Treat the main function as a signature def when the given main function
-    // contains on the tf.entry_function attribute.
-    auto attrs =
-        main_fn->getAttrOfType<mlir::DictionaryAttr>(tf_entry_function_);
-    if (attrs && !attrs.empty()) {
-      entry_functions.push_back(main_fn);
-    } else {
-      non_entry_functions.push_back(main_fn);
-    }
-  }
-
-  // Walk over the module collection ops with functions and while ops.
-  module_.walk([&](FuncOp fn) {
-    if (main_fn == fn) return WalkResult::advance();
-    auto attrs = fn->getAttrOfType<mlir::DictionaryAttr>("tf.entry_function");
-    if (attrs && !attrs.empty()) {
-      entry_functions.push_back(fn);
-    } else {
-      non_entry_functions.push_back(fn);
-    }
-    return WalkResult::advance();
-  });
-
-  // collect all reduce window ops, this is only a temporary hack
-  // in the future, we should have a function to walk over all ops that have
-  // regions contained, the logic in stablehlo is a bit different from tfl
-  // dialect in that all subgraphs in tflite a enclosed in func op where
-  // stablehlo op maintain their own regions
-  std::vector<mlir::stablehlo::ReduceWindowOp> reduce_window;
-  module_.walk([&](mlir::stablehlo::ReduceWindowOp op) {
-    reduce_window.push_back(op);
-    return WalkResult::advance();
-  });
-
-  // Assign the subgraph index. Among the given functions, it will put entry
-  // functions at the beginning of the list of the subgrahs.
-  for (auto fn : entry_functions) {
-    subgraph_index_map_[fn.getName().str()] = subgraph_idx++;
-    named_regions.emplace_back(fn.getName().str(), &fn.getBody());
-  }
-  for (auto fn : non_entry_functions) {
-    subgraph_index_map_[fn.getName().str()] = subgraph_idx++;
-    named_regions.emplace_back(fn.getName().str(), &fn.getBody());
-  }
-
-  // add regions of reduce_window ops into subgraph map. the name will be
-  // stablehlo.reduce_window as mlir::region is not assicoate with a name
-  for (auto op : reduce_window) {
-    reduce_window_subgraph_map_[op] = subgraph_idx++;
-    named_regions.emplace_back(op.getOperationName().str(), &op.getBody());
-  }
-
-  // Build subgraph for each of the named regions.
-  std::vector<BufferOffset<::stablehlo::flatbuf::SubGraph>> subgraphs;
-  subgraphs.reserve(named_regions.size());
-  int first_failed_func = -1;
-
-  // When we export each function in the module op, intentionally, we export the
-  // entry functions at the beginning of the subgraph list and the
-  // subgraph_index is the index in entry functions and at the same, is the
-  // index in the subgraph list.
-  int subgraph_index = 0;
-  for (const auto& it : llvm::enumerate(named_regions)) {
-    auto subgraph_or =
-        BuildSubGraph(it.value().first, it.value().second, subgraph_index);
-    if (!subgraph_or) {
-      if (first_failed_func == -1)
-        // Record the index of the first region that cannot be converted.
-        // Keep looping through all subgraphs in the module to make sure that
-        // we collect the list of missing ops from the entire module.
-        first_failed_func = it.index();
-    } else {
-      subgraphs.push_back(*subgraph_or);
-      ++subgraph_index;
-    }
-  }
-  // TODO(b/267801705) : Add schema version
-  auto model = ::stablehlo::flatbuf::CreateModel(
-      builder_, 0, builder_.CreateVector(opcodes_),
-      builder_.CreateVector(subgraphs), builder_.CreateVector(buffers_));
-  ::stablehlo::flatbuf::FinishModelBuffer(builder_, model);
-  // There is a limit of 2GB for a flatbuffer.
-  if (builder_.GetSize() > 2147483648) {
-    LOG(ERROR) << "Model size is bigger than 2gb";
-    return std::nullopt;
-  }
-
-  // Return serialized string for the built FlatBuffer.
-  return std::string(reinterpret_cast<const char*>(builder_.GetBufferPointer()),
-                     builder_.GetSize());
-}
-
-std::optional<BufferOffset<::stablehlo::flatbuf::Tensor>>
-Translator::BuildTensor(Value value, const std::string& name,
-                        unsigned buffer_idx) {
-  auto type = value.getType().cast<TensorType>();
-
-  auto check_shape =
-      [&](llvm::ArrayRef<int64_t> shape_ref) -> mlir::LogicalResult {
-    auto is_out_of_range = [](int64_t dim) {
-      return dim > std::numeric_limits<int32_t>::max();
-    };
-
-    if (std::any_of(shape_ref.begin(), shape_ref.end(), is_out_of_range))
-      return mlir::emitError(
-          value.getLoc(),
-          "result shape dimensions out of 32 bit int type range");
-
-    return mlir::success();
-  };
-
-  std::vector<int32_t> shape;
-  std::vector<int32_t> shape_signature;
-  auto* inst = value.getDefiningOp();
-
-  bool is_variable = !(inst && IsConst(inst));
-  if (type.hasStaticShape()) {
-    llvm::ArrayRef<int64_t> shape_ref = type.getShape();
-    if (mlir::failed(check_shape(shape_ref))) return std::nullopt;
-
-    shape = std::vector<int32_t>(shape_ref.begin(), shape_ref.end());
-  } else if (inst && IsConst(inst)) {
-    // Const op can have a result of dynamic shaped type (e.g. due to constant
-    // folding), but we can still derive the shape of a constant tensor for
-    // its attribute type.
-
-    auto tensor_attr = inst->getAttr("value").cast<mlir::TypedAttr>();
-    llvm::ArrayRef<int64_t> shape_ref =
-        tensor_attr.getType().cast<TensorType>().getShape();
-    if (mlir::failed(check_shape(shape_ref))) return std::nullopt;
-
-    shape = std::vector<int32_t>(shape_ref.begin(), shape_ref.end());
-  } else if (type.hasRank()) {
-    llvm::ArrayRef<int64_t> shape_ref = type.getShape();
-    if (mlir::failed(check_shape(shape_ref))) return std::nullopt;
-
-    shape.reserve(shape_ref.size());
-    for (auto& dim : shape_ref) {
-      // translate dynamic shapes from mlir to tfl values
-      shape.push_back(
-          dim == mlir::ShapedType::kDynamic ? 1 : static_cast<int>(dim));
-      shape_signature.push_back(static_cast<int>(
-          dim == mlir::ShapedType::kDynamic ? tensorflow::kTFDynamicSize
-                                            : dim));
-    }
-  }
-
-  Type element_type = type.getElementType();
-  auto status = GetDataType(element_type);
-  if (!status.ok()) return std::nullopt;
-  ::stablehlo::flatbuf::DataType data_type = GetDataType(element_type).value();
-
-  return ::stablehlo::flatbuf::CreateTensor(
-      builder_, builder_.CreateVector(shape), data_type,
-      (is_variable ? 0 : buffer_idx), builder_.CreateString(name));
-}
-
-void Translator::InitializeNamesFromAttribute(FuncOp fn, bool* has_input_attr) {
-  auto dict_attr = fn->getAttrOfType<mlir::DictionaryAttr>(tf_entry_function_);
-  if (!dict_attr) return;
-
-  llvm::SmallVector<llvm::StringRef, 2> input_names;
-  llvm::SmallVector<llvm::StringRef, 2> output_names;
-  if (auto str = dict_attr.get("inputs").dyn_cast_or_null<mlir::StringAttr>()) {
-    str.getValue().split(input_names, ',', /*MaxSplit=*/-1,
-                         /*KeepEmpty=*/false);
-    if (input_names.size() != fn.getNumArguments()) {
-      fn.emitWarning() << "invalid entry function specification";
-      return;
-    }
-    for (const auto& it : llvm::enumerate(fn.getArguments())) {
-      name_mapper_.InitOpName(it.value(), input_names[it.index()].trim());
-    }
-    *has_input_attr = true;
-  }
-
-  if (auto str =
-          dict_attr.get("outputs").dyn_cast_or_null<mlir::StringAttr>()) {
-    str.getValue().split(output_names, ',', /*MaxSplit=*/-1,
-                         /*KeepEmpty=*/false);
-    auto term = fn.back().getTerminator();
-    if (output_names.size() != term->getNumOperands()) {
-      fn.emitWarning() << "output names (" << output_names.size()
-                       << ") != terminator operands (" << term->getNumOperands()
-                       << ")";
-      return;
-    }
-    for (const auto& it : llvm::enumerate(term->getOperands())) {
-      name_mapper_.InitOpName(it.value(), output_names[it.index()].trim());
-    }
-  }
-}
-
-std::string Translator::UniqueName(mlir::Value val) {
-  return std::string(name_mapper_.GetUniqueName(val));
-}
-
-std::optional<BufferOffset<::stablehlo::flatbuf::SubGraph>>
-Translator::BuildSubGraph(const std::string& name, Region* region, int index) {
-  bool has_input_attr = false;
-  if (auto fn = dyn_cast<FuncOp>(region->getParentOp())) {
-    InitializeNamesFromAttribute(fn, &has_input_attr);
-  }
-  std::vector<BufferOffset<::stablehlo::flatbuf::Tensor>> tensors;
-  llvm::DenseMap<Value, int> tensor_index_map;
-
-  // Builds tensor and buffer for argument or operation result. Returns false
-  // on failure.
-  auto build_tensor_and_buffer = [&](Value value, const int subgraph_index,
-                                     const std::string& tensor_name) {
-    // NoneType represents optional and may be skipped here.
-    if (value.getType().isa<NoneType>()) {
-      return true;
-    }
-
-    tensor_index_map.insert({value, tensors.size()});
-    tensor_index_map_[subgraph_index][tensor_name] = tensors.size();
-    auto tensor_or = BuildTensor(value, tensor_name, buffers_.size());
-    if (!tensor_or) return false;
-    tensors.push_back(*tensor_or);
-
-    if (value.getDefiningOp()) {
-      auto buffer_or = BuildBuffer(value);
-      if (!buffer_or) return false;
-      buffers_.push_back(*buffer_or);
-    } else {
-      // TODO(b/267802872): Tflite will create a buffer entry for every tensor
-      // regardless constant or not. in stablehlo serialization, we don't plan
-      // to keep this behaviour
-      buffers_.push_back(empty_buffer_);
-    }
-    return true;
-  };
-
-  std::vector<BufferOffset<::stablehlo::flatbuf::Operator>> operators;
-
-  // Maps positions of operations in bb to positions in operators
-  llvm::DenseMap<int, int> operation_index_to_operator_index;
-  std::vector<Operation*> operators_in_mlir;
-  auto& bb = region->front();
-
-  // Main function's arguments are first passed to `input` op so they don't
-  // have associated tensor and buffer. Build FlatBuffer tensor and buffer for
-  // other functions.
-  for (unsigned i = 0, e = bb.getNumArguments(); i < e; ++i) {
-    mlir::BlockArgument arg = bb.getArgument(i);
-    std::string tensor_name;
-    if (has_input_attr)
-      tensor_name = std::string(name_mapper_.GetUniqueName(arg));
-    if (tensor_name.empty()) tensor_name = absl::StrCat("arg", i);
-    if (!build_tensor_and_buffer(arg, index, tensor_name)) return std::nullopt;
-  }
-
-  bool failed_once = false;
-  for (const auto& item : llvm::enumerate(bb)) {
-    Operation& inst = item.value();
-    const int operation_index = item.index();
-    if (inst.hasTrait<mlir::OpTrait::IsTerminator>()) break;
-
-    for (auto val : inst.getResults()) {
-      std::string tensor_name = UniqueName(val);
-      // For "tfl.numeric_verify" op, the name is used to find out the original
-      // activation tensor rather than its own unique name in the visualization
-      // or debugging tools.
-      // auto builtin_code = GetOpCode(&inst);
-      if (!build_tensor_and_buffer(val, index, tensor_name))
-        return std::nullopt;
-    }
-
-    // Skip constant ops as they don't represent flatbuffer operator.
-    if (IsConst(&inst)) continue;
-
-    // Fetch operand and result tensor indices.
-    std::vector<int32_t> results;
-    results.reserve(inst.getNumResults());
-    for (auto result : inst.getResults()) {
-      results.push_back(tensor_index_map.lookup(result));
-    }
-    Operation* real_inst = &inst;
-    std::vector<int32_t> operands;
-    operands.reserve(real_inst->getNumOperands());
-    for (auto operand : real_inst->getOperands()) {
-      if (operand.getType().isa<NoneType>())
-        operands.push_back(kStablehloOptionalTensor);
-      else
-        operands.push_back(tensor_index_map.lookup(operand));
-    }
-
-    if (auto flat_operator = BuildOperator(real_inst, operands, results)) {
-      operation_index_to_operator_index.try_emplace(operation_index,
-                                                    operators.size());
-      operators.push_back(*flat_operator);
-      operators_in_mlir.push_back(real_inst);
-    } else {
-      failed_once = true;
-    }
-  }
-  if (index + 1 > subgraph_op_inst_map_.size()) {
-    subgraph_op_inst_map_.resize(index + 1);
-  }
-  subgraph_op_inst_map_[index] = operators_in_mlir;
-  if (failed_once) return std::nullopt;
-
-  // Get input and output tensor indices for the subgraph.
-  std::vector<int32_t> inputs, outputs;
-  for (auto arg : bb.getArguments()) {
-    inputs.push_back(tensor_index_map[arg]);
-  }
-  for (auto result : bb.getTerminator()->getOperands()) {
-    outputs.push_back(tensor_index_map[result]);
-  }
-  return ::stablehlo::flatbuf::CreateSubGraph(
-      builder_, builder_.CreateVector(tensors), builder_.CreateVector(inputs),
-      builder_.CreateVector(outputs), builder_.CreateVector(operators),
-      /*name=*/builder_.CreateString(name));
-}
-
-std::optional<BufferOffset<::stablehlo::flatbuf::Buffer>>
-Translator::BuildBuffer(mlir::Value value) {
-  auto inst = value.getDefiningOp();
-  ElementsAttr attr;
-
-  if (auto cst = dyn_cast<mlir::arith::ConstantOp>(inst)) {
-    // arith::ConstantOp have ElementAttr at this point due to validation of the
-    // TFLite module.
-    attr = cst.getValue().cast<ElementsAttr>();
-  } else if (auto cst = dyn_cast<mlir::stablehlo::ConstantOp>(inst)) {
-    attr = cst.getValue();
-  } else {
-    return empty_buffer_;
-  }
-
-  tensorflow::Tensor tensor;
-  auto status = tensorflow::ConvertToTensor(attr, &tensor);
-  if (!status.ok()) {
-    inst->emitError(
-        Twine("failed to convert value attribute to tensor with error: " +
-              status.ToString()));
-    return std::nullopt;
-  }
-
-  absl::string_view tensor_data = tensor.tensor_data();
-  auto buffer_data = builder_.CreateVector(
-      reinterpret_cast<const uint8_t*>(tensor_data.data()), tensor_data.size());
-  return ::stablehlo::flatbuf::CreateBuffer(builder_, buffer_data);
-}
-
-uint32_t Translator::GetOpcodeIndex(
-    const std::string& op_name, ::stablehlo::flatbuf::OperatorCode op_code) {
-  auto it = opcode_index_map_.insert({op_name, 0});
-
-  // If the insert succeeded, the opcode has not been created already. Create a
-  // new operator code and update its index value in the map.
-  if (it.second) {
-    it.first->second = opcodes_.size();
-    opcodes_.push_back(op_code);
-  }
-  return it.first->second;
-}
-
-std::optional<BufferOffset<::stablehlo::flatbuf::Operator>>
-Translator::BuildOperator(Operation* inst, std::vector<int32_t> operands,
-                          const std::vector<int32_t>& results) {
-  const auto* dialect = inst->getDialect();
-  if (!dialect) {
-    inst->emitOpError("dialect is not registered");
-    return std::nullopt;
-  }
-
-  if (dialect == stablehlo_dialect_) {
-    auto op_code = GetOpCode(inst);
-    if (op_code == std::nullopt) {
-      return inst->emitOpError("op code not found"), std::nullopt;
-    }
-
-    auto opcode_index =
-        GetOpcodeIndex(inst->getName().getStringRef().str(), op_code.value());
-    std::optional<flatbuffers::Offset<::stablehlo::flatbuf::Operator>> offset;
-    if (op_code == ::stablehlo::flatbuf::OperatorCode_REDUCE_WINDOW) {
-      offset = CreateFlatBufferOperator(
-          inst, opcode_index, operands, results, &builder_,
-          reduce_window_subgraph_map_
-              [llvm::dyn_cast<mlir::stablehlo::ReduceWindowOp>(inst)]);
-    } else {
-      offset = CreateFlatBufferOperator(inst, opcode_index, operands, results,
-                                        &builder_);
-    }
-    if (!offset) {
-      inst->emitOpError("is not a supported stablehlo op");
-    }
-    return offset;
-  }
-
-  return inst->emitOpError("a stableHLO op"), std::nullopt;
-}
-
-}  // namespace odml
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.h b/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.h
deleted file mode 100644
index d9d1b7b0a17d81..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_TRANSLATOR_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_TRANSLATOR_H_
-
-#include <stddef.h>
-#include <stdlib.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <optional>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
-#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/stablehlo/schema/schema_generated.h"
-#include "tensorflow/lite/toco/toco_flags.pb.h"
-
-template <typename T>
-using BufferOffset = flatbuffers::Offset<T>;
-
-template <typename T>
-using VectorBufferOffset = flatbuffers::Offset<flatbuffers::Vector<T>>;
-
-using CustomOptionsOffset = VectorBufferOffset<uint8_t>;
-
-// Use initial buffer size in flatbuffer builder to be same as the initial size
-// used by the TOCO export. (It does not explain rationale for this choice.)
-// This number is currently inherited from Tflite
-constexpr size_t kInitialBufferSize = 10240;
-
-namespace mlir {
-namespace odml {
-
-// Translates an MLIR module in mhlo dialect to TFLite FlatBuffer.
-class Translator {
- public:
-  // Translates the given MLIR module into TFLite FlatBuffer format and returns
-  // the serialized output. Returns std::nullopt on unsupported, invalid inputs
-  // or internal error.
-  static std::optional<std::string> Translate(
-      ModuleOp module, const toco::TocoFlags& toco_flags,
-      const std::unordered_set<std::string>& tags,
-      tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper,
-      const std::map<std::string, std::string>& metadata);
-
- private:
-  enum class OpType : char { kStablehloOp };
-  explicit Translator(ModuleOp module, const toco::TocoFlags& toco_flags,
-                      const std::unordered_set<std::string>& saved_model_tags,
-                      tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper,
-                      const std::map<std::string, std::string>& metadata)
-      : module_(module),
-        name_mapper_(*op_or_arg_name_mapper),
-        builder_(kInitialBufferSize),
-        saved_model_tags_(saved_model_tags) {
-    // The first buffer must be empty according to the schema definition.
-    empty_buffer_ = ::stablehlo::flatbuf::CreateBuffer(builder_);
-    buffers_.push_back(empty_buffer_);
-    stablehlo_dialect_ =
-        module.getContext()
-            ->getOrLoadDialect<mlir::stablehlo::StablehloDialect>();
-    // Right now the TF executor dialect is still needed to build NodeDef.
-    module.getContext()
-        ->getOrLoadDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-  }
-
-  std::optional<std::string> TranslateInternal();
-
-  // Returns TFLite buffer populated with constant value if the operation is
-  // TFLite constant operation. Otherwise, returns an empty buffer. Emits error
-  // and returns std::nullopt on failure.
-  std::optional<BufferOffset<::stablehlo::flatbuf::Buffer>> BuildBuffer(
-      Value value);
-
-  // Builds TFLite tensor from the given value. `buffer_idx` is index of the
-  // corresponding buffer. Emits error and returns std::nullopt on failure.
-  std::optional<BufferOffset<::stablehlo::flatbuf::Tensor>> BuildTensor(
-      Value value, const std::string& name, unsigned buffer_idx);
-
-  // Returns opcode index for op identified by the op_name, if already
-  // available. Otherwise, creates a new OperatorCode using the given `builtin`
-  // operator and associates it with `op_name`.
-  uint32_t GetOpcodeIndex(const std::string& op_name,
-                          ::stablehlo::flatbuf::OperatorCode op_code);
-
-  // Builds operator for the given operation with specified operand and result
-  // tensor indices. Emits an error and returns std::nullopt on failure.
-  std::optional<BufferOffset<::stablehlo::flatbuf::Operator>> BuildOperator(
-      Operation* inst, std::vector<int32_t> operands,
-      const std::vector<int32_t>& results);
-
-  // Build a subgraph with a given name out of the region either corresponding
-  // to a function's body or while op. Modifies *region by calling
-  // ExtractControlEdges.
-  std::optional<BufferOffset<::stablehlo::flatbuf::SubGraph>> BuildSubGraph(
-      const std::string& name, Region* region, int index);
-
-  // Uses the tf.entry_function attribute (if set) to initialize the op to name
-  // mapping.
-  void InitializeNamesFromAttribute(mlir::func::FuncOp fn,
-                                    bool* has_input_attr);
-
-  // Returns a unique name for `val`.
-  std::string UniqueName(mlir::Value val);
-
-  ModuleOp module_;
-
-  tensorflow::OpOrArgNameMapper& name_mapper_;
-
-  flatbuffers::FlatBufferBuilder builder_;
-  BufferOffset<::stablehlo::flatbuf::Buffer> empty_buffer_;
-
-  std::vector<BufferOffset<::stablehlo::flatbuf::Buffer>> buffers_;
-  // Maps subgraph index and tensor name in the graph to the tensor index.
-  absl::flat_hash_map<int, absl::flat_hash_map<std::string, int>>
-      tensor_index_map_;
-
-  // Maps op name to index of the corresponding OperatorCode in opcodes_ vector.
-  absl::flat_hash_map<std::string, uint32_t> opcode_index_map_;
-  std::vector<int32_t> opcodes_;
-
-  // Maps function name to index of the corresponding subgraph in the FlatBuffer
-  // model.
-  absl::flat_hash_map<std::string, int> subgraph_index_map_;
-  absl::flat_hash_set<OpType> enabled_op_types_;
-
-  // maps between reduce_window op and their corresponding subgraphs
-  std::map<mlir::stablehlo::ReduceWindowOp, int> reduce_window_subgraph_map_;
-
-  // Points to stablehlo dialects & mhlo dialects, respectively. nullptr if the
-  // dialect is not registered.
-  Dialect* stablehlo_dialect_;
-
-  // Set of saved model tags, if any.
-  const std::unordered_set<std::string> saved_model_tags_;
-  // Map of key value pairs of metadata to export.
-  const std::map<std::string, std::string> metadata_;
-  // A mapping table to mlir::Operation objects for TFL subgraph and operator
-  // index in a flatbuffer.
-  std::vector<std::vector<Operation*>> subgraph_op_inst_map_;
-
-  const std::string tf_entry_function_ = "tf.entry_function";
-};
-
-}  // namespace odml
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_TRANSLATOR_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir
deleted file mode 100644
index d0da1f09fa5ae1..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | flatbuffer_translate -mlir-to-tflite-flatbuffer - -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = stablehlo.add %arg0, %arg0 : tensor<2xi32>
-  %1 = stablehlo.subtract %0, %arg0 : tensor<2xi32>
-  func.return %1 : tensor<2xi32>
-}
-}
-
-// CHECK:       func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:    %1 = "tfl.custom"(%0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:    return %1 : tensor<2xi32>
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-add.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-add.mlir
deleted file mode 100644
index b0eb02192f4dad..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-add.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = stablehlo.add %arg0, %arg0 : tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir
deleted file mode 100644
index 85653de898aa01..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
-  %0= "stablehlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = array<i64: 1, 2>} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
-  func.return %0 : tensor<1x2x2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0) {custom_code = "stablehlo.broadcast_in_dim", custom_option = #tfl<const_bytes : "0x62726F6164636173745F64696D656E73696F6E73000201020119010101072C022401">} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
-// CHECK-NEXT:      return %0 : tensor<1x2x2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-clamp.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-clamp.mlir
deleted file mode 100644
index 2d0051afde986b..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-clamp.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = "stablehlo.clamp"(%arg0, %arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0, %arg0) {custom_code = "stablehlo.clamp", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-compare.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-compare.mlir
deleted file mode 100644
index 44b69ab933039f..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-compare.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xf32>, %arg3: tensor<2xf32>) -> tensor<2xi1> {
-  %0 = stablehlo.compare LT, %arg0, %arg1 : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  %1 = stablehlo.compare LT, %arg0, %arg1, TOTALORDER : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  %2 = stablehlo.compare GT, %arg2, %arg3 : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xi1>
-  func.return %2 : tensor<2xi1>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:   func.func @main(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xf32>, %arg3: tensor<2xf32>) -> tensor<2xi1> {
-// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.compare", custom_option = #tfl<const_bytes : "0x636F6D70617269736F6E5F646972656374696F6E00024C5400011A0101010814022401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK-NEXT:     %1 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.compare", custom_option = #tfl<const_bytes : "0x636F6D706172655F74797065000A544F54414C4F5244455200636F6D70617269736F6E5F646972656374696F6E00024C540002331B0201022A0A1414042401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK-NEXT:     %2 = "tfl.custom"(%arg2, %arg3) {custom_code = "stablehlo.compare", custom_option = #tfl<const_bytes : "0x636F6D70617269736F6E5F646972656374696F6E0002475400011A0101010814022401">} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xi1>
-// CHECK-NEXT:     return %2 : tensor<2xi1>
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-concat.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-concat.mlir
deleted file mode 100644
index 4be83175a417e1..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-concat.mlir
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
-  %1 = "stablehlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
-  func.return %1 : tensor<6x3xf32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:    func @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.concatenate", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E00010B0101010004022401">} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
-// CHECK-NEXT:      return %0 : tensor<6x3xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT: }
-
-
-
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-constant.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-constant.mlir
deleted file mode 100644
index 62c2253869c725..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-constant.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main() -> tensor<2xf32> {
-  %0 = stablehlo.constant dense<2> : tensor<i32>
-  %1 = stablehlo.constant dense<[10.0, 11.0]> : tensor<2xf32>
-  func.return %1 : tensor<2xf32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func.func @main() -> tensor<2xf32> {
-// CHECK-NEXT:      %0 = "tfl.custom"() {custom_code = "stablehlo.constant", custom_option = #tfl<const_bytes : "0x76616C75650001020109010101062C022401">} : () -> tensor<i32>
-// CHECK-NEXT:      %1 = "tfl.custom"() {custom_code = "stablehlo.constant", custom_option = #tfl<const_bytes : "0x76616C756500000002000000000020410000304101150101010D36022401">} : () -> tensor<2xf32>
-// CHECK-NEXT:      return %1 : tensor<2xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir
deleted file mode 100644
index aa7742c15e4c42..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck -dump-input always %s
-
-module {
-func.func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
-  %0 = "stablehlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64,
-    dimension_numbers = #stablehlo.conv<raw
-      input_batch_dimension = 2,
-      input_feature_dimension = 3,
-      input_spatial_dimensions = [0, 1],
-      kernel_input_feature_dimension = 3,
-      kernel_output_feature_dimension = 2,
-      kernel_spatial_dimensions = [0, 1],
-      output_batch_dimension = 3,
-      output_feature_dimension = 0,
-      output_spatial_dimensions = [1, 2]
-    >, feature_group_count = 1 : i64, lhs_dilation = array<i64: 1, 1>, padding = dense<1> : tensor<2x2xi64>, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>], rhs_dilation = array<i64: 1, 1>, window_strides = array<i64: 1, 1>, window_reversal = array<i1: true, false>} :
-       (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
-  func.return %0 : tensor<16x8x8x1xf32>
-}
-}
-
-// CHECK: module {
-// CHECK-NEXT:   func.func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
-// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.convolution", custom_option = #tfl<const_bytes : "0x62617463685F67726F75705F636F756E740064696D656E73696F6E5F6E756D62657273000200010404020001040402010204040902031103020F03000D040428040428040428666561747572655F67726F75705F636F756E74006C68735F64696C6174696F6E0002010170616464696E67000401010101707265636973696F6E5F636F6E666967000744454641554C54000744454641554C540002120A7268735F64696C6174696F6E0002010177696E646F775F726576657273616C0002010077696E646F775F737472696465730002010109D3C28F7C6D613C2D1B09010901AC017A70493A28170428042C2C3C2C902C122401">} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
-// CHECK-NEXT:     return %0 : tensor<16x8x8x1xf32>
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-dot.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-dot.mlir
deleted file mode 100644
index ef715f778e8292..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-dot.mlir
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<72x2048xf32>, %arg1: tensor<2048x512xf32>) -> tensor<72x512xf32> {
-  %0 = "stablehlo.dot"(%arg0, %arg1) {
-    dimension_numbers = #stablehlo.dot<
-      lhs_batching_dimensions = [0, 1],
-      rhs_batching_dimensions = [1, 2],
-      lhs_contracting_dimensions = [0, 1],
-      rhs_contracting_dimensions = [1, 2]
-    >} :
-       (tensor<72x2048xf32>, tensor<2048x512xf32>) -> tensor<72x512xf32>
-  func.return %0 : tensor<72x512xf32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:    func.func @main(%arg0: tensor<72x2048xf32>, %arg1: tensor<2048x512xf32>) -> tensor<72x512xf32> {
-// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.dot", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E5F6E756D626572730002000104040201020404020001040402010204040414100C082828282801300101010D28022401">} : (tensor<72x2048xf32>, tensor<2048x512xf32>) -> tensor<72x512xf32>
-// CHECK-NEXT:    return %0 : tensor<72x512xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir
deleted file mode 100644
index 47c716c0ca5243..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<1x128x256xf32>, %arg1: tensor<30x1x2xi32>) -> tensor<30x1x256xf32> {
-  %0 = "stablehlo.gather"(%arg0, %arg1) {
-    dimension_numbers = #stablehlo.gather<
-      offset_dims = [2],
-      collapsed_slice_dims = [0, 1],
-      start_index_map = [0, 1],
-      index_vector_dim = 2>,
-    indices_are_sorted = false,
-    slice_sizes = array<i64: 1, 1, 256>} :
-       (tensor<1x128x256xf32>, tensor<30x1x2xi32>) -> tensor<30x1x256xf32>
-  func.return %0 : tensor<30x1x256xf32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:   func.func @main(%arg0: tensor<1x128x256xf32>, %arg1: tensor<30x1x2xi32>) -> tensor<30x1x256xf32> {
-// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.gather", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E5F6E756D626572730001020402000104040200010404040D0B070228282804696E64696365735F6172655F736F7274656400736C6963655F73697A65730000030001000100000103512A1803010337000F28042D062401">} : (tensor<1x128x256xf32>, tensor<30x1x2xi32>) -> tensor<30x1x256xf32>
-// CHECK-NEXT:     return %0 : tensor<30x1x256xf32>
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-max.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-max.mlir
deleted file mode 100644
index e8ccfcaee07805..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-max.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = stablehlo.maximum %arg0, %arg0 : tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.maximum", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-mul.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-mul.mlir
deleted file mode 100644
index b4bcbc455f2d24..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-mul.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = stablehlo.multiply %arg0, %arg0 : tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.multiply", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir
deleted file mode 100644
index bffb1da2b07117..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
-  %0 = "stablehlo.pad"(%arg0, %arg1) {
-    edge_padding_low = array<i64: 1, 0>,
-    edge_padding_high = array<i64: 2, 3>,
-    interior_padding = array<i64: 0, 0>
-  } : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
-  func.return %0 : tensor<11x131xf32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.pad", custom_option = #tfl<const_bytes : "0x656467655F70616464696E675F6869676800020203656467655F70616464696E675F6C6F7700020100696E746572696F725F70616464696E6700020000033E2A17030103311E0B2C2C2C062401">} : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
-// CHECK-NEXT:      return %0 : tensor<11x131xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-reshape.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-reshape.mlir
deleted file mode 100644
index 281f14bf8b844e..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-reshape.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = "stablehlo.reshape"(%arg0) : (tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0) {custom_code = "stablehlo.reshape", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-rsqrt.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-rsqrt.mlir
deleted file mode 100644
index f352e19959cba1..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-rsqrt.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  %0 = "stablehlo.rsqrt"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-  func.return %0 : tensor<2xf32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0) {custom_code = "stablehlo.rsqrt", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK-NEXT:      return %0 : tensor<2xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-scatter.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-scatter.mlir
deleted file mode 100644
index 5bd79227f576b8..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-scatter.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>, %arg2: tensor<1xi32>) -> tensor<3xi32> {
-  %0 = "stablehlo.scatter"(%arg0, %arg1, %arg2) ({
-  ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>):
-    "stablehlo.return"(%arg4) : (tensor<i32>) -> ()
-  }) {
-    scatter_dimension_numbers = #stablehlo.scatter<
-      update_window_dims = [],
-      inserted_window_dims = [0],
-      scatter_dims_to_operand_dims = [0],
-      index_vector_dim = 1>,
-    indices_are_sorted = false,
-    unique_indices = false} :
-       (tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>) -> tensor<3xi32>
-  func.return %0 : tensor<3xi32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:   func.func @main(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>, %arg2: tensor<1xi32>) -> tensor<3xi32> {
-// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "stablehlo.scatter", custom_option = #tfl<const_bytes : "0x696E64696365735F6172655F736F7274656400736361747465725F64696D656E73696F6E5F6E756D626572730000010004010004040707050128282804756E697175655F696E646963657300034D3B12030103001F00042804062401">} : (tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>) -> tensor<3xi32>
-// CHECK-NEXT:     return %0 : tensor<3xi32>
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-sub.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-sub.mlir
deleted file mode 100644
index bc4f72fd2bcd48..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-sub.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = stablehlo.subtract %arg0, %arg0 : tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl.mlir
deleted file mode 100644
index 8898fac4288218..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = stablehlo.add %arg0, %arg0 : tensor<2xi32>
-  %1 = stablehlo.subtract %0, %arg0 : tensor<2xi32>
-  func.return %1 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      %1 = "tfl.custom"(%0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %1 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
deleted file mode 100644
index b120ca89c290d4..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
+++ /dev/null
@@ -1,279 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/Diagnostics.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-
-namespace mlir {
-namespace odml {
-
-class StablehloToTflPass
-    : public mlir::PassWrapper<StablehloToTflPass,
-                               mlir::OperationPass<mlir::func::FuncOp>> {
- public:
-  explicit StablehloToTflPass() : PassWrapper() {}
-  StringRef getArgument() const final { return "stablehlo-tfl"; }
-  StringRef getDescription() const final {
-    return "This pass will legalize StableHLO Ops to TFLite custom Ops.";
-  }
-
- private:
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<TFL::TensorFlowLiteDialect>();
-  }
-  inline TFL::ConstBytesAttr CustomOption(OpBuilder* builder,
-                                          const std::string& content) {
-    return TFL::ConstBytesAttr::get(builder->getContext(),
-                                    StringRef(content.data(), content.size()));
-  }
-
-  void AddIntegerArray(flexbuffers::Builder* fbb,
-                       ::llvm::ArrayRef<int64_t> vec) {
-    auto start_input_dim = fbb->StartVector();
-    for (auto int_value : vec) {
-      fbb->Add(int_value);
-    }
-    fbb->EndVector(start_input_dim, /*typed=*/false, /*fixed=*/false);
-  }
-};
-
-void StablehloToTflPass::runOnOperation() {
-  func::FuncOp fn = getOperation();
-  OpBuilder builder(fn.getContext());
-  fn.walk([&](Operation* op) {
-    // Process only StableHLO ops.
-    if (op->getDialect()->getNamespace() != "stablehlo") return;
-
-    // Build options.
-    std::string custom_option_buffer;
-    auto fbb = std::make_unique<flexbuffers::Builder>();
-    size_t map_start = fbb->StartMap();
-    for (auto pair : op->getAttrDictionary().getValue()) {
-      const char* key = pair.getName().data();
-      const auto attr = pair.getValue();
-
-      if (attr.isa<::mlir::IntegerAttr>()) {
-        fbb->Int(key, attr.dyn_cast<mlir::IntegerAttr>().getInt());
-        continue;
-      }
-
-      if (attr.isa<::mlir::FloatAttr>()) {
-        fbb->Double(key, attr.dyn_cast<mlir::FloatAttr>().getValueAsDouble());
-        continue;
-      }
-
-      if (attr.isa<::mlir::ElementsAttr>()) {
-        auto start = fbb->StartVector(key);
-        auto array_attr = attr.dyn_cast<mlir::ElementsAttr>();
-        const auto ftype = array_attr.getElementType();
-        if (ftype.isInteger(16) || ftype.isInteger(32) || ftype.isInteger(64) ||
-            ftype.isInteger(128) || ftype.isInteger(1)) {
-          for (auto value : array_attr.getValues<IntegerAttr>()) {
-            auto int_value =
-                value.dyn_cast_or_null<mlir::IntegerAttr>().getInt();
-            fbb->Add(int_value);
-          }
-        } else if (ftype.isF32() || ftype.isF64() || ftype.isF128()) {
-          for (auto value : array_attr.getValues<FloatAttr>()) {
-            auto double_value =
-                value.dyn_cast_or_null<mlir::FloatAttr>().getValueAsDouble();
-            fbb->Add(double_value);
-          }
-        } else {
-          emitWarning(op->getLoc(), "serialization of ElementsAttr for ")
-              << key << " only supports Integer and Float.";
-        }
-        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::DenseI64ArrayAttr>()) {
-        auto array_attr = attr.dyn_cast<mlir::DenseI64ArrayAttr>();
-        auto start = fbb->StartVector(key);
-        for (auto int_value : array_attr.asArrayRef()) {
-          fbb->Add(int_value);
-        }
-        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::DenseBoolArrayAttr>()) {
-        auto array_attr = attr.dyn_cast<mlir::DenseBoolArrayAttr>();
-        auto start = fbb->StartVector(key);
-        for (auto bool_value : array_attr.asArrayRef()) {
-          fbb->Add(bool_value);
-        }
-        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::StringAttr>()) {
-        fbb->String(key, attr.dyn_cast<mlir::StringAttr>().data());
-        continue;
-      }
-
-      if (attr.isa<::mlir::ArrayAttr>()) {
-        auto start = fbb->StartVector(key);
-        auto array_attr = attr.dyn_cast<mlir::ArrayAttr>();
-        if (array_attr.size() > 1 && !array_attr[0].isa<mlir::StringAttr>() &&
-            !array_attr[0].isa<mlir::stablehlo::PrecisionAttr>()) {
-          emitWarning(op->getLoc(), "serialization of ArrayAttr for ")
-              << key << " only supports Strings.";
-          continue;
-        }
-        for (auto value : array_attr) {
-          if (value.isa<mlir::stablehlo::PrecisionAttr>()) {
-            auto string_value =
-                mlir::stablehlo::stringifyPrecision(
-                    value.cast<mlir::stablehlo::PrecisionAttr>().getValue())
-                    .data();
-            fbb->Add(string_value);
-          } else {
-            auto string_value =
-                value.dyn_cast_or_null<mlir::StringAttr>().data();
-            fbb->Add(string_value);
-          }
-        }
-        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::stablehlo::ConvDimensionNumbersAttr>()) {
-        auto dimension_attr =
-            attr.dyn_cast<::mlir::stablehlo::ConvDimensionNumbersAttr>();
-        auto start = fbb->StartVector(key);
-        fbb->Add(dimension_attr.getInputBatchDimension());
-        fbb->Add(dimension_attr.getInputFeatureDimension());
-        AddIntegerArray(fbb.get(), dimension_attr.getInputSpatialDimensions());
-        fbb->Add(dimension_attr.getKernelInputFeatureDimension());
-        fbb->Add(dimension_attr.getKernelOutputFeatureDimension());
-        AddIntegerArray(fbb.get(), dimension_attr.getKernelSpatialDimensions());
-        fbb->Add(dimension_attr.getOutputBatchDimension());
-        fbb->Add(dimension_attr.getOutputFeatureDimension());
-        AddIntegerArray(fbb.get(), dimension_attr.getOutputSpatialDimensions());
-        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::stablehlo::GatherDimensionNumbersAttr>()) {
-        auto dimension_attr =
-            attr.dyn_cast<::mlir::stablehlo::GatherDimensionNumbersAttr>();
-        auto start = fbb->StartVector(key);
-        AddIntegerArray(fbb.get(), dimension_attr.getOffsetDims());
-        AddIntegerArray(fbb.get(), dimension_attr.getCollapsedSliceDims());
-        AddIntegerArray(fbb.get(), dimension_attr.getStartIndexMap());
-        fbb->Add(dimension_attr.getIndexVectorDim());
-        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::stablehlo::ScatterDimensionNumbersAttr>()) {
-        auto dimension_attr =
-            attr.dyn_cast<::mlir::stablehlo::ScatterDimensionNumbersAttr>();
-        auto start = fbb->StartVector(key);
-        AddIntegerArray(fbb.get(), dimension_attr.getUpdateWindowDims());
-        AddIntegerArray(fbb.get(), dimension_attr.getInsertedWindowDims());
-        AddIntegerArray(fbb.get(),
-                        dimension_attr.getScatterDimsToOperandDims());
-        fbb->Add(dimension_attr.getIndexVectorDim());
-        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::stablehlo::DotDimensionNumbersAttr>()) {
-        auto dimension_attr =
-            attr.dyn_cast<::mlir::stablehlo::DotDimensionNumbersAttr>();
-        auto start = fbb->StartVector(key);
-        AddIntegerArray(fbb.get(), dimension_attr.getLhsBatchingDimensions());
-        AddIntegerArray(fbb.get(), dimension_attr.getRhsBatchingDimensions());
-        AddIntegerArray(fbb.get(),
-                        dimension_attr.getLhsContractingDimensions());
-        AddIntegerArray(fbb.get(),
-                        dimension_attr.getRhsContractingDimensions());
-        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::stablehlo::ComparisonDirectionAttr>()) {
-        auto string_value =
-            mlir::stablehlo::stringifyComparisonDirection(
-                attr.cast<mlir::stablehlo::ComparisonDirectionAttr>()
-                    .getValue())
-                .str();
-        fbb->String(key, string_value);
-        continue;
-      }
-
-      if (attr.isa<::mlir::stablehlo::ComparisonTypeAttr>()) {
-        auto string_value =
-            mlir::stablehlo::stringifyComparisonType(
-                attr.cast<mlir::stablehlo::ComparisonTypeAttr>().getValue())
-                .str();
-        fbb->String(key, string_value);
-        continue;
-      }
-
-      // default
-      emitWarning(op->getLoc(), "serialization not supported for : ") << key;
-    }
-    fbb->EndMap(map_start);
-    fbb->Finish();
-    custom_option_buffer.assign(fbb->GetBuffer().begin(),
-                                fbb->GetBuffer().end());
-
-    // Build custom op.
-    builder.setInsertionPoint(op);
-    auto tfl_custom_op = builder.create<TFL::CustomOp>(
-        op->getLoc(), op->getResultTypes(), op->getOperands(),
-        op->getName().getStringRef(),
-        CustomOption(&builder, custom_option_buffer));
-    op->replaceAllUsesWith(tfl_custom_op);
-    op->erase();
-  });
-}
-std::unique_ptr<OperationPass<func::FuncOp>> CreateStablehloToTflPass() {
-  return std::make_unique<StablehloToTflPass>();
-}
-
-static PassRegistration<StablehloToTflPass> pass;
-
-}  // namespace odml
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h
deleted file mode 100644
index 9445b770f10562..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_TFL_PASS_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_TFL_PASS_H_
-
-#include <memory>
-#include <string>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace mlir {
-namespace odml {
-
-// Creates a pass which transforms StableHLO Ops to TFL Ops.
-std::unique_ptr<OperationPass<func::FuncOp>> CreateStablehloToTflPass();
-
-}  // namespace odml
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_TFL_PASS_H_

From 5f832cc6cc0022afe3bebf18c5d11b57ccdbf5a0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 17:59:17 -0700
Subject: [PATCH 149/670] Add software prefetchers to help reduce cache-misses

PiperOrigin-RevId: 617350951
---
 tensorflow/lite/kernels/internal/BUILD          |  1 +
 .../internal/optimized/sse_tensor_utils.cc      |  5 +++++
 .../lite/kernels/internal/tensor_utils_test.cc  | 17 +++++++++++++++--
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 1bbf1516f82c3f..3cd65dceb6cd20 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -1006,6 +1006,7 @@ cc_library(
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:cpu_backend_gemm",
         "//tensorflow/lite/kernels:op_macros",
+        "@com_google_absl//absl/base:prefetch",
         "@ruy//ruy/profiler:instrumentation",
     ],
 )
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
index 014bb2fe531d1e..c4331e099a21d7 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #endif
 #ifdef __AVX2__
 #include <immintrin.h>
+
+#include "absl/base/prefetch.h"
 #endif
 
 #include <cstdint>
@@ -217,8 +219,11 @@ void Avx2MatrixBatchVectorMultiplyAccumulateImpl(
       // Initialize the dot product sum for the row to 0.
       __m256i dotprod_32x8 = _mm256_setzero_si256();
       std::intptr_t col = 0;
+      constexpr int prefetch_distance = 704;
       // For every block of 32x 8-bit inputs.
       while (col < (m_cols & ~31)) {
+        absl::PrefetchToLocalCache(vectors + col + prefetch_distance);
+        absl::PrefetchToLocalCache(row_ptr + col + prefetch_distance);
         const __m256i vec_16x16 =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(vectors + col));
         const __m256i row_16x16 =
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 00bd42bec0c4a3..902f42cca68c5b 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -2185,7 +2185,13 @@ BENCHMARK(BM_DotprodBatchOneMultiply)
     ->Args({640, 2048, 8, 8})
     ->Args({2048, 2048, 1, 1})
     ->Args({2048, 2048, 1, 8})
-    ->Args({2048, 2048, 8, 1});
+    ->Args({2048, 2048, 8, 1})
+    ->Args({4096, 4096, 8, 1})
+    ->Args({4096, 4096, 1, 8})
+    ->Args({8192, 8192, 8, 1})
+    ->Args({8192, 8192, 1, 8})
+    ->Args({16384, 16384, 8, 1})
+    ->Args({16384, 16384, 1, 8});
 
 void BM_DotprodBatchFourMultiply(benchmark::State& state) {
   const int rows = state.range(0);
@@ -2242,7 +2248,14 @@ BENCHMARK(BM_DotprodBatchFourMultiply)
     ->Args({2048, 2048, 4, 1})
     ->Args({2048, 2048, 4, 8})
     ->Args({2048, 2048, 5, 1})
-    ->Args({2048, 2048, 8, 1});
+    ->Args({2048, 2048, 8, 1})
+    ->Args({2048, 2048, 64, 1})
+    ->Args({2048, 2048, 1024, 1})
+    ->Args({4096, 4096, 1024, 1})
+    ->Args({8192, 8192, 1024, 1})
+    ->Args({8192, 8192, 1024, 8})
+    ->Args({16384, 16384, 1024, 1})
+    ->Args({16384, 8192, 1024, 1});
 
 void BM_DotprodSparseMultiply(benchmark::State& state) {
   const int rows = state.range(0);

From e1d997bd8ab605cf9d4e31e61d13fa61bdda0677 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 19 Mar 2024 18:16:46 -0700
Subject: [PATCH 150/670] [XLA:Python] Use PyType_FromSpec to build heap types,
 rather than manually building Python heap types.

Cleanup; no functional changes intended.

PiperOrigin-RevId: 617355142
---
 third_party/xla/xla/python/pjit.cc     | 134 +++++++++++------------
 third_party/xla/xla/python/py_array.cc | 140 ++++++++++++-------------
 third_party/xla/xla/python/py_array.h  |   2 -
 3 files changed, 134 insertions(+), 142 deletions(-)

diff --git a/third_party/xla/xla/python/pjit.cc b/third_party/xla/xla/python/pjit.cc
index 630865e9132abf..f3aa9b21a2ce54 100644
--- a/third_party/xla/xla/python/pjit.cc
+++ b/third_party/xla/xla/python/pjit.cc
@@ -800,8 +800,10 @@ void PjitFunction::ClearPythonReferences() {
 
 struct PjitFunctionObject {
   PyObject_HEAD;
+#if PY_VERSION_HEX < 0x030C0000
   PyObject* dict;      // Dictionary for __dict__
   PyObject* weakrefs;  // Weak references; for use by the Python interpreter.
+#endif                 // PY_VERSION_HEX < 0x030C0000
   vectorcallfunc vectorcall;
   PjitFunction fun;
 };
@@ -859,8 +861,10 @@ PyObject* PjitFunction_tp_new(PyTypeObject* subtype, PyObject* args,
   PjitFunctionObject* self =
       reinterpret_cast<PjitFunctionObject*>(subtype->tp_alloc(subtype, 0));
   if (!self) return nullptr;
+#if PY_VERSION_HEX < 0x030C0000
   self->dict = nullptr;
   self->weakrefs = nullptr;
+#endif  // PY_VERSION_HEX < 0x030C0000
   self->vectorcall = PjitFunction_tp_vectorcall;
   return reinterpret_cast<PyObject*>(self);
 }
@@ -869,10 +873,12 @@ void PjitFunction_tp_dealloc(PyObject* self) {
   PyObject_GC_UnTrack(self);
   PyTypeObject* tp = Py_TYPE(self);
   PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
-  if (o->weakrefs) {
-    PyObject_ClearWeakRefs(self);
-  }
+  PyObject_ClearWeakRefs(self);
+#if PY_VERSION_HEX < 0x030C0000
   Py_CLEAR(o->dict);
+#else
+  _PyObject_ClearManagedDict(self);
+#endif  // PY_VERSION_HEX < 0x030C0000
   o->fun.~PjitFunction();
   tp->tp_free(self);
   Py_DECREF(tp);
@@ -883,11 +889,13 @@ int PjitFunction_tp_traverse(PyObject* self, visitproc visit, void* arg) {
   // pytree_registry_ attribute of PjitFunction could in principle also have
   // python references to visit
   PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
-#if PY_VERSION_HEX >= 0x03090000
   // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
   Py_VISIT(Py_TYPE(self));
-#endif
+#if PY_VERSION_HEX < 0x030C0000
   Py_VISIT(o->dict);
+#else
+  _PyObject_VisitManagedDict(self, visit, arg);
+#endif  // PY_VERSION_HEX < 0x030C0000
   Py_VISIT(o->fun.cache_miss().ptr());
   Py_VISIT(o->fun.shard_arg_fallback().ptr());
   if (o->fun.fun()) {
@@ -898,7 +906,11 @@ int PjitFunction_tp_traverse(PyObject* self, visitproc visit, void* arg) {
 
 int PjitFunction_tp_clear(PyObject* self) {
   PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
+#if PY_VERSION_HEX < 0x030C0000
   Py_CLEAR(o->dict);
+#else
+  _PyObject_ClearManagedDict(self);
+#endif  // PY_VERSION_HEX < 0x030C0000
   o->fun.ClearPythonReferences();
   return 0;
 }
@@ -915,35 +927,11 @@ PyObject* PjitFunction_tp_descr_get(PyObject* self, PyObject* obj,
   return PyMethod_New(self, obj);
 }
 
-// Support d = instance.__dict__.
-PyObject* PjitFunction_get_dict(PyObject* self, void*) {
-  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
-  if (!o->dict) {
-    o->dict = PyDict_New();
-  }
-  Py_XINCREF(o->dict);
-  return o->dict;
-}
-
-int PjitFunction_set_dict(PyObject* self, PyObject* new_dict, void*) {
-  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
-  if (!PyDict_Check(new_dict)) {
-    PyErr_Format(PyExc_TypeError,
-                 "__dict__ must be set to a dictionary, not a '%s'",
-                 Py_TYPE(new_dict)->tp_name);
-    return -1;
-  }
-  Py_INCREF(new_dict);
-  Py_CLEAR(o->dict);
-  o->dict = new_dict;
-  return 0;
-}
-
 static PyGetSetDef PjitFunction_tp_getset[] = {
     // Having a __dict__ seems necessary to allow !functool.wraps to override
     // __doc__.
-    {const_cast<char*>("__dict__"), PjitFunction_get_dict,
-     PjitFunction_set_dict, nullptr, nullptr},
+    {const_cast<char*>("__dict__"), PyObject_GenericGetDict,
+     PyObject_GenericSetDict, nullptr, nullptr},
     {nullptr, nullptr, nullptr, nullptr, nullptr}};
 
 PyObject* PjitFunction_tp_repr(PyObject* self) {
@@ -1000,6 +988,34 @@ nb::object MakePjitFunction(
 // PjitFunction. Increment these if changing them.
 const int kPjitFunctionPickleVersion = 1;
 
+PyMemberDef PjitFunction_members[] = {
+    {"__vectorcalloffset__", T_PYSSIZET,
+     static_cast<Py_ssize_t>(offsetof(PjitFunctionObject, vectorcall)),
+     READONLY, nullptr},
+#if PY_VERSION_HEX < 0x030C0000
+    {"__dictoffset__", T_PYSSIZET,
+     static_cast<Py_ssize_t>(offsetof(PjitFunctionObject, dict)), READONLY,
+     nullptr},
+    {"__weaklistoffset__", T_PYSSIZET,
+     static_cast<Py_ssize_t>(offsetof(PjitFunctionObject, weakrefs)), READONLY,
+     nullptr},
+#endif  // PY_VERSION_HEX < 0x030C0000
+    {nullptr, 0, 0, 0, nullptr},
+};
+
+PyType_Slot PjitFunction_slots[] = {
+    {Py_tp_new, reinterpret_cast<void*>(PjitFunction_tp_new)},
+    {Py_tp_dealloc, reinterpret_cast<void*>(PjitFunction_tp_dealloc)},
+    {Py_tp_traverse, reinterpret_cast<void*>(PjitFunction_tp_traverse)},
+    {Py_tp_clear, reinterpret_cast<void*>(PjitFunction_tp_clear)},
+    {Py_tp_getset, reinterpret_cast<void*>(PjitFunction_tp_getset)},
+    {Py_tp_descr_get, reinterpret_cast<void*>(PjitFunction_tp_descr_get)},
+    {Py_tp_call, reinterpret_cast<void*>(PyVectorcall_Call)},
+    {Py_tp_repr, reinterpret_cast<void*>(PjitFunction_tp_repr)},
+    {Py_tp_members, reinterpret_cast<void*>(PjitFunction_members)},
+    {0, nullptr},
+};
+
 }  // namespace
 
 void BuildPjitSubmodule(nb::module_& m) {
@@ -1033,44 +1049,30 @@ void BuildPjitSubmodule(nb::module_& m) {
 
   // We need to use heap-allocated type objects because we want to add
   // additional methods dynamically.
-  nb::object cfun;
-  {
-    nb::str name = nb::str("PjitFunction");
-    nb::str qualname = nb::str("PjitFunction");
-    PyHeapTypeObject* heap_type = reinterpret_cast<PyHeapTypeObject*>(
-        PyType_Type.tp_alloc(&PyType_Type, 0));
-    // Caution: we must not call any functions that might invoke the GC until
-    // PyType_Ready() is called. Otherwise the GC might see a half-constructed
-    // type object.
-    CHECK(heap_type) << "Unable to create heap type object";
-    heap_type->ht_name = name.release().ptr();
-    heap_type->ht_qualname = qualname.release().ptr();
-    PyTypeObject* type = &heap_type->ht_type;
-    type->tp_name = "PjitFunction";
-    type->tp_basicsize = sizeof(PjitFunctionObject);
-    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE |
-                     Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_HAVE_VECTORCALL;
-    type->tp_new = PjitFunction_tp_new;
-    type->tp_dealloc = PjitFunction_tp_dealloc;
-    type->tp_dictoffset = offsetof(PjitFunctionObject, dict);
-    type->tp_traverse = PjitFunction_tp_traverse;
-    type->tp_clear = PjitFunction_tp_clear;
-    type->tp_weaklistoffset = offsetof(PjitFunctionObject, weakrefs);
-    type->tp_getset = PjitFunction_tp_getset;
-    type->tp_descr_get = PjitFunction_tp_descr_get;
-    type->tp_call = PyVectorcall_Call;
-    type->tp_vectorcall_offset = offsetof(PjitFunctionObject, vectorcall);
-    type->tp_repr = PjitFunction_tp_repr;
-    CHECK_EQ(PyType_Ready(type), 0);
-    PjitFunction_Type = reinterpret_cast<PyObject*>(type);
-    cfun = nb::borrow<nb::object>(PjitFunction_Type);
+  std::string name =
+      absl::StrCat(nb::cast<std::string>(m.attr("__name__")), ".PjitFunction");
+  PyType_Spec PjitFunction_spec = {
+      /*.name=*/name.c_str(),
+      /*.basicsize=*/static_cast<int>(sizeof(PjitFunctionObject)),
+      /*.itemsize=*/0,
+#if PY_VERSION_HEX < 0x030C0000
+      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
+          Py_TPFLAGS_HAVE_VECTORCALL,
+#else   // PY_VERSION_HEX < 0x030C0000
+      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
+          Py_TPFLAGS_HAVE_VECTORCALL | Py_TPFLAGS_MANAGED_DICT |
+          Py_TPFLAGS_MANAGED_WEAKREF,
+#endif  // PY_VERSION_HEX < 0x030C0000
+      /*.slots=*/PjitFunction_slots,
+  };
+  PjitFunction_Type = PyType_FromSpec(&PjitFunction_spec);
+  if (!PjitFunction_Type) {
+    throw nb::python_error();
   }
-  nb::object cfun_type = nb::borrow<nb::object>(PjitFunction_Type);
+  nb::object cfun = nb::borrow<nb::object>(PjitFunction_Type);
 
   // Add PjitFunction to the xla_extension module so it can be pickled.
-  m.attr("PjitFunction") = cfun_type;
-  cfun.attr("__module__") = m.attr("__name__");
-
+  m.attr("PjitFunction") = cfun;
   cfun.attr("__getstate__") = nb::cpp_function(
       [](const PjitFunction::object& self) {
         PjitFunction* fn = self.func();
diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index 831e3c09ab25d3..cd58ff580380c6 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -219,7 +219,10 @@ ifrt::MemoryKind CreateIfRtMemoryKindFromSharding(const nb::object& sharding) {
 
 struct PyArrayObject {
   PyObject_HEAD;
+#if PY_VERSION_HEX < 0x030C0000
   PyObject* weakrefs;
+  PyObject* dict;
+#endif  // PY_VERSION_HEX < 0x030B0000
   alignas(PyArray::Storage) char array_storage[sizeof(PyArray::Storage)];
 };
 static_assert(std::is_standard_layout<PyArrayObject>::value);
@@ -239,14 +242,15 @@ extern "C" void PyArray_tp_dealloc(PyObject* self) {
   PyTypeObject* tp = Py_TYPE(self);
   auto* obj = reinterpret_cast<PyArrayObject*>(self);
 
-  if (obj->weakrefs) {
-    PyObject_ClearWeakRefs(self);
-  }
-
   GetPyArrayStorageFromObject(obj)->~PyArray_Storage();
 
+  PyObject_ClearWeakRefs(self);
+#if PY_VERSION_HEX < 0x030C0000
   PyObject*& dict = *_PyObject_GetDictPtr(self);
   Py_CLEAR(dict);
+#else
+  _PyObject_ClearManagedDict(self);
+#endif  // PY_VERSION_HEX < 0x030C0000
 
   tp->tp_free(self);
   Py_DECREF(tp);
@@ -255,40 +259,26 @@ extern "C" void PyArray_tp_dealloc(PyObject* self) {
 // dynamic_attr: Allow the garbage collector to traverse the internal instance
 // `__dict__`.
 extern "C" int PyArray_tp_traverse(PyObject* self, visitproc visit, void* arg) {
+#if PY_VERSION_HEX < 0x030C0000
   PyObject*& dict = *_PyObject_GetDictPtr(self);
   Py_VISIT(dict);
-// https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
-#if PY_VERSION_HEX >= 0x03090000
+#else
+  _PyObject_VisitManagedDict(self, visit, arg);
+#endif  // PY_VERSION_HEX < 0x030C0000
+  // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
   Py_VISIT(Py_TYPE(self));
-#endif
   return 0;
 }
 
 // dynamic_attr: Allow the GC to clear the dictionary.
 extern "C" int PyArray_tp_clear(PyObject* self) {
+#if PY_VERSION_HEX < 0x030C0000
   PyObject*& dict = *_PyObject_GetDictPtr(self);
   Py_CLEAR(dict);
-  return 0;
-}
-
-// Give instances of this type a `__dict__` and opt into garbage collection.
-void EnableDynamicAttribute(PyHeapTypeObject* heap_type) {
-  auto* type = &heap_type->ht_type;
-  type->tp_flags |= Py_TPFLAGS_HAVE_GC;
-#if PY_VERSION_HEX < 0x030B0000
-  type->tp_dictoffset = type->tp_basicsize;  // place dict at the end
-  type->tp_basicsize +=
-      (ssize_t)sizeof(PyObject*);  // and allocate enough space for it
 #else
-  type->tp_flags |= Py_TPFLAGS_MANAGED_DICT;
-#endif
-  type->tp_traverse = PyArray_tp_traverse;
-  type->tp_clear = PyArray_tp_clear;
-
-  static PyGetSetDef getset[] = {{"__dict__", PyObject_GenericGetDict,
-                                  PyObject_GenericSetDict, nullptr, nullptr},
-                                 {nullptr, nullptr, nullptr, nullptr, nullptr}};
-  type->tp_getset = getset;
+  _PyObject_ClearManagedDict(self);
+#endif  // PY_VERSION_HEX < 0x030C0000
+  return 0;
 }
 
 template <typename... Args>
@@ -1335,13 +1325,6 @@ void PyArray_bf_releasebuffer(PyObject*, Py_buffer* buffer) {
   delete extra;
 }
 
-PyBufferProcs PyArray_tp_as_buffer = []() {
-  PyBufferProcs procs;
-  procs.bf_getbuffer = &PyArray_bf_getbuffer;
-  procs.bf_releasebuffer = &PyArray_bf_releasebuffer;
-  return procs;
-}();
-
 // Returns if shape has a major-to-minor layout.
 bool HasMajorToMinorLayout(const xla::Shape& shape) {
   if (shape.has_layout()) {
@@ -1485,50 +1468,59 @@ Status PyHostValue::CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
   return OkStatus();
 }
 
-Status PyArray::SetUpType() {
-  static constexpr char kName[] = "ArrayImpl";
-
-  nb::str name(kName);
-  nb::str qualname(kName);
-
-  auto* heap_type = reinterpret_cast<PyHeapTypeObject*>(
-      PyType_Type.tp_alloc(&PyType_Type, 0));
-  // Caution: we must not call any functions that might invoke the GC until
-  // PyType_Ready() is called below. Otherwise the GC might see a
-  // half-constructed type object.
-  if (!heap_type) {
-    return Internal("Unable to create heap type object");
-  }
-  heap_type->ht_name = name.release().ptr();
-  heap_type->ht_qualname = qualname.release().ptr();
-  PyTypeObject* type = &heap_type->ht_type;
-  type->tp_name = kName;
-  type->tp_basicsize = sizeof(PyArrayObject);
-  type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE;
-  type->tp_new = PyArray_tp_new;
-  type->tp_dealloc = PyArray_tp_dealloc;
-
-  // Supported protocols
-  type->tp_as_number = &heap_type->as_number;
-  type->tp_as_sequence = &heap_type->as_sequence;
-  type->tp_as_mapping = &heap_type->as_mapping;
-  type->tp_as_buffer = &PyArray_tp_as_buffer;
-
-  // Allow dynamic attributes.
-  EnableDynamicAttribute(heap_type);
-
-  // Allow weak references to DeviceArray objects.
-  type->tp_weaklistoffset = offsetof(PyArrayObject, weakrefs);
-
-  TF_RET_CHECK(PyType_Ready(type) == 0);
+namespace {
+PyGetSetDef PyArray_tp_getset[] = {
+    {"__dict__", PyObject_GenericGetDict, PyObject_GenericSetDict, nullptr,
+     nullptr},
+    {nullptr, nullptr, nullptr, nullptr, nullptr},
+};
 
-  PyArray::type_ = reinterpret_cast<PyObject*>(type);
+PyMemberDef PyArray_members[] = {
+#if PY_VERSION_HEX < 0x030C0000
+    {"__weaklistoffset__", T_PYSSIZET,
+     static_cast<Py_ssize_t>(offsetof(PyArrayObject, weakrefs)), READONLY,
+     nullptr},
+    {"__dictoffset__", T_PYSSIZET,
+     static_cast<Py_ssize_t>(offsetof(PyArrayObject, dict)), READONLY, nullptr},
+#endif  // PY_VERSION_HEX < 0x030C0000
+    {nullptr, 0, 0, 0, nullptr},
+};  // namespace xla
+
+PyType_Slot PyArray_slots[] = {
+    {Py_tp_new, reinterpret_cast<void*>(PyArray_tp_new)},
+    {Py_tp_dealloc, reinterpret_cast<void*>(PyArray_tp_dealloc)},
+    {Py_tp_members, reinterpret_cast<void*>(PyArray_members)},
+    {Py_tp_traverse, reinterpret_cast<void*>(PyArray_tp_traverse)},
+    {Py_tp_clear, reinterpret_cast<void*>(PyArray_tp_clear)},
+    {Py_tp_getset, reinterpret_cast<void*>(PyArray_tp_getset)},
+    {Py_bf_getbuffer, reinterpret_cast<void*>(PyArray_bf_getbuffer)},
+    {Py_bf_releasebuffer, reinterpret_cast<void*>(PyArray_bf_releasebuffer)},
+    {0, nullptr},
+};
 
-  return OkStatus();
-}
+}  // namespace
 
 Status PyArray::RegisterTypes(nb::module_& m) {
-  TF_RETURN_IF_ERROR(PyArray::SetUpType());
+  std::string name =
+      absl::StrCat(nb::cast<std::string>(m.attr("__name__")), ".ArrayImpl");
+
+  PyType_Spec PyArray_spec = {
+      /*.name=*/name.c_str(),
+      /*.basicsize=*/static_cast<int>(sizeof(PyArrayObject)),
+      /*.itemsize=*/0,
+#if PY_VERSION_HEX < 0x030C0000
+      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
+#else   // PY_VERSION_HEX >= 0x030C0000
+      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
+          Py_TPFLAGS_MANAGED_DICT | Py_TPFLAGS_MANAGED_WEAKREF,
+#endif  // PY_VERSION_HEX >= 0x030C0000
+      /*.slots=*/PyArray_slots,
+  };
+
+  type_ = PyType_FromSpec(&PyArray_spec);
+  if (!type_) {
+    throw nb::python_error();
+  }
   auto type = nb::borrow<nb::object>(type_);
   m.attr("ArrayImpl") = type;
 
diff --git a/third_party/xla/xla/python/py_array.h b/third_party/xla/xla/python/py_array.h
index 72f0981fbe0769..1595c830a92a7b 100644
--- a/third_party/xla/xla/python/py_array.h
+++ b/third_party/xla/xla/python/py_array.h
@@ -307,8 +307,6 @@ class PyArray : public nanobind::object {
   Storage& GetStorage();
   const Storage& GetStorage() const;
 
-  static Status SetUpType();
-
   inline static PyObject* type_ = nullptr;
 };
 

From d333b483b45134272020ec4999d323096fc40f6f Mon Sep 17 00:00:00 2001
From: Ionel Gog <icgog@google.com>
Date: Tue, 19 Mar 2024 18:24:46 -0700
Subject: [PATCH 151/670] Relax verification of ShardingParam so that a
 dimension can be sharded over multiple axis.

PiperOrigin-RevId: 617356700
---
 third_party/xla/xla/python/ifrt/ir/sharding_param.cc     | 9 +++------
 .../xla/xla/python/ifrt/ir/tests/verify_array.mlir       | 2 +-
 .../xla/python/ifrt/support/sharding_conversions_test.cc | 8 +-------
 3 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt/ir/sharding_param.cc b/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
index d8b36fb5d72d87..618cec5a5cf56b 100644
--- a/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
+++ b/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
@@ -161,12 +161,9 @@ absl::Status ShardingParam::verify() const {
       break;
     }
     cum_size *= minor_to_major().axis_sizes[index];
-    if (cum_size > dim_shards()[dim_index]) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Dimension #", dim_index, " of ", dim_shards()[dim_index],
-          " shards can't be assigned to the axes"));
-    } else if (cum_size == dim_shards()[dim_index]) {
-      cum_size = 1;
+    while (dim_index < dim_shards().size() &&
+           cum_size % dim_shards()[dim_index] == 0) {
+      cum_size /= dim_shards()[dim_index];
       dim_index++;
     }
   }
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir
index 81b557bf28d5e9..c2bf8634f1f5f1 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir
@@ -73,7 +73,7 @@ func.func @array_requires_enough_devices() {
 // -----
 
 func.func @array_requires_shard_distributable_to_axes() {
-  // expected-error@+2 {{Dimension #1 of 2 shards can't be assigned to the axes}}
+  // expected-error@+2 {{Can't shard the dims 1x2 to the mesh of [0] on 3}}
   %0 = builtin.unrealized_conversion_cast to
       !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 3, [0,1,2]>
   return
diff --git a/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc b/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
index 22b213ff7c2d7d..d6619f4693d0eb 100644
--- a/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
+++ b/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
@@ -164,9 +164,6 @@ TEST(ShardingConversionsTest, VerifyIncorrectShardings) {
   ShardingParam too_many_slices{/*dim_shards=*/{2, 2},
                                 {/*permutation=*/{0}, /*axis_sizes=*/{2}}};
   EXPECT_FALSE(too_many_slices.verify().ok());
-  ShardingParam cannot_distribute_slices{
-      /*dim_shards=*/{1, 2}, {/*permutation=*/{0, 1}, /*axis_sizes=*/{3, 2}}};
-  EXPECT_FALSE(cannot_distribute_slices.verify().ok());
   ShardingParam incorrect_permutation{
       /*dim_shards=*/{4, 1},
       {/*permutation=*/{0, 1, 1}, /*axis_sizes=*/{2, 2, 2}}};
@@ -197,10 +194,7 @@ TEST_P(HloShardingToShardingParamTest, HloShardingToShardingParam) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto sharding_param,
       ToShardingParam(param.hlo_sharding, param.rank, param.num_devices));
-  // We cannot verify sharding param because we're losing info about the
-  // axis_size during these conversions. While strictly some ShardingParam
-  // are invalid because they have more dims than axis, in practice this is not
-  // a problem because we can still correctly map the shards to the devices.
+  EXPECT_TRUE(sharding_param.verify().ok());
   TF_ASSERT_OK_AND_ASSIGN(auto actual_hlo_sharding,
                           ToHloSharding(sharding_param));
   EXPECT_EQ(param.hlo_sharding, actual_hlo_sharding);

From 1f225db21442a38d77eb60dfd182a26100b110d2 Mon Sep 17 00:00:00 2001
From: Dateng Lin <datenglin@google.com>
Date: Tue, 19 Mar 2024 22:21:22 -0700
Subject: [PATCH 152/670] Created `FinalizeTPUEmbeddingV2` to output
 `embedding_partitions` and `hbm_buffers_config`, and created V2 Ops for BC
 XLA Ops which accept `embedding_partitions`, `hbm_buffers_config` and the
 serialization of `TpuTopologyArgsProto`.

PiperOrigin-RevId: 617397729
---
 .../api_def_ComputeDedupDataSizeV2.pbtxt      |  40 +
 .../api_def_ComputeDedupDataTupleMaskV2.pbtxt |  45 ++
 .../api_def_FinalizeTPUEmbeddingV2.pbtxt      |  33 +
 ...def_XlaRecvTPUEmbeddingActivationsV2.pbtxt |  61 ++
 ...aRecvTPUEmbeddingDeduplicationDataV2.pbtxt |  37 +
 ...i_def_XlaSendTPUEmbeddingGradientsV2.pbtxt |  74 ++
 .../ComputeDedupDataSizeV2.pbtxt              |  24 +
 .../ComputeDedupDataTupleMaskV2.pbtxt         |  24 +
 .../FinalizeTPUEmbeddingV2.pbtxt              |  20 +
 .../XlaRecvTPUEmbeddingActivationsV2.pbtxt    |  35 +
 ...aRecvTPUEmbeddingDeduplicationDataV2.pbtxt |  24 +
 .../XlaSendTPUEmbeddingGradientsV2.pbtxt      |  48 ++
 .../core/tpu/kernels/tpu_embedding_ops.cc     | 752 ++++++++++++------
 tensorflow/core/tpu/ops/tpu_embedding_ops.cc  | 129 +++
 .../xla/stream_executor/tpu/tpu_ops_c_api.h   |  15 +
 15 files changed, 1133 insertions(+), 228 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ComputeDedupDataSizeV2.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ComputeDedupDataTupleMaskV2.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_FinalizeTPUEmbeddingV2.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingActivationsV2.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_XlaSendTPUEmbeddingGradientsV2.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataSizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataSizeV2.pbtxt
new file mode 100644
index 00000000000000..f8066663fa20c3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataSizeV2.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "ComputeDedupDataSizeV2"
+  visibility: HIDDEN
+  out_arg {
+    name: "num_elements"
+    description: <<END
+The size of the deduplicated data from infeed.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  attr {
+    name: "embedding_partitions"
+    description: <<END
+Serialized EmbeddingPartitionsProto proto.
+END
+  }
+  attr {
+    name: "hbm_buffers_config"
+    description: <<END
+Serialized HbmBuffersConfig proto.
+END
+  }
+  attr {
+    name: "tpu_topology"
+    description: <<END
+Serialized TpuTopologyArgsProto proto.
+END
+  }
+  summary: "An op computes the size of the deduplication data from embedding core and returns the updated config."
+  description: <<END
+This op is to compute size of the deduplication data so to provide this
+information to the op that computes the tuple mask of deduplication data can
+have static output shape.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataTupleMaskV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataTupleMaskV2.pbtxt
new file mode 100644
index 00000000000000..a8c9e86ca16c82
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataTupleMaskV2.pbtxt
@@ -0,0 +1,45 @@
+op {
+  graph_op_name: "ComputeDedupDataTupleMaskV2"
+  visibility: HIDDEN
+  out_arg {
+    name: "output_shape"
+    description: <<END
+A 2-D int tensor represent mask of deduplication data tuple generated by
+`XlaRecvTPUEmbeddingDeduplicationData`. The tuple has several integer and float
+type 1-D tensor tuple elements. The first dimenion of this output_shape 2-D
+tensor is tensor type of tuple elements, `0` represents integer tensor, `1`
+represents float tensor. The second dimension of `output_shape` gives length of
+each tuple element.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  attr {
+    name: "embedding_partitions"
+    description: <<END
+Serialized EmbeddingPartitionsProto proto.
+END
+  }
+  attr {
+    name: "hbm_buffers_config"
+    description: <<END
+Serialized HbmBuffersConfig proto.
+END
+  }
+  attr {
+    name: "tpu_topology"
+    description: <<END
+Serialized TpuTopologyArgsProto proto.
+END
+  }
+  summary: "An op computes tuple mask of deduplication data from embedding core."
+  description: <<END
+The deduplication data receiving from embedding core is a Tensor with
+type=DT_VARIANT. The tensor itself is an XLA nested tuple, whose elements are
+rank 1 tensors. This op is to represents types and length of these elements.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FinalizeTPUEmbeddingV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FinalizeTPUEmbeddingV2.pbtxt
new file mode 100644
index 00000000000000..825e07973a4728
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FinalizeTPUEmbeddingV2.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "FinalizeTPUEmbeddingV2"
+  in_arg {
+    name: "common_config"
+    description: <<END
+A string-encoded common configuration proto containing metadata
+about the TPUEmbedding partitioner output and the HBM size (in bytes) required
+for operation.
+END
+  }
+  in_arg {
+    name: "memory_config"
+    description: <<END
+A string-encoded memory config proto containing metadata about
+the memory allocations reserved for TPUEmbedding.
+END
+  }
+  out_arg {
+    name: "embedding_partitions"
+    description: <<END
+A string-encoded embedding partitions proto describing how embedding tables are
+partitioned along their feature and ID.
+END
+  }
+  out_arg {
+    name: "hbm_buffers_config"
+    description: <<END
+A string-encoded HBM buffers config proto specifies where HBM buffers are
+located.
+END
+  }
+  summary: "An op that finalizes the TPUEmbedding configuration."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingActivationsV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingActivationsV2.pbtxt
new file mode 100644
index 00000000000000..fe7d4d629e541f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingActivationsV2.pbtxt
@@ -0,0 +1,61 @@
+op {
+  graph_op_name: "XlaRecvTPUEmbeddingActivationsV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "deduplication_data"
+    description: <<END
+A Tensor with type=DT_VARIANT containing the deduplication
+data. The tensor is an XLA nested tuple containing N elements (where N is
+the ratio of the number of embedding to tensor cores per TPU chip). Each
+element of the nested tuple is a tuple of rank 1 tensors. Each tensor either
+contains indices (DT_UINT32) for embedding lookup on the TensorCore or
+weights (DT_FLOAT) to apply to the output of the embedding lookup operation.
+END
+  }
+  out_arg {
+    name: "outputs"
+    description: <<END
+A TensorList of embedding activations containing one Tensor per
+embedding table in the model.
+END
+  }
+  attr {
+    name: "num_tables"
+    description: <<END
+The number of output activation tensors. If feature descriptor is
+present in the tpu embedding config, it is equal to the number of features
+otherwise equal to number of embedding tables in the model.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  attr {
+    name: "embedding_partitions"
+    description: <<END
+Serialized EmbeddingPartitionsProto proto.
+END
+  }
+  attr {
+    name: "hbm_buffers_config"
+    description: <<END
+Serialized HbmBuffersConfig proto.
+END
+  }
+  attr {
+    name: "tpu_topology"
+    description: <<END
+Serialized TpuTopologyArgsProto proto.
+END
+  }
+  summary: "An op that receives embedding activations on the TPU."
+  description: <<END
+The TPU system performs the embedding lookups and aggregations. The results of
+these aggregations are visible to the Tensorflow Graph as the outputs of a
+XlaRecvTPUEmbeddingActivations Op. This op returns a list containing one
+Tensor of activations per table specified in the model.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
new file mode 100644
index 00000000000000..00c4a44a0af01f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "XlaRecvTPUEmbeddingDeduplicationDataV2"
+  visibility: HIDDEN
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  attr {
+    name: "embedding_partitions"
+    description: <<END
+Serialized EmbeddingPartitionsProto proto.
+END
+  }
+  attr {
+    name: "hbm_buffers_config"
+    description: <<END
+Serialized HbmBuffersConfig proto.
+END
+  }
+  attr {
+    name: "tpu_topology"
+    description: <<END
+Serialized TpuTopologyArgsProto proto.
+END
+  }
+  summary: "Receives deduplication data (indices and weights) from the embedding core."
+  description: <<END
+The deduplication data is a Tensor with type=DT_VARIANT. The tensor itself is an
+XLA nested tuple containing N elements (where N is the ratio of the number of
+embedding to tensor cores per TPU chip). Each element of the nested tuple is a
+tuple of rank 1 tensors. Each tensor either contains indices (DT_UINT32) for
+embedding lookup on the TensorCore or weights (DT_FLOAT) to apply to the output
+of the embedding lookup operation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSendTPUEmbeddingGradientsV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSendTPUEmbeddingGradientsV2.pbtxt
new file mode 100644
index 00000000000000..dea9b98e80ab5a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSendTPUEmbeddingGradientsV2.pbtxt
@@ -0,0 +1,74 @@
+op {
+  graph_op_name: "XlaSendTPUEmbeddingGradientsV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "gradients"
+    description: <<END
+A TensorList of gradients with which to update embedding tables.
+END
+  }
+  in_arg {
+    name: "learning_rates"
+    description: <<END
+A TensorList of learning rates used for updating the embedding
+tables via the optimizer. The length of the TensorList must be equal to the
+number of dynamic learning rate tags specified in the
+TPUEmbeddingConfiguration proto.
+END
+  }
+  in_arg {
+    name: "deduplication_data"
+    description: <<END
+A Tensor with type=DT_VARIANT containing the deduplication
+data. The tensor is an XLA nested tuple containing N elements (where N is
+the ratio of the number of embedding to tensor cores per TPU chip). Each
+element of the nested tuple is a tuple of rank 1 tensors. Each tensor either
+contains indices (DT_UINT32) for embedding lookup on the TensorCore or
+weights (DT_FLOAT) to apply to the output of the embedding lookup operation.
+END
+  }
+  attr {
+    name: "NumTables"
+    description: <<END
+number of tables
+END
+  }
+  attr {
+    name: "NumLearningRateTags"
+    description: <<END
+number of learning rate tags
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  attr {
+    name: "embedding_partitions"
+    description: <<END
+Serialized EmbeddingPartitionsProto proto.
+END
+  }
+  attr {
+    name: "hbm_buffers_config"
+    description: <<END
+Serialized HbmBuffersConfig proto.
+END
+  }
+  attr {
+    name: "tpu_topology"
+    description: <<END
+Serialized TpuTopologyArgsProto proto.
+END
+  }
+  summary: "An op that performs gradient updates of embedding tables."
+  description: <<END
+The gradients argument is a TensorList having the same length and shapes as the
+return value of XlaRecvTPUEmbeddingActivations, but contains gradients of the
+model's loss with respect to the embedding activations. The embedding tables are
+updated from these gradients via the optimizer specified in the
+TPUEmbeddingConfiguration proto given to tpu.initialize_system.
+END
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt
new file mode 100644
index 00000000000000..2493251c1fddc6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "ComputeDedupDataSizeV2"
+  output_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt
new file mode 100644
index 00000000000000..87deca2a2daecd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "ComputeDedupDataTupleMaskV2"
+  output_arg {
+    name: "output_shape"
+    type: DT_INT32
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt
new file mode 100644
index 00000000000000..63c69eaff3aaba
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "FinalizeTPUEmbeddingV2"
+  input_arg {
+    name: "common_config"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "memory_config"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "embedding_partitions"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "hbm_buffers_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt
new file mode 100644
index 00000000000000..2e8fb4d4f2530c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "XlaRecvTPUEmbeddingActivationsV2"
+  input_arg {
+    name: "deduplication_data"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+    number_attr: "num_tables"
+  }
+  attr {
+    name: "num_tables"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
new file mode 100644
index 00000000000000..d97710b91e46fb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "XlaRecvTPUEmbeddingDeduplicationDataV2"
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt
new file mode 100644
index 00000000000000..b416d0ad1a8f0c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt
@@ -0,0 +1,48 @@
+op {
+  name: "XlaSendTPUEmbeddingGradientsV2"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+    number_attr: "NumTables"
+  }
+  input_arg {
+    name: "learning_rates"
+    type: DT_FLOAT
+    number_attr: "NumLearningRateTags"
+  }
+  input_arg {
+    name: "deduplication_data"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "NumTables"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "NumLearningRateTags"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
index c6a7bcaa3b9780..9202b40dfc9370 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
 #include "tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.h"
@@ -49,6 +50,297 @@ using xla::LiteralUtil;
 
 namespace {
 
+void CompileRecvTPUEmbeddingActivations(
+    XlaOpKernelContext* ctx, const std::string& config_string,
+    const tensorflow::tpu::TPUEmbeddingConfiguration& tpu_embedding_config,
+    const std::string& embedding_partitions_string,
+    const std::string& hbm_buffers_config_string,
+    const std::string& tpu_topology_string) {
+  xla::XlaOp deduplication_data = ctx->Input("deduplication_data");
+  TpuEmbeddingEngine_RecvActivationsComputation_Params params;
+  params.tpu_embedding_config.bytes = config_string.c_str();
+  params.tpu_embedding_config.size = config_string.size();
+  params.embedding_partitions.bytes = embedding_partitions_string.c_str();
+  params.embedding_partitions.size = embedding_partitions_string.size();
+  params.hbm_buffers_config.bytes = hbm_buffers_config_string.c_str();
+  params.hbm_buffers_config.size = hbm_buffers_config_string.size();
+  params.tpu_topology.bytes = tpu_topology_string.c_str();
+  params.tpu_topology.size = tpu_topology_string.size();
+  StatusHelper status;
+  params.status = status.c_status;
+  auto builder = ctx->builder();
+  OP_REQUIRES_VALUE(auto shape, ctx, builder->GetShape(deduplication_data));
+  TpuSerializedProto xla_computation_serialized;
+  auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
+    StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
+  });
+  params.xla_computation = &xla_computation_serialized;
+  XLA_Shape c_shape;
+  ApiConverter::ToC(shape, &c_shape);
+  auto c_shape_cleanup =
+      absl::MakeCleanup([&c_shape] { ApiConverter::Destroy(&c_shape); });
+  params.deduplication_data_shape = &c_shape;
+
+  TpuSerializedProto op_sharding_proto_serialized;
+  if (ctx->builder()->sharding().has_value()) {
+    stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
+                                         &op_sharding_proto_serialized);
+    params.op_sharding = &op_sharding_proto_serialized;
+  } else {
+    params.op_sharding = nullptr;
+  }
+  auto op_sharding_cleanup = absl::MakeCleanup([&] {
+    if (params.op_sharding) {
+      StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
+    }
+  });
+
+  stream_executor::tpu::OpsApiFn()
+      ->TpuEmbeddingEngine_RecvActivationsComputationFn(&params);
+  OP_REQUIRES_OK(ctx, status.status());
+  auto xla_computation =
+      stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
+          xla_computation_serialized);
+  auto final_activations =
+      xla::Call(builder, xla_computation, {deduplication_data});
+
+  // Ensure that the number of outputs is the same as the number of user
+  // tables.
+  const int32_t output_count =
+      (tpu_embedding_config.feature_descriptor_size() == 0)
+          ? tpu_embedding_config.table_descriptor_size()
+          : tpu_embedding_config.feature_descriptor_size();
+  OP_REQUIRES(ctx, ctx->num_outputs() == output_count,
+              errors::InvalidArgument(
+                  "Kernel has %d outputs but configuration expects %d outputs.",
+                  ctx->num_outputs(), output_count));
+
+  for (int32_t output_id = 0; output_id < output_count; ++output_id) {
+    ctx->SetOutput(output_id,
+                   xla::GetTupleElement(final_activations, output_id));
+  }
+}
+
+void CompileRecvTPUEmbeddingDeduplicationData(
+    XlaOpKernelContext* ctx, const std::string& config_string,
+    const std::string& embedding_partitions_string,
+    const std::string& hbm_buffers_config_string,
+    const std::string& tpu_topology_string) {
+  TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputation_Params params;
+
+  params.tpu_embedding_config.bytes = config_string.c_str();
+  params.tpu_embedding_config.size = config_string.size();
+  params.embedding_partitions.bytes = embedding_partitions_string.c_str();
+  params.embedding_partitions.size = embedding_partitions_string.size();
+  params.hbm_buffers_config.bytes = hbm_buffers_config_string.c_str();
+  params.hbm_buffers_config.size = hbm_buffers_config_string.size();
+  params.tpu_topology.bytes = tpu_topology_string.c_str();
+  params.tpu_topology.size = tpu_topology_string.size();
+  TpuSerializedProto xla_computation_serialized;
+  auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
+    StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
+  });
+  params.xla_computation = &xla_computation_serialized;
+  StatusHelper status;
+  params.status = status.c_status;
+
+  TpuSerializedProto op_sharding_proto_serialized;
+  if (ctx->builder()->sharding().has_value()) {
+    stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
+                                         &op_sharding_proto_serialized);
+    params.op_sharding = &op_sharding_proto_serialized;
+  } else {
+    params.op_sharding = nullptr;
+  }
+  auto op_sharding_cleanup = absl::MakeCleanup([&] {
+    if (params.op_sharding) {
+      StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
+    }
+  });
+
+  stream_executor::tpu::OpsApiFn()
+      ->TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputationFn(
+          &params);
+  OP_REQUIRES_OK(ctx, status.status());
+
+  auto xla_computation =
+      stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
+          xla_computation_serialized);
+
+  const xla::XlaOp deduplication_data =
+      xla::Call(ctx->builder(), xla_computation, {});
+
+  // Ensure that the number of outputs is equal to 1 (for deduplication data).
+  OP_REQUIRES(ctx, ctx->num_outputs() == 1,
+              errors::InvalidArgument(
+                  "Kernel has %d outputs but configuration expects 1 output.",
+                  ctx->num_outputs()));
+
+  ctx->SetOutput(0, deduplication_data);
+}
+
+void CompileSendTPUEmbeddingGradients(
+    XlaOpKernelContext* ctx, const std::string& config_string,
+    const std::string& embedding_partitions_string,
+    const std::string& hbm_buffers_config_string,
+    const std::string& tpu_topology_string) {
+  std::vector<xla::XlaOp> gradients;
+  std::vector<TensorShape> tf_gradient_shapes;
+  OP_REQUIRES_OK(ctx,
+                 ctx->InputList("gradients", &gradients, &tf_gradient_shapes));
+  std::vector<xla::Shape> gradient_shapes;
+  auto builder = ctx->builder();
+  gradient_shapes.reserve(gradients.size());
+  for (xla::XlaOp op : gradients) {
+    // Gradient layout information is added by XLA, so we can just create
+    // default layout information.
+    xla::Shape gradient_shape = builder->GetShape(op).value();
+    xla::LayoutUtil::SetToDefaultLayout(&gradient_shape);
+    gradient_shapes.push_back(gradient_shape);
+  }
+
+  std::vector<xla::XlaOp> learning_rates;
+  std::vector<TensorShape> tf_learning_rate_shapes;
+  OP_REQUIRES_OK(ctx, ctx->InputList("learning_rates", &learning_rates,
+                                     &tf_learning_rate_shapes));
+  std::vector<xla::Shape> learning_rate_shapes;
+  learning_rate_shapes.reserve(learning_rates.size());
+  for (xla::XlaOp op : learning_rates) {
+    learning_rate_shapes.push_back(builder->GetShape(op).value());
+  }
+
+  xla::XlaOp deduplication_data = ctx->Input("deduplication_data");
+
+  TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params params;
+  params.tpu_embedding_config.bytes = config_string.c_str();
+  params.tpu_embedding_config.size = config_string.size();
+  params.embedding_partitions.bytes = embedding_partitions_string.c_str();
+  params.embedding_partitions.size = embedding_partitions_string.size();
+  params.hbm_buffers_config.bytes = hbm_buffers_config_string.c_str();
+  params.hbm_buffers_config.size = hbm_buffers_config_string.size();
+  params.tpu_topology.bytes = tpu_topology_string.c_str();
+  params.tpu_topology.size = tpu_topology_string.size();
+  TpuSerializedProto xla_computation_serialized;
+  auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
+    StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
+  });
+  params.xla_computation = &xla_computation_serialized;
+  StatusHelper status;
+  params.status = status.c_status;
+  OP_REQUIRES_VALUE(auto deduplication_shape, ctx,
+                    builder->GetShape(deduplication_data));
+  XLA_Shape gradient_tuple_c_shape;
+  params.gradient_tuple_shape = &gradient_tuple_c_shape;
+  ApiConverter::ToC(xla::ShapeUtil::MakeTupleShape(gradient_shapes),
+                    &gradient_tuple_c_shape);
+  XLA_Shape learning_rate_tuple_c_shape;
+  params.learning_rate_tuple_shape = &learning_rate_tuple_c_shape;
+  ApiConverter::ToC(xla::ShapeUtil::MakeTupleShape(learning_rate_shapes),
+                    &learning_rate_tuple_c_shape);
+  XLA_Shape deduplication_c_shape;
+  params.deduplication_data_shape = &deduplication_c_shape;
+  ApiConverter::ToC(deduplication_shape, &deduplication_c_shape);
+
+  auto c_shape_cleanup =
+      absl::MakeCleanup([&gradient_tuple_c_shape, &learning_rate_tuple_c_shape,
+                         &deduplication_c_shape] {
+        ApiConverter::Destroy(&gradient_tuple_c_shape);
+        ApiConverter::Destroy(&learning_rate_tuple_c_shape);
+        ApiConverter::Destroy(&deduplication_c_shape);
+      });
+  params.num_inputs = ctx->num_inputs();
+
+  TpuSerializedProto op_sharding_proto_serialized;
+  if (ctx->builder()->sharding().has_value()) {
+    stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
+                                         &op_sharding_proto_serialized);
+    params.op_sharding = &op_sharding_proto_serialized;
+  } else {
+    params.op_sharding = nullptr;
+  }
+  auto op_sharding_cleanup = absl::MakeCleanup([&] {
+    if (params.op_sharding) {
+      StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
+    }
+  });
+
+  stream_executor::tpu::OpsApiFn()
+      ->TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputationFn(&params);
+  OP_REQUIRES_OK(ctx, status.status());
+
+  auto xla_computation =
+      stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
+          xla_computation_serialized);
+
+  xla::Call(builder, xla_computation,
+            {xla::Tuple(builder, gradients),
+             xla::Tuple(builder, learning_rates), deduplication_data});
+}
+
+void CompileComputeDedupDataSize(XlaOpKernelContext* ctx,
+                                 const std::string& config_string,
+                                 const std::string& embedding_partitions_string,
+                                 const std::string& hbm_buffers_config_string,
+                                 const std::string& tpu_topology_string) {
+  TpuEmbeddingEngine_DedupDataSizeComputation_Params params;
+  params.tpu_embedding_config.bytes = config_string.c_str();
+  params.tpu_embedding_config.size = config_string.size();
+  params.embedding_partitions.bytes = embedding_partitions_string.c_str();
+  params.embedding_partitions.size = embedding_partitions_string.size();
+  params.hbm_buffers_config.bytes = hbm_buffers_config_string.c_str();
+  params.hbm_buffers_config.size = hbm_buffers_config_string.size();
+  params.tpu_topology.bytes = tpu_topology_string.c_str();
+  params.tpu_topology.size = tpu_topology_string.size();
+  int num_elements = -1;
+  params.num_elements = &num_elements;
+  StatusHelper status;
+  params.status = status.c_status;
+
+  stream_executor::tpu::OpsApiFn()
+      ->TpuEmbeddingEngine_DedupDataSizeComputationFn(&params);
+  OP_REQUIRES_OK(ctx, status.status());
+
+  auto output = xla::ConstantLiteral(
+      ctx->builder(), LiteralUtil::CreateR0<int32_t>(num_elements));
+  ctx->SetOutput(0, output);
+}
+
+void CompileComputeDedupDataTupleMask(
+    XlaOpKernelContext* ctx, const std::string& config_string,
+    const std::string& embedding_partitions_string,
+    const std::string& hbm_buffers_config_string,
+    const std::string& tpu_topology_string) {
+  TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params params;
+  params.tpu_embedding_config.bytes = config_string.c_str();
+  params.tpu_embedding_config.size = config_string.size();
+  params.embedding_partitions.bytes = embedding_partitions_string.c_str();
+  params.embedding_partitions.size = embedding_partitions_string.size();
+  params.hbm_buffers_config.bytes = hbm_buffers_config_string.c_str();
+  params.hbm_buffers_config.size = hbm_buffers_config_string.size();
+  params.tpu_topology.bytes = tpu_topology_string.c_str();
+  params.tpu_topology.size = tpu_topology_string.size();
+
+  TpuSerializedProto xla_computation_serialized;
+  auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
+    StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
+  });
+
+  params.xla_computation = &xla_computation_serialized;
+  StatusHelper status;
+  params.status = status.c_status;
+
+  stream_executor::tpu::OpsApiFn()
+      ->TpuEmbeddingEngine_DedupDataTupleMaskComputationFn(&params);
+  OP_REQUIRES_OK(ctx, status.status());
+
+  auto xla_computation =
+      stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
+          xla_computation_serialized);
+  const xla::XlaOp deduplication_data_tuple_mask =
+      xla::Call(ctx->builder(), xla_computation, {});
+  ctx->SetOutput(0, deduplication_data_tuple_mask);
+}
+
 // This TensorFlow op receives a batch of activations from the
 // TpuEmbeddingEngine.
 class RecvTPUEmbeddingActivationsOp : public XlaOpKernel {
@@ -70,66 +362,8 @@ class RecvTPUEmbeddingActivationsOp : public XlaOpKernel {
         ctx, ctx->num_inputs() == 1,
         errors::Internal("Kernel has ", ctx->num_inputs(),
                          " inputs but configuration expects one input"));
-
-    xla::XlaOp deduplication_data = ctx->Input("deduplication_data");
-
-    TpuEmbeddingEngine_RecvActivationsComputation_Params params;
-    params.tpu_embedding_config.bytes = config_string_.c_str();
-    params.tpu_embedding_config.size = config_string_.size();
-    StatusHelper status;
-    params.status = status.c_status;
-    auto builder = ctx->builder();
-    OP_REQUIRES_VALUE(auto shape, ctx, builder->GetShape(deduplication_data));
-    TpuSerializedProto xla_computation_serialized;
-    auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
-      StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
-    });
-    params.xla_computation = &xla_computation_serialized;
-    XLA_Shape c_shape;
-    ApiConverter::ToC(shape, &c_shape);
-    auto c_shape_cleanup =
-        absl::MakeCleanup([&c_shape] { ApiConverter::Destroy(&c_shape); });
-    params.deduplication_data_shape = &c_shape;
-
-    TpuSerializedProto op_sharding_proto_serialized;
-    if (ctx->builder()->sharding().has_value()) {
-      stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
-                                           &op_sharding_proto_serialized);
-      params.op_sharding = &op_sharding_proto_serialized;
-    } else {
-      params.op_sharding = nullptr;
-    }
-    auto op_sharding_cleanup = absl::MakeCleanup([&] {
-      if (params.op_sharding) {
-        StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
-      }
-    });
-
-    stream_executor::tpu::OpsApiFn()
-        ->TpuEmbeddingEngine_RecvActivationsComputationFn(&params);
-    OP_REQUIRES_OK(ctx, status.status());
-    auto xla_computation =
-        stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
-            xla_computation_serialized);
-    auto final_activations =
-        xla::Call(builder, xla_computation, {deduplication_data});
-
-    // Ensure that the number of outputs is the same as the number of user
-    // tables.
-    const int32 output_count =
-        (tpu_embedding_config_.feature_descriptor_size() == 0)
-            ? tpu_embedding_config_.table_descriptor_size()
-            : tpu_embedding_config_.feature_descriptor_size();
-    OP_REQUIRES(
-        ctx, ctx->num_outputs() == output_count,
-        errors::InvalidArgument(
-            "Kernel has %d outputs but configuration expects %d outputs.",
-            ctx->num_outputs(), output_count));
-
-    for (int32 output_id = 0; output_id < output_count; ++output_id) {
-      ctx->SetOutput(output_id,
-                     xla::GetTupleElement(final_activations, output_id));
-    }
+    CompileRecvTPUEmbeddingActivations(ctx, config_string_,
+                                       tpu_embedding_config_, "", "", "");
   }
 
  private:
@@ -164,52 +398,8 @@ class RecvTPUEmbeddingDeduplicationDataOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     VLOG(1) << "Compile RecvTPUEmbeddingDeduplicationDataOp";
 
-    TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputation_Params
-        params;
-
-    params.tpu_embedding_config.bytes = config_string_.c_str();
-    params.tpu_embedding_config.size = config_string_.size();
-    TpuSerializedProto xla_computation_serialized;
-    auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
-      StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
-    });
-    params.xla_computation = &xla_computation_serialized;
-    StatusHelper status;
-    params.status = status.c_status;
-
-    TpuSerializedProto op_sharding_proto_serialized;
-    if (ctx->builder()->sharding().has_value()) {
-      stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
-                                           &op_sharding_proto_serialized);
-      params.op_sharding = &op_sharding_proto_serialized;
-    } else {
-      params.op_sharding = nullptr;
-    }
-    auto op_sharding_cleanup = absl::MakeCleanup([&] {
-      if (params.op_sharding) {
-        StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
-      }
-    });
-
-    stream_executor::tpu::OpsApiFn()
-        ->TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputationFn(
-            &params);
-    OP_REQUIRES_OK(ctx, status.status());
-
-    auto xla_computation =
-        stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
-            xla_computation_serialized);
-
-    const xla::XlaOp deduplication_data =
-        xla::Call(ctx->builder(), xla_computation, {});
+    CompileRecvTPUEmbeddingDeduplicationData(ctx, config_string_, "", "", "");
 
-    // Ensure that the number of outputs is equal to 1 (for deduplication data).
-    OP_REQUIRES(ctx, ctx->num_outputs() == 1,
-                errors::InvalidArgument(
-                    "Kernel has %d outputs but configuration expects 1 output.",
-                    ctx->num_outputs()));
-
-    ctx->SetOutput(0, deduplication_data);
     VLOG(1) << "Compile RecvTPUDeduplicationDataOp done";
   }
 
@@ -246,91 +436,7 @@ class SendTPUEmbeddingGradientsOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     VLOG(1) << "Compile SendTPUEmbeddingGradientsOp";
 
-    std::vector<xla::XlaOp> gradients;
-    std::vector<TensorShape> tf_gradient_shapes;
-    OP_REQUIRES_OK(
-        ctx, ctx->InputList("gradients", &gradients, &tf_gradient_shapes));
-    std::vector<xla::Shape> gradient_shapes;
-    auto builder = ctx->builder();
-    gradient_shapes.reserve(gradients.size());
-    for (xla::XlaOp op : gradients) {
-      // Gradient layout information is added by XLA, so we can just create
-      // default layout information.
-      xla::Shape gradient_shape = builder->GetShape(op).value();
-      xla::LayoutUtil::SetToDefaultLayout(&gradient_shape);
-      gradient_shapes.push_back(gradient_shape);
-    }
-
-    std::vector<xla::XlaOp> learning_rates;
-    std::vector<TensorShape> tf_learning_rate_shapes;
-    OP_REQUIRES_OK(ctx, ctx->InputList("learning_rates", &learning_rates,
-                                       &tf_learning_rate_shapes));
-    std::vector<xla::Shape> learning_rate_shapes;
-    learning_rate_shapes.reserve(learning_rates.size());
-    for (xla::XlaOp op : learning_rates) {
-      learning_rate_shapes.push_back(builder->GetShape(op).value());
-    }
-
-    xla::XlaOp deduplication_data = ctx->Input("deduplication_data");
-
-    TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params params;
-    params.tpu_embedding_config.bytes = config_string_.c_str();
-    params.tpu_embedding_config.size = config_string_.size();
-    TpuSerializedProto xla_computation_serialized;
-    auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
-      StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
-    });
-    params.xla_computation = &xla_computation_serialized;
-    StatusHelper status;
-    params.status = status.c_status;
-    OP_REQUIRES_VALUE(auto deduplication_shape, ctx,
-                      builder->GetShape(deduplication_data));
-    XLA_Shape gradient_tuple_c_shape;
-    params.gradient_tuple_shape = &gradient_tuple_c_shape;
-    ApiConverter::ToC(xla::ShapeUtil::MakeTupleShape(gradient_shapes),
-                      &gradient_tuple_c_shape);
-    XLA_Shape learning_rate_tuple_c_shape;
-    params.learning_rate_tuple_shape = &learning_rate_tuple_c_shape;
-    ApiConverter::ToC(xla::ShapeUtil::MakeTupleShape(learning_rate_shapes),
-                      &learning_rate_tuple_c_shape);
-    XLA_Shape deduplication_c_shape;
-    params.deduplication_data_shape = &deduplication_c_shape;
-    ApiConverter::ToC(deduplication_shape, &deduplication_c_shape);
-
-    auto c_shape_cleanup = absl::MakeCleanup([&gradient_tuple_c_shape,
-                                              &learning_rate_tuple_c_shape,
-                                              &deduplication_c_shape] {
-      ApiConverter::Destroy(&gradient_tuple_c_shape);
-      ApiConverter::Destroy(&learning_rate_tuple_c_shape);
-      ApiConverter::Destroy(&deduplication_c_shape);
-    });
-    params.num_inputs = ctx->num_inputs();
-
-    TpuSerializedProto op_sharding_proto_serialized;
-    if (ctx->builder()->sharding().has_value()) {
-      stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
-                                           &op_sharding_proto_serialized);
-      params.op_sharding = &op_sharding_proto_serialized;
-    } else {
-      params.op_sharding = nullptr;
-    }
-    auto op_sharding_cleanup = absl::MakeCleanup([&] {
-      if (params.op_sharding) {
-        StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
-      }
-    });
-
-    stream_executor::tpu::OpsApiFn()
-        ->TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputationFn(&params);
-    OP_REQUIRES_OK(ctx, status.status());
-
-    auto xla_computation =
-        stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
-            xla_computation_serialized);
-
-    xla::Call(builder, xla_computation,
-              {xla::Tuple(builder, gradients),
-               xla::Tuple(builder, learning_rates), deduplication_data});
+    CompileSendTPUEmbeddingGradients(ctx, config_string_, "", "", "");
 
     VLOG(1) << "Compile SendTPUEmbeddingGradientsOp done";
   }
@@ -719,21 +825,7 @@ class ComputeDedupDataSizeOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     VLOG(1) << "Compile ComputeDedupDataSizeOp";
 
-    TpuEmbeddingEngine_DedupDataSizeComputation_Params params;
-    params.tpu_embedding_config.bytes = config_string_.c_str();
-    params.tpu_embedding_config.size = config_string_.size();
-    int num_elements = -1;
-    params.num_elements = &num_elements;
-    StatusHelper status;
-    params.status = status.c_status;
-
-    stream_executor::tpu::OpsApiFn()
-        ->TpuEmbeddingEngine_DedupDataSizeComputationFn(&params);
-    OP_REQUIRES_OK(ctx, status.status());
-
-    auto output = xla::ConstantLiteral(
-        ctx->builder(), LiteralUtil::CreateR0<int32_t>(num_elements));
-    ctx->SetOutput(0, output);
+    CompileComputeDedupDataSize(ctx, config_string_, "", "", "");
 
     VLOG(1) << "Compile ComputeDedupDataSizeOp done";
   }
@@ -765,29 +857,8 @@ class ComputeDedupDataTupleMaskOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     VLOG(1) << "Compile ComputeDedupDataTupleMaskOp";
 
-    TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params params;
-    params.tpu_embedding_config.bytes = config_string_.c_str();
-    params.tpu_embedding_config.size = config_string_.size();
-
-    TpuSerializedProto xla_computation_serialized;
-    auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
-      StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
-    });
-
-    params.xla_computation = &xla_computation_serialized;
-    StatusHelper status;
-    params.status = status.c_status;
-
-    stream_executor::tpu::OpsApiFn()
-        ->TpuEmbeddingEngine_DedupDataTupleMaskComputationFn(&params);
-    OP_REQUIRES_OK(ctx, status.status());
-
-    auto xla_computation =
-        stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
-            xla_computation_serialized);
-    const xla::XlaOp deduplication_data_tuple_mask =
-        xla::Call(ctx->builder(), xla_computation, {});
-    ctx->SetOutput(0, deduplication_data_tuple_mask);
+    CompileComputeDedupDataTupleMask(ctx, config_string_, "", "", "");
+
     VLOG(1) << "Compile ComputeDedupDataTupleMaskOp done";
   }
 
@@ -802,5 +873,230 @@ class ComputeDedupDataTupleMaskOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("ComputeDedupDataTupleMask").AllowVariantTypes(),
                 ComputeDedupDataTupleMaskOp);
 
+// This Op has the same functionality as `XlaRecvTPUEmbeddingActivations`, but
+// it accepts `embedding_partitions` and `hbm_buffers_config` (which can be
+// obtained from `FinalizeTPUEmbeddingV2`). This is meaningful for use cases
+// where the kernel runs in a different address space from where
+// `embedding_partitions` and `hbm_buffers_config` are stored.
+// The same principle applies to all the other V2 Ops here.
+class RecvTPUEmbeddingActivationsV2Op : public XlaOpKernel {
+ public:
+  explicit RecvTPUEmbeddingActivationsV2Op(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("embedding_partitions",
+                                     &embedding_partitions_string_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("hbm_buffers_config", &hbm_buffers_config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tpu_topology", &tpu_topology_string_));
+
+    OP_REQUIRES(
+        ctx, tpu_embedding_config_.ParseFromString(config_string_),
+        errors::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
+                                "proto from config attr"));
+  }
+
+  ~RecvTPUEmbeddingActivationsV2Op() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    OP_REQUIRES(
+        ctx, ctx->num_inputs() == 1,
+        errors::Internal("Kernel has ", ctx->num_inputs(),
+                         " inputs but configuration expects one input"));
+
+    CompileRecvTPUEmbeddingActivations(
+        ctx, config_string_, tpu_embedding_config_,
+        embedding_partitions_string_, hbm_buffers_config_string_,
+        tpu_topology_string_);
+  }
+
+ private:
+  tensorflow::tpu::TPUEmbeddingConfiguration tpu_embedding_config_;
+  std::string config_string_;
+  std::string embedding_partitions_string_;
+  std::string hbm_buffers_config_string_;
+  std::string tpu_topology_string_;
+
+  RecvTPUEmbeddingActivationsV2Op(const RecvTPUEmbeddingActivationsV2Op&) =
+      delete;
+  void operator=(const RecvTPUEmbeddingActivationsV2Op&) = delete;
+};
+
+REGISTER_XLA_OP(Name("XlaRecvTPUEmbeddingActivationsV2").AllowVariantTypes(),
+                RecvTPUEmbeddingActivationsV2Op);
+
+class RecvTPUEmbeddingDeduplicationDataV2Op : public XlaOpKernel {
+ public:
+  explicit RecvTPUEmbeddingDeduplicationDataV2Op(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("embedding_partitions",
+                                     &embedding_partitions_string_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("hbm_buffers_config", &hbm_buffers_config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tpu_topology", &tpu_topology_string_));
+    OP_REQUIRES(
+        ctx,
+        tensorflow::tpu::TPUEmbeddingConfiguration().ParseFromString(
+            config_string_),
+        errors::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
+                                "proto from config attr"));
+  }
+
+  ~RecvTPUEmbeddingDeduplicationDataV2Op() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(1) << "Compile RecvTPUEmbeddingDeduplicationDataV2Op";
+
+    CompileRecvTPUEmbeddingDeduplicationData(
+        ctx, config_string_, embedding_partitions_string_,
+        hbm_buffers_config_string_, tpu_topology_string_);
+
+    VLOG(1) << "Compile RecvTPUDeduplicationDataV2Op done";
+  }
+
+ private:
+  // TPU Embedding config string.
+  std::string config_string_;
+  std::string embedding_partitions_string_;
+  std::string hbm_buffers_config_string_;
+  std::string tpu_topology_string_;
+
+  RecvTPUEmbeddingDeduplicationDataV2Op(
+      const RecvTPUEmbeddingDeduplicationDataV2Op&) = delete;
+  void operator=(const RecvTPUEmbeddingDeduplicationDataV2Op&) = delete;
+};
+
+REGISTER_XLA_OP(
+    Name("XlaRecvTPUEmbeddingDeduplicationDataV2").AllowVariantTypes(),
+    RecvTPUEmbeddingDeduplicationDataV2Op);
+
+class SendTPUEmbeddingGradientsV2Op : public XlaOpKernel {
+ public:
+  explicit SendTPUEmbeddingGradientsV2Op(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("embedding_partitions",
+                                     &embedding_partitions_string_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("hbm_buffers_config", &hbm_buffers_config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tpu_topology", &tpu_topology_string_));
+    OP_REQUIRES(
+        ctx,
+        tensorflow::tpu::TPUEmbeddingConfiguration().ParseFromString(
+            config_string_),
+        errors::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
+                                "proto from config attr"));
+  }
+
+  ~SendTPUEmbeddingGradientsV2Op() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(1) << "Compile SendTPUEmbeddingGradientsV2Op";
+
+    CompileSendTPUEmbeddingGradients(
+        ctx, config_string_, embedding_partitions_string_,
+        hbm_buffers_config_string_, tpu_topology_string_);
+
+    VLOG(1) << "Compile SendTPUEmbeddingGradientsV2Op done";
+  }
+
+ private:
+  // TPU Embedding config string.
+  std::string config_string_;
+  std::string embedding_partitions_string_;
+  std::string hbm_buffers_config_string_;
+  std::string tpu_topology_string_;
+
+  SendTPUEmbeddingGradientsV2Op(const SendTPUEmbeddingGradientsV2Op&) = delete;
+  void operator=(const SendTPUEmbeddingGradientsV2Op&) = delete;
+};
+
+REGISTER_XLA_OP(Name("XlaSendTPUEmbeddingGradientsV2").AllowVariantTypes(),
+                SendTPUEmbeddingGradientsV2Op);
+
+class ComputeDedupDataSizeV2Op : public XlaOpKernel {
+ public:
+  explicit ComputeDedupDataSizeV2Op(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("embedding_partitions",
+                                     &embedding_partitions_string_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("hbm_buffers_config", &hbm_buffers_config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tpu_topology", &tpu_topology_string_));
+    OP_REQUIRES(
+        ctx,
+        tensorflow::tpu::TPUEmbeddingConfiguration().ParseFromString(
+            config_string_),
+        absl::InvalidArgumentError("Failed to parse TPUEmbeddingConfiguration "
+                                   "proto from config attr."));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(1) << "Compile ComputeDedupDataSizeV2Op";
+
+    CompileComputeDedupDataSize(
+        ctx, config_string_, embedding_partitions_string_,
+        hbm_buffers_config_string_, tpu_topology_string_);
+
+    VLOG(1) << "Compile ComputeDedupDataSizeV2Op done";
+  }
+
+ private:
+  // TPU Embedding config string.
+  std::string config_string_;
+  std::string embedding_partitions_string_;
+  std::string hbm_buffers_config_string_;
+  std::string tpu_topology_string_;
+
+  ComputeDedupDataSizeV2Op(const ComputeDedupDataSizeV2Op&) = delete;
+  void operator=(const ComputeDedupDataSizeV2Op&) = delete;
+};
+
+REGISTER_XLA_OP(Name("ComputeDedupDataSizeV2"), ComputeDedupDataSizeV2Op);
+
+class ComputeDedupDataTupleMaskV2Op : public XlaOpKernel {
+ public:
+  explicit ComputeDedupDataTupleMaskV2Op(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("embedding_partitions",
+                                     &embedding_partitions_string_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("hbm_buffers_config", &hbm_buffers_config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tpu_topology", &tpu_topology_string_));
+    OP_REQUIRES(
+        ctx,
+        tensorflow::tpu::TPUEmbeddingConfiguration().ParseFromString(
+            config_string_),
+        errors::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
+                                "proto from config attr"));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(1) << "Compile ComputeDedupDataTupleMaskV2Op";
+
+    CompileComputeDedupDataTupleMask(
+        ctx, config_string_, embedding_partitions_string_,
+        hbm_buffers_config_string_, tpu_topology_string_);
+
+    VLOG(1) << "Compile ComputeDedupDataTupleMaskV2Op done";
+  }
+
+ private:
+  // TPU Embedding config string.
+  std::string config_string_;
+  std::string embedding_partitions_string_;
+  std::string hbm_buffers_config_string_;
+  std::string tpu_topology_string_;
+
+  ComputeDedupDataTupleMaskV2Op(const ComputeDedupDataTupleMaskV2Op&) = delete;
+  void operator=(const ComputeDedupDataTupleMaskV2Op&) = delete;
+};
+
+REGISTER_XLA_OP(Name("ComputeDedupDataTupleMaskV2").AllowVariantTypes(),
+                ComputeDedupDataTupleMaskV2Op);
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_embedding_ops.cc b/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
index e973552d55e7f4..5d15a34bb14ba5 100644
--- a/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
@@ -567,4 +567,133 @@ REGISTER_OP("ComputeDedupDataTupleMask")
       return absl::OkStatus();
     });
 
+REGISTER_OP("XlaRecvTPUEmbeddingActivationsV2")
+    .Input("deduplication_data: variant")
+    .Output("outputs: num_tables * float32")
+    .Attr("num_tables: int >= 1")
+    .Attr("config: string")
+    .Attr("embedding_partitions: string")
+    .Attr("hbm_buffers_config: string")
+    .Attr("tpu_topology: string")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
+      int num_tables;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_tables", &num_tables));
+      if (c->num_outputs() != num_tables) {
+        return absl::InvalidArgumentError(
+            absl::StrFormat("Number of outputs: %d of the "
+                            "XlaRecvTPUEmbeddingActivationsV2 node "
+                            "does not match the num_tables attribute: %d.",
+                            c->num_outputs(), num_tables));
+      }
+      std::string config_string;
+      TF_RETURN_IF_ERROR(c->GetAttr("config", &config_string));
+      tpu::TPUEmbeddingConfiguration config;
+      if (!config.ParseFromString(config_string)) {
+        return absl::InvalidArgumentError(
+            "Malformed config attribute in the "
+            "XlaRecvTPUEmbeddingActivationsV2 "
+            "node.");
+      }
+      std::string embedding_partitions_string;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("embedding_partitions", &embedding_partitions_string));
+      std::string hbm_buffers_config_string;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("hbm_buffers_config", &hbm_buffers_config_string));
+      std::string tpu_topology_string;
+      TF_RETURN_IF_ERROR(c->GetAttr("tpu_topology", &tpu_topology_string));
+      std::vector<TensorShapeProto> output_shapes;
+      TF_RETURN_IF_ERROR(ComputeOutputTensorShapes(config, &output_shapes));
+      if (c->num_outputs() != output_shapes.size()) {
+        return absl::InvalidArgumentError(absl::StrFormat(
+            "Number of outputs: %d of the XlaRecvTPUEmbeddingActivationsV2 "
+            "node "
+            "does not match the number of tables or features in the TPU "
+            "embedding config: %d.",
+            c->num_outputs(), output_shapes.size()));
+      }
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        shape_inference::ShapeHandle output_shape;
+        TF_RETURN_IF_ERROR(
+            c->MakeShapeFromShapeProto(output_shapes[i], &output_shape));
+        c->set_output(i, output_shape);
+      }
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("XlaRecvTPUEmbeddingDeduplicationDataV2")
+    .Output("output: variant")
+    .Attr("config: string")
+    .Attr("embedding_partitions: string")
+    .Attr("hbm_buffers_config: string")
+    .Attr("tpu_topology: string")
+    .SetIsStateful()
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("XlaSendTPUEmbeddingGradientsV2")
+    .Input("gradients: NumTables * float32")
+    .Input("learning_rates: NumLearningRateTags * float32")
+    .Input("deduplication_data: variant")
+    .Attr("NumTables: int >= 1")
+    .Attr("NumLearningRateTags: int >= 0 = 0")
+    .Attr("config: string")
+    .Attr("embedding_partitions: string")
+    .Attr("hbm_buffers_config: string")
+    .Attr("tpu_topology: string")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
+      int learning_rate_tag_count;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("NumLearningRateTags", &learning_rate_tag_count));
+      std::vector<shape_inference::ShapeHandle> learning_rates;
+      TF_RETURN_IF_ERROR(c->input("learning_rates", &learning_rates));
+      for (int i = 0; i < learning_rate_tag_count; ++i) {
+        // Verify that each learning_rates element is scalar
+        shape_inference::ShapeHandle learning_rates_shape;
+        TF_RETURN_IF_ERROR(
+            c->WithRank(learning_rates[i], 0, &learning_rates_shape));
+      }
+
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("ComputeDedupDataSizeV2")
+    .Output("num_elements: int32")
+    .Attr("config: string")
+    .Attr("embedding_partitions: string")
+    .Attr("hbm_buffers_config: string")
+    .Attr("tpu_topology: string")
+    .SetIsStateful()
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("ComputeDedupDataTupleMaskV2")
+    .Output("output_shape: int32")
+    .Attr("config: string")
+    .Attr("embedding_partitions: string")
+    .Attr("hbm_buffers_config: string")
+    .Attr("tpu_topology: string")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShapeOfRank(2));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("FinalizeTPUEmbeddingV2")
+    .Input("common_config: string")
+    .Input("memory_config: string")
+    .Output("embedding_partitions: string")
+    .Output("hbm_buffers_config: string")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) -> absl::Status {
+      // Validate that all the inputs are compatible with the correct
+      // vector shape.
+      TF_RET_CHECK(c->num_inputs() == 2);
+      ShapeHandle input(c->Scalar());
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), input, &input));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), input, &input));
+      TF_RET_CHECK(c->num_outputs() == 2);
+      return absl::OkStatus();
+    });
+
 }  // namespace tensorflow
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h b/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
index 0db1b51f91f383..80365ebb046a22 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
@@ -635,6 +635,9 @@ typedef struct TpuEmbeddingEngine_RecvActivationsComputation_Params {
   void* priv;
 
   TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
   XLA_Shape* deduplication_data_shape;
   TpuSerializedProto* op_sharding;
 
@@ -652,6 +655,9 @@ typedef struct
   void* priv;
 
   TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
   TpuSerializedProto* op_sharding;
   // out
   TpuSerializedProto* xla_computation;
@@ -669,6 +675,9 @@ typedef struct TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params {
 
   int32_t num_inputs;
   TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
   XLA_Shape* learning_rate_tuple_shape;
   XLA_Shape* deduplication_data_shape;
   XLA_Shape* gradient_tuple_shape;
@@ -686,6 +695,9 @@ typedef struct TpuEmbeddingEngine_DedupDataSizeComputation_Params {
   void* priv;
 
   TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
   // out
   int32_t* num_elements;
   TF_Status* status;
@@ -699,6 +711,9 @@ typedef struct TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params {
   void* priv;
 
   TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
   // out
   TpuSerializedProto* xla_computation;
   TF_Status* status;

From ae480831c169f1ee14eaf62664a366c26b9a4c27 Mon Sep 17 00:00:00 2001
From: RoboSchmied <github@roboschmie.de>
Date: Wed, 20 Mar 2024 06:45:08 +0100
Subject: [PATCH 153/670] Update [png_io.h]

fix typo
information has no plural
---
 tensorflow/core/lib/png/png_io.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/lib/png/png_io.h b/tensorflow/core/lib/png/png_io.h
index 8afce9385a10b7..f2d173ab3e82dd 100644
--- a/tensorflow/core/lib/png/png_io.h
+++ b/tensorflow/core/lib/png/png_io.h
@@ -43,7 +43,7 @@ limitations under the License.
 namespace tensorflow {
 namespace png {
 
-// Handy container for decoding informations and struct pointers
+// Handy container for decoding information and struct pointers
 struct DecodeContext {
   const uint8* data;
   int data_left;

From c7c3bbd6983ecb906e5865c0c747cc258de5c7ce Mon Sep 17 00:00:00 2001
From: Deqiang Chen <deqiangc@google.com>
Date: Tue, 19 Mar 2024 22:34:49 -0700
Subject: [PATCH 154/670] Fix a use after free error in
 LowerToIfrtRestoreVariable pass

PiperOrigin-RevId: 617399882
---
 .../ifrt/lower_to_ifrt_restore_variable.mlir  | 26 +++++++++++++++++++
 .../ifrt/lower_to_ifrt_restore_variable.cc    | 19 +++++++-------
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/lower_to_ifrt_restore_variable.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/lower_to_ifrt_restore_variable.mlir
index 46f7f52195deca..5052694566de89 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/ifrt/lower_to_ifrt_restore_variable.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/lower_to_ifrt_restore_variable.mlir
@@ -25,6 +25,32 @@ module {
   }
 }
 
+// -----
+// single variable: VarHandleOp is before RestoreV2
+
+// CHECK-LABEL:   func.func @varhandle_before_restore() {
+// CHECK-NEXT:     [[PREFIX:%.*]] = "tf.Const"() <{value = dense<"restore_ariables"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+// CHECK-NEXT:     [[SLICE:%.*]] = "tf.Const"() <{value = dense<""> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+// CHECK-NEXT:     [[NAME:%.*]] = "tf.Const"() <{value = dense<"y"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+// CHECK-NEXT:     [[HANDLEY:%.*]] = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+// CHECK-NEXT:     "tf.IfrtRestoreVariableOp"([[PREFIX]], [[NAME]], [[SLICE]], [[HANDLEY]])
+// CHECK-SAME:        {restored_dtypes = [f32]}
+// CHECK-NOT:       "tf.RestoreV2"
+// CHECK-NEXT:     return
+
+module {
+  func.func @varhandle_before_restore() {
+    %cst = "tf.Const"() <{value = dense<"restore_ariables"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+    %cst_0 = "tf.Const"() <{value = dense<""> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+    %cst_1 = "tf.Const"() <{value = dense<"y"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+    %1 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+    %0 = "tf.RestoreV2"(%cst, %cst_1, %cst_0): (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<3x1xf32>
+    "tf.AssignVariableOp"(%1, %0) : (tensor<!tf_type.resource<tensor<3x1xf32>>>, tensor<3x1xf32>) -> ()
+    return
+  }
+}
+
+
 // -----
 // multiple variables
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/lower_to_ifrt_restore_variable.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/lower_to_ifrt_restore_variable.cc
index 9effab181c1566..7c0fa364b593a7 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/lower_to_ifrt_restore_variable.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/lower_to_ifrt_restore_variable.cc
@@ -49,16 +49,15 @@ class LowerToIfrtRestoreVariablePass
   void runOnOperation() override {
     mlir::ModuleOp module = getOperation();
 
-    mlir::WalkResult walk_result =
-        module.walk([&](mlir::TF::RestoreV2Op restore_op) {
-          if (mlir::failed(RewriteRestore(restore_op))) {
-            return mlir::WalkResult::interrupt();
-          }
-          return mlir::WalkResult::advance();
-        });
-
-    if (walk_result.wasInterrupted()) {
-      return signalPassFailure();
+    std::vector<mlir::TF::RestoreV2Op> restore_ops;
+    module.walk([&](mlir::TF::RestoreV2Op restore_op) {
+      restore_ops.push_back(restore_op);
+    });
+
+    for (const auto& restore_op : restore_ops) {
+      if (mlir::failed(RewriteRestore(restore_op))) {
+        return signalPassFailure();
+      }
     }
   }
 

From 2e8e12ec0bac2cac585b8a9eecd9a0519cd1ba8e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 23:17:47 -0700
Subject: [PATCH 155/670] Update ops-related pbtxt files.

PiperOrigin-RevId: 617407475
---
 tensorflow/core/ops/ops.pbtxt | 175 ++++++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)

diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 63973c87f531f9..81347a6dea8801 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -9384,6 +9384,30 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ComputeDedupDataSizeV2"
+  output_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "ComputeDedupDataTupleMask"
   output_arg {
@@ -9396,6 +9420,30 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ComputeDedupDataTupleMaskV2"
+  output_arg {
+    name: "output_shape"
+    type: DT_INT32
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "Concat"
   input_arg {
@@ -19935,6 +19983,26 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "FinalizeTPUEmbeddingV2"
+  input_arg {
+    name: "common_config"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "memory_config"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "embedding_partitions"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "hbm_buffers_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "Fingerprint"
   input_arg {
@@ -66309,6 +66377,41 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "XlaRecvTPUEmbeddingActivationsV2"
+  input_arg {
+    name: "deduplication_data"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+    number_attr: "num_tables"
+  }
+  attr {
+    name: "num_tables"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "XlaRecvTPUEmbeddingDeduplicationData"
   output_arg {
@@ -66321,6 +66424,30 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "XlaRecvTPUEmbeddingDeduplicationDataV2"
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "XlaSendTPUEmbeddingGradients"
   input_arg {
@@ -66357,6 +66484,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "XlaSendTPUEmbeddingGradientsV2"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+    number_attr: "NumTables"
+  }
+  input_arg {
+    name: "learning_rates"
+    type: DT_FLOAT
+    number_attr: "NumLearningRateTags"
+  }
+  input_arg {
+    name: "deduplication_data"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "NumTables"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "NumLearningRateTags"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "XlaSendToHost"
   input_arg {

From 7c3b80b0ea09db655c3289adfa11f26b0f7c00ac Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Mar 2024 23:46:15 -0700
Subject: [PATCH 156/670] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 617412161
---
 tensorflow/go/op/wrappers.go | 226 +++++++++++++++++++++++++++++++++++
 1 file changed, 226 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 3c41534f2338f8..4f3a7d7af782be 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -6733,6 +6733,34 @@ func ComputeDedupDataSize(scope *Scope, config string) (num_elements tf.Output)
 	return op.Output(0)
 }
 
+// An op computes the size of the deduplication data from embedding core and returns the updated config.
+//
+// This op is to compute size of the deduplication data so to provide this
+// information to the op that computes the tuple mask of deduplication data can
+// have static output shape.
+//
+// Arguments:
+//
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//	embedding_partitions: Serialized EmbeddingPartitionsProto proto.
+//	hbm_buffers_config: Serialized HbmBuffersConfig proto.
+//	tpu_topology: Serialized TpuTopologyArgsProto proto.
+//
+// Returns The size of the deduplicated data from infeed.
+func ComputeDedupDataSizeV2(scope *Scope, config string, embedding_partitions string, hbm_buffers_config string, tpu_topology string) (num_elements tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config, "embedding_partitions": embedding_partitions, "hbm_buffers_config": hbm_buffers_config, "tpu_topology": tpu_topology}
+	opspec := tf.OpSpec{
+		Type: "ComputeDedupDataSizeV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // An op computes tuple mask of deduplication data from embedding core.
 //
 // The deduplication data receiving from embedding core is a Tensor with
@@ -6763,6 +6791,39 @@ func ComputeDedupDataTupleMask(scope *Scope, config string) (output_shape tf.Out
 	return op.Output(0)
 }
 
+// An op computes tuple mask of deduplication data from embedding core.
+//
+// The deduplication data receiving from embedding core is a Tensor with
+// type=DT_VARIANT. The tensor itself is an XLA nested tuple, whose elements are
+// rank 1 tensors. This op is to represents types and length of these elements.
+//
+// Arguments:
+//
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//	embedding_partitions: Serialized EmbeddingPartitionsProto proto.
+//	hbm_buffers_config: Serialized HbmBuffersConfig proto.
+//	tpu_topology: Serialized TpuTopologyArgsProto proto.
+//
+// Returns A 2-D int tensor represent mask of deduplication data tuple generated by
+// `XlaRecvTPUEmbeddingDeduplicationData`. The tuple has several integer and float
+// type 1-D tensor tuple elements. The first dimenion of this output_shape 2-D
+// tensor is tensor type of tuple elements, `0` represents integer tensor, `1`
+// represents float tensor. The second dimension of `output_shape` gives length of
+// each tuple element.
+func ComputeDedupDataTupleMaskV2(scope *Scope, config string, embedding_partitions string, hbm_buffers_config string, tpu_topology string) (output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config, "embedding_partitions": embedding_partitions, "hbm_buffers_config": hbm_buffers_config, "tpu_topology": tpu_topology}
+	opspec := tf.OpSpec{
+		Type: "ComputeDedupDataTupleMaskV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -17614,6 +17675,42 @@ func FinalizeTPUEmbedding(scope *Scope, common_config tf.Output, memory_config t
 	return scope.AddOperation(opspec)
 }
 
+// An op that finalizes the TPUEmbedding configuration.
+//
+// Arguments:
+//
+//	common_config: A string-encoded common configuration proto containing metadata
+//
+// about the TPUEmbedding partitioner output and the HBM size (in bytes) required
+// for operation.
+//
+//	memory_config: A string-encoded memory config proto containing metadata about
+//
+// the memory allocations reserved for TPUEmbedding.
+//
+// Returns:
+//
+//	embedding_partitions: A string-encoded embedding partitions proto describing how embedding tables are
+//
+// partitioned along their feature and ID.
+//
+//	hbm_buffers_config: A string-encoded HBM buffers config proto specifies where HBM buffers are
+//
+// located.
+func FinalizeTPUEmbeddingV2(scope *Scope, common_config tf.Output, memory_config tf.Output) (embedding_partitions tf.Output, hbm_buffers_config tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FinalizeTPUEmbeddingV2",
+		Input: []tf.Input{
+			common_config, memory_config,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Generates fingerprint values.
 //
 // Generates fingerprint values of `data`.
@@ -59057,6 +59154,60 @@ func XlaRecvTPUEmbeddingActivations(scope *Scope, deduplication_data tf.Output,
 	return outputs
 }
 
+// An op that receives embedding activations on the TPU.
+//
+// The TPU system performs the embedding lookups and aggregations. The results of
+// these aggregations are visible to the Tensorflow Graph as the outputs of a
+// XlaRecvTPUEmbeddingActivations Op. This op returns a list containing one
+// Tensor of activations per table specified in the model.
+//
+// Arguments:
+//
+//	deduplication_data: A Tensor with type=DT_VARIANT containing the deduplication
+//
+// data. The tensor is an XLA nested tuple containing N elements (where N is
+// the ratio of the number of embedding to tensor cores per TPU chip). Each
+// element of the nested tuple is a tuple of rank 1 tensors. Each tensor either
+// contains indices (DT_UINT32) for embedding lookup on the TensorCore or
+// weights (DT_FLOAT) to apply to the output of the embedding lookup operation.
+//
+//	num_tables: The number of output activation tensors. If feature descriptor is
+//
+// present in the tpu embedding config, it is equal to the number of features
+// otherwise equal to number of embedding tables in the model.
+//
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//	embedding_partitions: Serialized EmbeddingPartitionsProto proto.
+//	hbm_buffers_config: Serialized HbmBuffersConfig proto.
+//	tpu_topology: Serialized TpuTopologyArgsProto proto.
+//
+// Returns A TensorList of embedding activations containing one Tensor per
+// embedding table in the model.
+func XlaRecvTPUEmbeddingActivationsV2(scope *Scope, deduplication_data tf.Output, num_tables int64, config string, embedding_partitions string, hbm_buffers_config string, tpu_topology string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_tables": num_tables, "config": config, "embedding_partitions": embedding_partitions, "hbm_buffers_config": hbm_buffers_config, "tpu_topology": tpu_topology}
+	opspec := tf.OpSpec{
+		Type: "XlaRecvTPUEmbeddingActivationsV2",
+		Input: []tf.Input{
+			deduplication_data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("XlaRecvTPUEmbeddingActivationsV2", err)
+		return
+	}
+	return outputs
+}
+
 // Receives deduplication data (indices and weights) from the embedding core.
 //
 // The deduplication data is a Tensor with type=DT_VARIANT. The tensor itself is an
@@ -59083,6 +59234,35 @@ func XlaRecvTPUEmbeddingDeduplicationData(scope *Scope, config string) (output t
 	return op.Output(0)
 }
 
+// Receives deduplication data (indices and weights) from the embedding core.
+//
+// The deduplication data is a Tensor with type=DT_VARIANT. The tensor itself is an
+// XLA nested tuple containing N elements (where N is the ratio of the number of
+// embedding to tensor cores per TPU chip). Each element of the nested tuple is a
+// tuple of rank 1 tensors. Each tensor either contains indices (DT_UINT32) for
+// embedding lookup on the TensorCore or weights (DT_FLOAT) to apply to the output
+// of the embedding lookup operation.
+//
+// Arguments:
+//
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//	embedding_partitions: Serialized EmbeddingPartitionsProto proto.
+//	hbm_buffers_config: Serialized HbmBuffersConfig proto.
+//	tpu_topology: Serialized TpuTopologyArgsProto proto.
+func XlaRecvTPUEmbeddingDeduplicationDataV2(scope *Scope, config string, embedding_partitions string, hbm_buffers_config string, tpu_topology string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config, "embedding_partitions": embedding_partitions, "hbm_buffers_config": hbm_buffers_config, "tpu_topology": tpu_topology}
+	opspec := tf.OpSpec{
+		Type: "XlaRecvTPUEmbeddingDeduplicationDataV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Wraps the XLA ReducePrecision operator
 //
 //	documented at https://www.tensorflow.org/xla/operation_semantics#reduceprecision.
@@ -59331,6 +59511,52 @@ func XlaSendTPUEmbeddingGradients(scope *Scope, gradients []tf.Output, learning_
 	return scope.AddOperation(opspec)
 }
 
+// An op that performs gradient updates of embedding tables.
+//
+// The gradients argument is a TensorList having the same length and shapes as the
+// return value of XlaRecvTPUEmbeddingActivations, but contains gradients of the
+// model's loss with respect to the embedding activations. The embedding tables are
+// updated from these gradients via the optimizer specified in the
+// TPUEmbeddingConfiguration proto given to tpu.initialize_system.
+//
+// Arguments:
+//
+//	gradients: A TensorList of gradients with which to update embedding tables.
+//	learning_rates: A TensorList of learning rates used for updating the embedding
+//
+// tables via the optimizer. The length of the TensorList must be equal to the
+// number of dynamic learning rate tags specified in the
+// TPUEmbeddingConfiguration proto.
+//
+//	deduplication_data: A Tensor with type=DT_VARIANT containing the deduplication
+//
+// data. The tensor is an XLA nested tuple containing N elements (where N is
+// the ratio of the number of embedding to tensor cores per TPU chip). Each
+// element of the nested tuple is a tuple of rank 1 tensors. Each tensor either
+// contains indices (DT_UINT32) for embedding lookup on the TensorCore or
+// weights (DT_FLOAT) to apply to the output of the embedding lookup operation.
+//
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//	embedding_partitions: Serialized EmbeddingPartitionsProto proto.
+//	hbm_buffers_config: Serialized HbmBuffersConfig proto.
+//	tpu_topology: Serialized TpuTopologyArgsProto proto.
+//
+// Returns the created operation.
+func XlaSendTPUEmbeddingGradientsV2(scope *Scope, gradients []tf.Output, learning_rates []tf.Output, deduplication_data tf.Output, config string, embedding_partitions string, hbm_buffers_config string, tpu_topology string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config, "embedding_partitions": embedding_partitions, "hbm_buffers_config": hbm_buffers_config, "tpu_topology": tpu_topology}
+	opspec := tf.OpSpec{
+		Type: "XlaSendTPUEmbeddingGradientsV2",
+		Input: []tf.Input{
+			tf.OutputList(gradients), tf.OutputList(learning_rates), deduplication_data,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // An op to send a tensor to the host.
 //
 // input: the tensor that will be sent to the host.

From ce4716c9cd876856047acd5424725ad0291ec5f9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Mar 2024 02:02:14 -0700
Subject: [PATCH 157/670] compat: Update forward compatibility horizon to
 2024-03-20

PiperOrigin-RevId: 617438130
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index ef8e89811f42bb..7816e29bf063ef 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 19)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 20)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 0f0413a5dbc5321047f81fbc760ab08a2942f9b2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Mar 2024 02:02:25 -0700
Subject: [PATCH 158/670] Update GraphDef version to 1807.

PiperOrigin-RevId: 617438178
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 0d3b39573bb785..529d4bd4736ccd 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1806  // Updated: 2024/3/19
+#define TF_GRAPH_DEF_VERSION 1807  // Updated: 2024/3/20
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From d9fba4beff6b3bd31e6de26a414c8991da8d2faf Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 20 Mar 2024 03:34:38 -0700
Subject: [PATCH 159/670] Added command buffer object.

PiperOrigin-RevId: 617458127
---
 tensorflow/lite/delegates/gpu/cl/BUILD        | 14 +++
 .../delegates/gpu/cl/cl_command_buffer.cc     | 98 +++++++++++++++++++
 .../lite/delegates/gpu/cl/cl_command_buffer.h | 53 ++++++++++
 .../lite/delegates/gpu/cl/testing/BUILD       |  1 +
 .../gpu/cl/testing/performance_profiling.cc   | 47 ++++-----
 5 files changed, 184 insertions(+), 29 deletions(-)
 create mode 100644 tensorflow/lite/delegates/gpu/cl/cl_command_buffer.cc
 create mode 100644 tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 84004f80218b9e..760e34464e2c94 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -134,6 +134,20 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "cl_command_buffer",
+    srcs = ["cl_command_buffer.cc"],
+    hdrs = ["cl_command_buffer.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_event",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "cl_command_queue",
     srcs = ["cl_command_queue.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.cc b/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.cc
new file mode 100644
index 00000000000000..1ee7f0c812a1b3
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.cc
@@ -0,0 +1,98 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h"
+
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+CLCommandBuffer::CLCommandBuffer(CLCommandBuffer&& cb) : cb_(cb.cb_) {
+  cb.cb_ = nullptr;
+}
+
+CLCommandBuffer& CLCommandBuffer::operator=(CLCommandBuffer&& cb) {
+  if (this != &cb) {
+    Release();
+    std::swap(cb_, cb.cb_);
+  }
+  return *this;
+}
+
+void CLCommandBuffer::Release() {
+  if (cb_) {
+    clReleaseCommandBufferKHR(cb_);
+    cb_ = nullptr;
+  }
+}
+
+absl::Status CLCommandBuffer::Init(CLCommandQueue* queue,
+                                   bool simultaneous_use) {
+  cl_int errcode_ret = CL_SUCCESS;
+  std::vector<cl_command_buffer_properties_khr> properties;
+  if (simultaneous_use) {
+    properties.push_back(CL_COMMAND_BUFFER_FLAGS_KHR);
+    properties.push_back(CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR);
+  }
+  properties.push_back(0);
+  cl_command_buffer_properties_khr* properties_ptr =
+      properties.size() != 1 ? properties.data() : nullptr;
+  cl_command_queue cmd_queue = queue->queue();
+  cb_ = clCreateCommandBufferKHR(1, &cmd_queue, properties_ptr, &errcode_ret);
+  if (errcode_ret != CL_SUCCESS) {
+    return absl::InternalError(absl::StrCat("Failed clCreateCommandBufferKHR.",
+                                            CLErrorCodeToString(errcode_ret)));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CLCommandBuffer::Finalize() {
+  cl_int errcode_ret = clFinalizeCommandBufferKHR(cb_);
+  if (errcode_ret != CL_SUCCESS) {
+    return absl::InternalError(
+        absl::StrCat("Failed clFinalizeCommandBufferKHR.",
+                     CLErrorCodeToString(errcode_ret)));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CLCommandBuffer::Enqueue(CLCommandQueue* queue, CLEvent* event) {
+  cl_event resulting_event;
+  cl_command_queue cmd_queue = queue->queue();
+  cl_int errcode_ret = clEnqueueCommandBufferKHR(
+      1, &cmd_queue, cb_, 0, nullptr, event ? &resulting_event : nullptr);
+  if (errcode_ret != CL_SUCCESS) {
+    return absl::InternalError(absl::StrCat("Failed clEnqueueCommandBufferKHR.",
+                                            CLErrorCodeToString(errcode_ret)));
+  }
+  if (event) {
+    *event = CLEvent(resulting_event);
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h b/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h
new file mode 100644
index 00000000000000..016346ddba7998
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_BUFFER_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class CLCommandBuffer {
+ public:
+  CLCommandBuffer() = default;
+  // Move only
+  CLCommandBuffer(CLCommandBuffer&& cb);
+  CLCommandBuffer& operator=(CLCommandBuffer&& cb);
+  CLCommandBuffer(const CLCommandBuffer&) = delete;
+  CLCommandBuffer& operator=(const CLCommandBuffer&) = delete;
+
+  ~CLCommandBuffer() { Release(); }
+
+  absl::Status Init(CLCommandQueue* queue, bool simultaneous_use = false);
+  absl::Status Finalize();
+  absl::Status Enqueue(CLCommandQueue* queue, CLEvent* event = nullptr);
+  cl_command_buffer_khr GetCommandBuffer() const { return cb_; }
+
+ private:
+  void Release();
+  cl_command_buffer_khr cb_ = nullptr;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_BUFFER_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/BUILD b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
index 94ee8dc3606d1e..24ef270196088e 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@@ -91,6 +91,7 @@ cc_binary(
     ],
     deps = [
         "//tensorflow/lite/core/kernels:builtin_ops",
+        "//tensorflow/lite/delegates/gpu/cl:cl_command_buffer",
         "//tensorflow/lite/delegates/gpu/cl:environment",
         "//tensorflow/lite/delegates/gpu/cl:inference_context",
         "//tensorflow/lite/delegates/gpu/cl:opencl_wrapper",
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
index 5ec70b4bb49227..05521cbe618311 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "third_party/opencl_headers/CL/cl_ext.h"
 #include "third_party/opencl_headers/CL/cl_platform.h"
 #include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
@@ -236,50 +237,37 @@ absl::Status RunSerializedTest(const std::string& model_name) {
   return absl::OkStatus();
 }
 
-absl::Status RunCommandBufferSample(int num_tests, int num_runs_per_test,
+absl::Status RunCommandBufferSample(int num_tests, double model_time_ms,
                                     Environment* env,
                                     InferenceContext* context) {
   if (!env->device().GetInfo().SupportsExtension("cl_khr_command_buffer")) {
     return absl::OkStatus();
   }
 
-  cl_command_queue command_queue = env->queue()->queue();
-  cl_int errcode_ret{CL_SUCCESS};
-  std::vector<cl_command_buffer_khr> cbs(num_runs_per_test);
+  int num_cbs = 3;
+  int num_inferences_in_cb = std::max(1.0, 100.0 / model_time_ms);
+  std::vector<CLCommandBuffer> cbs(num_cbs);
   for (auto& cb : cbs) {
-    cb = clCreateCommandBufferKHR(1, &command_queue, nullptr, &errcode_ret);
-    if (errcode_ret != CL_SUCCESS) {
-      return absl::InternalError("Failed clCreateCommandBufferKHR.");
-    }
-    RETURN_IF_ERROR(context->AddToCommanBuffer(cb));
-    errcode_ret = clFinalizeCommandBufferKHR(cb);
-    if (errcode_ret != CL_SUCCESS) {
-      return absl::InternalError("Failed clFinalizeCommandBufferKHR.");
+    RETURN_IF_ERROR(cb.Init(env->queue(), /*simultaneous_use=*/false));
+    for (int i = 0; i < num_inferences_in_cb; ++i) {
+      RETURN_IF_ERROR(context->AddToCommanBuffer(cb.GetCommandBuffer()));
     }
+    RETURN_IF_ERROR(cb.Finalize());
   }
 
   for (int i = 0; i < num_tests; ++i) {
     const auto start = std::chrono::high_resolution_clock::now();
     for (auto& cb : cbs) {
-      cl_int error_code =
-          clEnqueueCommandBufferKHR(1, &command_queue, cb, 0, nullptr, nullptr);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to clEnqueueCommandBufferKHR - ",
-                         CLErrorCodeToString(error_code)));
-      }
-      clFlush(command_queue);
+      RETURN_IF_ERROR(cb.Enqueue(env->queue()));
     }
-    clFinish(command_queue);
+    clFinish(env->queue()->queue());
     const auto end = std::chrono::high_resolution_clock::now();
     const double total_time_ms = (end - start).count() * 1e-6f;
-    const double average_inference_time = total_time_ms / num_runs_per_test;
+    const double average_inference_time =
+        total_time_ms / (num_cbs * num_inferences_in_cb);
     std::cout << "Total time CB - " << average_inference_time << "ms"
               << std::endl;
   }
-  for (auto& cb : cbs) {
-    clReleaseCommandBufferKHR(cb);
-  }
   return absl::OkStatus();
 }
 
@@ -326,9 +314,10 @@ absl::Status RunModelSample(const std::string& model_name) {
             << std::endl;
 
   const int num_tests = absl::GetFlag(FLAGS_num_tests);
-  const int num_runs_per_sec = std::max(
-      1, static_cast<int>(1000.0f / absl::ToDoubleMilliseconds(
-                                        profiling_info.GetTotalTime())));
+  const double model_time_ms =
+      absl::ToDoubleMilliseconds(profiling_info.GetTotalTime());
+  const int num_runs_per_sec =
+      std::max(1, static_cast<int>(1000.0f / model_time_ms));
   int num_runs_per_test = absl::GetFlag(FLAGS_num_runs_per_test);
   if (num_runs_per_test == 0) {
     num_runs_per_test = num_runs_per_sec;
@@ -347,7 +336,7 @@ absl::Status RunModelSample(const std::string& model_name) {
   }
   if (absl::GetFlag(FLAGS_benchmark_command_buffer)) {
     RETURN_IF_ERROR(
-        RunCommandBufferSample(num_tests, num_runs_per_test, &env, &context));
+        RunCommandBufferSample(num_tests, model_time_ms, &env, &context));
   }
   return absl::OkStatus();
 }

From 33e9917b87b81e82eae5a7a8cade883f3d58203a Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Wed, 20 Mar 2024 03:48:58 -0700
Subject: [PATCH 160/670] PR #10687: [CPU] Add SimplifyFPConversions only if
 xla_allow_excess_precision

Imported from GitHub PR https://github.com/openxla/xla/pull/10687

Several weeks ago it was a change which enables "simplify-fp-conversions" pass in cpu_compiler.cc for intel cpus unconditionally.

[PR-8402](https://github.com/openxla/xla/pull/8402) - [XLA:CPU] [oneDNN] Enable Dot op (MatMul) in BF16 Type

I noticed the following issue with having "simplify-fp-conversions" pass in cpu_compiler.cc enabled unconditionally.

My model uses bf16 operators (e.g. convolution). I want to jit compile and run it on CPU preserving intermediate bf16 accuracy.

Cpu compiler uses`float-normalization-bf16` pass which converts bf16 convolution to f32_convolution + convert_to_bf16 + convert_to_f32. (because typical cpu does not support bf16 computation)

Cpu compiler (on XEON) also uses `simplify-fp-conversions` pass which simplifies `f32_convolution + convert_to_bf16 + convert_to_f32` to just `f32_convolution`.

As the result - the whole model was converted to f32 precision internally and conversion to bf16 happens only at the very end.

In some cases we want to execute bf16 model on CPU but get results with accuracy similar to the case when it is executed on bf16 hardware.

To control the accuracy we can use debug_option `xla_allow_excess_precision`
By default it is true - hence, `simplify-fp-conversions` pass is enabled.

If we need to emulate bf16 computation on intel cpu we can set `XLA_FLAGS="--xla_allow_excess_precision=false"` - in this case `simplify-fp-conversions` will not be added to cpu_compiler pipeline. f32 ops results will be converted to bf16 immediately. This will preserve bf16 accuracy internally.

[gpu_compiler.cc](https://github.com/openxla/xla/blob/main/xla/service/gpu/gpu_compiler.cc#L1359) already enables `SimplifyFPConversions` pass only if `debug_options.xla_allow_excess_precision()` is true.
Copybara import of the project:

--
796dc83ef34455e53b83c02dc68cd6d71306e654 by Alexander Pivovarov <pivovaa@amazon.com>:

[CPU] Add SimplifyFPConversions only if xla_allow_excess_precision

Merging this change closes #10687

PiperOrigin-RevId: 617460913
---
 third_party/xla/xla/service/cpu/cpu_compiler.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index c47bda35bce11a..a5603c5b63f669 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -910,14 +910,20 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
   // AOT compiled code runs in single thread.
   if (!is_aot_compile) {
+    auto debug_options = module->config().debug_options();
     // Run SimplifyFPConversions pass to simplify the BF16 pattern and make it
     // easier to match.
-    pipeline.AddPass<SimplifyFPConversions>();
+    // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
+    if (debug_options.xla_allow_excess_precision()) {
+      pipeline.AddPass<SimplifyFPConversions>();
+    }
     pipeline.AddPass<OneDnnMatMulRewriter>(max_parallelism,
                                            compile_options.thread_pool);
     // Run SimplifyFPConversions pass again to remove redundant Convert ops
     // that may exist as a result of running OneDnnMatMulRewriter pass.
-    pipeline.AddPass<SimplifyFPConversions>();
+    if (debug_options.xla_allow_excess_precision()) {
+      pipeline.AddPass<SimplifyFPConversions>();
+    }
   }
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 

From e1312e0107664fe604c26623a077ff2310fab805 Mon Sep 17 00:00:00 2001
From: Doyeon Kim <doyeonkim@google.com>
Date: Wed, 20 Mar 2024 04:04:42 -0700
Subject: [PATCH 161/670] Factor out model import/export functions from SRQ
 component for reusability

This refactoring is needed in order to reuse model import/export functions for weight-only quantization API.

PiperOrigin-RevId: 617464318
---
 .../mlir/quantization/stablehlo/cc/BUILD      |  19 ++
 .../stablehlo/cc/calibration/component.cc     |  83 ---------
 .../stablehlo/cc/saved_model_export.cc        |  68 +++++++-
 .../stablehlo/cc/saved_model_export.h         |  28 ++-
 .../stablehlo/cc/saved_model_export_test.cc   |  14 +-
 .../stablehlo/cc/saved_model_import.cc        |  80 +++++++++
 .../stablehlo/cc/saved_model_import.h         |  37 ++++
 .../stablehlo/cc/static_range_ptq.cc          | 162 +-----------------
 .../tensorflow/python/quantize_model.cc       |  35 ----
 9 files changed, 238 insertions(+), 288 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
index 2ba0127d2b9c97..36830c4962fb61 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
@@ -127,10 +127,16 @@ cc_library(
     hdrs = ["saved_model_export.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":io",
         ":pass_pipeline",
+        ":saved_model_import",
         ":types",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:convert_asset_args",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:unfreeze_constants",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
@@ -140,6 +146,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -150,6 +157,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -184,15 +192,26 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":types",
+        "//tensorflow/cc/saved_model:loader",
         "//tensorflow/cc/saved_model:reader",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
index 494eadc8463143..7fb126fe0993b4 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
@@ -20,8 +20,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
-#include "absl/base/attributes.h"
 #include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -34,10 +32,8 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
@@ -46,21 +42,15 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace mlir::quant::stablehlo {
-namespace {
 
 using ::stablehlo::quantization::AddCalibrationStatistics;
 using ::stablehlo::quantization::CreateRepresentativeDatasetFileMap;
@@ -69,88 +59,15 @@ using ::stablehlo::quantization::RepresentativeDatasetConfig;
 using ::stablehlo::quantization::io::CreateTmpDir;
 using ::stablehlo::quantization::io::GetLocalTmpFileName;
 using ::tensorflow::AssetFileDef;
-using ::tensorflow::MLIRImportOptions;
 using ::tensorflow::SavedModelBundle;
-using ::tensorflow::SavedModelSignatureDefsToMlirImport;
 using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::ExportedModel;
 using ::tensorflow::quantization::PreprocessAndFreezeGraph;
 using ::tensorflow::quantization::PyFunctionLibrary;
-using ::tensorflow::quantization::RunPasses;
-using ::tensorflow::quantization::UnfreezeConstantsAndSaveVariables;
 
 using ImportedMlirModuleOp =
     std::pair<ModuleOp, std::unique_ptr<SavedModelBundle>>;
 
-// Loads a SavedModel at `saved_model_path` and converts it to `mlir::ModuleOp`.
-//
-// `tags` identify the `tensorflow::MetaGraphDef` to load from the SavedModel.
-// Similarly, `signature_keys` identify the functions (`SignatureDef`s) to load
-// within the `MetaGraphDef`. `ctx` is the `MLIRContext`, which should outlive
-// the returned `ModuleOp`, thus marked with the lifetime bound attribute.
-absl::StatusOr<ImportedMlirModuleOp> SavedModelToMlirModuleOp(
-    const absl::string_view saved_model_path,
-    const std::unordered_set<std::string>& tags,
-    const std::vector<std::string>& signature_keys,
-    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND) {
-  MLIRImportOptions import_options;
-  import_options.upgrade_legacy = true;
-  import_options.lift_variables = false;
-  import_options.include_variables_in_initializers = true;
-
-  auto bundle = std::make_unique<SavedModelBundle>();
-
-  // Copy to eliminate the `const` qualifier so that `absl::MakeSpan` can be
-  // called on it.
-  std::vector<std::string> exported_names = signature_keys;
-  absl::StatusOr<OwningOpRef<ModuleOp>> module_op =
-      SavedModelSignatureDefsToMlirImport(saved_model_path, tags,
-                                          absl::MakeSpan(exported_names), &ctx,
-                                          import_options, &bundle);
-  if (!module_op.status().ok()) {
-    return absl::InternalError(absl::StrCat("Failed to import SavedModel: ",
-                                            module_op.status().ToString()));
-  }
-
-  return std::make_pair(module_op->release(), std::move(bundle));
-}
-
-// Sets up and runs the passes for exporting `module_op`. The behavior of the
-// exporting passes is controlled by `export_opts`. Returns `AssetFileDef`s that
-// associate the input arguments of @main and the asset file names. Asset file
-// names will be used to feed the corresponding tensors during initialization
-// upon model loading.
-absl::StatusOr<SmallVector<AssetFileDef>> RunExportPasses(
-    const ExportOptions& export_opts, MLIRContext& ctx, ModuleOp module_op) {
-  if (export_opts.unfreeze_constants) {
-    TF_RETURN_IF_ERROR(UnfreezeConstantsAndSaveVariables(
-        export_opts.checkpoint_dir, ctx, module_op));
-    LOG(INFO) << "Unfrozen constants and saved variables to checkpoint file: "
-              << export_opts.checkpoint_dir;
-  }
-
-  if (absl::Status pass_run_status = RunPasses(
-          /*name=*/
-          export_opts.debug_name,
-          /*add_passes_func=*/
-          [dup_constants = export_opts.duplicate_shape_determining_constants](
-              PassManager& pm) { AddExportPasses(pm, dup_constants); },
-          ctx, module_op);
-      !pass_run_status.ok()) {
-    return pass_run_status;
-  }
-
-  FailureOr<SmallVector<AssetFileDef>> asset_file_defs =
-      quant::ConvertAssetArgs(module_op);
-  if (failed(asset_file_defs)) {
-    return absl::InternalError("Failed to convert asset args.");
-  }
-
-  return *asset_file_defs;
-}
-
-}  // namespace
-
 CalibrationComponent::CalibrationComponent(
     absl::Nonnull<MLIRContext*> ctx,
     absl::Nonnull<const PyFunctionLibrary*> py_function_lib,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.cc
index 7945ddf712209a..fd85bceca6f9c2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.cc
@@ -17,10 +17,12 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
@@ -32,11 +34,17 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
@@ -56,6 +64,8 @@ namespace {
 using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
 using ::mlir::tf_saved_model::kTfSavedModelInitializerInitType;
 using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+using ::stablehlo::quantization::QuantizationConfig;
+using ::stablehlo::quantization::io::GetLocalTmpFileName;
 using ::tensorflow::AssetFileDef;
 using ::tensorflow::ConvertMlirToGraph;
 using ::tensorflow::FunctionDefLibrary;
@@ -67,6 +77,8 @@ using ::tensorflow::NodeDef;
 using ::tensorflow::OpRegistry;
 using ::tensorflow::SaverDef;
 using ::tensorflow::quantization::ExportedModel;
+using ::tensorflow::quantization::RunPasses;
+using ::tensorflow::quantization::UnfreezeConstantsAndSaveVariables;
 
 // Finds and returns the name of the node from a set of control output nodes.
 // The name should contain the string `contains`. Returns an empty string if no
@@ -114,7 +126,29 @@ std::string FindFilePrefixTensorName(const GraphDef& graph_def) {
 
 }  // namespace
 
-ExportedModel CreateExportedModel(
+absl::StatusOr<ExportedModel> CreateExportedModel(
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationConfig& quantization_config,
+    absl::string_view debug_name_prefix,
+    const absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND, ModuleOp module_op) {
+  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
+  const ExportOptions export_opts = {
+      /*duplicate_shape_determining_constants=*/true,
+      /*unfreeze_constants=*/false, checkpoint_dir,
+      /*debug_name=*/
+      absl::StrCat(debug_name_prefix, kExportStepSuffix)};
+
+  TF_ASSIGN_OR_RETURN(const SmallVector<AssetFileDef> asset_file_defs,
+                      RunExportPasses(export_opts, ctx, module_op));
+
+  return ConvertMlirModuleToExportedModel(
+      module_op, checkpoint_dir, function_aliases,
+      {asset_file_defs.begin(), asset_file_defs.end()});
+}
+
+ExportedModel CreateExportedModelFromGraphDef(
     GraphDef&& graph_def, const absl::string_view init_node_name,
     const absl::string_view checkpoint_dir,
     const std::optional<SaverDef> saver_def,
@@ -222,9 +256,35 @@ absl::StatusOr<ExportedModel> ConvertMlirModuleToExportedModel(
   TF_ASSIGN_OR_RETURN(const std::optional<SaverDef> saver_def,
                       CreateSaverDef(control_ret_node_names, graph_def));
 
-  return CreateExportedModel(std::move(graph_def), init_node_name,
-                             checkpoint_dir, std::move(saver_def),
-                             function_aliases, asset_file_defs);
+  return CreateExportedModelFromGraphDef(std::move(graph_def), init_node_name,
+                                         checkpoint_dir, std::move(saver_def),
+                                         function_aliases, asset_file_defs);
+}
+
+absl::StatusOr<SmallVector<AssetFileDef>> RunExportPasses(
+    const ExportOptions& export_opts, MLIRContext& ctx, ModuleOp module_op) {
+  if (export_opts.unfreeze_constants) {
+    TF_RETURN_IF_ERROR(UnfreezeConstantsAndSaveVariables(
+        export_opts.checkpoint_dir, ctx, module_op));
+    LOG(INFO) << "Unfrozen constants and saved variables to checkpoint file: "
+              << export_opts.checkpoint_dir;
+  }
+
+  TF_RETURN_IF_ERROR(RunPasses(
+      /*name=*/
+      export_opts.debug_name,
+      /*add_passes_func=*/
+      [dup_constants = export_opts.duplicate_shape_determining_constants](
+          PassManager& pm) { AddExportPasses(pm, dup_constants); },
+      ctx, module_op));
+
+  FailureOr<SmallVector<AssetFileDef>> asset_file_defs =
+      quant::ConvertAssetArgs(module_op);
+  if (failed(asset_file_defs)) {
+    return absl::InternalError("Failed to convert asset args.");
+  }
+
+  return *asset_file_defs;
 }
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h
index 1bfd0d5113f955..357c5b0efe52d7 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h
@@ -19,13 +19,18 @@ limitations under the License.
 
 #include <optional>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
@@ -56,8 +61,20 @@ struct ExportOptions {
   std::string debug_name = "stablehlo_quant";
 };
 
+// Creates `ExportedModel` from `module_op`. `module_op` goes through post
+// process passes before an `ExportModel` is created.
+// TODO: b/329206105 - Add unit tests after decomposing post processing passes.
+absl::StatusOr<tensorflow::quantization::ExportedModel> CreateExportedModel(
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const ::stablehlo::quantization::QuantizationConfig& quantization_config,
+    absl::string_view debug_name_prefix,
+    const absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND, ModuleOp module_op);
+
 // Factory function for `ExportedModel`.
-[[nodiscard]] tensorflow::quantization::ExportedModel CreateExportedModel(
+[[nodiscard]] tensorflow::quantization::ExportedModel
+CreateExportedModelFromGraphDef(
     tensorflow::GraphDef&& graph_def, absl::string_view init_node_name,
     absl::string_view checkpoint_dir,
     std::optional<tensorflow::SaverDef> saver_def,
@@ -111,6 +128,15 @@ ConvertMlirModuleToExportedModel(
     const absl::flat_hash_map<std::string, std::string>& function_aliases,
     const std::vector<tensorflow::AssetFileDef>& asset_file_defs);
 
+// Sets up and runs the passes for exporting `module_op`. The behavior of the
+// exporting passes is controlled by `export_opts`. Returns `AssetFileDef`s that
+// associate the input arguments of @main and the asset file names. Asset file
+// names will be used to feed the corresponding tensors during initialization
+// upon model loading.
+// TODO: b/329206105 - Add unit tests after decomposing post processing passes.
+absl::StatusOr<SmallVector<::tensorflow::AssetFileDef>> RunExportPasses(
+    const ExportOptions& export_opts, MLIRContext& ctx, ModuleOp module_op);
+
 }  // namespace mlir::quant::stablehlo
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_SAVED_MODEL_EXPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export_test.cc
index e250f5314726f7..7e55644c38f886 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export_test.cc
@@ -57,10 +57,10 @@ TEST(CreateExportedModelTest, CreateExportedModelBasicFieldsSet) {
   ASSERT_TRUE(
       TextFormat::ParseFromString(R"pb(node { name: "foo" })pb", &graph_def));
 
-  const ExportedModel exported_model =
-      CreateExportedModel(std::move(graph_def), "init_node_name",
-                          "checkpoint_dir", /*saver_def=*/std::nullopt,
-                          /*function_aliases=*/{}, /*asset_file_defs=*/{});
+  const ExportedModel exported_model = CreateExportedModelFromGraphDef(
+      std::move(graph_def), "init_node_name", "checkpoint_dir",
+      /*saver_def=*/std::nullopt,
+      /*function_aliases=*/{}, /*asset_file_defs=*/{});
   ASSERT_THAT(exported_model.graph_def().node(), SizeIs(1));
   EXPECT_THAT(exported_model.graph_def().node()[0].name(), StrEq("foo"));
 
@@ -72,7 +72,7 @@ TEST(CreateExportedModelTest, CreateExportedModelBasicFieldsSet) {
 }
 
 TEST(CreateExportedModelTest, CreateExportedModelWithAddedFunctionAliases) {
-  const ExportedModel exported_model = CreateExportedModel(
+  const ExportedModel exported_model = CreateExportedModelFromGraphDef(
       GraphDef(), /*init_node_name=*/"", /*checkpoint_dir=*/"",
       /*saver_def=*/std::nullopt,
       /*function_aliases=*/{{"func1", "alias1"}, {"func2", "alias2"}},
@@ -93,7 +93,7 @@ TEST(CreateExportedModelTest, CreateExportedModelWithAddedAssetFileDefs) {
   ASSERT_TRUE(
       TextFormat::ParseFromString(R"pb(filename: "fname2")pb", &asset2));
 
-  const ExportedModel exported_model = CreateExportedModel(
+  const ExportedModel exported_model = CreateExportedModelFromGraphDef(
       GraphDef(), /*init_node_name=*/"", /*checkpoint_dir=*/"",
       /*saver_def=*/std::nullopt, /*function_aliases=*/{},
       /*asset_file_defs=*/{asset1, asset2});
@@ -107,7 +107,7 @@ TEST(CreateExportedModelTest, CreateExportedModelWithAddedSaverDef) {
   ASSERT_TRUE(TextFormat::ParseFromString(
       R"pb(filename_tensor_name: "my_file")pb", &saver_def));
 
-  const ExportedModel exported_model = CreateExportedModel(
+  const ExportedModel exported_model = CreateExportedModelFromGraphDef(
       GraphDef(), /*init_node_name=*/"", /*checkpoint_dir=*/"", saver_def,
       /*function_aliases=*/{}, /*asset_file_defs=*/{});
   EXPECT_THAT(exported_model.saver_def().filename_tensor_name(), "my_file");
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.cc
index 9c03ee6e21f4b5..a223a0b03f58a4 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.cc
@@ -14,23 +14,72 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
 
+#include <memory>
 #include <string>
 #include <unordered_set>
+#include <utility>
+#include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/cc/saved_model/reader.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace mlir::quant::stablehlo {
 
+using ::stablehlo::quantization::QuantizationConfig;
+using ::tensorflow::MLIRImportOptions;
+using ::tensorflow::SavedModelBundle;
+using ::tensorflow::SavedModelSignatureDefsToMlirImport;
+using ::tensorflow::quantization::PreprocessAndFreezeGraph;
+
+absl::StatusOr<ImportedMlirModuleOp> SavedModelToMlirModuleOp(
+    const absl::string_view saved_model_path,
+    const std::unordered_set<std::string>& tags,
+    const std::vector<std::string>& signature_keys,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND) {
+  MLIRImportOptions import_options;
+  import_options.upgrade_legacy = true;
+  import_options.lift_variables = false;
+  import_options.include_variables_in_initializers = true;
+
+  auto bundle = std::make_unique<SavedModelBundle>();
+
+  // Copy to eliminate the `const` qualifier so that `absl::MakeSpan` can be
+  // called on it.
+  std::vector<std::string> exported_names = signature_keys;
+  absl::StatusOr<OwningOpRef<ModuleOp>> module_op =
+      SavedModelSignatureDefsToMlirImport(saved_model_path, tags,
+                                          absl::MakeSpan(exported_names), &ctx,
+                                          import_options, &bundle);
+  if (!module_op.status().ok()) {
+    return absl::InternalError(absl::StrCat("Failed to import SavedModel: ",
+                                            module_op.status().ToString()));
+  }
+
+  return std::make_pair(module_op->release(), std::move(bundle));
+}
+
 absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
 GetFunctionAliases(absl::string_view saved_model_path,
                    const std::unordered_set<std::string>& tags) {
@@ -70,4 +119,35 @@ void UpdateFunctionAliases(
   });
 }
 
+absl::StatusOr<ModuleOp> ImportSavedModel(
+    const absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationConfig& quantization_config,
+    const absl::string_view mlir_dump_file_prefix,
+    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND) {
+  TF_ASSIGN_OR_RETURN(
+      ImportedMlirModuleOp imported_module,
+      SavedModelToMlirModuleOp(saved_model_path, tags, signature_keys, ctx));
+  auto [module_op, saved_model_bundle] = std::move(imported_module);
+
+  UpdateFunctionAliases(function_aliases, module_op);
+
+  // Collect the names of the functions that have aliases so that they may not
+  // be inlined.
+  absl::flat_hash_set<std::string> aliased_function_names;
+  absl::c_for_each(function_aliases, [&](const auto& aliases) {
+    return aliased_function_names.insert(aliases.first);
+  });
+
+  TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
+      mlir_dump_file_prefix, /*is_inliner_run=*/true,
+      /*noinline_functions=*/aliased_function_names, module_op, &ctx,
+      saved_model_bundle == nullptr ? nullptr
+                                    : saved_model_bundle->GetSession(),
+      /*run_tf_to_stablehlo=*/true, /*deserialize_xla_call_module=*/false));
+  return module_op;
+}
+
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h
index 2c20224cf24ed2..631d2e714900aa 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h
@@ -19,15 +19,40 @@ limitations under the License.
 
 #include <string>
 #include <unordered_set>
+#include <vector>
 
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 
 namespace mlir::quant::stablehlo {
 
+// Represents a pair of `mlir::ModuleOp` and `tensorflow::SavedModelBundle`. The
+// SavedModelBundle complements the imported ModuleOp by providing access to
+// `tensorflow::Session` which may be useful when reading values from resources
+// (e.g. `TF::VarHandleOp`s).
+using ImportedMlirModuleOp =
+    std::pair<ModuleOp, std::unique_ptr<::tensorflow::SavedModelBundle>>;
+
+// Loads a SavedModel at `saved_model_path` and converts it to `mlir::ModuleOp`.
+//
+// `tags` identify the `tensorflow::MetaGraphDef` to load from the SavedModel.
+// Similarly, `signature_keys` identify the functions (`SignatureDef`s) to load
+// within the `MetaGraphDef`. `ctx` is the `MLIRContext`, which should outlive
+// the returned `ModuleOp`, thus marked with the lifetime bound attribute.
+// TODO: b/329206105 - Add unit tests after decomposing preprocessing passes.
+absl::StatusOr<ImportedMlirModuleOp> SavedModelToMlirModuleOp(
+    absl::string_view saved_model_path,
+    const std::unordered_set<std::string>& tags,
+    const std::vector<std::string>& signature_keys,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND);
+
 // Gets the function aliases from the SavedModel.
 absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
 GetFunctionAliases(absl::string_view saved_model_path,
@@ -44,6 +69,18 @@ void UpdateFunctionAliases(
     absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
     ModuleOp module_op);
 
+// Loads a SavedModel to `mlir::ModuleOp` and performs preprocesses including
+// shape inference and graph freezing.
+// TODO: b/329206105 - Add unit tests after decomposing preprocessing passes.
+absl::StatusOr<ModuleOp> ImportSavedModel(
+    absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const ::stablehlo::quantization::QuantizationConfig& quantization_config,
+    absl::string_view mlir_dump_file_prefix,
+    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND);
+
 }  // namespace mlir::quant::stablehlo
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_SAVED_MODEL_IMPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
index e4b3595ae0f2de..015ab7605a05b7 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
@@ -15,200 +15,44 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h"
 
 #include <memory>
-#include <optional>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
-#include "absl/base/attributes.h"
 #include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace mlir::quant::stablehlo {
-namespace {
 
 using ::stablehlo::quantization::QuantizationConfig;
-using ::stablehlo::quantization::io::GetLocalTmpFileName;
-using ::tensorflow::AssetFileDef;
-using ::tensorflow::MLIRImportOptions;
-using ::tensorflow::SavedModelBundle;
-using ::tensorflow::SavedModelSignatureDefsToMlirImport;
 using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::ExportedModel;
-using ::tensorflow::quantization::PreprocessAndFreezeGraph;
 using ::tensorflow::quantization::PyFunctionLibrary;
-using ::tensorflow::quantization::RunPasses;
-using ::tensorflow::quantization::UnfreezeConstantsAndSaveVariables;
-
-// Sets up and runs the passes for exporting `module_op`. The behavior of the
-// exporting passes is controlled by `export_opts`. Returns `AssetFileDef`s that
-// associate the input arguments of @main and the asset file names. Asset file
-// names will be used to feed the corresponding tensors during initialization
-// upon model loading.
-absl::StatusOr<SmallVector<AssetFileDef>> RunExportPasses(
-    const ExportOptions& export_opts, MLIRContext& ctx, ModuleOp module_op) {
-  if (export_opts.unfreeze_constants) {
-    TF_RETURN_IF_ERROR(UnfreezeConstantsAndSaveVariables(
-        export_opts.checkpoint_dir, ctx, module_op));
-    LOG(INFO) << "Unfrozen constants and saved variables to checkpoint file: "
-              << export_opts.checkpoint_dir;
-  }
-
-  if (absl::Status pass_run_status = RunPasses(
-          /*name=*/
-          export_opts.debug_name,
-          /*add_passes_func=*/
-          [dup_constants = export_opts.duplicate_shape_determining_constants](
-              PassManager& pm) { AddExportPasses(pm, dup_constants); },
-          ctx, module_op);
-      !pass_run_status.ok()) {
-    return pass_run_status;
-  }
-
-  FailureOr<SmallVector<AssetFileDef>> asset_file_defs =
-      quant::ConvertAssetArgs(module_op);
-  if (failed(asset_file_defs)) {
-    return absl::InternalError("Failed to convert asset args.");
-  }
-
-  return *asset_file_defs;
-}
-
-// Represents a pair of `mlir::ModuleOp` and `tensorflow::SavedModelBundle`. The
-// SavedModelBundle complements the imported ModuleOp by providing access to
-// `tensorflow::Session` which may be useful when reading values from resources
-// (e.g. `TF::VarHandleOp`s).
-using ImportedMlirModuleOp =
-    std::pair<ModuleOp, std::unique_ptr<SavedModelBundle>>;
-
-// Loads a SavedModel at `saved_model_path` and converts it to `mlir::ModuleOp`.
-//
-// `tags` identify the `tensorflow::MetaGraphDef` to load from the SavedModel.
-// Similarly, `signature_keys` identify the functions (`SignatureDef`s) to load
-// within the `MetaGraphDef`. `ctx` is the `MLIRContext`, which should outlive
-// the returned `ModuleOp`, thus marked with the lifetime bound attribute.
-absl::StatusOr<ImportedMlirModuleOp> SavedModelToMlirModuleOp(
-    const absl::string_view saved_model_path,
-    const std::unordered_set<std::string>& tags,
-    const std::vector<std::string>& signature_keys,
-    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND) {
-  MLIRImportOptions import_options;
-  import_options.upgrade_legacy = true;
-  import_options.lift_variables = false;
-  import_options.include_variables_in_initializers = true;
-
-  auto bundle = std::make_unique<SavedModelBundle>();
-
-  // Copy to eliminate the `const` qualifier so that `absl::MakeSpan` can be
-  // called on it.
-  std::vector<std::string> exported_names = signature_keys;
-  absl::StatusOr<OwningOpRef<ModuleOp>> module_op =
-      SavedModelSignatureDefsToMlirImport(saved_model_path, tags,
-                                          absl::MakeSpan(exported_names), &ctx,
-                                          import_options, &bundle);
-  if (!module_op.status().ok()) {
-    return absl::InternalError(absl::StrCat("Failed to import SavedModel: ",
-                                            module_op.status().ToString()));
-  }
-
-  return std::make_pair(module_op->release(), std::move(bundle));
-}
-
-absl::StatusOr<ModuleOp> ImportSavedModel(
-    const absl::string_view saved_model_path,
-    const std::vector<std::string>& signature_keys,
-    const std::unordered_set<std::string>& tags,
-    const QuantizationConfig& quantization_config,
-    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
-    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND) {
-  TF_ASSIGN_OR_RETURN(
-      ImportedMlirModuleOp imported_module,
-      SavedModelToMlirModuleOp(saved_model_path, tags, signature_keys, ctx));
-  auto [module_op, saved_model_bundle] = std::move(imported_module);
-
-  UpdateFunctionAliases(function_aliases, module_op);
-
-  // Collect the names of the functions that have aliases so that they may not
-  // be inlined.
-  absl::flat_hash_set<std::string> aliased_function_names;
-  absl::c_for_each(function_aliases, [&](const auto& aliases) {
-    return aliased_function_names.insert(aliases.first);
-  });
-
-  TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
-      /*mlir_dump_file_prefix=*/PreCalibrationComponent::kName,
-      /*is_inliner_run=*/true, /*noinline_functions=*/aliased_function_names,
-      module_op, &ctx,
-      saved_model_bundle == nullptr ? nullptr
-                                    : saved_model_bundle->GetSession(),
-      /*run_tf_to_stablehlo=*/true, /*deserialize_xla_call_module=*/false));
-  return module_op;
-}
-
-absl::StatusOr<ExportedModel> CreateExportedModel(
-    const std::vector<std::string>& signature_keys,
-    const std::unordered_set<std::string>& tags,
-    const QuantizationConfig& quantization_config,
-    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
-    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND, ModuleOp module_op) {
-  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
-  const ExportOptions export_opts = {
-      /*duplicate_shape_determining_constants=*/true,
-      /*unfreeze_constants=*/false, checkpoint_dir,
-      /*debug_name=*/
-      absl::StrCat(PostCalibrationComponent::kName, kExportStepSuffix)};
-
-  TF_ASSIGN_OR_RETURN(const SmallVector<AssetFileDef> asset_file_defs,
-                      RunExportPasses(export_opts, ctx, module_op));
-
-  UpdateFunctionAliases(function_aliases, module_op);
-
-  return ConvertMlirModuleToExportedModel(
-      module_op, checkpoint_dir, function_aliases,
-      {asset_file_defs.begin(), asset_file_defs.end()});
-}
-
-}  // namespace
 
 StaticRangePtqComponent::StaticRangePtqComponent(
     absl::Nonnull<MLIRContext*> ctx,
@@ -263,7 +107,8 @@ absl::Status QuantizeStaticRangePtq(
   TF_ASSIGN_OR_RETURN(
       ModuleOp module_op,
       ImportSavedModel(src_saved_model_path, signature_keys, tags,
-                       quantization_config, *function_aliases, *ctx));
+                       quantization_config, PreCalibrationComponent::kName,
+                       *function_aliases, *ctx));
 
   StaticRangePtqComponent static_range_ptq_component(
       ctx.get(), &py_function_library, src_saved_model_path, signature_keys,
@@ -274,7 +119,8 @@ absl::Status QuantizeStaticRangePtq(
   TF_ASSIGN_OR_RETURN(
       const ExportedModel post_calibrated_exported_model,
       CreateExportedModel(signature_keys, tags, quantization_config,
-                          *function_aliases, *ctx, module_op));
+                          PostCalibrationComponent::kName, *function_aliases,
+                          *ctx, module_op));
 
   // Remove the `tpu` tag for exporting because the output quantized model is
   // essentially a CPU model.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index 10bedefb55161d..d4c1c8a5174046 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -123,41 +123,6 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportAndPreprocessSavedModel(
   return module_ref;
 }
 
-// Sets up and runs the passes for exporting `module_op`. The behavior of the
-// exporting passes is controlled by `export_opts`. Returns `AssetFileDef`s that
-// associate the input arguments of @main and the asset file names. Asset file
-// names will be used to feed the corresponding tensors during initialization
-// upon model loading.
-absl::StatusOr<llvm::SmallVector<AssetFileDef>> RunExportPasses(
-    const ExportOptions &export_opts, mlir::MLIRContext &ctx,
-    mlir::ModuleOp module_op) {
-  if (export_opts.unfreeze_constants) {
-    TF_RETURN_IF_ERROR(UnfreezeConstantsAndSaveVariables(
-        export_opts.checkpoint_dir, ctx, module_op));
-    LOG(INFO) << "Unfrozen constants and saved variables to checkpoint file: "
-              << export_opts.checkpoint_dir;
-  }
-
-  if (absl::Status pass_run_status = RunPasses(
-          /*name=*/
-          export_opts.debug_name,
-          /*add_passes_func=*/
-          [dup_constants = export_opts.duplicate_shape_determining_constants](
-              mlir::PassManager &pm) { AddExportPasses(pm, dup_constants); },
-          ctx, module_op);
-      !pass_run_status.ok()) {
-    return pass_run_status;
-  }
-
-  mlir::FailureOr<llvm::SmallVector<AssetFileDef>> asset_file_defs =
-      mlir::quant::ConvertAssetArgs(module_op);
-  if (failed(asset_file_defs)) {
-    return absl::InternalError("Failed to convert asset args.");
-  }
-
-  return *asset_file_defs;
-}
-
 absl::StatusOr<ExportedModel> ModuleOpToExportedModel(
     mlir::ModuleOp module_op, mlir::MLIRContext *ctx,
     absl::string_view step_name, const bool unfreeze_constants,

From 3ef550dd8f092553804c802c614a8bf23cff7526 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Wed, 20 Mar 2024 04:14:01 -0700
Subject: [PATCH 162/670] Sync the declaration and the tests of
 RegisterFunctionDefForSubgraphs with the definition

This caused a linking error, which blocks many XLA commits

PiperOrigin-RevId: 617466326
---
 tensorflow/lite/delegates/flex/delegate_data.h       | 3 +--
 tensorflow/lite/delegates/flex/delegate_data_test.cc | 7 ++-----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/delegates/flex/delegate_data.h b/tensorflow/lite/delegates/flex/delegate_data.h
index ada2f9847b269c..7cc06f527bca24 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.h
+++ b/tensorflow/lite/delegates/flex/delegate_data.h
@@ -99,8 +99,7 @@ tensorflow::Status RegisterFunctionDefForSubgraphs(
         const std::vector<std::unique_ptr<Subgraph>>&,
         std::set<std::string>* result)>& select_subgraphs_to_register,
     tensorflow::ResourceMgr* resource_mgr,
-    tensorflow::EagerContext* eager_context, TfLiteDelegate* flex_delegate,
-    tensorflow::mutex* mutex);
+    tensorflow::EagerContext* eager_context, TfLiteDelegate* flex_delegate);
 
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/delegate_data_test.cc b/tensorflow/lite/delegates/flex/delegate_data_test.cc
index 70e6e3ee762173..058abe00cc66b5 100644
--- a/tensorflow/lite/delegates/flex/delegate_data_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
@@ -98,11 +97,10 @@ TEST(DelegateDataTest, CheckFunctionDef) {
                          /*resources=*/nullptr, /*resource_ids=*/nullptr,
                          /*initialization_status_map=*/nullptr);
   main_subgraph.SetName("main");
-  tensorflow::mutex mutex;
   TF_ASSERT_OK(RegisterFunctionDefForSubgraphs(
       main_subgraph, select_subgraphs_to_register,
       eager_context->HostCPU()->resource_manager(), eager_context,
-      /*flex_delegate=*/nullptr, &mutex));
+      /*flex_delegate=*/nullptr));
 
   const string add_fdef_txt = R"pb(
     signature {
@@ -230,11 +228,10 @@ TEST(DelegateDataTest, CheckFunctionDefWithOnlyMainGraph) {
                          /*resource_ids=*/nullptr,
                          /*initialization_status_map=*/nullptr);
   main_subgraph.SetName("main");
-  tensorflow::mutex mutex;
   TF_ASSERT_OK(RegisterFunctionDefForSubgraphs(
       main_subgraph, select_subgraphs_to_register,
       eager_context->HostCPU()->resource_manager(), eager_context,
-      /*flex_delegate=*/nullptr, &mutex));
+      /*flex_delegate=*/nullptr));
 
   EXPECT_EQ(eager_context->GetFunctionDef("main"), nullptr);
 

From 3d281e8180544843ebd5517d9bcadccd9e2db469 Mon Sep 17 00:00:00 2001
From: "Jiyoun (Jen) Ha" <jiyounha@google.com>
Date: Wed, 20 Mar 2024 04:29:41 -0700
Subject: [PATCH 163/670] Add check conditions in
 `quantization_driver_test.cc`.

- Adds more rigorous checks for desired states in intermediate testing stages.
- Renames and rewrites `IsEmpty` and `HasQuantParams` for clarity.
- Follows the recommendations in go/totw/135.

PiperOrigin-RevId: 617469193
---
 .../common/quantization_lib/BUILD             |  3 ++
 .../quantization_lib/quantization_driver.h    | 17 ++++++---
 .../quantization_driver_test.cc               | 37 +++++++++++++------
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD b/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
index a0d64569562d38..9ba2ad29d5c212 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
@@ -53,6 +53,7 @@ tf_cc_test(
     srcs = ["quantization_driver_test.cc"],
     deps = [
         ":quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
         "//tensorflow/compiler/mlir/quantization/common:func",
         "//tensorflow/compiler/mlir/quantization/common:test_base",
@@ -63,6 +64,8 @@ tf_cc_test(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h
index d054e9ed738ce0..070ecb75f5db5b 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h
@@ -137,11 +137,22 @@ class QuantizationDriver {
 
   SmallVector<BlockArgument, 4> GetArgs() { return args_; }
 
+  llvm::DenseMap<std::pair<mlir::Operation*, int>, int> GetResultStates() {
+    return result_states_;
+  }
+
+  DenseMap<OpWithResultIndex, QuantStateIndex> result_states_;
+
   // Returns the state of the block argument.
   QuantState& GetArgQuantState(BlockArgument arg) {
     return states_[arg_states_[arg]];
   }
 
+  // Returns the state of the index-th result of the op.
+  QuantState& GetResultQuantState(Operation* op, const int index) {
+    return states_[result_states_[{op, index}]];
+  }
+
  private:
   // Duplicates the constant op if it has multiple uses, and replaces
   // target_op->operand[operand_index] with the newly created op. This also
@@ -262,11 +273,6 @@ class QuantizationDriver {
     return states_[operand_states_[{op, index}]];
   }
 
-  // Returns the state of the index-th result of the op.
-  QuantState& GetResultQuantState(Operation* op, const int index) {
-    return states_[result_states_[{op, index}]];
-  }
-
   // Returns the states of the index-th operand of the op.
   RequantizeStates& GetOperandRequantizeStates(Operation* op, const int index) {
     return rescale_states_[operand_states_[{op, index}]];
@@ -330,7 +336,6 @@ class QuantizationDriver {
   // Maps of indexes to the propagation state vector from the ops operands,
   // results and arguments.
   DenseMap<OpWithOperandIndex, QuantStateIndex> operand_states_;
-  DenseMap<OpWithResultIndex, QuantStateIndex> result_states_;
   DenseMap<BlockArgument, QuantStateIndex> arg_states_;
   DenseMap<Value, QuantStateIndex> value_to_state_;
 
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver_test.cc b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver_test.cc
index 1942ae56b0aba4..cc82c09894b46b 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver_test.cc
@@ -26,12 +26,16 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
@@ -80,7 +84,8 @@ std::unique_ptr<quant::OpQuantSpec> GetOpQuantSpec(
 
 TEST_F(ApplyQuantizationParamsPropagationTest,
        ConstsUsedMultipleTimesAreDuplicated) {
-  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleTFLite);
+  const OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleTFLite);
   func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
 
   auto op_quant_spec_getter = [&](Operation* op) {
@@ -97,14 +102,13 @@ TEST_F(ApplyQuantizationParamsPropagationTest,
 
   int64_t num_constant_op = 0;
   main_fn.walk([&](arith::ConstantOp cst) { ++num_constant_op; });
-  // TODO: b/323478683 - This should actually be 3. Bias parameter is
-  // duplicated one extra time. Tackle this in a follow-up cl.
   EXPECT_EQ(num_constant_op, 4);
 }
 
 TEST_F(ApplyQuantizationParamsPropagationTest,
        PropagateParamsCreatesQuantState) {
-  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleTFLite);
+  const OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleTFLite);
   func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
 
   auto op_quant_spec_getter = [&](Operation* op) {
@@ -120,16 +124,23 @@ TEST_F(ApplyQuantizationParamsPropagationTest,
   quantization_driver.Initialize();
   ASSERT_TRUE(quantization_driver.PropagateParamsAndReturnIfChanged());
   EXPECT_THAT(quantization_driver.GetArgs(), Not(IsEmpty()));
+
   for (const auto& arg : quantization_driver.GetArgs()) {
-    QuantState& state = quantization_driver.GetArgQuantState(arg);
-    // TODO: b/323478683 - Below should not be empty. Inspect further to see
-    // if there is a bug.
-    EXPECT_TRUE(state.IsEmpty());
+    const QuantState& state = quantization_driver.GetArgQuantState(arg);
+    EXPECT_TRUE(isa<quant::QuantizedType>(state.params));
+  }
+  for (const auto& result : quantization_driver.GetResultStates()) {
+    Operation* op = result.first.first;
+    const int res_index = result.first.second;
+    const QuantState state =
+        quantization_driver.GetResultQuantState(op, res_index);
+    EXPECT_TRUE(isa<quant::QuantizedType>(state.params));
   }
 }
 
 TEST_F(ApplyQuantizationParamsPropagationTest, FinalizeInsertsQDQOps) {
-  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleTFLite);
+  const OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleTFLite);
   func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
 
   auto op_quant_spec_getter = [&](Operation* op) {
@@ -146,8 +157,12 @@ TEST_F(ApplyQuantizationParamsPropagationTest, FinalizeInsertsQDQOps) {
       xla_call_module_op->getOperand(1).getDefiningOp();
   Operation* filter_qcast_op = filter_dcast_op->getOperand(0).getDefiningOp();
   ASSERT_NE(filter_qcast_op, nullptr);
-  // TODO: b/323478683 - Add check for `UniformQuantizedPerAxisType` below.
-  EXPECT_TRUE(filter_qcast_op->getResult(0).getType().isa<mlir::Type>());
+  EXPECT_TRUE(isa<quantfork::QuantizeCastOp>(filter_qcast_op));
+  EXPECT_TRUE(isa<quantfork::DequantizeCastOp>(filter_dcast_op));
+  EXPECT_TRUE(isa<UniformQuantizedPerAxisType>(filter_qcast_op->getResult(0)
+                                                   .getType()
+                                                   .cast<TensorType>()
+                                                   .getElementType()));
 }
 
 }  // namespace

From 2206fc55c639389df464808ba9c926d1d372fde5 Mon Sep 17 00:00:00 2001
From: Doyeon Kim <doyeonkim@google.com>
Date: Wed, 20 Mar 2024 04:50:31 -0700
Subject: [PATCH 164/670] Convert stablehlo.dynamic_slice to tfl.slice

PiperOrigin-RevId: 617473124
---
 .../uniform-quantized-stablehlo-to-tfl.mlir   | 52 +++++++++++
 ...uniform_quantized_stablehlo_to_tfl_pass.cc | 92 +++++++++++++++++--
 2 files changed, 136 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
index fcd5cc52d870e0..05d00443ebcca6 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
@@ -1420,3 +1420,55 @@ func.func @float_gather(%arg0: tensor<3x4x2x2xf32>, %arg1: tensor<2x3x2xi64>) ->
 // CHECK: stablehlo.gather
 // CHECK-NOT: tfl.gather_nd
 // CHECK-NOT: tfl.gather
+
+// -----
+
+// Test that a quantized stablehlo.dynamic_slice is converted to tfl.slice.
+
+// CHECK-LABEL: func @dynamic_slice
+// CHECK-SAME: %[[ARG0:.+]]: tensor<4x4x!quant.uniform<i8:f32, 3.000000e-01:-5>>, %[[ARG1:.+]]: tensor<i64>, %[[ARG2:.+]]: tensor<i64>
+func.func @dynamic_slice(
+    %arg0: tensor<4x4x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
+    %arg1: tensor<i64>,
+    %arg2: tensor<i64>
+  ) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-01:-5>> {
+  %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {
+    slice_sizes = array<i64: 2, 1>
+  } : (
+    tensor<4x4x!quant.uniform<i8:f32, 3.000000e-01:-5>>, tensor<i64>,
+    tensor<i64>
+  ) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+  return %0 : tensor<2x1x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+}
+
+
+// CHECK-DAG: %[[SLICE_SIZE:.+]] = arith.constant dense<[2, 1]> : tensor<2xi64>
+// CHECK-DAG: %[[ZERO:.+]] = arith.constant dense<0> : tensor<1xi64>
+// CHECK-DAG: %[[MAX1:.+]] = arith.constant dense<2> : tensor<1xi64>
+// CHECK-DAG: %[[MAX2:.+]] = arith.constant dense<3> : tensor<1xi64>
+// CHECK: %[[BITCAST1:.+]] = "tfl.bitcast"(%[[ARG1]]) : (tensor<i64>) -> tensor<1xi64>
+// CHECK: %[[MIN1:.+]] = "tfl.minimum"(%[[BITCAST1]], %[[MAX1]]) : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+// CHECK: %[[BITCAST2:.+]] = "tfl.bitcast"(%[[ARG2]]) : (tensor<i64>) -> tensor<1xi64>
+// CHECK: %[[MIN2:.+]] = "tfl.minimum"(%[[BITCAST2]], %[[MAX2]]) : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+// CHECK: %[[CONCAT:.+]] = "tfl.concatenation"(%[[MIN1]], %[[MIN2]]) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64>
+// CHECK: %[[MAX:.+]] = "tfl.maximum"(%[[CONCAT]], %[[ZERO]]) : (tensor<2xi64>, tensor<1xi64>) -> tensor<2xi64>
+// CHECK: %[[SLICE:.+]] = "tfl.slice"(%[[ARG0]], %[[MAX]], %[[SLICE_SIZE]])
+// CHECK-SAME: (tensor<4x4x!quant.uniform<i8:f32, 3.000000e-01:-5>>, tensor<2xi64>, tensor<2xi64>) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+
+// -----
+
+// Test that a float stablehlo.dynamic_slice is not converted to tfl.slice.
+
+func.func @float_dynamic_slice(%arg0: tensor<4x4xf32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<2x1xf32> {
+  %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {
+    slice_sizes = array<i64: 2, 1>
+  } : (tensor<4x4xf32>, tensor<i64>, tensor<i64>) -> tensor<2x1xf32>
+  return %0 : tensor<2x1xf32>
+}
+
+// CHECK-LABEL: func @float_dynamic_slice
+// CHECK: stablehlo.dynamic_slice
+// CHECK-NOT: tfl.bitcast
+// CHECK-NOT: tfl.minimum
+// CHECK-NOT: tfl.maximum
+// CHECK-NOT: tfl.slice
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
index fa041289c5a861..c124f33d6f55eb 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
@@ -2032,20 +2032,96 @@ class RewriteQuantizedGatherOp : public OpRewritePattern<stablehlo::GatherOp> {
   }
 };
 
+// Rewrites quantized stablehlo.dynamic_slice to tfl.slice.
+// TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
+class RewriteQuantizedDynamicSliceOp
+    : public OpRewritePattern<stablehlo::DynamicSliceOp> {
+ public:
+  using OpRewritePattern<stablehlo::DynamicSliceOp>::OpRewritePattern;
+
+  LogicalResult match(stablehlo::DynamicSliceOp op) const override {
+    if (!IsQuantizedTensorType(op.getOperand().getType()) ||
+        !IsQuantizedTensorType(op.getResult().getType())) {
+      return failure();
+    }
+
+    return success(quant::HasStaticShape(op.getOperand()));
+  }
+
+  void rewrite(stablehlo::DynamicSliceOp op,
+               PatternRewriter& rewriter) const override {
+    Type output = op.getResult().getType();
+    Value input = op.getOperand();
+    TensorType operand_type = input.getType().cast<TensorType>();
+    ArrayRef<int64_t> operand_shape = operand_type.getShape();
+    const int64_t rank = operand_type.getRank();
+    const Type i64_type = rewriter.getI64Type();
+
+    ArrayRef<int64_t> slice_sizes = op.getSliceSizes();
+    TensorType single_element_type =
+        operand_type.cloneWith({static_cast<int64_t>(1)}, i64_type);
+
+    SmallVector<Value> start_indices(rank);
+    for (auto [i, start_index] : llvm::enumerate(op.getStartIndices())) {
+      // Start indices should be casted from tensor<i64> to tensor<1xi64>.
+      auto cast = rewriter.create<TFL::BitcastOp>(
+          op->getLoc(), single_element_type, start_index);
+      int64_t upper_limit_idx = operand_shape[i] - slice_sizes[i];
+      auto upper_limit_attr =
+          DenseIntElementsAttr::get(single_element_type, {upper_limit_idx});
+      auto upper_limit_cst =
+          rewriter.create<arith::ConstantOp>(op->getLoc(), upper_limit_attr);
+      // Dynamic start indices should be clamped with upper limit of
+      // `shape(operand) - slice_sizes)` as per semantics of
+      // `stablehlo.dynamic_slice`.
+      // (https://github.com/openxla/stablehlo/blob/main/docs/spec.md#dynamic_slice)
+      start_indices[i] =
+          rewriter.create<TFL::MinimumOp>(op->getLoc(), cast, upper_limit_cst);
+    }
+
+    Value concatenated = start_indices[0];
+    if (rank > 1) {
+      SmallVector<int64_t> begin_shape{rank};
+      Type begin_type = operand_type.cloneWith(begin_shape, i64_type);
+      concatenated = rewriter.create<TFL::ConcatenationOp>(
+          op->getLoc(), begin_type, start_indices, /*axis=*/0,
+          /*fused_activation_function=*/rewriter.getStringAttr("NONE"));
+    }
+
+    // Clamp with lower limit.
+    auto lower_limit_attr = DenseIntElementsAttr::get(
+        single_element_type, {static_cast<int64_t>(0)});
+    auto lower_limit_cst =
+        rewriter.create<arith::ConstantOp>(op->getLoc(), lower_limit_attr);
+    // Dynamic start indices should be clamped with lower limit of
+    // 0 as per semantics of `stablehlo.dynamic_slice`.
+    // (https://github.com/openxla/stablehlo/blob/main/docs/spec.md#dynamic_slice)
+    auto begin = rewriter.create<TFL::MaximumOp>(op->getLoc(), concatenated,
+                                                 lower_limit_cst);
+
+    SmallVector<int64_t> size_len{rank};
+    TensorType size_type = operand_type.cloneWith(size_len, i64_type);
+    auto size_attr = DenseIntElementsAttr::get(size_type, slice_sizes);
+    auto size = rewriter.create<arith::ConstantOp>(op.getLoc(), size_attr);
+
+    rewriter.replaceOpWithNewOp<TFL::SliceOp>(op, output, input, begin, size);
+  }
+};
+
 void UniformQuantizedStableHloToTflPass::runOnOperation() {
   func::FuncOp func_op = getOperation();
   MLIRContext& ctx = getContext();
 
   RewritePatternSet patterns(&ctx);
-  patterns.add<RewriteUniformQuantizeOp, RewriteUniformDequantizeOp,
+  patterns.add<RewriteUniformDequantizeOp, RewriteUniformQuantizeOp,
+               RewriteQuantizedBroadcastInDimOp, RewriteQuantizedConcatenateOp,
+               RewriteQuantizedConvolutionOp,
                RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp,
-               RewriteQuantizedConvolutionOp, RewriteQuantizedTransposeOp,
-               RewriteQuantizedReshapeOp, RewriteQuantizedSelectOp,
-               RewriteQuantizedConcatenateOp, RewriteQuantizedPadOp,
-               RewriteQuantizedSliceOp, RewriteQuantizedBroadcastInDimOp,
-               RewriteQuantizedReduceWindowOpWithMax,
-               RewriteQuantizedDynamicReshapeOp, RewriteQuantizedGatherOp>(
-      &ctx);
+               RewriteQuantizedDynamicReshapeOp, RewriteQuantizedDynamicSliceOp,
+               RewriteQuantizedGatherOp, RewriteQuantizedPadOp,
+               RewriteQuantizedReduceWindowOpWithMax, RewriteQuantizedReshapeOp,
+               RewriteQuantizedSelectOp, RewriteQuantizedSliceOp,
+               RewriteQuantizedTransposeOp>(&ctx);
 
   if (failed(applyPatternsAndFoldGreedily(func_op, std::move(patterns)))) {
     func_op.emitError() << "Failed to convert stablehlo ops with uniform "

From f8ba8ea3dbaaa94483c489ee58b0ad00973d9d23 Mon Sep 17 00:00:00 2001
From: Harsha HS <harsha.havanurshamsundara@amd.com>
Date: Wed, 20 Mar 2024 12:08:54 +0000
Subject: [PATCH 165/670] Remove missing `cuda.h` header from `mat_mul_op.cc`

---
 tensorflow/core/kernels/sparse/mat_mul_op.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.cc b/tensorflow/core/kernels/sparse/mat_mul_op.cc
index ccea05e7ad06a4..538ba073be9c0e 100644
--- a/tensorflow/core/kernels/sparse/mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mat_mul_op.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/platform/threadpool.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/core/util/cuda_sparse.h"
 #include "tensorflow/core/util/gpu_solvers.h"
 #endif

From 4085ebeabcff37a49a43fbb25f6e9c26eb584778 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 20 Mar 2024 04:52:47 -0700
Subject: [PATCH 166/670] Also handle DynamicSlice in MoveCopyToUsers

While there, also fix a bug in the test for Slice.
The slice operand should have the same layout as the slice.

PiperOrigin-RevId: 617473515
---
 third_party/xla/xla/service/gpu/BUILD         |  2 ++
 .../xla/xla/service/gpu/move_copy_to_users.cc | 21 +++++++++++++
 .../service/gpu/move_copy_to_users_test.cc    | 30 +++++++++++++++++--
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 6ebe462a1f12a9..2fe90a0418aea1 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2441,6 +2441,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
@@ -2452,6 +2453,7 @@ xla_cc_test(
     srcs = ["move_copy_to_users_test.cc"],
     deps = [
         ":move_copy_to_users",
+        "//xla/service:layout_assignment",
         "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:test",
diff --git a/third_party/xla/xla/service/gpu/move_copy_to_users.cc b/third_party/xla/xla/service/gpu/move_copy_to_users.cc
index b6996499f264b0..51ffbed0ec0138 100644
--- a/third_party/xla/xla/service/gpu/move_copy_to_users.cc
+++ b/third_party/xla/xla/service/gpu/move_copy_to_users.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -70,6 +71,26 @@ class MoveCopyToUsersVisitor : public DfsHloRewriteVisitor {
     return absl::OkStatus();
   }
 
+  // Turn copy->dynamic-slice into dynamic-slice->copy, as dynamic-slice is
+  // layout-preserving.
+  absl::Status HandleDynamicSlice(HloInstruction* hlo) override {
+    HloInstruction* operand = hlo->mutable_operand(0);
+    if (operand->opcode() == HloOpcode::kCopy) {
+      HloInstruction* copied = operand->mutable_operand(0);
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * earlier_slice,
+          MakeDynamicSliceHlo(
+              copied,
+              absl::Span<HloInstruction* const>(hlo->operands()).subspan(1),
+              hlo->dynamic_slice_sizes(), &hlo->metadata()));
+      *earlier_slice->mutable_shape()->mutable_layout() =
+          copied->shape().layout();
+      HloInstruction* later_copy = MakeCopyHlo(earlier_slice, hlo->shape());
+      TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, later_copy));
+    }
+    return absl::OkStatus();
+  }
+
   // Turn copy->reduce_window into reduce_window->copy, as reduce_window is
   // layout-preserving.
   absl::Status HandleReduceWindow(HloInstruction* hlo) override {
diff --git a/third_party/xla/xla/service/gpu/move_copy_to_users_test.cc b/third_party/xla/xla/service/gpu/move_copy_to_users_test.cc
index 72f45b806fc9d5..718847168f901f 100644
--- a/third_party/xla/xla/service/gpu/move_copy_to_users_test.cc
+++ b/third_party/xla/xla/service/gpu/move_copy_to_users_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <optional>
 
 #include "absl/strings/string_view.h"
+#include "xla/service/layout_assignment.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
 
@@ -26,6 +27,10 @@ namespace {
 
 class MoveCopyToUsersTest : public HloTestBase {
  public:
+  MoveCopyToUsersTest()
+      : HloTestBase(/*verifier_layout_sensitive=*/true,
+                    /*allow_mixed_precision_in_hlo_verifier=*/true,
+                    LayoutAssignment::InstructionCanChangeLayout) {}
   void CheckMoveCopyToUsers(absl::string_view hlo,
                             std::optional<absl::string_view> expected) {
     RunAndFilecheckHloRewrite(hlo, MoveCopyToUsers{}, expected);
@@ -112,13 +117,34 @@ HloModule module
 ENTRY main {
   input = f32[1,17,9,9]{3,2,1,0} parameter(0)
   copy = f32[1,17,9,9]{1,3,2,0} copy(input)
-  ROOT converted = f32[1,4,6,6] slice(copy), slice={[0:1],[0:4],[0:6],[0:6]}
+  ROOT slice = f32[1,4,6,6]{1,3,2,0} slice(copy), slice={[0:1],[0:4],[0:6],[0:6]}
 }
 )";
 
   CheckMoveCopyToUsers(hlo, R"(
 // CHECK: [[slice_0:%[^ ]+]] = f32[1,4,6,6]{3,2,1,0} slice([[input_1:%[^ ]+]]), slice={[0:1], [0:4], [0:6], [0:6]}
-// CHECK-NEXT: ROOT [[copy_1_2:%[^ ]+]] = f32[1,4,6,6]{3,2,1,0} copy([[slice_0]])
+// CHECK-NEXT: ROOT [[copy_1_2:%[^ ]+]] = f32[1,4,6,6]{1,3,2,0} copy([[slice_0]])
+)");
+}
+
+TEST_F(MoveCopyToUsersTest, DynamicSlice) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY main {
+  input = f32[1,17,9,9]{3,2,1,0} parameter(0)
+  copy = f32[1,17,9,9]{1,3,2,0} copy(input)
+  s0 = s32[] parameter(1)
+  s1 = s32[] parameter(2)
+  s2 = s32[] parameter(3)
+  s3 = s32[] parameter(4)
+  ROOT ds = f32[1,4,6,6]{1,3,2,0} dynamic-slice(copy, s0, s1, s2, s3), dynamic_slice_sizes={1,4,6,6}
+}
+)";
+
+  CheckMoveCopyToUsers(hlo, R"(
+// CHECK: [[ds:%[^ ]+]] = f32[1,4,6,6]{3,2,1,0} dynamic-slice({{.*}}), dynamic_slice_sizes={1,4,6,6}
+// CHECK-NEXT: ROOT {{.*}} = f32[1,4,6,6]{1,3,2,0} copy([[ds]])
 )");
 }
 

From a545da52466bff99fe19aa1887422312d69a6d6c Mon Sep 17 00:00:00 2001
From: Doyeon Kim <doyeonkim@google.com>
Date: Wed, 20 Mar 2024 04:53:42 -0700
Subject: [PATCH 167/670] Integrate int8 per-tensor weight-only quantization
 for server

PiperOrigin-RevId: 617473690
---
 .../mlir/quantization/stablehlo/cc/BUILD      |  33 ++++
 .../stablehlo/cc/pass_pipeline.cc             |  32 ++++
 .../quantization/stablehlo/cc/pass_pipeline.h |   7 +
 .../stablehlo/cc/weight_only_ptq.cc           | 114 ++++++++++++++
 .../stablehlo/cc/weight_only_ptq.h            |  80 ++++++++++
 .../passes/prepare_quantize_hybrid.cc         |   3 +-
 .../mlir/quantization/stablehlo/python/BUILD  |   1 +
 .../integration_test/quantize_model_test.py   | 147 ++++++++++++++----
 .../stablehlo/python/pywrap_quantization.cc   |  22 +++
 .../stablehlo/python/pywrap_quantization.pyi  |  14 ++
 .../python/pywrap_quantization_lib.cc         |  13 ++
 .../python/pywrap_quantization_lib.h          |  10 ++
 .../stablehlo/python/quantization.py          |  30 ++--
 .../stablehlo/quantization_config.proto       |   6 +-
 14 files changed, 467 insertions(+), 45 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc
 create mode 100644 tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
index 36830c4962fb61..c04ec9b59e6257 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
@@ -396,3 +396,36 @@ cc_library(
         "@local_tsl//tsl/platform:statusor",
     ],
 )
+
+cc_library(
+    name = "weight_only_ptq",
+    srcs = ["weight_only_ptq.cc"],
+    hdrs = ["weight_only_ptq.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":component",
+        ":context",
+        ":pass_pipeline",
+        ":saved_model_export",
+        ":saved_model_import",
+        ":types",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:die_if_null",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
index 59e64d6d77d95e..91a67392abc2d2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
@@ -75,6 +75,38 @@ void AddPostCalibrationPasses(
   }
 }
 
+void AddWeightOnlyQuantizationPasses(
+    OpPassManager& pm, const QuantizationSpecs& quantization_specs,
+    const PipelineConfig& pipeline_config,
+    const DebuggerConfig& debugger_config) {
+  // For models with NCHW convolution format. This pass is required because
+  // downstream pipeline handles NHWC convolution better for most cases.
+  pm.addNestedPass<func::FuncOp>(createNchwConvolutionToNhwcPass());
+
+  // Folds `stablehlo.constant`->`stablehlo.transpose` patterns, which is often
+  // generated as by-products after optimizing dimension numbers (e.g.
+  // NCHW->NHWC convolution conversion).
+  pm.addNestedPass<func::FuncOp>(createFoldConstantTransposePass());
+  pm.addPass(CreateLiftQuantizableSpotsAsFunctionsPass(quantization_specs));
+  if (debugger_config.debugger_type() !=
+      DebuggerConfig::DEBUGGER_TYPE_UNSPECIFIED) {
+    pm.addPass(CreateAddDumpTensorOpPass(debugger_config.debugger_type(),
+                                         debugger_config.log_dir_path()));
+  }
+  AddShapeLegalizationPasses(pm);
+  QuantizeCompositeFunctionsPassOptions options;
+  // For debugging purposes.
+  options.mlir_dump_file_name_ = "quantize_composite_functions";
+  options.enable_weight_only_ = true;
+  pm.addPass(createQuantizeCompositeFunctionsPass(options));
+
+  // Add an inliner pass to inline quantized StableHLO functions.
+  pm.addPass(createInlinerPass());
+  if (pipeline_config.unpack_quantized_types()) {
+    AddStablehloQuantToIntPasses(pm);
+  }
+}
+
 void AddXlaCallModuleOpDeserializationPasses(OpPassManager& pm) {
   pm.addPass(TF::CreateXlaCallModuleDeserializationPass());
   pm.addPass(createRestoreFunctionNamePass());
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
index ef7b51aaf6096f..348b7266f3296e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
@@ -37,6 +37,13 @@ void AddPostCalibrationPasses(
     const ::stablehlo::quantization::StaticRangePtqPreset&
         static_range_ptq_preset);
 
+// Adds passes for weight-only quantization.
+void AddWeightOnlyQuantizationPasses(
+    OpPassManager& pm,
+    const ::stablehlo::quantization::QuantizationSpecs& quantization_specs,
+    const ::stablehlo::quantization::PipelineConfig& pipeline_config,
+    const ::stablehlo::quantization::DebuggerConfig& debugger_config);
+
 // Deserializes StableHLO functions serialized and embedded in XlaCallModuleOps.
 void AddXlaCallModuleOpDeserializationPasses(OpPassManager& pm);
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc
new file mode 100644
index 00000000000000..bbd9a9c25620bd
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc
@@ -0,0 +1,114 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/die_if_null.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace mlir::quant::stablehlo {
+
+using ::stablehlo::quantization::QuantizationConfig;
+using ::tensorflow::SignatureDef;
+using ::tensorflow::quantization::ExportedModel;
+using ::tensorflow::quantization::PyFunctionLibrary;
+using ::tensorflow::quantization::RunPasses;
+
+WeightOnlyPtqComponent::WeightOnlyPtqComponent(absl::Nonnull<MLIRContext*> ctx)
+    : ctx_(ABSL_DIE_IF_NULL(ctx)) {}  // Crash OK
+
+absl::StatusOr<ModuleOp> WeightOnlyPtqComponent::Run(
+    ModuleOp module_op, const QuantizationConfig& config) {
+  TF_RETURN_IF_ERROR(RunPasses(
+      kName, /*add_passes_func=*/
+      [&config](PassManager& pm) {
+        AddWeightOnlyQuantizationPasses(pm, config.specs(),
+                                        config.pipeline_config(),
+                                        config.debugger_config());
+      },
+      *ctx_, module_op));
+  return module_op;
+}
+
+absl::Status QuantizeWeightOnlyPtq(
+    const absl::string_view src_saved_model_path,
+    const absl::string_view dst_saved_model_path,
+    QuantizationConfig quantization_config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map,
+    const PyFunctionLibrary& py_function_library) {
+  std::unordered_set<std::string> tags;
+  tags.insert(quantization_config.tf_saved_model().tags().begin(),
+              quantization_config.tf_saved_model().tags().end());
+
+  std::unique_ptr<MLIRContext> ctx = CreateMlirContextForQuantization();
+
+  absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
+      function_aliases = GetFunctionAliases(src_saved_model_path, tags);
+  if (!function_aliases.ok()) {
+    return absl::InternalError(absl::StrCat(
+        "Failed to get function alias: ", function_aliases.status().message()));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      ModuleOp module_op,
+      ImportSavedModel(src_saved_model_path, signature_keys, tags,
+                       quantization_config, WeightOnlyPtqComponent::kName,
+                       *function_aliases, *ctx));
+
+  WeightOnlyPtqComponent weight_only_ptq_component(ctx.get());
+  TF_ASSIGN_OR_RETURN(
+      module_op, weight_only_ptq_component.Run(module_op, quantization_config));
+
+  TF_ASSIGN_OR_RETURN(
+      const ExportedModel post_calibrated_exported_model,
+      CreateExportedModel(signature_keys, tags, quantization_config,
+                          WeightOnlyPtqComponent::kName, *function_aliases,
+                          *ctx, module_op));
+
+  // Remove the `tpu` tag for exporting because the output quantized model is
+  // essentially a CPU model.
+  tags.erase("tpu");
+
+  py_function_library.SaveExportedModel(
+      dst_saved_model_path, post_calibrated_exported_model,
+      src_saved_model_path, tags, signature_def_map);
+
+  return absl::OkStatus();
+}
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h
new file mode 100644
index 00000000000000..bf23e93246c700
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h
@@ -0,0 +1,80 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_WEIGHT_ONLY_PTQ_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_WEIGHT_ONLY_PTQ_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Performs int8 weight-only quantization on dot_general ops.
+//
+// The resulting `ModuleOp` contains quantized StableHLO ops serialized in
+// `TF::XlaCallModuleOp`s. They are quantized using the weight constants, not
+// relying on calibration.
+class WeightOnlyPtqComponent : public Component {
+ public:
+  // Used for debugging purposes.
+  static constexpr absl::string_view kName = "quant_ptq_weight_only";
+
+  explicit WeightOnlyPtqComponent(absl::Nonnull<MLIRContext*> ctx);
+
+  absl::StatusOr<ModuleOp> Run(
+      ModuleOp module_op,
+      const ::stablehlo::quantization::QuantizationConfig& config) override;
+
+ private:
+  absl::Nonnull<MLIRContext*> ctx_;
+};
+
+// Runs weight-only quantization on a SavedModel at
+// `src_saved_model_path` and saves the resulting model to
+// `dst_saved_model_path`.
+//
+// `quantization_config` configures the quantization behavior for the
+// weight-only quantization.
+//
+// `signature_keys` specify the signatures that correspond to functions to be
+// quantized. `signature_def_map` connects the signature keys to
+// `SignatureDef`s.
+//
+// Returns a non-OK status when the quantization is not successful.
+// LINT.IfChange
+absl::Status QuantizeWeightOnlyPtq(
+    absl::string_view src_saved_model_path,
+    absl::string_view dst_saved_model_path,
+    ::stablehlo::quantization::QuantizationConfig quantization_config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
+        signature_def_map,
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library);
+// LINT.ThenChange(../python/pywrap_quantization.cc:weight_only_ptq)
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_WEIGHT_ONLY_PTQ_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize_hybrid.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize_hybrid.cc
index 77a389398270a9..33a208f07ad2ec 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize_hybrid.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize_hybrid.cc
@@ -96,7 +96,8 @@ class InsertWeightParamPattern
       return false;
     }
     Operation* user = operand.getOwner();
-    if (auto call_op = cast<TF::XlaCallModuleOp>(user)) {
+    if (isa<TF::XlaCallModuleOp>(user)) {
+      auto call_op = cast<TF::XlaCallModuleOp>(user);
       const StringRef function_name = GetEntryFunctionName(call_op);
       const bool is_conv_or_dot = function_name.contains("conv") ||
                                   function_name.contains("dot_general");
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
index a9bd3a713ede7c..2b20cc48a89d69 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
@@ -133,6 +133,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:config",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:static_range_ptq",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:weight_only_ptq",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
index 80ccf81c33b9b9..ae257a2550d8a9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -749,44 +749,39 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
   # TODO(b/307621353): add CALIBRATION_METHOD_HISTOGRAM_PERCENTILE.
   @parameterized.parameters(
       {
-          'calibration_options':
-              qc.CalibrationOptions(
-                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX  # pylint: disable=line-too-long
-              )
+          'calibration_options': qc.CalibrationOptions(
+              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX
+          )
       },
       {
-          'calibration_options':
-              qc.CalibrationOptions(
-                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_AVERAGE_MIN_MAX  # pylint: disable=line-too-long
-              ),
+          'calibration_options': qc.CalibrationOptions(
+              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_AVERAGE_MIN_MAX
+          ),
       },
       {
-          'calibration_options':
-              qc.CalibrationOptions(
-                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,  # pylint: disable=line-too-long
-                  calibration_parameters=qc.CalibrationOptions.CalibrationParameters(  # pylint: disable=line-too-long
-                      initial_num_bins=10,
-                  ),
+          'calibration_options': qc.CalibrationOptions(
+              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
+              calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
+                  initial_num_bins=10,
               ),
+          ),
       },
       {
-          'calibration_options':
-              qc.CalibrationOptions(
-                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,  # pylint: disable=line-too-long
-                  calibration_parameters=qc.CalibrationOptions.CalibrationParameters(  # pylint: disable=line-too-long
-                      initial_num_bins=10,
-                  ),
+          'calibration_options': qc.CalibrationOptions(
+              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
+              calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
+                  initial_num_bins=10,
               ),
+          ),
       },
       {
-          'calibration_options':
-              qc.CalibrationOptions(
-                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,  # pylint: disable=line-too-long
-                  calibration_parameters=qc.CalibrationOptions.CalibrationParameters(  # pylint: disable=line-too-long
-                      initial_num_bins=10,
-                  ),
+          'calibration_options': qc.CalibrationOptions(
+              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
+              calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
+                  initial_num_bins=10,
               ),
-      }
+          ),
+      },
   )
   @test_util.run_in_graph_and_eager_modes
   def test_conv_ptq_model_by_calibration_options(
@@ -814,18 +809,14 @@ def test_conv_ptq_model_by_calibration_options(
 
     # Generate model input data.
     input_data = ops.convert_to_tensor(
-        np.random.uniform(low=0.0, high=10, size=input_shape).astype(
-            'f4'
-        )
+        np.random.uniform(low=0.0, high=10, size=input_shape).astype('f4')
     )
 
     def data_gen() -> repr_dataset.RepresentativeDataset:
       for _ in range(100):
         yield {
             'input_tensor': ops.convert_to_tensor(
-                np.random.uniform(low=0, high=10, size=input_shape).astype(
-                    'f4'
-                )
+                np.random.uniform(low=0, high=10, size=input_shape).astype('f4')
             ),
         }
 
@@ -866,5 +857,95 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.02, atol=0.5)
 
 
+class WeightOnlyQuantizationTest(quantize_model_test_base.QuantizedModelTest):
+
+  @parameterized.parameters(
+      testing.parameter_combinations([{
+          'bias_fn': (
+              None,
+              nn_ops.bias_add,
+          ),
+          'activation_fn': (
+              None,
+              nn_ops.relu,
+              nn_ops.relu6,
+          ),
+          'dim_sizes': (
+              # tf.MatMul cases.
+              ([None, 1024], [1024, 3]),  # dynamic batch dim.
+              ([1, 1024], [1024, 3]),
+              # tf.BatchMatMul cases.
+              ([10, 1, 1024], [10, 1024, 3]),
+              ([2, 3, 1, 1024], [2, 3, 1024, 3]),
+          ),
+      }])
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_matmul_weight_only_model(
+      self,
+      bias_fn: Optional[ops.Operation],
+      activation_fn: Optional[ops.Operation],
+      dim_sizes: Sequence[int],
+  ):
+    lhs_dim_size, rhs_dim_size = dim_sizes
+    input_shape = (*lhs_dim_size,)
+    filter_shape = (*rhs_dim_size,)
+    static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
+    model = self._create_matmul_model(
+        input_shape,
+        filter_shape,
+        self._input_saved_model_path,
+        bias_fn,
+        activation_fn,
+    )
+
+    rng = np.random.default_rng(1234)
+    input_data = ops.convert_to_tensor(
+        rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
+            np.float32
+        )
+    )
+
+    config = qc.QuantizationConfig(
+        weight_only_preset=qc.WeightOnlyPreset(),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    expected_outputs = model.matmul(input_data)
+
+    root = load.load(self._output_saved_model_path)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+
+    new_outputs = root.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+    # Tests that the quantized graph outputs similar values. The rtol and atol
+    # values are arbitrary.
+    self.assertAllClose(new_outputs, expected_outputs, rtol=0.03, atol=0.2)
+
+    module_str = self._extract_first_xla_call_module_op(
+        self._output_saved_model_path
+    )
+
+    # Tests that the output graph contains subtract and multiply for
+    # dequantization.
+    self.assertTrue(re.search('stablehlo.subtract', module_str))
+    self.assertTrue(re.search('stablehlo.multiply', module_str))
+    self.assertTrue(
+        re.search('stablehlo.dot_general.*xf32>.*xf32>.*xf32>', module_str)
+    )
+    # Tests that the output graph contains float dot_general.
+    self.assertTrue(
+        re.search('stablehlo.dot_general.*xf32>.*xf32>.*xf32>', module_str)
+    )
+
+    # TODO: b/329342175 - Add check for output file size.
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
index 6ee6f9ac317ce0..db312c3bc3b60e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
@@ -29,6 +29,7 @@ namespace {
 
 using ::stablehlo::quantization::pywrap::PywrapPopulateDefaults;
 using ::stablehlo::quantization::pywrap::PywrapQuantizeStaticRangePtq;
+using ::stablehlo::quantization::pywrap::PywrapQuantizeWeightOnlyPtq;
 
 }  // namespace
 
@@ -60,6 +61,27 @@ PYBIND11_MODULE(pywrap_quantization, m) {
         py::arg("py_function_library"));
   // LINT.ThenChange(pywrap_quantization.pyi:static_range_ptq)
 
+  // If the function signature changes, likely its corresponding .pyi type
+  // hinting should also change.
+  // LINT.IfChange(weight_only_ptq)
+  m.def("weight_only_ptq", &PywrapQuantizeWeightOnlyPtq,
+        R"pbdoc(
+        Runs weight-only Quantization on a SavedModel at `src_saved_model_path`
+        and saves the resulting model to `dst_saved_model_path`.
+
+        The user should pass a serialized `QuantizationConfig` for the
+        `quantization_config_serialized` argument, and a signature key ->
+        serialized `SignatureDef` mapping for the `signature_def_map_serialized`
+        argument.
+
+        Raises `StatusNotOk` exception if when the run was unsuccessful.
+        )pbdoc",
+        py::arg("src_saved_model_path"), py::arg("dst_saved_model_path"),
+        py::arg("quantization_config_serialized"), py::kw_only(),
+        py::arg("signature_keys"), py::arg("signature_def_map_serialized"),
+        py::arg("py_function_library"));
+  // LINT.ThenChange(pywrap_quantization.pyi:weight_only_ptq)
+
   // If the function signature changes, likely its corresponding .pyi type
   // hinting should also change.
   // LINT.IfChange(populate_default_configs)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
index f46f44b218ee84..85bb40e25221b0 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
@@ -32,6 +32,20 @@ def static_range_ptq(
 # LINT.ThenChange()
 
 
+# LINT.IfChange(weight_only_ptq)
+def weight_only_ptq(
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
+    quantization_config_serialized: bytes,
+    *,
+    signature_keys: list[str],
+    signature_def_map_serialized: dict[str, bytes],
+    py_function_library: py_function_lib.PyFunctionLibrary,
+) -> Any: ...  # Status
+
+# LINT.ThenChange()
+
+
 # LINT.IfChange(populate_default_configs)
 def populate_default_configs(
     user_provided_quantization_config_serialized: bytes,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc
index 4fe33c60147df7..61f7442a00c2c3 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc
@@ -22,12 +22,14 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 
 namespace stablehlo::quantization::pywrap {
 
 using ::mlir::quant::stablehlo::QuantizeStaticRangePtq;
+using ::mlir::quant::stablehlo::QuantizeWeightOnlyPtq;
 using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::PyFunctionLibrary;
 
@@ -46,6 +48,17 @@ absl::Status PywrapQuantizeStaticRangePtq(
                                 py_function_library);
 }
 
+absl::Status PywrapQuantizeWeightOnlyPtq(
+    absl::string_view src_saved_model_path,
+    absl::string_view dst_saved_model_path, const QuantizationConfig& config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map,
+    const PyFunctionLibrary& py_function_library) {
+  return QuantizeWeightOnlyPtq(src_saved_model_path, dst_saved_model_path,
+                               config, signature_keys, signature_def_map,
+                               py_function_library);
+}
+
 QuantizationConfig PywrapPopulateDefaults(
     const QuantizationConfig& user_provided_config) {
   return PopulateDefaults(user_provided_config);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h
index 0f1af29424e79d..ead2e38289dff3 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h
@@ -40,6 +40,16 @@ absl::Status PywrapQuantizeStaticRangePtq(
         signature_def_map,
     const tensorflow::quantization::PyFunctionLibrary& py_function_library);
 
+// Function used by the pywrap_quantization module to mirror
+// `::mlir::quant::stablehlo::QuantizeWeightOnlyPtq`.
+absl::Status PywrapQuantizeWeightOnlyPtq(
+    absl::string_view src_saved_model_path,
+    absl::string_view dst_saved_model_path, const QuantizationConfig& config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
+        signature_def_map,
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library);
+
 // Function used by the pywrap_quantization module to mirror
 // `::stablehlo::quantization::PopulateDefaults`.
 QuantizationConfig PywrapPopulateDefaults(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
index 6938000deaae0e..5852d29186b793 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
@@ -63,10 +63,10 @@ def quantize_saved_model(
   if not (
       config.HasField('static_range_ptq_preset')
       and len(config.static_range_ptq_preset.representative_datasets) == 1
-  ):
+  ) and not config.HasField('weight_only_preset'):
     raise ValueError(
         '`quantize_saved_model` currently only supports static-range PTQ with a'
-        ' single signature.'
+        ' single signature or weight-only quantization.'
     )
 
   config = qc.QuantizationConfig.FromString(
@@ -80,11 +80,21 @@ def quantize_saved_model(
   )
 
   signature_def_map_serialized = _serialize_signature_def_map(signature_def_map)
-  pywrap_quantization.static_range_ptq(
-      src_saved_model_path,
-      dst_saved_model_path,
-      quantization_config_serialized=config.SerializeToString(),
-      signature_keys=list(signature_def_map.keys()),
-      signature_def_map_serialized=signature_def_map_serialized,
-      py_function_library=py_function_lib.PyFunctionLibrary(),
-  )
+  if config.HasField('static_range_ptq_preset'):
+    pywrap_quantization.static_range_ptq(
+        src_saved_model_path,
+        dst_saved_model_path,
+        quantization_config_serialized=config.SerializeToString(),
+        signature_keys=list(signature_def_map.keys()),
+        signature_def_map_serialized=signature_def_map_serialized,
+        py_function_library=py_function_lib.PyFunctionLibrary(),
+    )
+  elif config.HasField('weight_only_preset'):
+    pywrap_quantization.weight_only_ptq(
+        src_saved_model_path,
+        dst_saved_model_path,
+        quantization_config_serialized=config.SerializeToString(),
+        signature_keys=list(signature_def_map.keys()),
+        signature_def_map_serialized=signature_def_map_serialized,
+        py_function_library=py_function_lib.PyFunctionLibrary(),
+    )
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
index 36b781a7d28914..5ff51136bcd4e4 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
@@ -55,6 +55,9 @@ message StaticRangePtqPreset {
   bool enable_per_channel_quantized_weight = 2;
 }
 
+// Applies int8 per-tensor weight-only quantization for all dot_general op.
+message WeightOnlyPreset {}
+
 // Metadata specific to the input TensorFlow SavedModel, which may be required
 // to identify the specific MetaGraphDef to quantize, for example.
 // Next ID: 2
@@ -259,7 +262,7 @@ message CalibrationOptions {
 
 // Quantization configuration for StableHLO Quantizer. This is the primary
 // message containing all configurable options.
-// Next ID: 7
+// Next ID: 8
 message QuantizationConfig {
   // Config presets provide predefined popular or common quantization specs.
   // Lightweight users may choose one of the presets for quick experiments. Each
@@ -270,6 +273,7 @@ message QuantizationConfig {
   oneof preset {
     // Performs best-effort static-range post-training quantization (PTQ).
     StaticRangePtqPreset static_range_ptq_preset = 1;
+    WeightOnlyPreset weight_only_preset = 7;
   }
 
   // TF SavedModel specific information for the input model.

From ac48fbac67f216ac4d74df9e23db30b8fff50675 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Wed, 20 Mar 2024 05:04:33 -0700
Subject: [PATCH 168/670] [XLA:GPU][IndexAnalysis] Add indexing map for
 DynamicSlice.

PiperOrigin-RevId: 617476081
---
 .../service/gpu/model/indexing_analysis.cc    |  47 ++++++++
 .../gpu/model/indexing_analysis_test.cc       | 104 +++++++++++++++---
 2 files changed, 134 insertions(+), 17 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index be53057e82c3db..f0b5c0bb5c813a 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -304,6 +304,50 @@ HloInstructionIndexing ComputeOutputToInputDotOpIndexing(
       {lhs_indexing_map, rhs_indexing_map});
 }
 
+HloInstructionIndexing ComputeOutputToInputDynamicSliceOpIndexing(
+    const HloDynamicSliceInstruction* dynamic_slice,
+    IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
+  const Shape& input_shape = dynamic_slice->operand(0)->shape();
+  const Shape& output_shape = dynamic_slice->shape();
+  int64_t rank = output_shape.rank();
+  const int64_t first_index_num = dynamic_slice->first_index_operand_number();
+
+  CHECK(dynamic_slice->operand(first_index_num)->shape().rank() == 0)
+      << "b/118437727: Old form, not supported.";
+  // A map from tensor iteration space to (), because index operands are 0d
+  // tensors.
+  IndexingMap zero_dim_map = IndexingMap::FromTensorSizes(
+      indexing_context,
+      AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{},
+                     mlir_context),
+      output_shape.dimensions(), {});
+
+  std::vector<RTVar> offsets_rt_vars;
+  offsets_rt_vars.reserve(rank);
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(rank);
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    exprs.push_back(getAffineDimExpr(dim, mlir_context) +
+                    getAffineSymbolExpr(dim, mlir_context));
+    Interval feasible_values{
+        0, input_shape.dimensions(dim) - dynamic_slice->slice_sizes(dim)};
+    RTVarData rt_var_data{feasible_values,
+                          dynamic_slice->operand(dim + first_index_num),
+                          zero_dim_map};
+    offsets_rt_vars.push_back(indexing_context->RegisterRTVar(rt_var_data));
+  }
+  std::vector<IndexingMap> indexing_maps(dynamic_slice->operand_count(),
+                                         zero_dim_map);
+  indexing_maps.front() = IndexingMap{
+      indexing_context,
+      AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank, exprs,
+                     mlir_context),
+      zero_dim_map.GetDimVars(), /*range_vars=*/{}, std::move(offsets_rt_vars)};
+  return HloInstructionIndexing::FromIndexingMaps(indexing_maps);
+}
+
 IndexingMap ComputeOutputToInputPadOpIndexingImpl(
     absl::Span<const int64_t> output_dims,
     absl::Span<const int64_t> padding_low,
@@ -1189,6 +1233,9 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
   if (auto dot = DynCast<HloDotInstruction>(instr)) {
     return ComputeOutputToInputDotOpIndexing(dot, ctx);
   }
+  if (auto dynamic_slice = DynCast<HloDynamicSliceInstruction>(instr)) {
+    return ComputeOutputToInputDynamicSliceOpIndexing(dynamic_slice, ctx);
+  }
   if (auto fusion = DynCast<HloFusionInstruction>(instr)) {
     return ComputeOutputToInputFusionOpIndexing(fusion, output_id, ctx);
   }
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 39cade7c560b58..3cc5907b8d24fd 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -626,6 +626,71 @@ TEST_F(IndexingAnalysisTest, ConcatenateOp) {
                           )"))));
 }
 
+TEST_F(IndexingAnalysisTest, DynamicSliceOp) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      %src = s32[2,2,258] parameter(0)
+      %of1 = s32[] parameter(1)
+      %of2 = s32[] parameter(2)
+      %of3 = s32[] parameter(3)
+      ROOT %ds = s32[1,2,32] dynamic-slice(s32[2,2,258] %src,
+        s32[] %of1, s32[] %of2, s32[] %of3),
+        dynamic_slice_sizes={1, 2, 32}
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2)[s0, s1, s2] -> (d0 + s0, d1 + s1, d2 + s2)
+                domain:
+                d0 in [0, 0]
+                d1 in [0, 1]
+                d2 in [0, 31]
+                s0 id: 0 in [0, 1]
+                  hlo: %of1 = s32[] parameter(1)
+                  (d0, d1, d2)  -> ()
+                  domain:
+                  d0 in [0, 0]
+                  d1 in [0, 1]
+                  d2 in [0, 31]
+                s1 id: 1 in [0, 0]
+                  hlo: %of2 = s32[] parameter(2)
+                  (d0, d1, d2)  -> ()
+                  domain:
+                  d0 in [0, 0]
+                  d1 in [0, 1]
+                  d2 in [0, 31]
+                s2 id: 2 in [0, 226]
+                  hlo: %of3 = s32[] parameter(3)
+                  (d0, d1, d2) -> ()
+                  domain:
+                  d0 in [0, 0]
+                  d1 in [0, 1]
+                  d2 in [0, 31]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2)  -> ()
+                domain:
+                d0 in [0, 0]
+                d1 in [0, 1]
+                d2 in [0, 31]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2)  -> ()
+                domain:
+                d0 in [0, 0]
+                d1 in [0, 1]
+                d2 in [0, 31]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2)  -> ()
+                domain:
+                d0 in [0, 0]
+                d1 in [0, 1]
+                d2 in [0, 31]
+              )"))));
+}
+
 TEST_F(IndexingAnalysisTest, FusionOpWithSingleBinaryOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
@@ -2053,30 +2118,35 @@ TEST_F(IndexingAnalysisTest, FusionWithUnsupportedOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     fused_computation {
-      input = f32[20, 20] parameter(0)
-      start_indices = s32[2] parameter(1)
-      lhs = f32[5, 5] dynamic-slice(f32[20,20] input, s32[2] start_indices),
-          dynamic_slice_sizes={5, 5}
-      rhs = f32[5, 5] slice(f32[20, 20] input),
-          slice={[0:20:4], [0:5:1]}
-      ROOT add = f32[5, 5] add(lhs, rhs)
+      p0 = f32[20, 20] parameter(0)
+      p1 = f32[4,4] parameter(1)
+      p2 = f32[4,3] parameter(2)
+      lhs =  f32[4,3] triangular-solve(f32[4,4] p1, f32[4,3] p2),
+        left_side=true,
+        lower=true,
+        transpose_a=NO_TRANSPOSE,
+        unit_diagonal=true
+      rhs = f32[4, 3] slice(f32[20, 20] p0),
+          slice={[0:20:6], [0:5:2]}
+      ROOT add = f32[4, 3] add(lhs, rhs)
     }
     ENTRY e {
       p0 = f32[20, 20] parameter(0)
-      p1 = s32[2] parameter(1)
-      ROOT fusion = f32[5, 5] fusion(p0, p1), kind=kLoop,
+      p1 = f32[4, 4] parameter(1)
+      p2 = f32[4, 3] parameter(2)
+      ROOT fusion = f32[4, 3] fusion(p0, p1, p2), kind=kLoop,
           calls=fused_computation
     }
   )"));
-  EXPECT_THAT(input_indexing.indexing_maps,
-              ElementsAre(UnorderedElementsAre(MatchIndexingMap(R"(
-                            (d0, d1) -> (d0 * 4, d1)
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(UnorderedElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0 * 6, d1 * 2)
                             domain:
-                            d0 in [0, 4]
-                            d1 in [0, 4]
-                          )"),
-                                               UndefinedMap()),
-                          ElementsAre(UndefinedMap())));
+                            d0 in [0, 3]
+                            d1 in [0, 2]
+                          )")),
+                  ElementsAre(UndefinedMap()), ElementsAre(UndefinedMap())));
 }
 
 TEST_F(IndexingAnalysisTest, TilingIndexing) {

From cecebc349998dfc77332972c25f9cecff6c0cbcf Mon Sep 17 00:00:00 2001
From: akhilgoe <114951738+akhilgoe@users.noreply.github.com>
Date: Wed, 20 Mar 2024 05:13:23 -0700
Subject: [PATCH 169/670] PR #10626: [XLA:CPU][oneDNN] Add a BF16 pattern for
 Softmax

Imported from GitHub PR https://github.com/openxla/xla/pull/10626

This PR:

1. Adds a pattern for BF16 softmax.
2. Adds a test to verify rewrite and execution result.
3. Skips BF16 matmul / softmax tests on machines that do not support BF16
Copybara import of the project:

--
f0370a18eed1f2b47d8dc15470b970029726b0e8 by Akhil Goel <akhil.goel@intel.com>:

Add Softmax BF16 pattern and test

--
62af424e4e74ddb7f64584aa3b05be761db956dc by Akhil Goel <akhil.goel@intel.com>:

Declare convert_instr identifier

--
d030100ea87923582aa64cdcf0fac1464051288c by Akhil Goel <akhil.goel@intel.com>:

Fix formatting

Merging this change closes #10626

PiperOrigin-RevId: 617477998
---
 .../xla/service/cpu/onednn_ops_rewriter.cc    | 19 ++++----
 third_party/xla/xla/tests/BUILD               |  2 +
 .../xla/xla/tests/onednn_matmul_test.cc       |  8 ++++
 .../xla/xla/tests/onednn_softmax_test.cc      | 46 +++++++++++++++++++
 4 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
index 183119e09f213a..9f11232a9d839b 100644
--- a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
@@ -34,6 +34,11 @@ auto OptionalConvert(Pattern pattern) {
   return m::AnyOf<HloInstruction>(m::Convert(pattern), std::move(pattern));
 }
 
+inline auto OneDnnConvertibleInstr(HloInstruction** instr) {
+  return m::AnyOf<HloInstruction>(m::CustomCall(instr, {"__onednn$layernorm"}),
+                                  m::CustomCall(instr, {"__onednn$softmax"}));
+}
+
 HloInstruction* FindLayerNormScale(HloInstruction* instr) {
   HloInstruction* scale = nullptr;
   auto scalePattern = m::Multiply().WithBinaryOperandsAnyOrder(
@@ -424,15 +429,13 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
   }
 
   Status HandleConvert(HloInstruction* instr) override {
-    HloInstruction* ln_instr;
+    HloInstruction* custom_call;
     HloInstruction* convert_instr;
     auto pattern =
         m::Op(&convert_instr)
             .WithOpcode(HloOpcode::kConvert)
-            .WithOperand(0, m::Op(&ln_instr)
+            .WithOperand(0, OneDnnConvertibleInstr(&custom_call)
                                 .WithOneUser()
-                                .WithOpcode(HloOpcode::kCustomCall)
-                                .WithCustomCallTarget({"__onednn$layernorm"})
                                 .WithElementType(PrimitiveType::F32));
 
     if (!IsSupportedType(instr->shape().element_type())) return OkStatus();
@@ -448,11 +451,11 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
                                            instr->shape().element_type()),
               producer));
       absl::InlinedVector<HloInstruction*, 2> newoperands =
-          ln_instr->mutable_operands();
+          custom_call->mutable_operands();
       newoperands.at(0) = newinp;
-      HloInstruction* ln_call = instr->AddInstruction(
-          ln_instr->CloneWithNewOperands(instr->shape(), newoperands));
-      TF_RETURN_IF_ERROR(ReplaceInstruction(instr, ln_call));
+      HloInstruction* updated_call = instr->AddInstruction(
+          custom_call->CloneWithNewOperands(instr->shape(), newoperands));
+      TF_RETURN_IF_ERROR(ReplaceInstruction(instr, updated_call));
     }
 
     return OkStatus();
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 13346a0796d682..f1a249a9ff027a 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -2906,6 +2906,8 @@ xla_test(
         "//xla:shape_util",
         "//xla:test",
         "//xla:test_helpers",
+        "//xla/service/cpu:onednn_util",
+        "@local_tsl//tsl/platform:platform_port",
     ],
 )
 
diff --git a/third_party/xla/xla/tests/onednn_matmul_test.cc b/third_party/xla/xla/tests/onednn_matmul_test.cc
index 516f130f2c41a2..52ef3d34adec68 100644
--- a/third_party/xla/xla/tests/onednn_matmul_test.cc
+++ b/third_party/xla/xla/tests/onednn_matmul_test.cc
@@ -437,6 +437,10 @@ TEST_F(MatmulTest, ReLUTestF32) {
 }
 
 TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_F32) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
   const char* matmul_module_str = R"(
   HloModule jit_apply, entry_computation_layout={(f32[3072]{0}, f32[768,3072]{1,0}, f32[16,128,768]{2,1,0})->bf16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
   ENTRY matmul.test.bf16 {
@@ -459,6 +463,10 @@ TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_F32) {
 }
 
 TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_BF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
   const char* matmul_module_str = R"(
   HloModule jit_apply, entry_computation_layout={(bf16[3072]{0}, bf16[768,3072]{1,0}, f32[16,128,768]{2,1,0})->bf16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
   ENTRY matmul.test.bf16 {
diff --git a/third_party/xla/xla/tests/onednn_softmax_test.cc b/third_party/xla/xla/tests/onednn_softmax_test.cc
index 8506337f194a3a..4af19eafa732d0 100644
--- a/third_party/xla/xla/tests/onednn_softmax_test.cc
+++ b/third_party/xla/xla/tests/onednn_softmax_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <utility>
 
 #include "xla/literal.h"
+#include "xla/service/cpu/onednn_util.h"
 #include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
@@ -101,6 +102,10 @@ TEST_F(OneDnnSoftmaxTest, SoftmaxFP32) {
 }
 
 TEST_F(OneDnnSoftmaxTest, SoftmaxBF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
   const std::string hlo_string = R"(
         HloModule jit_softmax, entry_computation_layout={(bf16[1,128,30522]{2,1,0})->bf16[1,128,30522]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
         region_0.4 {
@@ -136,6 +141,47 @@ TEST_F(OneDnnSoftmaxTest, SoftmaxBF16) {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-4, 1e-4}));
 }
 
+TEST_F(OneDnnSoftmaxTest, SoftmaxF32toBF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
+  const std::string hlo_string = R"(
+        HloModule jit_softmax, entry_computation_layout={(f32[16,128,30522]{2,1,0})->bf16[16,128,30522]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+        region_0.4 {
+            Arg_0.5 = f32[] parameter(0)
+            Arg_1.6 = f32[] parameter(1)
+            ROOT maximum.7 = f32[] maximum(Arg_0.5, Arg_1.6)
+        }
+        region_1.15 {
+            Arg_0.16 = f32[] parameter(0)
+            Arg_1.17 = f32[] parameter(1)
+            ROOT add.18 = f32[] add(Arg_0.16, Arg_1.17)
+        }
+        ENTRY main.25 {
+            Arg_0.1 = f32[16,128,30522]{2,1,0} parameter(0), sharding={replicated}
+            constant.3 = f32[] constant(-inf)
+            reduce.8 = f32[16,128]{1,0} reduce(Arg_0.1, constant.3), dimensions={2}, to_apply=region_0.4
+            reshape.9 = f32[16,128,1]{2,1,0} reshape(reduce.8)
+            broadcast.10 = f32[16,128,1]{2,1,0} broadcast(reshape.9), dimensions={0,1,2}
+            reshape.11 = f32[16,128]{1,0} reshape(broadcast.10)
+            broadcast.12 = f32[16,128,30522]{2,1,0} broadcast(reshape.11), dimensions={0,1}
+            subtract.13 = f32[16,128,30522]{2,1,0} subtract(Arg_0.1, broadcast.12)
+            exponential.14 = f32[16,128,30522]{2,1,0} exponential(subtract.13)
+            constant.2 = f32[] constant(0)
+            reduce.19 = f32[16,128]{1,0} reduce(exponential.14, constant.2), dimensions={2}, to_apply=region_1.15
+            reshape.20 = f32[16,128,1]{2,1,0} reshape(reduce.19)
+            broadcast.21 = f32[16,128,1]{2,1,0} broadcast(reshape.20), dimensions={0,1,2}
+            reshape.22 = f32[16,128]{1,0} reshape(broadcast.21)
+            broadcast.23 = f32[16,128,30522]{2,1,0} broadcast(reshape.22), dimensions={0,1}
+            divide.24 = f32[16,128,30522]{2,1,0} divide(exponential.14, broadcast.23)
+            ROOT convert.1 = bf16[16,128,30522]{2,1,0} convert(divide.24)
+        }
+    )";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-4, 1e-4}));
+}
+
 }  // namespace cpu
 }  // namespace xla
 

From f849de0732b5bc677311a3d4b9fe55ecf5b68adf Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Wed, 20 Mar 2024 05:17:06 -0700
Subject: [PATCH 170/670] [XLA:GPU] Refactor code that sets priorities into a
 function.

This change lookups and erases a few more elements from `reverse_map_`, but overall it shouldn't make a dent in the compile time.

With this change, we check if priority is negative before inserting into the queue, not after. This also doesn't affect compile time, but arguably makes the code easier to understand.

PiperOrigin-RevId: 617478676
---
 .../xla/xla/service/gpu/priority_fusion.cc    | 62 +++++++++----------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/priority_fusion.cc b/third_party/xla/xla/service/gpu/priority_fusion.cc
index cc1db570d441f6..c96c65319b259e 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion.cc
@@ -154,12 +154,36 @@ class GpuPriorityFusionQueue {
       }
       instructions.push_back(instruction);
     }
+
+    ComputeAndSetPriorities(instructions);
+  }
+
+  void ComputeAndSetPriorities(
+      const std::vector<HloInstruction*>& instructions) {
     std::vector<Priority> priorities = ComputePriorities(instructions);
 
     for (auto [instruction, priority] : llvm::zip(instructions, priorities)) {
-      auto emplace_result = producer_priority_queue_.emplace(
-          std::make_pair(priority, instruction->unique_id()), instruction);
-      CHECK(emplace_result.second);
+      auto key = std::make_pair(priority, instruction->unique_id());
+
+      // Remove instruction with the old priority from the queue.
+      auto reverse_it = reverse_map_.find(instruction);
+      if (reverse_it != reverse_map_.end()) {
+        const PriorityQueue::iterator& queue_it = reverse_it->second;
+        // Priority didn't change. Nothing to do.
+        if (key == queue_it->first) {
+          continue;
+        }
+        producer_priority_queue_.erase(queue_it);
+        reverse_map_.erase(reverse_it);
+      }
+
+      // If the priority is negative, it's not helpful to perform fusion on this
+      // instruction.
+      if (priority < 0) {
+        continue;
+      }
+
+      auto emplace_result = producer_priority_queue_.emplace(key, instruction);
       reverse_map_.emplace(instruction, emplace_result.first);
     }
   }
@@ -195,18 +219,11 @@ class GpuPriorityFusionQueue {
 
     while (!producer_priority_queue_.empty() && current_consumers_.empty()) {
       auto next_it = std::prev(producer_priority_queue_.end());
-      auto priority = next_it->first.first;
 
       current_producer_ = next_it->second;
       producer_priority_queue_.erase(next_it);
       reverse_map_.erase(current_producer_);
 
-      // If the priority is negative, it's not helpful to perform fusion on this
-      // instruction.
-      if (priority < 0) {
-        continue;
-      }
-
       current_consumers_ = current_producer_->users();
 
       if (current_producer_->opcode() == HloOpcode::kBitcast) {
@@ -229,30 +246,9 @@ class GpuPriorityFusionQueue {
       TF_CHECK_OK(cost_analysis_.RevisitInstruction(instruction));
     }
 
-    std::vector<HloInstruction*> to_update_vector{to_update_priority_.begin(),
-                                                  to_update_priority_.end()};
-    std::vector<Priority> new_priorities = ComputePriorities(to_update_vector);
+    ComputeAndSetPriorities(std::vector<HloInstruction*>{
+        to_update_priority_.begin(), to_update_priority_.end()});
 
-    for (auto [instruction, new_priority] :
-         llvm::zip(to_update_vector, new_priorities)) {
-      auto reverse_it = reverse_map_.find(instruction);
-      const auto new_key =
-          std::make_pair(new_priority, instruction->unique_id());
-      if (reverse_it != reverse_map_.end()) {
-        if (new_key == reverse_it->second->first) {
-          continue;
-        }
-        producer_priority_queue_.erase(reverse_it->second);
-      }
-      auto emplace_result =
-          producer_priority_queue_.emplace(new_key, instruction);
-      CHECK(emplace_result.second);
-      if (reverse_it != reverse_map_.end()) {
-        reverse_it->second = emplace_result.first;
-      } else {
-        reverse_map_.emplace(instruction, emplace_result.first);
-      }
-    }
     to_update_priority_.clear();
   }
 

From 06267377eba3d1ccca6b97733bdaf56d62ab196c Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <isergachev@nvidia.com>
Date: Wed, 20 Mar 2024 05:25:52 -0700
Subject: [PATCH 171/670] PR #10711: [GPU][NFC] Clarify dependency of cuDNN
 fusion compilation on stream executor and cuDNN handle.

Imported from GitHub PR https://github.com/openxla/xla/pull/10711

LocalCuDnnHandle can now be created only via CudnnAccess which ensures that main cuDNN handle is created first.

DnnGraph methods using cuDNN handle now require DnnSupport&.
Copybara import of the project:

--
44baf4b48b9cedc762a77d3087db0581581a03a7 by Ilia Sergachev <isergachev@nvidia.com>:

[GPU][NFC] Clarify dependency of cuDNN fusion compilation on stream executor and cuDNN handle.

Merging this change closes #10711

PiperOrigin-RevId: 617480424
---
 .../xla/service/gpu/cudnn_fusion_compiler.cc  | 25 ++++----
 .../xla/service/gpu/cudnn_fusion_compiler.h   |  5 +-
 .../xla/xla/service/gpu/nvptx_compiler.cc     |  4 +-
 .../xla/xla/stream_executor/cuda/cuda_dnn.cc  | 63 ++++++++++---------
 .../xla/xla/stream_executor/cuda/cuda_dnn.h   |  4 +-
 third_party/xla/xla/stream_executor/dnn.h     |  6 +-
 6 files changed, 57 insertions(+), 50 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
index 8f2f70c816ffb8..1262ad321fe1ca 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
@@ -42,7 +42,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/primitive_util.h"
-#include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
@@ -50,7 +49,6 @@ limitations under the License.
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/stream_executor/cuda/cuda_dnn.h"
 #include "xla/stream_executor/cuda/cudnn_frontend_helpers.h"
-#include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -436,13 +434,13 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
 
 // Creates a cuDNN graph, queries cuDNN whether it is supported.
 absl::StatusOr<se::gpu::CudnnGraph> PrepareGraph(
-    const HloFusionInstruction& hlo, se::Stream& stream) {
+    se::dnn::DnnSupport& dnn_support, const HloFusionInstruction& hlo) {
   TF_ASSIGN_OR_RETURN(std::optional<se::gpu::CudnnGraph> graph,
                       HloFusionToCuDnnGraph(hlo));
   if (!graph.has_value()) {
     return absl::InternalError("Construction of cuDNN graph failed.");
   }
-  TF_ASSIGN_OR_RETURN(bool supported, graph->Prepare());
+  TF_ASSIGN_OR_RETURN(bool supported, graph->Prepare(dnn_support));
   if (!supported) {
     return absl::InternalError("cuDNN graph is not supported.");
   }
@@ -451,7 +449,8 @@ absl::StatusOr<se::gpu::CudnnGraph> PrepareGraph(
 
 class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit CuDnnFusionVisitor(const AutotuneConfig& config) : config_(config) {}
+  explicit CuDnnFusionVisitor(se::dnn::DnnSupport& dnn_support)
+      : dnn_support_(dnn_support) {}
 
   absl::Status HandleFusion(HloInstruction* hlo) override {
     TF_ASSIGN_OR_RETURN(auto gpu_config,
@@ -476,25 +475,23 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
         GetComputationFingerprint(hlo->fused_instructions_computation(), {});
     std::string& cache_entry = compilation_cache_[cache_key];
     if (cache_entry.empty()) {
-      TF_ASSIGN_OR_RETURN(se::Stream * stream, config_.GetStream());
-
       TF_ASSIGN_OR_RETURN(
           se::gpu::CudnnGraph graph,
-          PrepareGraph(*DynCast<HloFusionInstruction>(hlo), *stream));
+          PrepareGraph(dnn_support_, *DynCast<HloFusionInstruction>(hlo)));
 
       if (plan_id >= 0) {
         // Build single plan with given ID.
         if (plan_id >= graph.Graph().get_execution_plan_count()) {
           return absl::InternalError("cuDNN graph plan does not exist.");
         }
-        TF_RETURN_IF_ERROR(graph.Build(plan_id));
+        TF_RETURN_IF_ERROR(graph.Build(dnn_support_, plan_id));
       } else {
         // Build plans one by one till first successful when no plan_id was
         // provided.
         for (plan_id = 0; plan_id < graph.Graph().get_execution_plan_count();
              ++plan_id) {
           VLOG(7) << "Trying plan ID " << plan_id;
-          if (graph.Build(plan_id).ok()) {
+          if (graph.Build(dnn_support_, plan_id).ok()) {
             VLOG(7) << "Successfully built plan ID " << plan_id;
             break;
           }
@@ -528,7 +525,7 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
   }
 
  private:
-  AutotuneConfig config_;
+  se::dnn::DnnSupport& dnn_support_;
   // <HLO computation fingerprint, serialized compiled cuDNN graph>.
   absl::flat_hash_map<std::string, std::string> compilation_cache_;
 };
@@ -539,13 +536,13 @@ absl::StatusOr<bool> CuDnnFusionCompiler::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER("cuDNN fusion compiler");
-  return CuDnnFusionVisitor(config_).RunOnModule(module, execution_threads);
+  return CuDnnFusionVisitor(dnn_support_)
+      .RunOnModule(module, execution_threads);
 }
 
 int CuDnnFusionCompiler::GetAvailablePlanCount(
     const HloFusionInstruction& hlo) const {
-  se::Stream& stream = *config_.GetStream().value();
-  auto graph = PrepareGraph(hlo, stream);
+  auto graph = PrepareGraph(dnn_support_, hlo);
   if (!graph.ok()) {
     return 0;
   }
diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h
index ffd512159ecbb5..69bd00a71ba2b1 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/stream_executor/dnn.h"
 
 namespace xla {
 namespace gpu {
@@ -33,7 +34,7 @@ namespace gpu {
 class CuDnnFusionCompiler : public HloModulePass {
  public:
   explicit CuDnnFusionCompiler(const AutotuneConfig& config)
-      : config_(config) {}
+      : dnn_support_(*config.GetExecutor()->AsDnn()) {}
 
   absl::string_view name() const override { return "cudnn-fusion-compiler"; }
 
@@ -45,7 +46,7 @@ class CuDnnFusionCompiler : public HloModulePass {
   int GetAvailablePlanCount(const HloFusionInstruction& hlo) const;
 
  private:
-  AutotuneConfig config_;
+  se::dnn::DnnSupport& dnn_support_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 5821d21efc4757..9a5b82d8a7b6f7 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -326,7 +326,9 @@ absl::Status NVPTXCompiler::AddConvAndGemmAutotuningPasses(
     pipeline->AddPass<GpuConvAlgorithmPicker>(autotune_config);
   }
   pipeline->AddPass<GemmAlgorithmPicker>(autotune_config);
-  pipeline->AddPass<CuDnnFusionCompiler>(autotune_config);
+  if (!autotune_config.IsDeviceless()) {
+    pipeline->AddPass<CuDnnFusionCompiler>(autotune_config);
+  }
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 7076e9dae0c375..3514df43929e7a 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -229,6 +229,19 @@ class CudnnHandle {
   cudnnHandle_t handle_;  // Not owned.
 };
 
+// RAII wrapper for temporary cuDNN handles that are used for multithreaded
+// compilation. Unlike with CudnnAccess these are not associated
+// with GPU devices and are not locked.
+class LocalCuDnnHandle {
+ public:
+  explicit LocalCuDnnHandle(cudnnHandle_t handle) : handle_(handle) {}
+  ~LocalCuDnnHandle() { cudnnDestroy(handle_); }
+  cudnnHandle_t handle() { return handle_; }
+
+ private:
+  cudnnHandle_t handle_;
+};
+
 // Major version is neither forward or backward compatible and therefore major
 // versions needs to match between source and library.
 //
@@ -288,6 +301,14 @@ class CudnnAccess {
     return CudnnHandle(executor, std::move(lock), handle_);
   }
 
+  absl::StatusOr<std::unique_ptr<LocalCuDnnHandle>> GetLocalHandle() {
+    cudnnHandle_t handle = nullptr;
+    if (cudnnCreate(&handle) != CUDNN_STATUS_SUCCESS) {
+      return absl::InternalError("Creation of local cudnn handle failed.");
+    }
+    return std::make_unique<LocalCuDnnHandle>(handle);
+  }
+
   void NotifyStreamDestroyed(Stream* stream) {
     CUstream cu_stream = AsGpuStreamValue(stream);
     absl::MutexLock lock(&mutex_);
@@ -6344,7 +6365,8 @@ CreateCudnnFlashAttentionCausalMaskTensor(
 }
 
 absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
-    CudnnHandle& cudnn, const dnn::MatmulTensorDescriptor& q_descriptor,
+    dnn::DnnSupport& dnn_support,
+    const dnn::MatmulTensorDescriptor& q_descriptor,
     const dnn::MatmulTensorDescriptor& k_descriptor,
     const dnn::MatmulTensorDescriptor& v_descriptor,
     const dnn::TensorDescriptor& o_descriptor,
@@ -6464,11 +6486,11 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
   }
 
   CudnnGraph cudnnGraph(std::move(graph));
-  TF_ASSIGN_OR_RETURN(bool supported, cudnnGraph.Prepare());
+  TF_ASSIGN_OR_RETURN(bool supported, cudnnGraph.Prepare(dnn_support));
   if (!supported) {
     return absl::InternalError("cuDNN graph is not supported.");
   }
-  TF_RETURN_IF_ERROR(cudnnGraph.Build(/*plan_id=*/0));
+  TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/0));
 
   if (VLOG_IS_ON(4)) {
     VLOG(4) << "\b flash attention operation graph: " << graph;
@@ -9087,7 +9109,7 @@ CudnnSupport::FusedMHARunnerFromDesc(
     TF_ASSIGN_OR_RETURN(
         auto graph,
         GetCudnnFlashAttentionOperationGraph(
-            cudnn, /*q_descriptor=*/bmm1_lhs_descriptor,
+            *this, /*q_descriptor=*/bmm1_lhs_descriptor,
             /*k_descriptor=*/bmm1_rhs_descriptor,
             /*v_descriptor=*/bmm2_rhs_descriptor,
             /*o_descriptor=*/output_descriptor, bias_descriptor,
@@ -10354,31 +10376,11 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
   return IsStatusOk(status, /*report_error=*/true);
 }
 
-// RAII wrapper for temporary cuDNN handles that are used for multithreaded
-// compilation. Unlike with CudnnAccess these are not associated
-// with GPU devices and are not locked.
-class LocalCuDnnHandle {
- public:
-  explicit LocalCuDnnHandle(cudnnHandle_t handle) : handle_(handle) {}
-  ~LocalCuDnnHandle() { cudnnDestroy(handle_); }
-  cudnnHandle_t handle() { return handle_; }
-  static absl::StatusOr<std::unique_ptr<LocalCuDnnHandle>> create() {
-    cudnnHandle_t handle = nullptr;
-    if (cudnnCreate(&handle) != CUDNN_STATUS_SUCCESS) {
-      return absl::InternalError("Could not create cudnn handle");
-    }
-    return std::make_unique<LocalCuDnnHandle>(handle);
-  }
-
- private:
-  cudnnHandle_t handle_;
-};
-
 #if CUDNN_VERSION >= 8100
 
 absl::StatusOr<std::unique_ptr<dnn::DnnGraph>> CudnnSupport::DeserializeGraph(
     absl::string_view serialized_data) const {
-  TF_ASSIGN_OR_RETURN(auto cudnn, LocalCuDnnHandle::create());
+  TF_ASSIGN_OR_RETURN(auto cudnn, cudnn_->GetLocalHandle());
   cudnn_frontend::graph::Graph graph;
   RETURN_IF_CUDNN_FRONTEND_ERROR(graph.deserialize(
       cudnn->handle(),
@@ -10387,8 +10389,9 @@ absl::StatusOr<std::unique_ptr<dnn::DnnGraph>> CudnnSupport::DeserializeGraph(
   return std::make_unique<CudnnGraph>(std::move(graph));
 }
 
-absl::StatusOr<bool> CudnnGraph::Prepare() {
-  TF_ASSIGN_OR_RETURN(auto cudnn, LocalCuDnnHandle::create());
+absl::StatusOr<bool> CudnnGraph::Prepare(dnn::DnnSupport& dnn_support) {
+  const CudnnSupport& cudnn_support = static_cast<CudnnSupport&>(dnn_support);
+  TF_ASSIGN_OR_RETURN(auto cudnn, cudnn_support.cudnn_->GetLocalHandle());
   RETURN_IF_CUDNN_FRONTEND_ERROR(graph_.validate());
   RETURN_IF_CUDNN_FRONTEND_ERROR(graph_.build_operation_graph(cudnn->handle()));
   RETURN_IF_CUDNN_FRONTEND_ERROR(
@@ -10400,8 +10403,10 @@ absl::StatusOr<bool> CudnnGraph::Prepare() {
   return true;
 }
 
-absl::Status CudnnGraph::Build(const int64_t plan_id) {
-  TF_ASSIGN_OR_RETURN(auto cudnn, LocalCuDnnHandle::create());
+absl::Status CudnnGraph::Build(dnn::DnnSupport& dnn_support,
+                               const int64_t plan_id) {
+  const CudnnSupport& cudnn_support = static_cast<CudnnSupport&>(dnn_support);
+  TF_ASSIGN_OR_RETURN(auto cudnn, cudnn_support.cudnn_->GetLocalHandle());
   RETURN_IF_CUDNN_FRONTEND_ERROR(
       graph_.build_plan_at_index(cudnn->handle(), plan_id));
   return absl::OkStatus();
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
index 40484460b5a8e1..06cc5fec2dd9b4 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
@@ -60,9 +60,9 @@ class CudnnGraph : public dnn::DnnGraph {
   explicit CudnnGraph(cudnn_frontend::graph::Graph&& graph)
       : graph_(std::move(graph)) {}
   // Prepares a graph and checks whether it is generally supported.
-  absl::StatusOr<bool> Prepare() override;
+  absl::StatusOr<bool> Prepare(dnn::DnnSupport&) override;
   // Builds single plan of the graph with given ID.
-  absl::Status Build(int64_t plan_id) override;
+  absl::Status Build(dnn::DnnSupport&, int64_t plan_id) override;
   // Builds all the plans
   absl::Status Execute(Stream& stream,
                        absl::Span<DeviceMemoryBase> operands) const override;
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index dcae7b0d719cca..0ed0444dd4f4bb 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -1250,6 +1250,8 @@ class VersionInfo {
   int patch_;
 };
 
+class DnnSupport;
+
 class DnnGraph {
  public:
   DnnGraph() = default;
@@ -1259,8 +1261,8 @@ class DnnGraph {
   // anything else unexpected),
   // false on expected ones (graph is valid but not supported),
   // true on success.
-  virtual absl::StatusOr<bool> Prepare() = 0;
-  virtual absl::Status Build(int64_t plan_id) = 0;
+  virtual absl::StatusOr<bool> Prepare(DnnSupport&) = 0;
+  virtual absl::Status Build(DnnSupport&, int64_t plan_id) = 0;
   virtual absl::Status Execute(Stream& stream,
                                absl::Span<DeviceMemoryBase> operands) const = 0;
 };

From b407fb5f4c9bdfdc6f69f5c4d659a6567c593473 Mon Sep 17 00:00:00 2001
From: Philipp Hack <phack@nvidia.com>
Date: Wed, 20 Mar 2024 05:44:56 -0700
Subject: [PATCH 172/670] PR #10536: Size 1 Dimensions in Layer Norm Fusion

Imported from GitHub PR https://github.com/openxla/xla/pull/10536

Enables the fusion of layer norm patterns with degenerate input dimensions of size 1 and with additional optional type conversions or reshapes adding or removing degenerate dimensions.
Copybara import of the project:

--
54e247d2bf7104358549f0fc31019c36cfb5ce9b by Philipp Hack <phack@nvidia.com>:

Layer norm fusion for inputs with degenerate dimensions.

--
7d766bf95049e0ea7fba30c7b6262dac553c94ed by Philipp Hack <phack@nvidia.com>:

Layer norm fusion for inputs with degenerate dimensions.

--
14075f1714dcfec83efe13b8903a34caa4b39c01 by Philipp Hack <phack@nvidia.com>:

Layer norm fusion for inputs with degenerate dimensions.

Merging this change closes #10536

PiperOrigin-RevId: 617484341
---
 .../xla/service/gpu/cudnn_norm_rewriter.cc    | 166 ++++---
 .../service/gpu/cudnn_norm_rewriter_test.cc   | 433 +++++++++++++++++-
 2 files changed, 528 insertions(+), 71 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
index 732ed968a405ea..ec966e18085b11 100644
--- a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
@@ -98,8 +98,9 @@ struct NormMetadata {
   // cuDNN. Nullptr if no transposes were inserted.
   HloInstruction *x_transpose, *y_transpose;
   // The reduction and non-reduction dimensions of the input into the forward
-  // layer norm before the potential application of transposes.
-  std::vector<int64_t> norm_dims, non_norm_dims;
+  // layer norm before the potential application of transposes and adjusted for
+  // the removal of any degenerate dimensions in the input to the norm.
+  std::vector<int64_t> norm_dims_adjusted, non_norm_dims_adjusted;
 };
 
 // Map from the instruction pointer of a layer norm Custom Call to its metadata.
@@ -171,6 +172,45 @@ bool CompatibleElementType(const HloInstruction* instr) {
   return element_type == BF16 || element_type == F16 || element_type == F32;
 }
 
+// Returns the dimensions associated with shape, adjusted for the removal of any
+// degenerate dimensions in shape. Specifically, for each dimension d in
+// dimensions, returns the new index of d if all dimensions of size 1 are
+// removed from shape. If d has size 1, it is not included in the returned
+// vector.
+std::vector<int64_t> AdjustedDimensions(const Shape& shape,
+                                        absl::Span<const int64_t> dimensions) {
+  absl::flat_hash_map<int64_t, int64_t> dimension_map;
+  for (int64_t dimension = 0, non_degen_dimension = 0; dimension < shape.rank();
+       ++dimension) {
+    if (shape.dimensions(dimension) > 1) {
+      dimension_map.insert({dimension, non_degen_dimension});
+      non_degen_dimension++;
+    }
+  }
+  std::vector<int64_t> adjusted_dimensions;
+  for (int64_t dimension : dimensions) {
+    auto non_degenerate_dimension = dimension_map.find(dimension);
+    if (non_degenerate_dimension != dimension_map.end()) {
+      adjusted_dimensions.emplace_back(non_degenerate_dimension->second);
+    }
+  }
+  return adjusted_dimensions;
+}
+
+// Returns the dimensions of broadcast or reduction instructions, adjusted for
+// the removal of any degenerate dimensions in the output or input.
+std::vector<int64_t> AdjustedDimensions(const HloInstruction* instr) {
+  Shape shape;
+  if (instr->opcode() == HloOpcode::kBroadcast) {
+    shape = instr->shape();
+  } else if (instr->opcode() == HloOpcode::kReduce) {
+    shape = instr->operand(0)->shape();
+  } else {
+    return {};
+  }
+  return AdjustedDimensions(shape, instr->dimensions());
+}
+
 // Returns whether the HLO Computation applied by instr calculates the sum of
 // the elements. When provided, compares reduce_dims to the dimensions of the
 // reduction.
@@ -179,12 +219,9 @@ bool AppliesAddReduce(const HloInstruction* instr,
   if (instr->opcode() != HloOpcode::kReduce) {
     return false;
   }
-  if (ShapeUtil::HasDegenerateDimensions(instr->operand(0)->shape())) {
-    VLOG(1) << "Reduction input must not have degenerate dimensions.";
-    return false;
-  }
+
   // Verify the dimensions of the reduction.
-  if (!reduce_dims.empty() && instr->dimensions() != reduce_dims) {
+  if (!reduce_dims.empty() && AdjustedDimensions(instr) != reduce_dims) {
     return false;
   }
 
@@ -275,12 +312,6 @@ bool FindTarget(const HloInstruction* custom_call, const HloInstruction* instr,
 std::vector<int64_t> MapDimensions(const Shape& original_shape,
                                    const Shape& reshaped_shape,
                                    const absl::Span<const int64_t> dimensions) {
-  // The original and reshaped shape must not have degenerate dimensions.
-  if (ShapeUtil::HasDegenerateDimensions(original_shape) ||
-      ShapeUtil::HasDegenerateDimensions(reshaped_shape)) {
-    return {};
-  }
-
   auto dimension_product =
       [](const Shape& shape,
          absl::Span<const int64_t> product_dimensions) -> int64_t {
@@ -296,8 +327,9 @@ std::vector<int64_t> MapDimensions(const Shape& original_shape,
   for (int64_t original_dimension = 0, reshaped_dimension = 0;
        original_dimension < original_shape.rank(); ++original_dimension) {
     original_dimensions.emplace_back(original_dimension);
-    while (dimension_product(reshaped_shape, reshaped_dimensions) <
-               dimension_product(original_shape, original_dimensions) &&
+    while ((reshaped_dimensions.empty() ||
+            dimension_product(reshaped_shape, reshaped_dimensions) <
+                dimension_product(original_shape, original_dimensions)) &&
            reshaped_dimension < reshaped_shape.rank()) {
       reshaped_dimensions.emplace_back(reshaped_dimension++);
     }
@@ -554,12 +586,12 @@ auto Expectation(Pattern pattern) {
 // Expected value, or mean, with optional broadcast.
 template <typename Pattern>
 auto Expectation(UniqueHloInstruction* expectation, Pattern pattern) {
-  auto shared_subpattern =
-      MultiplyAnyOrder(m::Broadcast(m::ConstantScalar()), AddReduce(pattern))
+  auto shared_subpattern = OptionalSupportedTransform(
+      m::MultiplyAnyOrder(m::Broadcast(m::ConstantScalar()), AddReduce(pattern))
           .WithPredicate([](const HloInstruction* instr) {
             return CalculatesExpectation(instr);
           })
-          .WithPredicate(expectation->capture_or_verify);
+          .WithPredicate(expectation->capture_or_verify));
   return m::AnyOf<HloInstruction>(m::Broadcast(shared_subpattern),
                                   shared_subpattern);
 }
@@ -568,12 +600,13 @@ auto Expectation(UniqueHloInstruction* expectation, Pattern pattern) {
 template <typename Pattern>
 auto Expectation(UniqueHloInstruction* expectation, HloInstruction** reduce,
                  Pattern pattern) {
-  auto shared_subpattern = MultiplyAnyOrder(m::Broadcast(m::ConstantScalar()),
-                                            AddReduce(reduce, pattern))
-                               .WithPredicate([](const HloInstruction* instr) {
-                                 return CalculatesExpectation(instr);
-                               })
-                               .WithPredicate(expectation->capture_or_verify);
+  auto shared_subpattern = OptionalSupportedTransform(
+      m::MultiplyAnyOrder(m::Broadcast(m::ConstantScalar()),
+                          AddReduce(reduce, pattern))
+          .WithPredicate([](const HloInstruction* instr) {
+            return CalculatesExpectation(instr);
+          })
+          .WithPredicate(expectation->capture_or_verify));
   return m::AnyOf<HloInstruction>(m::Broadcast(shared_subpattern),
                                   shared_subpattern);
 }
@@ -583,14 +616,19 @@ auto Expectation(UniqueHloInstruction* expectation, HloInstruction** reduce,
 auto Variance(UniqueHloInstruction* variance, UniqueHloInstruction* expectation,
               UniqueHloInstruction* x) {
   return m::AnyOf<HloInstruction>(
-      Subtract(Expectation(Square(m::Op().WithPredicate(x->capture_or_verify))),
-               Square(Expectation(expectation,
-                                  m::Op().WithPredicate(x->capture_or_verify))))
+      Subtract(Expectation(Square(OptionalSupportedTransform(
+                   m::Op().WithPredicate(x->capture_or_verify)))),
+               Square(Expectation(expectation, OptionalSupportedTransform(
+                                                   m::Op().WithPredicate(
+                                                       x->capture_or_verify)))))
           .WithPredicate(variance->capture_or_verify),
       Expectation(
-          Square(Subtract(m::Op().WithPredicate(x->capture_or_verify),
-                          Expectation(expectation, m::Op().WithPredicate(
-                                                       x->capture_or_verify)))))
+          Square(Subtract(
+              OptionalSupportedTransform(
+                  m::Op().WithPredicate(x->capture_or_verify)),
+              Expectation(expectation,
+                          OptionalSupportedTransform(
+                              m::Op().WithPredicate(x->capture_or_verify))))))
           .WithPredicate(variance->capture_or_verify));
 }
 
@@ -727,8 +765,8 @@ auto XCenter(UniqueHloInstruction* x_center, UniqueHloInstruction* x,
              UniqueHloInstruction* fused_expectation,
              UniqueHloInstruction* custom_call,
              const NormMetadataMap& norm_metadata) {
-  auto capture_or_verify_x = [x, x_center, custom_call, &norm_metadata](
-                                 const HloInstruction* instr) -> bool {
+  auto capture_or_verify_x =
+      [x, custom_call, &norm_metadata](const HloInstruction* instr) -> bool {
     return x->CaptureOrVerify(
         FindTarget(custom_call->Instr(), instr->operand(0),
                    custom_call->Instr()->operand(0), norm_metadata)
@@ -854,9 +892,11 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
     if (Match(
             instr,
             SubtractMultiplyAddAnyOrder(
-                m::Op().WithPredicate(x.capture_or_verify),
+                OptionalSupportedTransform(
+                    m::Op().WithPredicate(x.capture_or_verify)),
                 Expectation(&expectation, &reduce,
-                            m::Op().WithPredicate(x.capture_or_verify)),
+                            OptionalSupportedTransform(
+                                m::Op().WithPredicate(x.capture_or_verify))),
                 NormFactor(&norm_factor, &x, &variance, &expectation, &epsilon),
                 m::Broadcast(&broadcast_scale, m::Op(&scale)),
                 m::Broadcast(&broadcast_bias, m::Op(&bias))))) {
@@ -889,11 +929,6 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
         return absl::OkStatus();
       }
 
-      // Skip initial convert, if present.
-      if (x.Instr()->opcode() == HloOpcode::kConvert) {
-        x.SetInstr(x.Instr()->mutable_operand(0));
-      }
-
       // Verify the input and output layouts.
       // TODO(philipphack): Consider supporting more general cases.
       if (!LayoutUtil::IsMonotonicWithDim0Major(x.Instr()->shape().layout()) ||
@@ -914,10 +949,15 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       }
 
       // Verify that the shapes of scale and bias are compatible with the
-      // operation.
+      // operation. The adjusted norm dimensions are the dimensions of the
+      // reduction after removing any degenerate dimensions from the input of
+      // the reduction.
       std::vector<int64_t> norm_dims(reduce->dimensions().begin(),
                                      reduce->dimensions().end());
-      if (norm_dims.size() != scale->shape().dimensions_size()) {
+      std::vector<int64_t> norm_dims_adjusted = AdjustedDimensions(reduce);
+      if (norm_dims_adjusted.size() !=
+          ShapeUtil::DropDegenerateDimensions(scale->shape())
+              .dimensions_size()) {
         VLOG(1) << "Layer norm input dimensions not supported.";
         return absl::OkStatus();
       }
@@ -930,12 +970,14 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       }
 
       // Verify the broadcasts of scale and bias.
-      if (!ShapeUtil::EqualIgnoringElementType(reduce->operand(0)->shape(),
-                                               broadcast_scale->shape()) ||
-          !ShapeUtil::EqualIgnoringElementType(reduce->operand(0)->shape(),
-                                               broadcast_bias->shape()) ||
-          reduce->dimensions() != broadcast_scale->dimensions() ||
-          reduce->dimensions() != broadcast_bias->dimensions()) {
+      if (!ShapeUtil::EqualIgnoringElementType(
+              ShapeUtil::DropDegenerateDimensions(reduce->operand(0)->shape()),
+              ShapeUtil::DropDegenerateDimensions(broadcast_scale->shape())) ||
+          !ShapeUtil::EqualIgnoringElementType(
+              ShapeUtil::DropDegenerateDimensions(reduce->operand(0)->shape()),
+              ShapeUtil::DropDegenerateDimensions(broadcast_bias->shape())) ||
+          norm_dims_adjusted != AdjustedDimensions(broadcast_scale) ||
+          norm_dims_adjusted != AdjustedDimensions(broadcast_bias)) {
         VLOG(1) << "Layer norm operand broadcast not supported.";
         return absl::OkStatus();
       }
@@ -949,6 +991,9 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
           non_norm_dims.emplace_back(x_dim);
         }
       }
+      std::vector<int64_t> non_norm_dims_adjusted =
+          AdjustedDimensions(x.Instr()->shape(), non_norm_dims);
+
       std::vector<int64_t> x_transpose_order = non_norm_dims;
       x_transpose_order.insert(x_transpose_order.end(), norm_dims.begin(),
                                norm_dims.end());
@@ -1052,9 +1097,10 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
 
       // Store metadata for potential use in the backward graph.
       norm_metadata_.insert(
-          {custom_call, NormMetadata({x_transpose.value_or(nullptr),
-                                      y_transpose.value_or(nullptr), norm_dims,
-                                      non_norm_dims})});
+          {custom_call,
+           NormMetadata({x_transpose.value_or(nullptr),
+                         y_transpose.value_or(nullptr), norm_dims_adjusted,
+                         non_norm_dims_adjusted})});
 
       VLOG(1) << "Layer norm rewritten into Custom Call.";
 
@@ -1255,10 +1301,14 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       }
 
       // Verify the dimensions of reductions in the backward graph.
-      if (reduce0->dimensions() != norm_metadata->second.norm_dims ||
-          reduce1->dimensions() != norm_metadata->second.norm_dims ||
-          reduce2->dimensions() != norm_metadata->second.norm_dims ||
-          reduce3->dimensions() != norm_metadata->second.norm_dims) {
+      if (AdjustedDimensions(reduce0) !=
+              norm_metadata->second.norm_dims_adjusted ||
+          AdjustedDimensions(reduce1) !=
+              norm_metadata->second.norm_dims_adjusted ||
+          AdjustedDimensions(reduce2) !=
+              norm_metadata->second.norm_dims_adjusted ||
+          AdjustedDimensions(reduce3) !=
+              norm_metadata->second.norm_dims_adjusted) {
         VLOG(1) << "Unexpected reductions dimensions in layer norm gradient.";
         return absl::OkStatus();
       }
@@ -1303,8 +1353,9 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
                                        m::Op().Is(factor1.Instr()))))) {
               // Dscale is an addition-reduction of the product.
               for (HloInstruction* multiply_user : user->users()) {
-                if (AppliesAddReduce(multiply_user,
-                                     norm_metadata->second.non_norm_dims)) {
+                if (AppliesAddReduce(
+                        multiply_user,
+                        norm_metadata->second.non_norm_dims_adjusted)) {
                   return multiply_user;
                 }
               }
@@ -1321,7 +1372,8 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
 
       // Find Dbias, i.e. an addition-reduction of DY, starting from DY.
       // Rewriting proceeds without fusing Dbias if unsuccessful.
-      dbias = FindAddReduce(dy.Instr(), norm_metadata->second.non_norm_dims);
+      dbias = FindAddReduce(dy.Instr(),
+                            norm_metadata->second.non_norm_dims_adjusted);
 
       // Verify the input and output layouts.
       // TODO(philipphack): Consider supporting more general cases.
diff --git a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
index 193b9f19d76250..f598e46da9accb 100644
--- a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
@@ -105,7 +105,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm2D1) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4], scale: f32[4], bias: f32[4]) -> f32[2,4] {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4], {{.*}}: f32[4], {{.*}}: f32[4]) -> f32[2,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
@@ -174,7 +174,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D3) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[8], bias: f32[8]) -> f32[2,4,6,8] {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[8], {{.*}}: f32[8]) -> f32[2,4,6,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
@@ -193,6 +193,75 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D3) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
+TEST_F(CudnnNormRewriterTest, LayerNorm4D3Degenerate0) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[1,4,6,8] parameter(0)
+        input_square = f32[1,4,6,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[1,4,6] reduce(input_square, c0), dimensions={3}, to_apply=apply
+        r_nelems = f32[] constant(0.125)
+        r_nelems_bcast = f32[1,4,6] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[1,4,6] multiply(input_square_sum, r_nelems_bcast)
+        input_sum = f32[1,4,6] reduce(input, c0), dimensions={3}, to_apply=apply
+        input_mean = f32[1,4,6] multiply(input_sum, r_nelems_bcast)
+        input_mean_square = f32[1,4,6] multiply(input_mean, input_mean)
+        variance = f32[1,4,6] subtract(input_square_mean, input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[1,4,6] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[1,4,6] add(variance, epsilon_bcast)
+        norm_factor = f32[1,4,6] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[1,4,6,8] broadcast(norm_factor), dimensions={0,1,2}
+        input_mean_bcast = f32[1,4,6,8] broadcast(input_mean), dimensions={0,1,2}
+        input_center = f32[1,4,6,8] subtract(input, input_mean_bcast)
+        norm = f32[1,4,6,8] multiply(norm_factor_bcast, input_center)
+        scale = f32[8] parameter(1)
+        scale_bcast = f32[1,4,6,8] broadcast(scale), dimensions={3}
+        norm_scale = f32[1,4,6,8] multiply(norm, scale_bcast)
+        bias = f32[8] parameter(2)
+        bias_bcast = f32[1,4,6,8] broadcast(bias), dimensions={3}
+        ROOT out = f32[1,4,6,8] add(norm_scale, bias_bcast)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[1,4,6,8], {{.*}}: f32[8], {{.*}}: f32[8]) -> f32[1,4,6,8] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[1,4,6,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[24,8,1,1]{3,2,1,0} bitcast([[P0]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[8]{0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[24,8,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK:           }
+; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[24,8,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
+; CHECK-NEXT:  ROOT [[GTE_BITCAST:%[^ ]+]] = f32[1,4,6,8]{3,2,1,0} bitcast([[GTE]])
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
 TEST_F(CudnnNormRewriterTest, LayerNorm4D2) {
 #if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
   GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
@@ -243,7 +312,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[6], bias: f32[6]) -> f32[2,4,6,8] {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[6], {{.*}}: f32[6]) -> f32[2,4,6,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,4,8,6]{3,2,1,0} transpose([[P0]]), dimensions={0,1,3,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
@@ -263,6 +332,76 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
+TEST_F(CudnnNormRewriterTest, LayerNorm4D2Degenerate1) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,1,6,8] parameter(0)
+        input_square = f32[2,1,6,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,1,8] reduce(input_square, c0), dimensions={2}, to_apply=apply
+        r_nelems = f32[] constant(0.166667)
+        r_nelems_bcast = f32[2,1,8] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,1,8] multiply(input_square_sum, r_nelems_bcast)
+        reduce = f32[2,1,8] reduce(input, c0), dimensions={2}, to_apply=apply
+        input_mean = f32[2,1,8] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,1,8] multiply(input_mean, input_mean)
+        variance = f32[2,1,8] subtract(input_square_mean, input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,1,8] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,1,8] add(variance, epsilon_bcast)
+        norm_factor = f32[2,1,8] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,1,6,8] broadcast(norm_factor), dimensions={0,1,3}
+        input_mean_bcast = f32[2,1,6,8] broadcast(input_mean), dimensions={0,1,3}
+        input_center = f32[2,1,6,8] subtract(input, input_mean_bcast)
+        norm = f32[2,1,6,8] multiply(norm_factor_bcast, input_center)
+        scale = f32[6] parameter(1)
+        scale_bcast = f32[2,1,6,8] broadcast(scale), dimensions={2}
+        norm_scale = f32[2,1,6,8] multiply(norm, scale_bcast)
+        bias = f32[6] parameter(2)
+        bias_broadcast = f32[2,1,6,8] broadcast(bias), dimensions={2}
+        ROOT out = f32[2,1,6,8] add(norm_scale, bias_broadcast)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,1,6,8], {{.*}}: f32[6], {{.*}}: f32[6]) -> f32[2,1,6,8] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,1,6,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,6]{3,2,1,0} transpose([[P0]]), dimensions={1,0,3,2}
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[6]{0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[6]{0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[16,6,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK:           }
+; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[16,6,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
+; CHECK-NEXT:  ROOT [[FUSION:%[^ ]+]] = f32[2,1,6,8]{3,2,1,0} fusion([[GTE]]), kind=kLoop, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
 TEST_F(CudnnNormRewriterTest, LayerNorm4D12) {
 #if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
   GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
@@ -313,7 +452,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4,6], bias: f32[4,6]) -> f32[2,4,6,8] {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4,6], {{.*}}: f32[4,6]) -> f32[2,4,6,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} transpose([[P0]]), dimensions={0,3,1,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE]])
@@ -333,6 +472,76 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
+TEST_F(CudnnNormRewriterTest, LayerNorm4D12Degenerate2) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,4,1,8] parameter(0)
+        input_square = f32[2,4,1,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,8] reduce(input_square, c0), dimensions={1,2}, to_apply=apply
+        r_nelems = f32[] constant(0.25)
+        r_nelems_bcast = f32[2,8] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,8] multiply(input_square_sum, r_nelems_bcast)
+        reduce = f32[2,8] reduce(input, c0), dimensions={1,2}, to_apply=apply
+        input_mean = f32[2,8] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,8] multiply(input_mean, input_mean)
+        variance = f32[2,8] subtract(input_square_mean, input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,8] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,8] add(variance, epsilon_bcast)
+        norm_factor = f32[2,8] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,4,1,8] broadcast(norm_factor), dimensions={0,3}
+        input_mean_bcast = f32[2,4,1,8] broadcast(input_mean), dimensions={0,3}
+        input_center = f32[2,4,1,8] subtract(input, input_mean_bcast)
+        norm = f32[2,4,1,8] multiply(norm_factor_bcast, input_center)
+        scale = f32[4,1] parameter(1)
+        scale_bcast = f32[2,4,1,8] broadcast(scale), dimensions={1,2}
+        norm_scale = f32[2,4,1,8] multiply(norm, scale_bcast)
+        bias = f32[4,1] parameter(2)
+        bias_broadcast = f32[2,4,1,8] broadcast(bias), dimensions={1,2}
+        ROOT out = f32[2,4,1,8] add(norm_scale, bias_broadcast)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,1,8], {{.*}}: f32[4,1], {{.*}}: f32[4,1]) -> f32[2,4,1,8] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} transpose([[P0]]), dimensions={2,0,3,1}
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,1]{1,0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK:           }
+; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
+; CHECK-NEXT:  ROOT  [[FUSION:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} fusion([[GTE]]), kind=kLoop, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
 TEST_F(CudnnNormRewriterTest, LayerNorm4D3IncorrectScaleBroadcast) {
 #if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
   GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
@@ -383,7 +592,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D3IncorrectScaleBroadcast) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,2,2,2], scale: f32[2], bias: f32[2]) -> f32[2,2,2,2] {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,2,2,2], {{.*}}: f32[2], {{.*}}: f32[2]) -> f32[2,2,2,2] {
 ; CHECK-NOT:           custom_call_target="__cudnn$norm"
   )";
 
@@ -442,7 +651,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain2D1) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4], scale: f32[4], bias: f32[4]) -> (f32[2,4], f32[2], f32[2], f32[2]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4], {{.*}}: f32[4], {{.*}}: f32[4]) -> (f32[2,4], f32[2], f32[2], f32[2]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
@@ -519,7 +728,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D3) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[8], bias: f32[8]) -> (f32[2,4,6,8], f32[2,4,6], f32[2,4,6], f32[2,4,6]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[8], {{.*}}: f32[8]) -> (f32[2,4,6,8], f32[2,4,6], f32[2,4,6], f32[2,4,6]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
@@ -596,7 +805,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4,6], bias: f32[4,6]) -> (f32[2,4,6,8], f32[2,8], f32[2,8], f32[2,8]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4,6], {{.*}}: f32[4,6]) -> (f32[2,4,6,8], f32[2,8], f32[2,8], f32[2,8]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} transpose([[P0]]), dimensions={0,3,1,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE]])
@@ -622,6 +831,84 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
+TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12Degenerate2) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,4,1,8] parameter(0)
+        input_square = f32[2,4,1,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,8] reduce(input_square, c0), dimensions={1,2}, to_apply=apply
+        r_nelems = f32[] constant(0.25)
+        r_nelems_bcast = f32[2,8] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,8] multiply(input_square_sum, r_nelems_bcast)
+        reduce = f32[2,8] reduce(input, c0), dimensions={1,2}, to_apply=apply
+        input_mean = f32[2,8] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,8] multiply(input_mean, input_mean)
+        variance = f32[2,8] subtract(input_square_mean, input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,8] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,8] add(variance, epsilon_bcast)
+        norm_factor = f32[2,8] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,4,1,8] broadcast(norm_factor), dimensions={0,3}
+        input_mean_bcast = f32[2,4,1,8] broadcast(input_mean), dimensions={0,3}
+        input_center = f32[2,4,1,8] subtract(input, input_mean_bcast)
+        norm = f32[2,4,1,8] multiply(norm_factor_bcast, input_center)
+        scale = f32[4,1] parameter(1)
+        scale_bcast = f32[2,4,1,8] broadcast(scale), dimensions={1,2}
+        norm_scale = f32[2,4,1,8] multiply(norm, scale_bcast)
+        bias = f32[4,1] parameter(2)
+        bias_broadcast = f32[2,4,1,8] broadcast(bias), dimensions={1,2}
+        norm_scale_bias = f32[2,4,1,8] add(norm_scale, bias_broadcast)
+        norm_factor_cube = f32[2,8] divide(norm_factor, variance_plus_epsilon)
+        ROOT out = (f32[2,4,1,8], f32[2,8], f32[2,8], f32[2,8]) tuple(norm_scale_bias, input_mean, norm_factor, norm_factor_cube)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,1,8], {{.*}}: f32[4,1], {{.*}}: f32[4,1]) -> (f32[2,4,1,8], f32[2,8], f32[2,8], f32[2,8]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} transpose([[P0]]), dimensions={2,0,3,1}
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,1]{1,0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK:           }
+; CHECK-NEXT:    [[GTE0:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
+; CHECK-NEXT:    [[FUSION0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} fusion([[GTE0]]), kind=kLoop, calls=[[FUSED_COMPUTATION0:%[^ ]+]]
+; CHECK-NEXT:    [[GTE1:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=1
+; CHECK-NEXT:    [[GTE1_BITCAST:%[^ ]+]] = f32[2,8]{1,0} bitcast([[GTE1]])
+; CHECK-NEXT:    [[GTE2:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=2
+; CHECK-NEXT:    [[GTE2_BITCAST:%[^ ]+]] = f32[2,8]{1,0} bitcast([[GTE2]])
+; CHECK-NEXT:    [[FUSION1:%[^ ]+]] = f32[2,8]{1,0} fusion([[GTE2]]), kind=kLoop, calls=[[FUSED_COMPUTATION1:%[^ ]+]]
+; CHECK-NEXT:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,1,8]{3,2,1,0}, f32[2,8]{1,0}, f32[2,8]{1,0}, f32[2,8]{1,0}) tuple([[FUSION0]], [[GTE1_BITCAST]], [[GTE2_BITCAST]], [[FUSION1]])
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
 TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward2D1) {
 #if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
   GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
@@ -700,7 +987,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward2D1) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4], scale: f32[4], bias: f32[4], doutput: f32[2,4]) -> (f32[2,4], f32[2,4], f32[4], f32[4]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4], {{.*}}: f32[4], {{.*}}: f32[4], {{.*}}: f32[2,4]) -> (f32[2,4], f32[2,4], f32[4], f32[4]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
@@ -815,7 +1102,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D3) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[8], bias: f32[8], doutput: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[8], f32[8]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[8], {{.*}}: f32[8], {{.*}}: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[8], f32[8]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
@@ -930,7 +1217,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D2) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[6], bias: f32[6], doutput: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[6], f32[6]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[6], {{.*}}: f32[6], {{.*}}: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[6], f32[6]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,4,8,6]{3,2,1,0} transpose([[P0]]), dimensions={0,1,3,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
@@ -1048,7 +1335,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4,6], bias: f32[4,6], doutput: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4,6], f32[4,6]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4,6], {{.*}}: f32[4,6], {{.*}}: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4,6], f32[4,6]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} transpose([[P0]]), dimensions={0,3,1,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
@@ -1088,6 +1375,124 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
+TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12Degenerate2) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,4,1,8] parameter(0)
+        input_square = f32[2,4,1,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,8] reduce(input_square, c0), dimensions={1,2}, to_apply=apply
+        reduce = f32[2,8] reduce(input, c0), dimensions={1,2}, to_apply=apply
+        r_nelems = f32[] constant(0.25)
+        r_nelems_bcast = f32[2,8] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,8] multiply(input_square_sum,r_nelems_bcast)
+        input_mean = f32[2,8] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,8] multiply(input_mean,input_mean)
+        variance = f32[2,8] subtract(input_square_mean,input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,8] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,8] add(variance, epsilon_bcast)
+        norm_factor = f32[2,8] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,4,1,8] broadcast(norm_factor), dimensions={0,3}
+        input_mean_bcast = f32[2,4,1,8] broadcast(input_mean), dimensions={0,3}
+        input_center = f32[2,4,1,8] subtract(input, input_mean_bcast)
+        norm = f32[2,4,1,8] multiply(input_center, norm_factor_bcast)
+        scale = f32[4,1] parameter(1)
+        scale_bcast = f32[2,4,1,8] broadcast(scale), dimensions={1,2}
+        norm_scale = f32[2,4,1,8] multiply(norm, scale_bcast)
+        bias = f32[4,1] parameter(2)
+        bias_bcast = f32[2,4,1,8] broadcast(bias), dimensions={1,2}
+        norm_scale_bias = f32[2,4,1,8] add(norm_scale, bias_bcast)
+        doutput = f32[2,4,1,8] parameter(3)
+        dbias = f32[4,1] reduce(doutput, c0), dimensions={0,3}, to_apply=apply
+        norm_doutput = f32[2,4,1,8] multiply(norm, doutput)
+        dscale = f32[4,1] reduce(norm_doutput, c0), dimensions={0,3}, to_apply=apply
+        scale_doutput = f32[2,4,1,8] multiply(scale_bcast, doutput)
+        input_center_scale_doutput = f32[2,4,1,8] multiply(input_center, scale_doutput)
+        f0 = f32[2,8] reduce(input_center_scale_doutput, c0), dimensions={1,2}, to_apply=apply
+        norm_factor_cube = f32[2,8] divide(norm_factor, variance_plus_epsilon)
+        c1 = f32[] constant(-0.5)
+        c1_bcast = f32[2,8] broadcast(c1), dimensions={}
+        dnorm_factor = f32[2,8] multiply(norm_factor_cube, c1_bcast)
+        f0_dnorm_factor = f32[2,8] multiply(f0, dnorm_factor)
+        c2 = f32[] constant(0.5)
+        c2_bcast = f32[2,8] broadcast(c2), dimensions={}
+        f0_dnorm_factor_scaled = f32[2,8] multiply(f0_dnorm_factor, c2_bcast)
+        f0_dnorm_factor_scaled_bcast = f32[2,4,1,8] broadcast(f0_dnorm_factor_scaled), dimensions={0,3}
+        f1 = f32[2,4,1,8] multiply(input_center, f0_dnorm_factor_scaled_bcast)
+        minus_f1 = f32[2,4,1,8] negate(f1)
+        minus_f1_sum = f32[2,8] reduce(minus_f1, c0), dimensions={1,2}, to_apply=apply
+        f2 = f32[2,4,1,8] multiply(norm_factor_bcast, scale_doutput)
+        minus_f2 = f32[2,4,1,8] negate(f2)
+        minus_f2_sum = f32[2,8] reduce(minus_f2, c0), dimensions={1,2}, to_apply=apply
+        minus_f1_f2_sum = f32[2,8] add(minus_f1_sum, minus_f2_sum)
+        minus_f1_f2_sum_scaled = f32[2,8] multiply(minus_f1_f2_sum, r_nelems_bcast)
+        minus_f1_f2_sum_scaled_bcast = f32[2,4,1,8] broadcast(minus_f1_f2_sum_scaled), dimensions={0,3}
+        f1_f2 = f32[2,4,1,8] add(f1, f2)
+        dinput = f32[2,4,1,8] add(f1_f2, minus_f1_f2_sum_scaled_bcast)
+        ROOT out = (f32[2,4,1,8], f32[2,4,1,8], f32[4,1], f32[4,1]) tuple(norm_scale_bias, dinput, dscale, dbias)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,1,8], {{.*}}: f32[4,1], {{.*}}: f32[4,1], {{.*}}: f32[2,4,1,8]) -> (f32[2,4,1,8], f32[2,4,1,8], f32[4,1], f32[4,1]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} transpose([[P0]]), dimensions={2,0,3,1}
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,1]{1,0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK-DAG:         "kind":"LAYER_FWD_TRAIN"
+; CHECK:           }
+; CHECK-DAG:     [[GTE0:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=0
+; CHECK-DAG:     [[P3:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(3)
+; CHECK-NEXT:    [[TRANSPOSE1:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} transpose([[P3]]), dimensions={2,0,3,1}
+; CHECK-DAG:     [[P3_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE1]])
+; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
+; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0
+; CHECK-DAG:         "kind":"LAYER_BWD"
+; CHECK:           }
+; CHECK-DAG:     [[GTE3:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=0
+; CHECK-DAG:     [[FUSION0:%[^ ]+]] = (f32[2,4,1,8]{3,2,1,0}, f32[2,4,1,8]{3,2,1,0}) fusion([[GTE0]], [[GTE3]]), kind=kLoop, calls=[[FUSED_COMPUTATION0:%[^ ]+]]
+; CHECK-DAG:     [[GTEF0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} get-tuple-element([[FUSION0]]), index=0
+; CHECK-DAG:     [[GTEF1:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} get-tuple-element([[FUSION0]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[4,1]{1,0} bitcast([[GTE4]])
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[4,1]{1,0} bitcast([[GTE5]])
+; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,1,8]{3,2,1,0}, f32[2,4,1,8]{3,2,1,0}, f32[4,1]{1,0}, f32[4,1]{1,0}) tuple([[GTEF0]], [[GTEF1]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
 TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeSplit) {
 #if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
   GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
@@ -1167,7 +1572,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeSplit) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4], bias: f32[4], doutput: f32[2,4,48]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4], f32[4]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4], {{.*}}: f32[4], {{.*}}: f32[2,4,48]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4], f32[4]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,6,8,4]{3,2,1,0} transpose([[P0]]), dimensions={0,2,3,1}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
@@ -1286,7 +1691,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeCombine) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4], bias: f32[4], doutput: f32[2,4,6,2,2,2]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4], f32[4]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4], {{.*}}: f32[4], {{.*}}: f32[2,4,6,2,2,2]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4], f32[4]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,6,8,4]{3,2,1,0} transpose([[P0]]), dimensions={0,2,3,1}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])

From 3a0693e666631cc95340398e50a1e3c5ba1c0eb9 Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Wed, 20 Mar 2024 05:55:16 -0700
Subject: [PATCH 173/670] Implement deferring `stablehlo.transpose` ops
 attached to activations of `stablehlo.add`.

PiperOrigin-RevId: 617486376
---
 .../mlir/quantization/stablehlo/BUILD         |   2 +
 .../stablehlo/cc/pass_pipeline.cc             |  34 ++++-
 .../quantization/stablehlo/cc/pass_pipeline.h |  10 ++
 .../passes/defer_activation_transpose.cc      | 121 ++++++++++++++++++
 .../quantization/stablehlo/passes/passes.td   |  14 ++
 .../passes/defer_activation_transpose.mlir    |  71 ++++++++++
 .../tests/pipelines/process_nchw_tensor.mlir  |  68 ++++++++++
 .../stablehlo/tools/stablehlo_quant_opt.cc    |   4 +
 8 files changed, 317 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
 create mode 100644 tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
 create mode 100644 tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 11b100be5601c1..03d00a8b174fe5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -48,6 +48,7 @@ cc_library(
     srcs = [
         "passes/convert_func_to_bfloat16.cc",
         "passes/convert_xla_call_module_op_to_bfloat16.cc",
+        "passes/defer_activation_transpose.cc",
         "passes/fold_constant_transpose.cc",
         "passes/lift_quantizable_spots_as_functions.cc",
         "passes/lift_quantizable_spots_as_functions_fusion.inc",
@@ -756,6 +757,7 @@ tf_cc_binary(
         ":test_passes",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:pass_pipeline",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
index 91a67392abc2d2..7f2df7a572b530 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
@@ -37,14 +38,11 @@ void AddPreCalibrationPasses(OpPassManager& pm,
                              const CalibrationOptions& calibration_options,
                              const QuantizationSpecs& quantization_specs,
                              const DebuggerConfig& debugger_config) {
-  // For models with NCHW convolution format. This pass is required because
-  // downstream pipeline handles NHWC convolution better for most cases.
-  pm.addNestedPass<func::FuncOp>(createNchwConvolutionToNhwcPass());
+  // Convert NCHW tensors to NHWC at along with extra optimizations as
+  // downstream passes perform better optimizations when dealing with NHWC
+  // formatted tensors.
+  AddProcessNchwTensorPasses(pm);
 
-  // Folds `stablehlo.constant`->`stablehlo.transpose` patterns, which is often
-  // generated as by-products after optimizing dimension numbers (e.g.
-  // NCHW->NHWC convolution conversion).
-  pm.addNestedPass<func::FuncOp>(createFoldConstantTransposePass());
   pm.addPass(CreateLiftQuantizableSpotsAsFunctionsPass(quantization_specs));
   if (debugger_config.debugger_type() !=
       DebuggerConfig::DEBUGGER_TYPE_UNSPECIFIED) {
@@ -151,4 +149,26 @@ void AddCallModuleSerializationPasses(OpPassManager& pm) {
   pm.addPass(TF::CreateXlaCallModuleSerializationPass());
 }
 
+void AddProcessNchwTensorPasses(OpPassManager& pm) {
+  // For models with NCHW convolution format. This pass is required because
+  // downstream pipeline handles NHWC convolution better for most cases.
+  pm.addNestedPass<func::FuncOp>(createNchwConvolutionToNhwcPass());
+
+  // Recursively push down the `stablehlo.transpose` ops for activations
+  // generated by the `NchwConvolutionToNhwc` pass.
+  pm.addNestedPass<func::FuncOp>(createDeferActivationTransposePass());
+
+  // Folds `stablehlo.constant`->`stablehlo.transpose` patterns, which is often
+  // generated as by-products after optimizing dimension numbers (e.g.
+  // NCHW->NHWC convolution conversion).
+  pm.addNestedPass<func::FuncOp>(createFoldConstantTransposePass());
+}
+
+void RegisterPassPipelines() {
+  static PassPipelineRegistration<> nchw_tensor_format_processing_pipeline(
+      /*arg=*/"stablehlo-process-nchw-tensor",
+      /*description=*/"Optimizes tensors with NCHW format.",
+      AddProcessNchwTensorPasses);
+}
+
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
index 348b7266f3296e..4f94506b6c184e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
@@ -61,6 +61,16 @@ void AddCallModuleSerializationPasses(OpPassManager& pm);
 // through a StableHLO <-> MHLO roundtrip to utilize the MHLOQuantToInt pass.
 void AddStablehloQuantToIntPasses(OpPassManager& pm);
 
+// Processes tensors with NCHW format (== (batch, channel, height, weight)) by
+// converting them to NHWC formats along with extra optimizations such as
+// constant folding the transpose->convolution pattern. This is useful when
+// downstream pipeline (e.g. XLA) is more optimized when accepting NHWC formats.
+void AddProcessNchwTensorPasses(OpPassManager& pm);
+
+// Registers quantization pass pipelines. This is only required when running
+// MLIR opt binaries and not required when adding passes programmatically.
+void RegisterPassPipelines();
+
 }  // namespace mlir::quant::stablehlo
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PASS_PIPELINE_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
new file mode 100644
index 00000000000000..cee1fb9d8cdaf1
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
@@ -0,0 +1,121 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <array>
+#include <cstdint>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+
+namespace mlir::quant::stablehlo {
+
+#define GEN_PASS_DEF_DEFERACTIVATIONTRANSPOSEPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+namespace {
+
+using ::mlir::stablehlo::AddOp;
+using ::mlir::stablehlo::TransposeOp;
+
+class RewriteAddWithActivationTranspose : public OpRewritePattern<AddOp> {
+ public:
+  using OpRewritePattern<AddOp>::OpRewritePattern;
+
+  LogicalResult match(AddOp op) const override {
+    // Only supports the case for 2D convolution.
+    const Value lhs = op.getOperand(0);
+    if (!HasRankOf(lhs, /*rank=*/4)) return failure();
+
+    const Value rhs = op.getOperand(1);
+    Operation* rhs_op = rhs.getDefiningOp();
+    if (rhs_op == nullptr || !rhs_op->hasTrait<OpTrait::ConstantLike>()) {
+      return failure();
+    }
+
+    // Match LHS permutation that converts: NHWC -> NCHW.
+    auto transpose_op = dyn_cast_or_null<TransposeOp>(lhs.getDefiningOp());
+    if (transpose_op == nullptr) {
+      return failure();
+    }
+
+    return success(transpose_op.getPermutation() ==
+                   ArrayRef<int64_t>(kDesiredLhsPermutation));
+  }
+
+  void rewrite(AddOp op, PatternRewriter& rewriter) const override {
+    auto lhs_transpose_op = cast<TransposeOp>(op.getOperand(0).getDefiningOp());
+    Value lhs_input = lhs_transpose_op.getOperand();
+
+    Value rhs_input = op.getOperand(1);
+
+    // NCHW -> NHWC for the right-hand side, to match the operand's shape.
+    auto rhs_transpose_op = rewriter.create<TransposeOp>(
+        op.getLoc(), /*operand=*/rhs_input,
+        rewriter.getDenseI64ArrayAttr(kRhsPermutation));
+
+    auto add_op =
+        rewriter.create<AddOp>(op.getLoc(), lhs_input, rhs_transpose_op);
+
+    // NHWC -> NCHW for the output, to match the shapes of `op`'s users.
+    auto output_transpose_op = rewriter.create<TransposeOp>(
+        op.getLoc(), /*operand=*/add_op.getResult(),
+        rewriter.getDenseI64ArrayAttr(kOutputPermutation));
+
+    rewriter.replaceAllUsesWith(op.getResult(), output_transpose_op);
+  }
+
+ private:
+  // Permutation representing NHWC -> NCHW for the activation (LHS), used for
+  // matching the pattern.
+  static constexpr std::array<int64_t, 4> kDesiredLhsPermutation = {0, 3, 1, 2};
+
+  // Permutation representing NCHW -> NHWC for the RHS, newly inserted after the
+  // conversion.
+  static constexpr std::array<int64_t, 4> kRhsPermutation = {0, 2, 3, 1};
+  // Permutation representing NHWC -> NCHW for the output, newly inserted after
+  // the conversion.
+  static constexpr std::array<int64_t, 4> kOutputPermutation = {0, 3, 1, 2};
+};
+
+}  // namespace
+
+class DeferActivationTransposePass
+    : public impl::DeferActivationTransposePassBase<
+          DeferActivationTransposePass> {
+ private:
+  void runOnOperation() override;
+};
+
+void DeferActivationTransposePass::runOnOperation() {
+  func::FuncOp func_op = getOperation();
+  MLIRContext& ctx = getContext();
+
+  RewritePatternSet patterns(&ctx);
+  patterns.add<RewriteAddWithActivationTranspose>(&ctx);
+  if (failed(applyPatternsAndFoldGreedily(func_op, std::move(patterns)))) {
+    func_op->emitWarning() << "Failed to converge patterns: " << getArgument();
+  }
+}
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
index fb3f5fcb0a21c3..cf184376803d61 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -162,6 +162,20 @@ def NchwConvolutionToNhwcPass : Pass<"stablehlo-nchw-convolution-to-nhwc", "mlir
   let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
 }
 
+def DeferActivationTransposePass : Pass<"stablehlo-defer-activation-transpose", "mlir::func::FuncOp"> {
+  let summary = "Merges stablehlo.transpose for activations.";
+  let description = [{
+    Defers activation transposes (e.g. LHS of `stablehlo.add`) to the output and
+    optionally inserts `stablehlo.transpose`s to match the shape of operands.
+    This is useful when recursively pushing down the extra `stablehlo.transpose`
+    inserted to activation tensors after running `NchwConvolutionToNhwcPass`.
+
+    Currently only converts limited cases that appear in NCHW->NHWC 2D
+    convolution conversion, to avoid introducing unwanted pessimizations.
+  }];
+  let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
+}
+
 def PrepareQuantizeHybridPass : Pass<"stablehlo-prepare-quantize-hybrid", "mlir::func::FuncOp"> {
   let summary = "Prepare hybrid quantization for weight-only quantization and dynamic range quantization.";
   let dependentDialects = [
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
new file mode 100644
index 00000000000000..c6a1a5ca6d5fd7
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
@@ -0,0 +1,71 @@
+// RUN: stablehlo-quant-opt %s -stablehlo-defer-activation-transpose \
+// RUN:   -split-input-file -verify-diagnostics | FileCheck %s
+
+// Tests that an `add(transpose(arg0), arg1)` pattern is converted to
+// `transpose(add(arg0, transpose(arg1)))`. The transpose in the activation is
+// deferred to the output of `stablehlo.add` and an extra transpose op is
+// inserted to the RHS to match the shape of the operand.
+
+// CHECK-LABEL: add_with_activation_transpose
+func.func @add_with_activation_transpose(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x4x3x3xf32>
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+  %2 = stablehlo.add %1, %0 : tensor<1x4x3x3xf32>
+  return %2 : tensor<1x4x3x3xf32>
+}
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[CONST_0]], dims = [0, 2, 3, 1] : (tensor<1x4x3x3xf32>) -> tensor<1x3x3x4xf32>
+
+// Check that the shape of the add is changed to reflect the deferred transpose.
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[ARG_0]], %[[TRANSPOSE_0]] : tensor<1x3x3x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// [No change] Tests that the activation transpose whose permutation is not
+// `[0, 3, 1, 2]` is not deferred.
+
+// CHECK-LABEL: add_with_activation_transpose_permutation_mismatch
+func.func @add_with_activation_transpose_permutation_mismatch(
+      %arg0: tensor<1x2x3x4xf32>) -> tensor<1x3x2x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x2x4xf32>
+  %1 = stablehlo.transpose %arg0, dims = [0, 2, 1, 3] : (tensor<1x2x3x4xf32>) -> tensor<1x3x2x4xf32>
+  %2 = stablehlo.add %1, %0 : tensor<1x3x2x4xf32>
+  return %2 : tensor<1x3x2x4xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[ADD_0]]
+
+// -----
+
+// [No change] Tests that the activation transpose whose rank is not 4 is not
+// deferred.
+
+// CHECK-LABEL: add_with_activation_transpose_rank_two
+func.func @add_with_activation_transpose_rank_two(%arg0: tensor<1x2xf32>) -> tensor<2x1xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<2x1xf32>
+  %1 = stablehlo.transpose %arg0, dims = [1, 0] : (tensor<1x2xf32>) -> tensor<2x1xf32>
+  %2 = stablehlo.add %1, %0 : tensor<2x1xf32>
+  return %2 : tensor<2x1xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[ADD_0]]
+
+// -----
+
+// [No change] Tests that the right-hand side that is not a constant is not
+// deferred.
+
+// CHECK-LABEL: add_with_activation_transpose_nonconst_rhs
+func.func @add_with_activation_transpose_nonconst_rhs(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<1x4x3x3xf32>) -> tensor<1x4x3x3xf32> {
+  %0 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+  %1 = stablehlo.add %0, %arg1 : tensor<1x4x3x3xf32>
+  return %1 : tensor<1x4x3x3xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[ADD_0]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
new file mode 100644
index 00000000000000..c40f5d3b3cdd1b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
@@ -0,0 +1,68 @@
+// RUN: stablehlo-quant-opt %s -stablehlo-process-nchw-tensor \
+// RUN:   -split-input-file -verify-diagnostics | FileCheck %s
+
+// Tests that a `convolution(%activation, %weight)` with the activation tensor
+// NCHW format is converted to NHWC convolution. Transpose ops are inserted to
+// the activation and output to match the function signature. The weight
+// constant is transposed.
+
+// CHECK-LABEL: nchw_conv
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x8x4x4xf32>
+func.func @nchw_conv(%arg0: tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32> {
+  %0 = stablehlo.constant() {value = dense<7.000000e+00> : tensor<8x8x3x3xf32>} : () -> tensor<8x8x3x3xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x8x4x4xf32>, tensor<8x8x3x3xf32>) -> tensor<1x8x4x4xf32>
+  return %2 : tensor<1x8x4x4xf32>
+}
+// CHECK-DAG: %[[CONST:.+]] = stablehlo.constant {{.*}} : tensor<3x3x8x8xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG]], dims = [0, 2, 3, 1] : (tensor<1x8x4x4xf32>) -> tensor<1x4x4x8xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x4x4x8xf32>, tensor<3x3x8x8xf32>) -> tensor<1x4x4x8xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[CONV]], dims = [0, 3, 1, 2] : (tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32>
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// Tests that a `add(convolution(%activation, %weight), %bias)` with the
+// activation tensor of NCHW format is converted to NHWC convolution + add
+// operation. Transpose ops are inserted to activations and outputs to match the
+// function signature. Constants are also transposed accordingly.
+
+// CHECK-LABEL: nchw_conv_with_bias_add
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x5x5xf32>
+func.func @nchw_conv_with_bias_add(%arg0: tensor<1x2x5x5xf32>) -> tensor<1x4x5x5xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4x2x3x3xf32>
+  %1 = stablehlo.constant dense<3.000000e+00> : tensor<1x4x5x5xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2x5x5xf32>, tensor<4x2x3x3xf32>) -> tensor<1x4x5x5xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x4x5x5xf32>
+  return %3 : tensor<1x4x5x5xf32>
+}
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} : tensor<3x3x2x4xf32>
+// CHECK-DAG: %[[BIAS_CONST:.+]] = stablehlo.constant {{.*}} : tensor<1x5x5x4xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG]], dims = [0, 2, 3, 1] : (tensor<1x2x5x5xf32>) -> tensor<1x5x5x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[WEIGHT_CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2xf32>, tensor<3x3x2x4xf32>) -> tensor<1x5x5x4xf32>
+// CHECK: %[[ADD:.+]] = stablehlo.add %[[CONV]], %[[BIAS_CONST]] : tensor<1x5x5x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[ADD]], dims = [0, 3, 1, 2] : (tensor<1x5x5x4xf32>) -> tensor<1x4x5x5xf32>
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// Tests that a `add(convolution(%activation, %weight), %bias)` pattern with the
+// activation tensor of NCHW format and non-constant bias is converted to NHWC
+// convolution, but without the deferred transpose for `stablehlo.add`.
+// Transpose ops are inserted to the activation and output of
+// `stablehlo.convolution`. The weight constants is transposed.
+
+// CHECK-LABEL: nchw_conv_with_nonconst_bias_add
+// CHECK-SAME: %[[ARG_0:.+]]: tensor<1x2x5x5xf32>
+// CHECK-SAME: %[[ARG_1:.+]]: tensor<1x4x5x5xf32>
+func.func @nchw_conv_with_nonconst_bias_add(%arg0: tensor<1x2x5x5xf32>, %arg1: tensor<1x4x5x5xf32>) -> tensor<1x4x5x5xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4x2x3x3xf32>
+  %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2x5x5xf32>, tensor<4x2x3x3xf32>) -> tensor<1x4x5x5xf32>
+  %2 = stablehlo.add %1, %arg1 : tensor<1x4x5x5xf32>
+  return %2 : tensor<1x4x5x5xf32>
+}
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} : tensor<3x3x2x4xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG_0]], dims = [0, 2, 3, 1] : (tensor<1x2x5x5xf32>) -> tensor<1x5x5x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[WEIGHT_CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2xf32>, tensor<3x3x2x4xf32>) -> tensor<1x5x5x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[CONV]], dims = [0, 3, 1, 2] : (tensor<1x5x5x4xf32>) -> tensor<1x4x5x5xf32>
+// CHECK: %[[ADD:.+]] = stablehlo.add %[[TRANSPOSE_1]], %[[ARG_1]] : tensor<1x4x5x5xf32>
+// CHECK: return %[[ADD]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc b/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
index 69d9a725a37ebe..9b587e4273965f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "stablehlo/transforms/Passes.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h"
@@ -53,6 +54,9 @@ int main(int argc, char** argv) {
   // These passes are only used for testing purposes.
   mlir::quant::stablehlo::testing::registerTestPasses();
 
+  // Register StableHLO Quantizer pass pipelines.
+  mlir::quant::stablehlo::RegisterPassPipelines();
+
   mlir::DialectRegistry registry;
   registry.insert<mlir::scf::SCFDialect, mlir::TF::TensorFlowDialect,
                   mlir::tf_saved_model::TensorFlowSavedModelDialect,

From b295acb2cee67ac694d90b428fb3c9fb0d4549f2 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Wed, 20 Mar 2024 06:19:01 -0700
Subject: [PATCH 174/670] [xla:gpu][NFC] Misc fixes to
 DynamicAddressComputationFusion

PiperOrigin-RevId: 617491428
---
 .../xla/xla/service/gpu/fusions/custom.cc     | 30 ++++++++++---------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index ce543a26e9c574..bcc4fb094d3f2d 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -188,7 +188,12 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
   const BufferAssignment& buffer_assignment =
       ir_emitter_context.buffer_assignment();
 
-  HloDynamicSliceInstruction* slice_instr = nullptr;
+  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
+      offset_buffer_indices;
+  std::vector<std::optional<const Shape>> orig_shapes;
+  std::vector<std::optional<const Shape>> sliced_shapes;
+
+  HloDynamicIndexInstruction* slice_instr = nullptr;
   auto get_original_slice =
       [&](const HloInstruction* start,
           const ShapeIndex& index) -> absl::StatusOr<BufferAllocation::Slice> {
@@ -202,17 +207,19 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
         [](auto node) { return node.opcode() == HloOpcode::kDynamicSlice; });
     if (!slice_adaptor.has_value()) {
       return absl::InternalError(
-          "AddressComputationFusion expects at least one sliced operand");
+          "DynamicAddressComputationFusion expects at least one sliced "
+          "operand");
     }
 
-    slice_instr = const_cast<HloDynamicSliceInstruction*>(
-        static_cast<const HloDynamicSliceInstruction*>(
+    slice_instr = const_cast<HloDynamicIndexInstruction*>(
+        static_cast<const HloDynamicIndexInstruction*>(
             &slice_adaptor->instruction()));
 
     if (!IsContiguousSlice(slice_instr->operand(0)->shape(),
                            slice_instr->shape())) {
       return absl::InternalError(
-          "AddressComputationFusion only handles contiguous slices currently");
+          "DynamicAddressComputationFusion only handles contiguous slices "
+          "currently");
     }
 
     const auto* param = Cast<HloParameterInstruction>(slice_instr->operand(0));
@@ -220,12 +227,7 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
                               fusion.operand(param->parameter_number()), index);
   };
 
-  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
-      offset_buffer_indices;
-  std::vector<std::optional<const Shape>> orig_shapes;
-  std::vector<std::optional<const Shape>> sliced_shapes;
-
-  auto get_operand_slice_info = [&]() {
+  auto collect_slice_info = [&]() {
     if (slice_instr == nullptr) {
       offset_buffer_indices.push_back(std::nullopt);
       orig_shapes.push_back(std::nullopt);
@@ -249,12 +251,12 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice lhs_slice,
                       get_original_slice(custom_call.operand(0), /*index=*/{}));
-  get_operand_slice_info();
+  collect_slice_info();
 
   slice_instr = nullptr;
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice rhs_slice,
                       get_original_slice(custom_call.operand(1), /*index=*/{}));
-  get_operand_slice_info();
+  collect_slice_info();
 
   BufferAllocation::Slice output;
   std::optional<BufferAllocation::Slice> workspace = std::nullopt;
@@ -356,7 +358,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
   using Slices = std::vector<std::optional<CustomCallThunk::Slice>>;
 
   Slices operands;
-  // TODO(vuson): add test with custom call with tuple-typed operands
+  // TODO(vuson): add test with custom call with token-typed operands
   for (auto* operand : custom_call.operands()) {
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
         operand->shape(), [&](const Shape& subshape, const ShapeIndex& index) {

From 142c81f6e88ba7f289ad46210effb2b398876dbb Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Wed, 20 Mar 2024 06:30:08 -0700
Subject: [PATCH 175/670] Expand presets to `QuantizationConfig` to populate
 `_quantization_method` for each quantizable unit.

PiperOrigin-RevId: 617493646
---
 .../lift_quantizable_spots_as_functions.cc    |   5 -
 .../stablehlo/passes/testing/passes.h         |  10 ++
 .../stablehlo/passes/testing/passes.td        |  14 +++
 ...ts_as_functions_with_quantization_specs.cc |  35 +++++-
 .../stablehlo/python/pywrap_quantization.cc   |  17 ++-
 .../stablehlo/python/pywrap_quantization.pyi  |  10 +-
 .../python/pywrap_quantization_lib.cc         |   4 +
 .../python/pywrap_quantization_lib.h          |   4 +
 .../stablehlo/python/quantization.py          |   9 ++
 ..._as_functions_with_quantization_specs.mlir | 107 ++++++++++++++----
 10 files changed, 183 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
index 13fe29fe787324..a40568f70d193a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
@@ -168,11 +168,6 @@ LogicalResult ApplyQuantizationSpec(const QuantizationSpec& spec,
   if (!main_func) return failure();
 
   const Method& quantization_method = spec.method();
-  if (!quantization_method.has_no_quantization()) {
-    module_op->emitError() << "Unsupported quantization method: "
-                           << quantization_method.DebugString() << "\n";
-    return failure();
-  }
 
   FailureOr<std::string> quantization_method_txtpb =
       QuantizationMethodToTextProto(quantization_method);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h
index aa9c2106789f27..7ba129d1c7a40d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h
@@ -19,11 +19,21 @@ limitations under the License.
 
 namespace mlir::quant::stablehlo::testing {
 
+// Identifies predefined `QuantizationSpecs` for
+// `TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass`. The pass
+// option argument is specified in line comments for each enum value.
+enum class TestQuantizationSpecs {
+  kEmpty,                 // empty
+  kDisableAllDotGeneral,  // disable-all-dot-general
+  kStaticRangePtqToAll,   // static-range-ptq-to-all
+};
+
 // Adds generated pass default constructors or options definitions.
 #define GEN_PASS_DECL
 // Adds generated pass registration functions.
 #define GEN_PASS_REGISTRATION
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h.inc"
+
 }  // namespace mlir::quant::stablehlo::testing
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TESTING_PASSES_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
index 38d60e94f97e9a..c2be397d764d58 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
@@ -69,6 +69,20 @@ def TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass :
     This test-only pass is the same as `LiftQuantizableSpotsAsFunctionsPass` but
     has predefined `QuantizationSpecs` to make FileCheck testing easier.
   }];
+  let options = [
+    Option<"quantization_specs_", "quantization-specs",
+      "mlir::quant::stablehlo::testing::TestQuantizationSpecs",
+      /*default=*/"mlir::quant::stablehlo::testing::TestQuantizationSpecs::kEmpty",
+      "Sets one of the predefined `QuantizationSpecs` for testing.",
+      [{llvm::cl::values(
+        clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kEmpty,
+          "empty", "Uses empty (default) QuantizationSpecs."),
+        clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kDisableAllDotGeneral,
+          "disable-all-dot-general", "Disables all dot_general ops by matching lifted function names"),
+        clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kStaticRangePtqToAll,
+          "static-range-ptq-to-all", "Applies `StaticRangePtq` to all quantizable units.")
+      )}]>
+  ];
   let dependentDialects = [
       "mlir::func::FuncDialect",
       "mlir::stablehlo::StablehloDialect",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
index e8cb185cb7b55d..25920c986e4d1d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
@@ -39,15 +40,28 @@ using ::tsl::protobuf::TextFormat;
 // NOLINTNEXTLINE(misc-include-cleaner) - Required for OSS.
 using ::tsl::protobuf::io::ArrayInputStream;
 
+// Empty (default) `QuantizationSpecs` proto.
+constexpr absl::string_view kSpecsEmpty = R"pb(specs
+                                               [])pb";
+
 // Configure `QuantizationSpecs` to disable quantization for all dot_general
 // quantizable units.
-constexpr absl::string_view kSpecsDisableAllDotGeneralByFuncName =
+constexpr absl::string_view kSpecsDisableAllDotGeneral =
     R"pb(specs
          [ {
            matcher { function_name { regex: "composite_dot_general_.*" } }
            method { no_quantization {} }
          }])pb";
 
+// Configure `QuantizationSpecs` to apply `StaticRangePtq` to all quantizable
+// units.
+constexpr absl::string_view kSpecsStaticRangePtqToAll =
+    R"pb(specs
+         [ {
+           matcher { function_name { regex: ".*" } }
+           method { static_range_ptq {} }
+         }])pb";
+
 class TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass
     : public impl::
           TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPassBase<
@@ -64,9 +78,22 @@ class TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass
   void runOnOperation() override;
 };
 
+// `TestQuantizationSpecs` -> predefined `QuantizationSpecs` textproto.
+absl::string_view GetQuantizationSpecsTextProto(
+    const TestQuantizationSpecs test_specs) {
+  switch (test_specs) {
+    case TestQuantizationSpecs::kEmpty:
+      return kSpecsEmpty;
+    case TestQuantizationSpecs::kDisableAllDotGeneral:
+      return kSpecsDisableAllDotGeneral;
+    case TestQuantizationSpecs::kStaticRangePtqToAll:
+      return kSpecsStaticRangePtqToAll;
+  }
+}
+
 // Parses a text proto into a `QuantizationSpecs` proto. Returns
 // `InvalidArgumentError` if `text_proto` is invalid.
-absl::StatusOr<QuantizationSpecs> ParseQuantizationSpecsTextProto(
+absl::StatusOr<QuantizationSpecs> ParseTextProto(
     const absl::string_view text_proto) {
   QuantizationSpecs quantization_specs;
   TextFormat::Parser parser;
@@ -81,8 +108,9 @@ void TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass::
     runOnOperation() {
   PassManager pass_manager{&getContext()};
 
+  // Construct `QuantizationSpecs` from the pass option `quantization-specs`.
   const absl::StatusOr<QuantizationSpecs> quantization_specs =
-      ParseQuantizationSpecsTextProto(kSpecsDisableAllDotGeneralByFuncName);
+      ParseTextProto(GetQuantizationSpecsTextProto(quantization_specs_));
   if (!quantization_specs.ok()) {
     signalPassFailure();
     return;
@@ -93,7 +121,6 @@ void TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass::
 
   if (failed(pass_manager.run(getOperation()))) {
     signalPassFailure();
-    return;
   }
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
index db312c3bc3b60e..3269006ec06dbb 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
@@ -27,6 +27,7 @@ namespace py = pybind11;
 
 namespace {
 
+using ::stablehlo::quantization::pywrap::PywrapExpandPresets;
 using ::stablehlo::quantization::pywrap::PywrapPopulateDefaults;
 using ::stablehlo::quantization::pywrap::PywrapQuantizeStaticRangePtq;
 using ::stablehlo::quantization::pywrap::PywrapQuantizeWeightOnlyPtq;
@@ -93,5 +94,19 @@ PYBIND11_MODULE(pywrap_quantization, m) {
         default values to fields that the user did not explicitly specify.
         )pbdoc",
         py::arg("user_provided_config_serialized"));
-  // LINT.ThenChange(pywrap_quantization.pyi:static_range_ptq)
+  // LINT.ThenChange(pywrap_quantization.pyi:populate_default_configs)
+
+  // If the function signature changes, likely its corresponding .pyi type
+  // hinting should also change.
+  // LINT.IfChange(expand_preset_configs)
+  m.def("expand_preset_configs", &PywrapExpandPresets, R"pbdoc(
+        Expands presets to other fields in `QuantizationConfig`.
+
+        Each preset is expressible by other fields in `QuantizationConfig`.
+        Returns a copy of `QuantizationConfig` (serialized) where the fields are
+        expanded from presets. If no preset has been set, it is a no-op and
+        returns the same copy of the input.
+        )pbdoc",
+        py::arg("quantization_config_serialized"));
+  // LINT.ThenChange(pywrap_quantization.pyi:expand_preset_configs)
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
index 85bb40e25221b0..e79e2db2c2ac8f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
@@ -17,7 +17,6 @@ from typing import Any
 from tensorflow.compiler.mlir.quantization.tensorflow.python import py_function_lib
 from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as rd
 
-
 # LINT.IfChange(static_range_ptq)
 def static_range_ptq(
     src_saved_model_path: str,
@@ -31,7 +30,6 @@ def static_range_ptq(
 
 # LINT.ThenChange()
 
-
 # LINT.IfChange(weight_only_ptq)
 def weight_only_ptq(
     src_saved_model_path: str,
@@ -45,10 +43,16 @@ def weight_only_ptq(
 
 # LINT.ThenChange()
 
-
 # LINT.IfChange(populate_default_configs)
 def populate_default_configs(
     user_provided_quantization_config_serialized: bytes,
 ) -> bytes: ...  # QuantizationConfig
 
 # LINT.ThenChange()
+
+# LINT.IfChange(expand_preset_configs)
+def expand_preset_configs(
+    quantization_config_serialized: bytes,
+) -> bytes: ...  # QuantizationConfig
+
+# LINT.ThenChange()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc
index 61f7442a00c2c3..3b5ece120bdeb0 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc
@@ -64,4 +64,8 @@ QuantizationConfig PywrapPopulateDefaults(
   return PopulateDefaults(user_provided_config);
 }
 
+QuantizationConfig PywrapExpandPresets(const QuantizationConfig& config) {
+  return ExpandPresets(config);
+}
+
 }  // namespace stablehlo::quantization::pywrap
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h
index ead2e38289dff3..ff724abaac5dee 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h
@@ -55,6 +55,10 @@ absl::Status PywrapQuantizeWeightOnlyPtq(
 QuantizationConfig PywrapPopulateDefaults(
     const QuantizationConfig& user_provided_config);
 
+// Function used by the pywrap_quantization module to mirror
+// `::stablehlo::quantization::ExpandPresets`.
+QuantizationConfig PywrapExpandPresets(const QuantizationConfig& config);
+
 }  // namespace stablehlo::quantization::pywrap
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PYTHON_PYWRAP_QUANTIZATION_LIB_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
index 5852d29186b793..aa3745a3fdd453 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
@@ -69,9 +69,18 @@ def quantize_saved_model(
         ' single signature or weight-only quantization.'
     )
 
+  # Updates user-provided `QuantizationConfig`s for the internal quantization
+  # pipeline to work with.
+  print('=== User-provided QuantizationConfig ===')
+  print(config)
   config = qc.QuantizationConfig.FromString(
       pywrap_quantization.populate_default_configs(config.SerializeToString())
   )
+  config = qc.QuantizationConfig.FromString(
+      pywrap_quantization.expand_preset_configs(config.SerializeToString())
+  )
+  print('=== Updated QuantizationConfig ===')
+  print(config)
 
   signature_def_map = save_model.get_signatures_from_saved_model(
       src_saved_model_path,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
index a0d797cfee4fa2..6b3753a2bed846 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
@@ -1,7 +1,76 @@
-// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs \
-// RUN:   -split-input-file | FileCheck %s
+// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=disable-all-dot-general" \
+// RUN:   -split-input-file | FileCheck %s --check-prefix=DISABLE-ALL-DOT-GENERAL
+// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=empty" \
+// RUN:   -split-input-file | FileCheck %s --check-prefix=EMPTY
+// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=static-range-ptq-to-all" \
+// RUN:   -split-input-file | FileCheck %s --check-prefix=STATIC-RANGE-PTQ-TO-ALL
 
-// CHECK: @main
+// Tests that `composite_dot_general_fn_1` and its corresponding XlaCallModuleOp
+// contains attributes required for quantization, including the
+// `_quantization_method` attribute that contains textpb of `Method`.
+
+// DISABLE-ALL-DOT-GENERAL: @main
+func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  return %1 : tensor<1x1x64xf32>
+}
+
+// DISABLE-ALL-DOT-GENERAL: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// DISABLE-ALL-DOT-GENERAL: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+
+// Check that the `_quantization_method` attribute contains the quantization
+// method in textproto format. The dot_general op quantization is explicitly
+// disabled by having `_quantization_method = "no_quantization {}"`.
+// DISABLE-ALL-DOT-GENERAL-SAME: _entry_function = @composite_dot_general_fn_1
+// DISABLE-ALL-DOT-GENERAL-SAME: _original_entry_function
+// DISABLE-ALL-DOT-GENERAL-SAME: _quantization_method = "no_quantization {}"
+// DISABLE-ALL-DOT-GENERAL-SAME: _tfl_quant_trait = "fully_quantizable"
+
+// DISABLE-ALL-DOT-GENERAL: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// DISABLE-ALL-DOT-GENERAL: }
+
+// DISABLE-ALL-DOT-GENERAL-LABEL: private @composite_dot_general_fn_1
+// DISABLE-ALL-DOT-GENERAL-SAME: tf_quant.composite_function
+// DISABLE-ALL-DOT-GENERAL: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// DISABLE-ALL-DOT-GENERAL: return %[[DOT_GENERAL:.*]] : tensor<1x1x64xf32>
+// DISABLE-ALL-DOT-GENERAL: }
+
+// -----
+
+// Tests that `composite_dot_general_fn_1` and its corresponding XlaCallModuleOp
+// contains attributes required for quantization. `_quantization_method` is not
+// set, as it is implicitly disabled.
+
+// EMPTY: @main
+func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  return %1 : tensor<1x1x64xf32>
+}
+
+// EMPTY: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// EMPTY: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+
+// Check that the `_quantization_method` attribute doesn't contain the
+// quantization method, implying "no_quantization".
+// EMPTY-SAME: _entry_function = @composite_dot_general_fn_1
+// EMPTY-SAME: _original_entry_function
+// EMPTY-NOT: _quantization_method
+// EMPTY-SAME: _tfl_quant_trait = "fully_quantizable"
+
+// EMPTY: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// EMPTY: }
+
+// EMPTY-LABEL: private @composite_dot_general_fn_1
+// EMPTY-SAME: tf_quant.composite_function
+// EMPTY: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// EMPTY: return %[[DOT_GENERAL:.*]] : tensor<1x1x64xf32>
+// EMPTY: }
+
+// -----
+
+// STATIC-RANGE-PTQ-TO-ALL: @main
 func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
   %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
   %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
@@ -11,21 +80,21 @@ func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
 // contains attributes required for quantization, including the
 // `_quantization_method` attribute that contains textpb of `Method`.
 
-// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// STATIC-RANGE-PTQ-TO-ALL: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// STATIC-RANGE-PTQ-TO-ALL: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
 
 // Check that the `_quantization_method` attribute contains the quantization
-// method in textproto format.
-// CHECK-SAME: _entry_function = @composite_dot_general_fn_1
-// CHECK-SAME: _original_entry_function
-// CHECK-SAME: _quantization_method = "no_quantization {}"
-// CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
-
-// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
-// CHECK: }
-
-// CHECK-LABEL: private @composite_dot_general_fn_1
-// CHECK-SAME: tf_quant.composite_function
-// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
-// CHECK: return %[[DOT_GENERAL:.*]] : tensor<1x1x64xf32>
-// CHECK: }
+// method in textproto format, enabling static-range PTQ.
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _entry_function = @composite_dot_general_fn_1
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _original_entry_function
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _quantization_method = "static_range_ptq {}"
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _tfl_quant_trait = "fully_quantizable"
+
+// STATIC-RANGE-PTQ-TO-ALL: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// STATIC-RANGE-PTQ-TO-ALL: }
+
+// STATIC-RANGE-PTQ-TO-ALL-LABEL: private @composite_dot_general_fn_1
+// STATIC-RANGE-PTQ-TO-ALL-SAME: tf_quant.composite_function
+// STATIC-RANGE-PTQ-TO-ALL: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// STATIC-RANGE-PTQ-TO-ALL: return %[[DOT_GENERAL:.*]] : tensor<1x1x64xf32>
+// STATIC-RANGE-PTQ-TO-ALL: }

From d65e18ef7248f74fd9087824caec46186440984b Mon Sep 17 00:00:00 2001
From: Sergey Kozub <sergeykozub@google.com>
Date: Wed, 20 Mar 2024 06:42:13 -0700
Subject: [PATCH 176/670] [IndexAnalysis] Add indexing map for
 DynamicUpdateSlice.

PiperOrigin-RevId: 617496344
---
 third_party/xla/xla/service/gpu/model/BUILD   |  1 +
 .../service/gpu/model/indexing_analysis.cc    | 56 +++++++++++++
 .../gpu/model/indexing_analysis_test.cc       | 79 ++++++++++++++++---
 .../xla/service/gpu/model/indexing_context.cc |  2 +
 .../xla/service/gpu/model/indexing_context.h  |  2 +
 5 files changed, 128 insertions(+), 12 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 3e4d2e378afa0c..3f6fbf71370784 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -507,6 +507,7 @@ xla_cc_test(
     srcs = ["indexing_analysis_test.cc"],
     deps = [
         ":indexing_analysis",
+        ":indexing_map",
         ":indexing_test_utils",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_traversal",
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index f0b5c0bb5c813a..bff3a3431a4d4c 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -348,6 +348,59 @@ HloInstructionIndexing ComputeOutputToInputDynamicSliceOpIndexing(
   return HloInstructionIndexing::FromIndexingMaps(indexing_maps);
 }
 
+HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
+    const HloDynamicUpdateSliceInstruction* dus,
+    IndexingContext* indexing_context) {
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
+  const Shape& update_shape = dus->operand(1)->shape();
+  const Shape& output_shape = dus->shape();
+  int64_t rank = output_shape.rank();
+
+  // operand: (d0, ... d_{N-1}) -> (d0, ... d_{N-1})
+  std::vector<AffineExpr> identity;
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    identity.push_back(getAffineDimExpr(dim, mlir_context));
+  }
+  IndexingMap operand_map = IndexingMap::FromTensorSizes(
+      indexing_context,
+      AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/0, /*results=*/identity,
+                     mlir_context),
+      output_shape.dimensions(), {});
+
+  // start_indices: (d0, ... d_{N-1}) -> ()
+  IndexingMap start_indices_map = IndexingMap::FromTensorSizes(
+      indexing_context,
+      AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{},
+                     mlir_context),
+      output_shape.dimensions(), {});
+
+  // update: (d_0 - s_0, ..., d_{N-1} - s_{N-1})
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(rank);
+  std::vector<RTVar> rt_vars;
+  rt_vars.reserve(rank);
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    exprs.push_back(getAffineDimExpr(dim, mlir_context) -
+                    getAffineSymbolExpr(dim, mlir_context));
+    Interval feasible_values{
+        0, output_shape.dimensions(dim) - update_shape.dimensions(dim)};
+    rt_vars.push_back(indexing_context->RegisterRTVar(
+        {feasible_values, dus->operand(2 + dim), start_indices_map}));
+  }
+  IndexingMap update_map{indexing_context,
+                         AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank,
+                                        /*results=*/exprs, mlir_context),
+                         operand_map.GetDimVars(),
+                         /*range_vars=*/{}, rt_vars};
+
+  std::vector<IndexingMap> indexing_maps(dus->operand_count(),
+                                         start_indices_map);
+  indexing_maps[0] = std::move(operand_map);
+  indexing_maps[1] = std::move(update_map);
+  return HloInstructionIndexing::FromIndexingMaps(indexing_maps);
+}
+
 IndexingMap ComputeOutputToInputPadOpIndexingImpl(
     absl::Span<const int64_t> output_dims,
     absl::Span<const int64_t> padding_low,
@@ -1236,6 +1289,9 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
   if (auto dynamic_slice = DynCast<HloDynamicSliceInstruction>(instr)) {
     return ComputeOutputToInputDynamicSliceOpIndexing(dynamic_slice, ctx);
   }
+  if (auto dus = DynCast<HloDynamicUpdateSliceInstruction>(instr)) {
+    return ComputeOutputToInputDynamicUpdateSliceOpIndexing(dus, ctx);
+  }
   if (auto fusion = DynCast<HloFusionInstruction>(instr)) {
     return ComputeOutputToInputFusionOpIndexing(fusion, output_id, ctx);
   }
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 3cc5907b8d24fd..57f825d29eae72 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/hlo_traversal.h"
+#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
@@ -43,7 +44,12 @@ MATCHER_P2(MatchInstrIndexing, operand_id, indexing_map_matchers, "") {
                             result_listener);
 }
 
-using IndexingAnalysisTest = IndexingTestBase;
+class IndexingAnalysisTest : public IndexingTestBase {
+  void SetUp() override {
+    IndexingTestBase::SetUp();
+    IndexingContext::ResetRTVarStateForTests();
+  }
+};
 
 TEST_F(IndexingAnalysisTest, FuseProducerConsumerOutputToInputIndexing) {
   auto root = ParseAndGetRoot(R"(
@@ -691,6 +697,57 @@ TEST_F(IndexingAnalysisTest, DynamicSliceOp) {
               )"))));
 }
 
+TEST_F(IndexingAnalysisTest, DynamicUpdateSliceOp) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      %src = s32[20,30] parameter(0)
+      %upd = s32[5,10] parameter(1)
+      %of1 = s32[] parameter(2)
+      %of2 = s32[] parameter(3)
+      ROOT %dus = s32[20,30] dynamic-update-slice(
+          s32[20,30] %src, s32[5,10] %upd, s32[] %of1, s32[] %of2)
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1) -> (d0, d1)
+                domain:
+                d0 in [0, 19]
+                d1 in [0, 29]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1)[s0, s1]  -> (d0 - s0, d1 - s1)
+                domain:
+                d0 in [0, 19]
+                d1 in [0, 29]
+                s0 id: 0 in [0, 15]
+                  hlo: %of1 = s32[] parameter(2)
+                  (d0, d1)  -> ()
+                  domain:
+                  d0 in [0, 19]
+                  d1 in [0, 29]
+                s1 id: 1 in [0, 20]
+                  hlo: %of2 = s32[] parameter(3)
+                  (d0, d1)  -> ()
+                  domain:
+                  d0 in [0, 19]
+                  d1 in [0, 29]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1)  -> ()
+                domain:
+                d0 in [0, 19]
+                d1 in [0, 29]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1)  -> ()
+                domain:
+                d0 in [0, 19]
+                d1 in [0, 29]
+              )"))));
+}
+
 TEST_F(IndexingAnalysisTest, FusionOpWithSingleBinaryOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
@@ -2088,18 +2145,20 @@ TEST_F(IndexingAnalysisTest, UnsupportedOps) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
-      input = s32[1,1,25,1] parameter(0)
-      update = s32[1,1,2,1] parameter(1)
-      start_indices = s32[4] parameter(2)
-      ROOT dyn-update = s32[1,1,25,1] dynamic-update-slice(
-        input, update, start_indices)
+      p0 = f32[20, 20] parameter(0)
+      p1 = f32[4,4] parameter(1)
+      p2 = f32[4,3] parameter(2)
+      ROOT out = f32[4,3] triangular-solve(f32[4,4] p1, f32[4,3] p2),
+        left_side=true,
+        lower=true,
+        transpose_a=NO_TRANSPOSE,
+        unit_diagonal=true
     }
   )");
   auto input_indexing = GetOutputToInputIndexing(root);
   EXPECT_THAT(
       input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(UndefinedMap()), ElementsAre(UndefinedMap()),
-                  ElementsAre(UndefinedMap())));
+      ElementsAre(ElementsAre(UndefinedMap()), ElementsAre(UndefinedMap())));
 
   auto output_indexing_0 = GetInputToOutputIndexing(root, 0);
   EXPECT_THAT(output_indexing_0.indexing_maps,
@@ -2108,10 +2167,6 @@ TEST_F(IndexingAnalysisTest, UnsupportedOps) {
   auto output_indexing_1 = GetInputToOutputIndexing(root, 1);
   EXPECT_THAT(output_indexing_1.indexing_maps,
               ElementsAre(ElementsAre(UndefinedMap())));
-
-  auto output_indexing_2 = GetInputToOutputIndexing(root, 2);
-  EXPECT_THAT(output_indexing_2.indexing_maps,
-              ElementsAre(ElementsAre(UndefinedMap())));
 }
 
 TEST_F(IndexingAnalysisTest, FusionWithUnsupportedOp) {
diff --git a/third_party/xla/xla/service/gpu/model/indexing_context.cc b/third_party/xla/xla/service/gpu/model/indexing_context.cc
index 2d4e7c75f70014..b1ee50a4f1bb86 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_context.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_context.cc
@@ -33,5 +33,7 @@ RTVarData& IndexingContext::GetRTVarData(RTVarID id) {
   return rt_vars_registry_.at(id);
 }
 
+/*static*/ void IndexingContext::ResetRTVarStateForTests() { rt_var_count = 0; }
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_context.h b/third_party/xla/xla/service/gpu/model/indexing_context.h
index 1106c65874e648..1947919bc849a4 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_context.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_context.h
@@ -41,6 +41,8 @@ class IndexingContext {
 
   RTVarData& GetRTVarData(RTVarID id);
 
+  static void ResetRTVarStateForTests();
+
  private:
   mlir::MLIRContext* mlir_context_;
   absl::flat_hash_map<RTVarID, RTVarData> rt_vars_registry_;

From 91dedccfeb8b2d37875853b719b2a20f22065558 Mon Sep 17 00:00:00 2001
From: Alan Kelly <alankelly@google.com>
Date: Wed, 20 Mar 2024 06:46:15 -0700
Subject: [PATCH 177/670] Mutex protect XNNPack's workspace.

PiperOrigin-RevId: 617497120
---
 .../delegates/xnnpack/xnnpack_delegate.cc     | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index e24c2b03d21012..518575bcbe596f 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <limits>
 #include <map>
 #include <memory>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -708,6 +709,7 @@ class Delegate {
 
   TfLiteXNNPackDelegateOptions options_{};
   VariableHolder variable_holder_;
+  std::mutex workspace_mutex_;
 };
 
 class Subgraph {
@@ -1116,7 +1118,8 @@ class Subgraph {
   }
 
   TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node,
-                       bool enable_subgraph_reshaping) {
+                       bool enable_subgraph_reshaping, Delegate* delegate) {
+    std::lock_guard<std::mutex> lock(delegate->workspace_mutex_);
     if (enable_subgraph_reshaping) {
       xnn_status status = xnn_status_invalid_state;
       for (int i = 0; i < inputs_.size(); ++i) {
@@ -1172,7 +1175,9 @@ class Subgraph {
     }
   }
 
-  TfLiteStatus Invoke(TfLiteContext* context, bool enable_subgraph_reshaping) {
+  TfLiteStatus Invoke(TfLiteContext* context, bool enable_subgraph_reshaping,
+                      Delegate* delegate) {
+    std::lock_guard<std::mutex> lock(delegate->workspace_mutex_);
     bool any_pointers_changed = false;
     for (std::pair<int, void*> io_info : externals_) {
       const TfLiteTensor& tensor = context->tensors[io_info.first];
@@ -6625,8 +6630,10 @@ class Subgraph {
     return enable_subgraph_reshaping_;
   }
 
+  inline Delegate* GetDelegate() const { return delegate_; }
+
  private:
-  Subgraph(const Delegate& delegate, xnn_runtime_t runtime,
+  Subgraph(Delegate& delegate, xnn_runtime_t runtime,
            const std::unordered_set<int>& externals, std::vector<int>& inputs,
            std::vector<int>& outputs,
            std::unordered_map<int, uint32_t>& tflite_tensor_to_xnnpack)
@@ -6639,6 +6646,7 @@ class Subgraph {
     outputs_ = outputs;
     has_variables_ = !delegate.GetAllVariableTensors().empty();
     enable_subgraph_reshaping_ = delegate.enable_subgraph_reshaping();
+    delegate_ = &delegate;
   }
 
   // XNNPACK Runtime (subgraph + workspace) with smart-pointer for lifetime
@@ -6666,6 +6674,7 @@ class Subgraph {
   bool has_variables_ = false;
   bool variables_set_up_ = false;
   bool enable_subgraph_reshaping_ = false;
+  Delegate* delegate_;
 };
 
 TfLiteIntArray* Delegate::PrepareOpsToDelegate(TfLiteContext* context) {
@@ -7074,7 +7083,8 @@ TfLiteStatus SubgraphPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   Subgraph* subgraph = static_cast<Subgraph*>(node->user_data);
   return static_cast<Subgraph*>(node->user_data)
-      ->Prepare(context, node, subgraph->EnableSubgraphReshaping());
+      ->Prepare(context, node, subgraph->EnableSubgraphReshaping(),
+                subgraph->GetDelegate());
 }
 
 TfLiteStatus SubgraphInvoke(TfLiteContext* context, TfLiteNode* node) {
@@ -7084,7 +7094,8 @@ TfLiteStatus SubgraphInvoke(TfLiteContext* context, TfLiteNode* node) {
 
   Subgraph* subgraph = static_cast<Subgraph*>(node->user_data);
   return static_cast<Subgraph*>(node->user_data)
-      ->Invoke(context, subgraph->EnableSubgraphReshaping());
+      ->Invoke(context, subgraph->EnableSubgraphReshaping(),
+               subgraph->GetDelegate());
 }
 
 void SubgraphFree(TfLiteContext* context, void* buffer) {

From 982c12629f129894c79955cae36f25ae3eb430b2 Mon Sep 17 00:00:00 2001
From: zoranjovanovic-ns <126815388+zoranjovanovic-ns@users.noreply.github.com>
Date: Wed, 20 Mar 2024 07:00:43 -0700
Subject: [PATCH 178/670] PR #10510: [ROCm] Triton in XLA for ROCm - gemm
 rewriter triton related changes.

Imported from GitHub PR https://github.com/openxla/xla/pull/10510

First commit of the series for enabling Triton in XLA for ROCm .
Copybara import of the project:

--
832d7253db1f8972862252f034b216d1eed1da29 by Zoran Jovanovic <zjovanov@amd.com>:

[ROCm] Triton in XLA for ROCm - gemm rewriter triton related changes.

Merging this change closes #10510

PiperOrigin-RevId: 617500238
---
 .../xla/xla/service/gpu/gemm_fusion.cc        | 51 ++++++++++++++-----
 .../service/gpu/triton_tiling_propagation.cc  |  4 +-
 2 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_fusion.cc b/third_party/xla/xla/service/gpu/gemm_fusion.cc
index e98904b364443f..57ca58c2ec638b 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion.cc
@@ -728,9 +728,14 @@ class GemmFusionVisitor : public DfsHloRewriteVisitor {
     // If a GEMM requiring padding for cuBLAS is encountered here this
     // happened because earlier ShouldTritonHandleGEMM() accepted it and padding
     // was skipped. Accept it ignoring profitability checks.
-    if (!CublasRequiresPadding(*Cast<HloDotInstruction>(dot), gpu_version_) &&
-        !should_fuse) {
-      return absl::OkStatus();
+    // TODO(rocm): check ROCM padding requirements.
+    if (std::holds_alternative<se::CudaComputeCapability>(gpu_version_)) {
+      if (!CublasRequiresPadding(
+              *Cast<HloDotInstruction>(dot),
+              std::get<se::CudaComputeCapability>(gpu_version_)) &&
+          !should_fuse) {
+        return OkStatus();
+      }
     }
 
     HloComputation* computation =
@@ -776,17 +781,29 @@ absl::StatusOr<bool> RunOnComputation(
   return visitor.changed();
 }
 
-bool IsSupportedByTriton(
-    PrecisionConfig::Algorithm algorithm,
-    const se::CudaComputeCapability& cuda_compute_capability) {
+bool IsSupportedByTriton(PrecisionConfig::Algorithm algorithm,
+                         const se::GpuComputeCapability& gpu_version) {
+  auto cuda_compute_capability =
+      std::get_if<se::CudaComputeCapability>(&gpu_version);
+  auto rocm_compute_capability =
+      std::get_if<se::RocmComputeCapability>(&gpu_version);
   switch (algorithm) {
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
+      if (cuda_compute_capability) {
+        return cuda_compute_capability->IsAtLeastAmpere();
+      }
+      return false;
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
-      return true;
 
-    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
-      return cuda_compute_capability.IsAtLeastAmpere();
+      if (cuda_compute_capability) {
+        return cuda_compute_capability->IsAtLeastAmpere();
+      }
+      if (rocm_compute_capability) {
+        return rocm_compute_capability->has_bf16_dtype_support();
+      }
+      return false;
 
     // TODO(b/326579472): Fix the support of this algorithm and maybe allow it
     // here.
@@ -804,8 +821,12 @@ FusionDecision CanTritonHandleGEMM(
     const HloDotInstruction& dot, const se::GpuComputeCapability& gpu_version) {
   auto cuda_compute_capability =
       std::get_if<se::CudaComputeCapability>(&gpu_version);
+  auto rocm_compute_capability =
+      std::get_if<se::RocmComputeCapability>(&gpu_version);
 
-  if (!cuda_compute_capability) return "Non CUDA device.";
+  if (!cuda_compute_capability && !rocm_compute_capability) {
+    return "Non CUDA or ROCM device.";
+  }
 
   if (dot.precision_config().algorithm() == PrecisionConfig::ALG_UNSET) {
     if (!tsl::tensor_float_32_execution_enabled() ||
@@ -826,8 +847,14 @@ FusionDecision CanTritonHandleGEMM(
       case F32:
         return true;
       case BF16:
-        return cuda_compute_capability->IsAtLeast(
-            stream_executor::CudaComputeCapability::AMPERE);
+        if (cuda_compute_capability) {
+          return cuda_compute_capability->IsAtLeast(
+              stream_executor::CudaComputeCapability::AMPERE);
+        }
+        if (rocm_compute_capability) {
+          return rocm_compute_capability->has_bf16_dtype_support();
+        }
+        return false;
       default:
         return false;
     }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 35be72b8ad001f..c3e8ab2ef428d2 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -1067,7 +1067,9 @@ GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
       std::move(std::get<DimOrdersAndReqs>(result_or_error));
   int fusion_level =
       hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
-  if (!std::get<se::CudaComputeCapability>(gpu_version)
+  // TODO(ROCm): Check fusion level for ROCm.
+  if (std::holds_alternative<se::CudaComputeCapability>(gpu_version) &&
+      !std::get<se::CudaComputeCapability>(gpu_version)
            .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
     fusion_level = std::min(fusion_level, 1);
   }

From 109e4658c5993102feeb74ce79639f4d988e13c1 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <isergachev@nvidia.com>
Date: Wed, 20 Mar 2024 07:23:28 -0700
Subject: [PATCH 179/670] PR #10489: [GPU] Fix command buffer support for cuDNN
 fusions.

Imported from GitHub PR https://github.com/openxla/xla/pull/10489

CuDnnCmd is constructed before DnnGraph in CuDnnThunk is initialized so CuDnnCmd has to get  unique_ptr\<DnnGraph\>& instead of DnnGraph& at initialization.

Accordingly cuDNN thunks have to be initialized before command buffer ones to initialize graphs before they get captured.

Test CommandBuffersAreSupported used to not demonstrate the use of command buffers because the corresponding command buffer call used to be inlined and no command buffers were created. This is now cleaned up and does work as expected with minimal CUDA graph size set to 1 with a flag.
Copybara import of the project:

--
8547c674f3e0858efca9763bed586f1d796184d7 by Ilia Sergachev <isergachev@nvidia.com>:

[GPU] Fix command buffer support for cuDNN fusions.

Merging this change closes #10489

PiperOrigin-RevId: 617505164
---
 third_party/xla/xla/service/gpu/fusions/BUILD |  6 +-
 .../xla/xla/service/gpu/fusions/cudnn_test.cc | 79 ++++++++++++-------
 third_party/xla/xla/service/gpu/runtime/BUILD |  5 +-
 .../service/gpu/runtime/command_buffer_cmd.cc |  8 +-
 .../service/gpu/runtime/command_buffer_cmd.h  |  5 +-
 .../gpu/runtime/command_buffer_cmd_emitter.cc |  1 +
 .../xla/service/gpu/runtime/cudnn_thunk.cc    | 19 +++--
 .../xla/xla/service/gpu/runtime/cudnn_thunk.h |  4 +-
 third_party/xla/xla/stream_executor/dnn.h     |  2 +
 9 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index c43db4b68b5a32..7de49d329c808a 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -577,11 +577,15 @@ xla_test(
     ],
     deps = [
         "//xla/hlo/ir:hlo",
+        "//xla/service:executable",
         "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/tests:filecheck",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
index 1567cd17104859..150f085cc9b9b6 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
@@ -13,12 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include <gtest/gtest.h>
+#include "absl/status/statusor.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/substitute.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/executable.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/tests/filecheck.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -254,44 +261,58 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
 }
 
-TEST_F(CuDnnFusionExecutionTest, CommandBuffersAreSupported) {
-  const std::string kHloText = R"(
-HloModule m
-
-%fusion0 {
-  %p0 = f32[64,64]{1,0} parameter(0)
-  %p1 = f32[64,64]{1,0} parameter(1)
-  ROOT %d = f32[64,64]{1,0} dot(%p0, %p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
+class CuDnnFusionCommandBufferTest : public CuDnnFusionTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = CuDnnFusionTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_graph_min_graph_size(1);
+    return debug_options;
+  }
+};
 
-%fusion_a {
-  %p0.2 = f32[64,64]{1,0} parameter(0)
-  %p1.2 = f32[64,64]{1,0} parameter(1)
-  ROOT %a = f32[64,64]{1,0} add(%p0.2, %p1.2)
+TEST_F(CuDnnFusionCommandBufferTest, CommandBuffersAreSupported) {
+  const std::string kHloText = R"(
+fd0 {
+  p0 = f32[64,64]{1,0} parameter(0)
+  p1 = f32[64,64]{1,0} parameter(1)
+  ROOT d = f32[64,64]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
-%fusion1 {
-  %p0.1 = f32[64,64]{1,0} parameter(0)
-  %p1.1 = f32[64,64]{1,0} parameter(1)
-  ROOT %d.1 = f32[64,64]{1,0} dot(%p0.1, %p1.1), lhs_contracting_dims={0}, rhs_contracting_dims={1}
+fd1 {
+  p0 = f32[64,64]{1,0} parameter(0)
+  p1 = f32[64,64]{1,0} parameter(1)
+  ROOT d = f32[64,64]{1,0} dot(p0, p1), lhs_contracting_dims={0}, rhs_contracting_dims={1}
 }
 
-%command_buffer {
-  %p0.4 = f32[64,64]{1,0} parameter(0)
-  %p1.4 = f32[64,64]{1,0} parameter(1)
-  %d0.1 = f32[64,64]{1,0} fusion(%p0.4, %p1.4), kind=kCustom, calls=%fusion0,
+ENTRY e {
+  p0 = f32[64,64]{1,0} parameter(0)
+  p1 = f32[64,64]{1,0} parameter(1)
+  d0 = f32[64,64]{1,0} fusion(p0, p1), kind=kCustom, calls=fd0,
     backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion","cudnn_fusion_config":{"plan_id":"0"}}}
-  %a.2 = f32[64,64]{1,0} fusion(%d0.1, %d0.1), kind=kLoop, calls=%fusion_a
-  ROOT %d1.1 = f32[64,64]{1,0} fusion(%a.2, %p1.4), kind=kCustom, calls=%fusion1,
+  a = f32[64,64]{1,0} add(d0, d0)
+  ROOT d1 = f32[64,64]{1,0} fusion(a, d0), kind=kCustom, calls=fd1,
     backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion","cudnn_fusion_config":{"plan_id":"0"}}}
-}
-
-ENTRY %e {
-  %p0.3 = f32[64,64]{1,0} parameter(0)
-  %p1.3 = f32[64,64]{1,0} parameter(1)
-  ROOT %call = f32[64,64]{1,0} call(%p0.3, %p1.3), to_apply=%command_buffer
 })";
 
+  // Verify that a command buffer is applied.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      backend().compiler()->RunBackend(
+          GetOptimizedModule(kHloText).value(),
+          backend().default_stream_executor(),
+          backend().default_stream_executor()->GetAllocator()));
+  absl::StatusOr<bool> filecheck_result =
+      RunFileCheck(executable->module().ToString(), R"(
+; CHECK: ENTRY
+; CHECK-NEXT: parameter
+; CHECK-NEXT: parameter
+; CHECK-NEXT: ROOT
+; CHECK-SAME: command_buffer
+)");
+  TF_ASSERT_OK(filecheck_result.status());
+  EXPECT_TRUE(filecheck_result.value());
+
+  // Verify that the command buffer executes correctly.
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index df151764e4da79..f1573b18665bd8 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -70,7 +70,6 @@ cc_library(
     ]),
     deps = [
         ":annotation",
-        ":cudnn_thunk",
         ":custom_call_thunk",
         ":nccl_all_gather_thunk",
         ":nccl_all_reduce_thunk",
@@ -966,15 +965,13 @@ cc_library(
     srcs = ["cudnn_thunk.cc"],
     hdrs = ["cudnn_thunk.h"],
     deps = [
-        "//xla:status",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:kernel_arguments",
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
-        "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
index 1da3c5d4253c9f..4ba70f1cf698ea 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
@@ -61,6 +61,7 @@ limitations under the License.
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
@@ -1157,7 +1158,7 @@ CommandBufferCmd::BufferUsageVector GemmCmd::buffers() {
 
 CuDnnCmd::CuDnnCmd(ExecutionStreamId execution_stream_id,
                    absl::Span<const BufferAllocation::Slice> args,
-                   const se::dnn::DnnGraph& graph)
+                   const std::shared_ptr<se::dnn::LazyDnnGraph> graph)
     : TracedCommandBufferCmd(execution_stream_id),
       args_(args.cbegin(), args.cend()),
       graph_(graph) {}
@@ -1173,6 +1174,7 @@ absl::Status CuDnnCmd::Initialize(const Thunk::InitializeParams& params,
 absl::Status CuDnnCmd::Record(const Thunk::ExecuteParams& execute_params,
                               const RecordParams& record_params,
                               se::CommandBuffer* command_buffer) {
+  CHECK(graph_ != nullptr);
   std::vector<se::DeviceMemoryBase> operands;
   operands.reserve(args_.size());
   for (const BufferAllocation::Slice& arg : args_) {
@@ -1184,8 +1186,8 @@ absl::Status CuDnnCmd::Record(const Thunk::ExecuteParams& execute_params,
 
   return AddTracedCommandBuffer(
       execute_params, record_params, command_buffer, [&](se::Stream* stream) {
-        return graph_.Execute(*stream,
-                              absl::Span<se::DeviceMemoryBase>(operands));
+        return graph_->get()->Execute(
+            *stream, absl::Span<se::DeviceMemoryBase>(operands));
       });
 }
 
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
index 21682123245b40..2ac0ebc49eafc7 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
-#include "xla/service/gpu/runtime/cudnn_thunk.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
@@ -756,7 +755,7 @@ class CuDnnCmd : public TracedCommandBufferCmd {
  public:
   CuDnnCmd(ExecutionStreamId execution_stream_id,
            absl::Span<const BufferAllocation::Slice> args,
-           const se::dnn::DnnGraph& graph);
+           std::shared_ptr<se::dnn::LazyDnnGraph> graph);
 
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
@@ -771,7 +770,7 @@ class CuDnnCmd : public TracedCommandBufferCmd {
 
  private:
   std::vector<BufferAllocation::Slice> args_;
-  const se::dnn::DnnGraph& graph_;
+  const std::shared_ptr<se::dnn::LazyDnnGraph> graph_;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
index 17596acd6a176b..5fcd8e299177c8 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
 #include "xla/service/gpu/runtime/conditional_thunk.h"
 #include "xla/service/gpu/runtime/copy_thunk.h"
+#include "xla/service/gpu/runtime/cudnn_thunk.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 #include "xla/service/gpu/runtime/gemm_thunk.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.cc b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.cc
index 2a33ed605a1dcd..d4f25890924f3e 100644
--- a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.cc
@@ -15,7 +15,12 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime/cudnn_thunk.h"
 
+#include <memory>
+#include <utility>
+
 #include "absl/status/status.h"
+#include "xla/stream_executor/dnn.h"
+#include "tsl/platform/errors.h"
 
 namespace xla {
 namespace gpu {
@@ -23,11 +28,12 @@ namespace gpu {
 CuDnnThunk::CuDnnThunk(std::string serialized_graph, ThunkInfo thunk_info,
                        absl::Span<const KernelArgument> kernel_arguments)
     : Thunk(Kind::kCuDnn, std::move(thunk_info)),
-      serialized_graph_(std::move(serialized_graph)) {
+      serialized_graph_(std::move(serialized_graph)),
+      graph_(std::make_shared<se::dnn::LazyDnnGraph>(nullptr)) {
   args_.reserve(kernel_arguments.size());
   for (const KernelArgument& kernel_argument : kernel_arguments) {
     args_.push_back(kernel_argument.slice());
-  }
+  };
 }
 
 absl::Status CuDnnThunk::Initialize(const InitializeParams& params) {
@@ -37,7 +43,7 @@ absl::Status CuDnnThunk::Initialize(const InitializeParams& params) {
         params.stream->parent()->AsDnn()->DeserializeGraph(serialized_graph_);
     std::string().swap(serialized_graph_);
     if (result.ok()) {
-      graph_ = std::move(*result);
+      graph_->swap(*result);
     }
     ret = result.status();
   });
@@ -45,13 +51,16 @@ absl::Status CuDnnThunk::Initialize(const InitializeParams& params) {
 }
 
 absl::Status CuDnnThunk::ExecuteOnStream(const ExecuteParams& params) {
+  InitializeParams initialize_params;
+  initialize_params.stream = params.stream;
+  TF_RETURN_IF_ERROR(Initialize(initialize_params));
   std::vector<se::DeviceMemoryBase> buffer_args;
   buffer_args.reserve(args_.size());
   for (const BufferAllocation::Slice& arg : args_) {
     buffer_args.push_back(params.buffer_allocations->GetDeviceAddress(arg));
   }
-  return graph_->Execute(*params.stream,
-                         absl::Span<se::DeviceMemoryBase>(buffer_args));
+  return graph_->get()->Execute(*params.stream,
+                                absl::Span<se::DeviceMemoryBase>(buffer_args));
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
index be1204b9b65dc9..6e7f355090d6c7 100644
--- a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
@@ -43,7 +43,7 @@ class CuDnnThunk : public Thunk {
   absl::Status Initialize(const InitializeParams&) override;
   absl::Status ExecuteOnStream(const ExecuteParams&) override;
 
-  const se::dnn::DnnGraph& graph() const { return *graph_; }
+  std::shared_ptr<se::dnn::LazyDnnGraph> graph() const { return graph_; }
   const std::vector<BufferAllocation::Slice>& arguments() const {
     return args_;
   }
@@ -51,7 +51,7 @@ class CuDnnThunk : public Thunk {
  private:
   absl::once_flag once_flag_;
   std::string serialized_graph_;
-  std::unique_ptr<se::dnn::DnnGraph> graph_;
+  std::shared_ptr<se::dnn::LazyDnnGraph> graph_;
   std::vector<BufferAllocation::Slice> args_;
 };
 
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index 0ed0444dd4f4bb..f2503a424d9f79 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -1267,6 +1267,8 @@ class DnnGraph {
                                absl::Span<DeviceMemoryBase> operands) const = 0;
 };
 
+using LazyDnnGraph = std::unique_ptr<DnnGraph>;
+
 // Suite of operations typically used for implementing Deep/Convolutional Neural
 // Nets. Note: A false return value of an operation indicates the
 // implementation is not available.

From d72b16ff18d1d2896aea41df5c5ce62eb4ccfcc0 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 20 Mar 2024 07:33:38 -0700
Subject: [PATCH 180/670] Cleaning, common stuff moved to separate function.

PiperOrigin-RevId: 617507848
---
 .../delegates/gpu/cl/inference_context.cc     | 59 ++++++++-----------
 .../lite/delegates/gpu/cl/inference_context.h |  4 ++
 2 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 93982f5c556ccb..1a507374756440 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -817,6 +817,23 @@ absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
   return absl::OkStatus();
 }
 
+absl::Status InferenceContext::ClarifyTimeMultipleEnqueue(
+    double ops_total_duration_ms, int min_ops, int max_ops,
+    ProfilingCommandQueue* queue, ProfilingInfo* result) {
+  queue->ResetMeasurements();
+  for (int i = 0; i < nodes_.size(); ++i) {
+    queue->SetEventsLabel(nodes_[i].name);
+    const int times =
+        ops_total_duration_ms /
+        absl::ToDoubleMilliseconds(result->dispatches[i].duration);
+    const int n = std::min(max_ops, std::max(min_ops, times));
+    RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
+  }
+  RETURN_IF_ERROR(queue->WaitForCompletion());
+  *result = queue->GetProfilingInfo();
+  return absl::OkStatus();
+}
+
 absl::Status InferenceContext::ProfileTime(ProfilingCommandQueue* queue,
                                            ProfilingInfo* result) {
   queue->ResetMeasurements();
@@ -832,42 +849,18 @@ absl::Status InferenceContext::ProfileTime(ProfilingCommandQueue* queue,
   }
 
   if (gpu_info_.IsMali()) {
-    queue->ResetMeasurements();
-    for (int i = 0; i < nodes_.size(); ++i) {
-      queue->SetEventsLabel(nodes_[i].name);
-      const double times =
-          16.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
-      const int n = std::min(256.0, std::max(2.0, times));
-      RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
-    }
-    RETURN_IF_ERROR(queue->WaitForCompletion());
-    *result = queue->GetProfilingInfo();
-    return absl::OkStatus();
+    return ClarifyTimeMultipleEnqueue(/*ops_total_duration_ms=*/16.0,
+                                      /*min_ops=*/2, /*max_ops=*/256, queue,
+                                      result);
   }
 
   if (gpu_info_.IsPowerVR()) {
-    queue->ResetMeasurements();
-    for (int i = 0; i < nodes_.size(); ++i) {
-      queue->SetEventsLabel(nodes_[i].name);
-      const double times =
-          32.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
-      const int n = std::min(64.0, std::max(4.0, times));
-      RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
-    }
-    RETURN_IF_ERROR(queue->WaitForCompletion());
-    *result = queue->GetProfilingInfo();
-
-    queue->ResetMeasurements();
-    for (int i = 0; i < nodes_.size(); ++i) {
-      queue->SetEventsLabel(nodes_[i].name);
-      const double times =
-          128.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
-      const int n = std::min(1024.0, std::max(4.0, times));
-      RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
-    }
-    RETURN_IF_ERROR(queue->WaitForCompletion());
-    *result = queue->GetProfilingInfo();
-    return absl::OkStatus();
+    RETURN_IF_ERROR(ClarifyTimeMultipleEnqueue(/*ops_total_duration_ms=*/32.0,
+                                               /*min_ops=*/4, /*max_ops=*/64,
+                                               queue, result));
+    return ClarifyTimeMultipleEnqueue(/*ops_total_duration_ms=*/128.0,
+                                      /*min_ops=*/4, /*max_ops=*/1024, queue,
+                                      result);
   }
 
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h
index e5e883b07c3c34..93cf4975d01ae2 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -148,6 +148,10 @@ class InferenceContext {
   void InitRecordableQueue(Environment* env);
 
   absl::Status ProfileTime(ProfilingCommandQueue* queue, ProfilingInfo* result);
+  absl::Status ClarifyTimeMultipleEnqueue(double ops_total_duration_ms,
+                                          int min_ops, int max_ops,
+                                          ProfilingCommandQueue* queue,
+                                          ProfilingInfo* result);
 
   struct ExecutionHints {
     bool need_flush = false;

From 760fb4c0e5daebd9df9ca5dafa55b758cd40c3b4 Mon Sep 17 00:00:00 2001
From: Philipp Hack <phack@nvidia.com>
Date: Wed, 20 Mar 2024 08:18:44 -0700
Subject: [PATCH 181/670] PR #10722: Consolidate Unit Tests for GEMM Rewriter

Imported from GitHub PR https://github.com/openxla/xla/pull/10722

Moves the unit tests in `xla/service/gpu/gemm_rewriter_test.cc` into `xla/service/gpu/tests/gemm_rewrite_test.cc`.
Copybara import of the project:

--
e3056a172238a256214d856cf3ad5cb44099e06a by Philipp Hack <phack@nvidia.com>:

Merge GEMM rewriter unit tests.

Merging this change closes #10722

PiperOrigin-RevId: 617518997
---
 third_party/xla/xla/service/gpu/BUILD         |  17 ---
 .../xla/xla/service/gpu/gemm_rewriter_test.cc | 105 ------------------
 .../service/gpu/tests/gemm_rewrite_test.cc    |  61 ++++++++++
 3 files changed, 61 insertions(+), 122 deletions(-)
 delete mode 100644 third_party/xla/xla/service/gpu/gemm_rewriter_test.cc

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 2fe90a0418aea1..34eedbeb2f26c9 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1560,23 +1560,6 @@ cc_library(
     ]),
 )
 
-xla_cc_test(
-    name = "gemm_rewriter_test",
-    srcs = ["gemm_rewriter_test.cc"],
-    deps = [
-        ":gemm_rewriter",
-        "//xla:autotuning_proto_cc",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cc_library(
     name = "triton_support",
     srcs = ["triton_support.cc"],
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_test.cc
deleted file mode 100644
index a72c8255d348bc..00000000000000
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_test.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/gemm_rewriter.h"
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "xla/autotuning.pb.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla::gpu {
-namespace {
-
-class GemmRewriterTest : public HloTestBase {
- public:
-  GemmRewriterTest()
-      : HloTestBase(/*verifier_layout_sensitive=*/true,
-                    /*allow_mixed_precision_in_hlo_verifier=*/false) {}
-
-  DebugOptions GetDebugOptionsForTest() override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_cublaslt(false);
-    return debug_options;
-  }
-
-  se::GpuComputeCapability gpu_version_{
-      se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0}};
-};
-
-TEST_F(GemmRewriterTest, MatrixVectorMultiplication) {
-  const char* hlo = R"(
-HloModule m
-
-ENTRY e {
-  p0 = f32[2048] parameter(0)
-  p1 = f32[2048, 16384] parameter(1)
-  ROOT d = f32[16384] dot(p0, p1),
-    lhs_contracting_dims={0}, rhs_contracting_dims={0}
-})";
-
-  const char* expected = R"(
-// CHECK:  %[[P0:.+]] = f32[2048]{0} parameter(0)
-// CHECK:  %[[P1:.+]] = f32[2048,16384]{1,0} parameter(1)
-// CHECK:  %[[CUSTOM_CALL:.+]] = (f32[16384]{0}, s8[4194304]{0}) custom-call(%[[P0]], %[[P1]]), custom_call_target="__cublas$gemm"
-)";
-
-  RunAndFilecheckHloRewrite(hlo, GemmRewriter(gpu_version_), expected);
-}
-
-TEST_F(GemmRewriterTest, MatrixVectorMultiplicationWithBatch) {
-  const char* hlo = R"(
-HloModule m
-
-ENTRY e {
-  p0 = f32[10, 10, 2048] parameter(0)
-  p1 = f32[10, 10, 2048, 16384] parameter(1)
-  ROOT d = f32[10, 10, 16384] dot(p0, p1),
-   lhs_batch_dims={0, 1}, rhs_batch_dims={0, 1},
-   lhs_contracting_dims={2}, rhs_contracting_dims={2}
-})";
-
-  const char* expected = R"(
-// CHECK:  %[[P0:.+]] = f32[10,10,2048]{2,1,0} parameter(0)
-// CHECK:  %[[P1:.+]] = f32[10,10,2048,16384]{3,2,1,0} parameter(1)
-// CHECK:  %[[CUSTOM_CALL:.+]] = (f32[10,10,16384]{2,1,0}, s8[4194304]{0}) custom-call(%[[P0]], %[[P1]]), custom_call_target="__cublas$gemm"
-)";
-
-  RunAndFilecheckHloRewrite(hlo, GemmRewriter(gpu_version_), expected);
-}
-
-TEST_F(GemmRewriterTest, SparseDotNotSupported) {
-  const char* hlo = R"(
-HloModule test
-
-ENTRY main {
-  lhs = f16[5,16] parameter(0)
-  rhs = f16[32,10] parameter(1)
-  meta = u16[5,2] parameter(2)
-  ROOT dot = f32[5,10] dot(lhs, rhs, meta),
-      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
-})";
-  auto hlo_pass = GemmRewriter(gpu_version_);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&hlo_pass, module.get()));
-  EXPECT_FALSE(changed);
-}
-
-}  // namespace
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index 0ee2481cfed26a..20f40a2dc453d7 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -1479,6 +1479,67 @@ class LegacyCublasGemmRewriteTest : public GemmRewriteTest {
   }
 };
 
+TEST_F(LegacyCublasGemmRewriteTest, MatrixVectorMultiplication) {
+  const char* hlo_text = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f32[2048] parameter(0)
+  p1 = f32[2048, 16384] parameter(1)
+  ROOT d = f32[16384] dot(p0, p1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+})";
+
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::AMPERE, 0}),
+                            R"(
+; CHECK:  %[[P0:.+]] = f32[2048]{0} parameter(0)
+; CHECK:  %[[P1:.+]] = f32[2048,16384]{1,0} parameter(1)
+; CHECK:  %[[CUSTOM_CALL:.+]] = (f32[16384]{0}, s8[4194304]{0}) custom-call(%[[P0]], %[[P1]]), custom_call_target="__cublas$gemm"
+)");
+}
+
+TEST_F(LegacyCublasGemmRewriteTest, MatrixVectorMultiplicationWithBatch) {
+  const char* hlo_text = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f32[10, 10, 2048] parameter(0)
+  p1 = f32[10, 10, 2048, 16384] parameter(1)
+  ROOT d = f32[10, 10, 16384] dot(p0, p1),
+   lhs_batch_dims={0, 1}, rhs_batch_dims={0, 1},
+   lhs_contracting_dims={2}, rhs_contracting_dims={2}
+})";
+
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::AMPERE, 0}),
+                            R"(
+; CHECK:  %[[P0:.+]] = f32[10,10,2048]{2,1,0} parameter(0)
+; CHECK:  %[[P1:.+]] = f32[10,10,2048,16384]{3,2,1,0} parameter(1)
+; CHECK:  %[[CUSTOM_CALL:.+]] = (f32[10,10,16384]{2,1,0}, s8[4194304]{0}) custom-call(%[[P0]], %[[P1]]), custom_call_target="__cublas$gemm"
+)");
+}
+
+TEST_F(LegacyCublasGemmRewriteTest, SparseDotNotSupported) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY main {
+  lhs = f16[5,16] parameter(0)
+  rhs = f16[32,10] parameter(1)
+  meta = u16[5,2] parameter(2)
+  ROOT dot = f32[5,10] dot(lhs, rhs, meta),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+})";
+  auto hlo_pass = GemmRewriter(
+      se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&hlo_pass, module.get()));
+  EXPECT_FALSE(changed);
+}
+
 // Test that the alpha and beta fields of the GemmBackendConfig are updated.
 // A bias must be present for the beta value to be set.
 // In order to have a bias add fused, the bias term must be overwritable.

From f4536f801a48d1252d27136afdc0155d9fe34626 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Wed, 20 Mar 2024 08:24:18 -0700
Subject: [PATCH 182/670] [xla:gpu] Move the collective-permute-decomposer and
 Send-Recv pipeliner to pre-fusion.

Previously, the Send-Recv pipeliner is invoked as a post-fusion pass. This is a
problem for two reasons. The loop bound analysis in the pipeliner can't
recognize loop bound in fusioned computation; performing loop pipelining before
fusion can enable better fusion decision.

The Send-Recv pipeliner relies on the collective-permute-decomposer to generate
Send-Recv operations. We now change the collective-permute-decomposer to
process collective-permute operations instead of collective-permute-operations
and move this pass along with the Send-Recv pass to pre-fusion.

PiperOrigin-RevId: 617520340
---
 third_party/xla/xla/service/BUILD             |  1 +
 .../service/collective_permute_decomposer.cc  | 37 ++++----------
 .../service/collective_permute_decomposer.h   | 15 +++---
 .../collective_permute_decomposer_test.cc     | 51 ++++---------------
 .../xla/xla/service/gpu/gpu_compiler.cc       | 23 +++++----
 5 files changed, 42 insertions(+), 85 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index b16fd5ea5328b4..41a636e985782a 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -381,6 +381,7 @@ cc_library(
     deps = [
         ":collective_ops_utils",
         ":hlo_pass",
+        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:backend_configs_cc",
diff --git a/third_party/xla/xla/service/collective_permute_decomposer.cc b/third_party/xla/xla/service/collective_permute_decomposer.cc
index f02ae73757e94d..08e5f78217060a 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/graphcycles/graphcycles.h"
+#include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 
@@ -71,10 +72,10 @@ bool HasCycles(const SourceTargetPairs& pairs) {
   return false;
 }
 
-// Returns true if the CollectivePermuteStart instruction should be transformed
-// to Send/Recv. We currently limit the transformation to asynchronous
-// CollectivePermuteStart without any cycle in the (source, target)
-// relationship, with only one input and without any context data.
+// Returns true if the CollectivePermute instruction should be transformed
+// to Send/Recv. We currently limit the transformation to CollectivePermute
+// operations without any cycle in their (source, target) relationship,
+// with only one input and without any context data.
 bool ShouldDecompose(const HloCollectivePermuteInstruction& collective_permute,
                      int64_t threshold_in_bytes) {
   // TODO(b/316043789): enable the transformation for the no channel_id case.
@@ -82,25 +83,14 @@ bool ShouldDecompose(const HloCollectivePermuteInstruction& collective_permute,
     return false;
   }
 
-  auto backend_config =
-      collective_permute.backend_config<xla::gpu::GpuBackendConfig>()
-          ->collective_backend_config();
-  if (backend_config.is_sync()) {
-    return false;
-  }
-  if (collective_permute.operand_count() != 1) {
-    return false;
-  }
-
   const Shape& result_shape = collective_permute.shape();
-  // Skip the transformation if there is any context data.
-  if (result_shape.tuple_shapes_size() != 2) {
+  // Skip the transformation if result is not an array, such as containing
+  // context data.
+  if (!result_shape.IsArray()) {
     return false;
   }
 
-  const Shape& shape = result_shape.tuple_shapes(0);
-  CHECK(shape.IsArray());
-  if (ShapeUtil::ByteSizeOf(shape) < threshold_in_bytes) {
+  if (ShapeUtil::ByteSizeOf(result_shape) < threshold_in_bytes) {
     return false;
   }
   return !HasCycles(collective_permute.source_target_pairs());
@@ -121,9 +111,6 @@ bool MayPipeline(const HloCollectivePermuteInstruction& collective_permute) {
 Status DecomposeCollectivePermute(
     HloCollectivePermuteInstruction* collective_permute,
     HloComputation* computation, const std::string& pipeline_decision) {
-  // The HLO verifier ensures that CollectivePermuteStart's single user is
-  // CollectivePermuteDone.
-  HloInstruction* collective_permute_done = collective_permute->users().front();
   // We currently only decompose collective-permute with a channel_id.
   int64_t channel_id = collective_permute->channel_id().value();
   HloInstruction* data = collective_permute->mutable_operand(0);
@@ -168,9 +155,7 @@ Status DecomposeCollectivePermute(
 
   HloInstruction* recv_data = computation->AddInstruction(
       HloInstruction::CreateGetTupleElement(recv_done, 0));
-  TF_RETURN_IF_ERROR(collective_permute_done->ReplaceAllUsesWith(recv_data));
-  TF_RETURN_IF_ERROR(
-      computation->RemoveInstructionAndUnusedOperands(collective_permute_done));
+  TF_RETURN_IF_ERROR(collective_permute->ReplaceAllUsesWith(recv_data));
   TF_RETURN_IF_ERROR(
       computation->RemoveInstructionAndUnusedOperands(collective_permute));
 
@@ -287,7 +272,7 @@ absl::StatusOr<bool> CollectivePermuteDecomposer::Run(
         while_bodies.insert(hlo->while_body());
         continue;
       }
-      if (hlo->opcode() != HloOpcode::kCollectivePermuteStart) {
+      if (hlo->opcode() != HloOpcode::kCollectivePermute) {
         continue;
       }
 
diff --git a/third_party/xla/xla/service/collective_permute_decomposer.h b/third_party/xla/xla/service/collective_permute_decomposer.h
index 26c3018aba5945..f0d4c0c0df9ae5 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer.h
+++ b/third_party/xla/xla/service/collective_permute_decomposer.h
@@ -21,16 +21,15 @@ limitations under the License.
 
 namespace xla {
 
-// CollectivePermuteDecomposer is a pass that (1) converts asynchronous
-// CollectivePermute operations without any cycle in the (source, target)
-// relationship to Send/Recv, and (2) annotates the Send/Recv for pipelining
-// with a frontend attribute. We currently restrict the decomposition
-// to CollectivePermuteStart with one input and without any context data.
+// CollectivePermuteDecomposer is a pass that (1) converts CollectivePermute
+// operations without any cycle in their (source, target) relationship to
+// Send/Recv, and (2) annotates the Send/Recv for pipelining with a frontend
+// frontend attribute. We currently restrict the decomposition to
+// CollectivePermute with one input and without any context data.
 //
 // before transformation:
-//     start = (<rt>, <rt>) collective-permute-start(data),
+//     cp = (<rt>, <rt>) collective-permute(data),
 //       source_target_pairs={...}
-//     done = <rt> collective-permute-done(start)
 //
 // after transformation:
 //    after-all = token[] after-all()
@@ -42,7 +41,7 @@ namespace xla {
 //    recv-done = (<rt>, token[]) recv-done(recv), channel_id=0
 //    send-done = token[] send-done(send), channel_id=0,
 //      control-predecessors={recv-done}
-//    done = <rt> get-tuple-element(recv-done), index=0
+//    cp = <rt> get-tuple-element(recv-done), index=0
 //
 // For pipelining, we first make pipelining decision on CollectivePermute
 // operations, and then record the decision on the decomposed Send/Recv via
diff --git a/third_party/xla/xla/service/collective_permute_decomposer_test.cc b/third_party/xla/xla/service/collective_permute_decomposer_test.cc
index 97d20a77d3b871..4a3bb2360c6878 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer_test.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer_test.cc
@@ -35,33 +35,13 @@ using ::testing::HasSubstr;
 namespace op = xla::testing::opcode_matchers;
 using CollectivePermuteDecomposerTest = HloTestBase;
 
-TEST_F(CollectivePermuteDecomposerTest, SyncNotTransformed) {
-  const absl::string_view kModuleStr = R"(
-      HloModule test
-      ENTRY test_computation {
-        p = u32[] replica-id()
-        start = (u32[], u32[]) collective-permute-start(p), channel_id=1,
-          source_target_pairs={{0,1}, {1,2}},
-          backend_config="{ \"collective_backend_config\": {\"is_sync\":true}}"
-        ROOT done = u32[] collective-permute-done(start)
-      }
-    )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnUnverifiedModule((kModuleStr)));
-  CollectivePermuteDecomposer decomposer(/*threshold_in_bytes=*/0);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
-  EXPECT_FALSE(changed);
-}
-
 TEST_F(CollectivePermuteDecomposerTest, WithCycleNotTransformed) {
   const absl::string_view kModuleStr = R"(
       HloModule test
       ENTRY test_computation {
-        p = (u32[], u32[]) replica-id()
-        start = u32[] collective-permute-start(p), channel_id=1,
+        p = u32[] replica-id()
+        ROOT cp = u32[] collective-permute(p), channel_id=1,
           source_target_pairs={{0,1}, {1,0}}
-        ROOT done = u32[] collective-permute-done(start)
       }
     )";
 
@@ -77,9 +57,8 @@ TEST_F(CollectivePermuteDecomposerTest, WithContextDataNotTransformed) {
   HloModule test
   ENTRY test_computation {
     p = u32[] replica-id()
-    start = (u32[], u32[], u32[], u32[]) collective-permute-start(p), channel_id=1,
+    ROOT cp = (u32[], u32[], u32[], u32[]) collective-permute(p), channel_id=1,
       source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}}
-    ROOT done = u32[] collective-permute-done(start)
   }
   )";
 
@@ -95,10 +74,9 @@ TEST_F(CollectivePermuteDecomposerTest, TransformedExplicitChannelId) {
   HloModule test
   ENTRY test_computation {
     p = u32[] replica-id()
-    start = (u32[], u32[]) collective-permute-start(p), channel_id=1,
+    ROOT cp = u32[] collective-permute(p), channel_id=1,
       source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}},
       metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
-    ROOT done = u32[] collective-permute-done(start)
   }
   )";
 
@@ -154,9 +132,8 @@ TEST_F(CollectivePermuteDecomposerTest, NotTransformedDefaultChannelId) {
   HloModule test
   ENTRY test_computation {
     p = u32[] replica-id()
-    start = (u32[], u32[]) collective-permute-start(p),
+    ROOT cp = u32[] collective-permute(p),
       source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}}
-    ROOT done = u32[] collective-permute-done(start)
   }
   )";
 
@@ -172,10 +149,9 @@ TEST_F(CollectivePermuteDecomposerTest, ThresholdNotTransformed) {
   HloModule test
   ENTRY test_computation {
     p = u32[] replica-id()
-    start = (u32[], u32[]) collective-permute-start(p), channel_id=1,
+    ROOT cp = u32[] collective-permute(p), channel_id=1,
       source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}},
       metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
-    ROOT done = u32[] collective-permute-done(start)
   }
   )";
 
@@ -201,10 +177,9 @@ TEST_F(CollectivePermuteDecomposerTest, Pipeline1) {
     count = get-tuple-element(param), index=0
     send-data = get-tuple-element(param), index=1
 
-    start = (u32[2], u32[2]) collective-permute-start(send-data), channel_id=1,
+    recv-data = u32[2] collective-permute(send-data), channel_id=1,
       source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}},
       frontend_attributes={_xla_other_attribute="xyz"}
-    recv-data = u32[2] collective-permute-done(start)
 
     c1 = u32[] constant(1)
     new_count = u32[] add(count, c1)
@@ -265,13 +240,11 @@ TEST_F(CollectivePermuteDecomposerTest, ForwardPipeline2) {
     count = get-tuple-element(param), index=0
     send-data = get-tuple-element(param), index=1
 
-    start.0 = (u32[2], u32[2]) collective-permute-start(send-data), channel_id=1,
+    recv-data.0 = u32[2] collective-permute(send-data), channel_id=1,
       source_target_pairs={{3,0}}
-    recv-data.0 = u32[2] collective-permute-done(start.0)
 
-    start.1 = (u32[2], u32[2]) collective-permute-start(send-data), channel_id=2,
+    recv-data.1 = u32[2] collective-permute(send-data), channel_id=2,
       source_target_pairs={{0,1}, {1,2}, {2,3}}
-    recv-data.1 = u32[2] collective-permute-done(start.1)
 
     replica = u32[] replica-id()
     constant0 = u32[] constant(0)
@@ -342,13 +315,11 @@ TEST_F(CollectivePermuteDecomposerTest, BackwardPipeline2) {
     count = get-tuple-element(param), index=0
     send-data = get-tuple-element(param), index=1
 
-    start.0 = (u32[2], u32[2]) collective-permute-start(send-data), channel_id=1,
+    recv-data.0 = u32[2] collective-permute(send-data), channel_id=1,
       source_target_pairs={{1,0},{2,1},{3,2}}
-    recv-data.0 = u32[2] collective-permute-done(start.0)
 
-    start.1 = (u32[2], u32[2]) collective-permute-start(send-data), channel_id=2,
+    recv-data.1 = u32[2] collective-permute(send-data), channel_id=2,
       source_target_pairs={{0,3}}
-    recv-data.1 = u32[2] collective-permute-done(start.1)
 
     replica = u32[] replica-id()
     constant0 = u32[] constant(0)
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index d2acde8de1244d..29614e5d163821 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -959,6 +959,18 @@ absl::Status RunCollectiveOptimizationPasses(
           .debug_options()
           .xla_gpu_collective_permute_decomposer_threshold());
 
+  collectives_pipeline.AddPass<CollectivePermuteDecomposer>(
+      hlo_module->config()
+          .debug_options()
+          .xla_gpu_collective_permute_decomposer_threshold());
+
+  if (hlo_module->config()
+          .debug_options()
+          .xla_gpu_enable_pipelined_collectives() ||
+      hlo_module->config().debug_options().xla_gpu_enable_pipelined_p2p()) {
+    AddP2PPipeliner(collectives_pipeline);
+  }
+
   // Run algebraic simplifier to reshape(broadcast) into a broadcast when
   // the reshape is just adding a unit dimension. This will help with the
   // AllGatherBroadcastReorder pass.
@@ -1167,17 +1179,6 @@ absl::Status RunPostFusionCollectiveOptimizationPasses(HloModule* hlo_module) {
   };
   pipeline.AddPass<GpuAsyncCollectiveAnnotator>(convert_to_async);
 
-  pipeline.AddPass<CollectivePermuteDecomposer>(
-      hlo_module->config()
-          .debug_options()
-          .xla_gpu_collective_permute_decomposer_threshold());
-
-  if (hlo_module->config()
-          .debug_options()
-          .xla_gpu_enable_pipelined_collectives() ||
-      hlo_module->config().debug_options().xla_gpu_enable_pipelined_p2p()) {
-    AddP2PPipeliner(pipeline);
-  }
   return pipeline.Run(hlo_module).status();
 }
 

From 9aa7d4b3c442cb2b624c871538144ad5e233a4a4 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 20 Mar 2024 08:32:24 -0700
Subject: [PATCH 183/670] Integrate LLVM at llvm/llvm-project@d93cfd8dab57

Updates LLVM usage to match
[d93cfd8dab57](https://github.com/llvm/llvm-project/commit/d93cfd8dab57)

PiperOrigin-RevId: 617522339
---
 third_party/llvm/generated.patch | 325 ++++++++++++++++++++++++-------
 third_party/llvm/workspace.bzl   |   4 +-
 2 files changed, 255 insertions(+), 74 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index ed71b6eda58bb5..d31be3f7ac4ea7 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,75 +1,256 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
---- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
-+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
-@@ -66,7 +66,7 @@
+diff -ruN --strip-trailing-cr a/clang/test/CodeGen/aarch64-soft-float-abi-errors.c b/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
+--- a/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
++++ b/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
+@@ -1,9 +1,9 @@
+ // REQUIRES: aarch64-registered-target
  
- libc_support_library(
-     name = "internal_includes",
--    hdrs = glob([
-+    textual_hdrs = glob([
-         "include/llvm-libc-macros/*.h",
-         "include/llvm-libc-types/*",
-     ]),
-diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
---- a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
-+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
-@@ -178,6 +178,7 @@
-     deps = [
-         "//libc:__support_fputil_basic_operations",
-         "//libc:__support_fputil_fp_bits",
-+        "//libc:internal_includes",
-         "//libc/test/UnitTest:LibcUnitTest",
-         "//libc/test/UnitTest:fp_test_helpers",
-         "//libc/utils/MPFRWrapper:mpfr_wrapper",
-@@ -323,6 +324,7 @@
-         "//libc:__support_fputil_basic_operations",
-         "//libc:__support_fputil_fenv_impl",
-         "//libc:__support_fputil_fp_bits",
-+        "//libc:internal_includes",
-         "//libc/test/UnitTest:LibcUnitTest",
-         "//libc/test/UnitTest:fp_test_helpers",
-     ],
-@@ -350,6 +352,7 @@
-         "//libc:__support_cpp_limits",
-         "//libc:__support_fputil_fp_bits",
-         "//libc:__support_fputil_normal_float",
-+        "//libc:internal_includes",
-         "//libc/test/UnitTest:LibcUnitTest",
-         "//libc/test/UnitTest:fp_test_helpers",
-     ],
-@@ -376,6 +379,7 @@
-     deps = [
-         "//libc:__support_fputil_fenv_impl",
-         "//libc:__support_fputil_fp_bits",
-+        "//libc:internal_includes",
-         "//libc/test/UnitTest:LibcUnitTest",
-         "//libc/test/UnitTest:fp_test_helpers",
-         "//libc/utils/MPFRWrapper:mpfr_wrapper",
-@@ -412,6 +416,7 @@
-     deps = [
-         "//libc:__support_fputil_fenv_impl",
-         "//libc:__support_fputil_fp_bits",
-+        "//libc:internal_includes",
-         "//libc/test/UnitTest:LibcUnitTest",
-         "//libc/test/UnitTest:fp_test_helpers",
-         "//libc/utils/MPFRWrapper:mpfr_wrapper",
-@@ -523,6 +528,7 @@
-         "//libc:__support_cpp_type_traits",
-         "//libc:__support_fputil_basic_operations",
-         "//libc:__support_fputil_fp_bits",
-+        "//libc:internal_includes",
-         "//libc/test/UnitTest:LibcUnitTest",
-         "//libc/test/UnitTest:fp_test_helpers",
-     ],
-diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
---- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
-+++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
-@@ -84,6 +84,7 @@
-         "//libc:__support_fputil_fp_bits",
-         "//libc:__support_fputil_fpbits_str",
-         "//libc:__support_fputil_rounding_mode",
-+        "//libc:internal_includes",
-     ],
- )
+-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +fp-armv8 -S -target-abi aapcs      -verify=fp-hard %s
+-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -target-abi aapcs-soft -verify=nofp-soft %s
+-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -target-abi aapcs      -verify=nofp-hard %s
+-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -target-abi aapcs -O1  -verify=nofp-hard,nofp-hard-opt -emit-llvm %s
++// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +fp-armv8 -S -o /dev/null -target-abi aapcs      -verify=fp-hard %s
++// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -o /dev/null -target-abi aapcs-soft -verify=nofp-soft %s
++// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -o /dev/null -target-abi aapcs      -verify=nofp-hard %s
++// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -o /dev/null -target-abi aapcs -O1  -verify=nofp-hard,nofp-hard-opt -emit-llvm %s
+ // No run line needed for soft-float ABI with an FPU because that is rejected by the driver
  
+ // With the hard-float ABI and a target with an FPU, FP arguments are passed in
+diff -ruN --strip-trailing-cr a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
++++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+@@ -4074,6 +4074,7 @@
+   switch (qual_type->getTypeClass()) {
+   case clang::Type::Atomic:
+   case clang::Type::Auto:
++  case clang::Type::CountAttributed:
+   case clang::Type::Decltype:
+   case clang::Type::Elaborated:
+   case clang::Type::Paren:
+@@ -4755,6 +4756,7 @@
+   switch (qual_type->getTypeClass()) {
+   case clang::Type::Atomic:
+   case clang::Type::Auto:
++  case clang::Type::CountAttributed:
+   case clang::Type::Decltype:
+   case clang::Type::Elaborated:
+   case clang::Type::Paren:
+@@ -5088,6 +5090,7 @@
+   switch (qual_type->getTypeClass()) {
+   case clang::Type::Atomic:
+   case clang::Type::Auto:
++  case clang::Type::CountAttributed:
+   case clang::Type::Decltype:
+   case clang::Type::Elaborated:
+   case clang::Type::Paren:
+diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
++++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+@@ -611,25 +611,6 @@
+   return false;
+ }
+ 
+-static SDValue simplifyUseOfIntToFP(SDValue Op, const APInt &DemandedBits,
+-                                    SelectionDAG &DAG) {
+-  unsigned Opc = Op.getOpcode();
+-  assert((Opc == ISD::SINT_TO_FP || Opc == ISD::UINT_TO_FP) &&
+-         "Invalid Int -> FP Opcode");
+-  if (!DemandedBits.isSignMask())
+-    return SDValue();
+-
+-  EVT VT = Op.getValueType();
+-  if (Opc == ISD::UINT_TO_FP)
+-    return DAG.getConstant(0, SDLoc(Op), VT);
+-
+-  EVT InnerVT = Op.getOperand(0).getValueType();
+-  if (VT.getScalarSizeInBits() == InnerVT.getScalarSizeInBits())
+-    return DAG.getBitcast(VT, Op.getOperand(0));
+-
+-  return SDValue();
+-}
+-
+ bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
+                                           DAGCombinerInfo &DCI) const {
+   SelectionDAG &DAG = DCI.DAG;
+@@ -835,11 +816,6 @@
+     }
+     break;
+   }
+-  case ISD::UINT_TO_FP:
+-  case ISD::SINT_TO_FP:
+-    if (SDValue R = simplifyUseOfIntToFP(Op, DemandedBits, DAG))
+-      return R;
+-    break;
+   case ISD::SIGN_EXTEND_INREG: {
+     // If none of the extended bits are demanded, eliminate the sextinreg.
+     SDValue Op0 = Op.getOperand(0);
+@@ -2337,12 +2313,6 @@
+     Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+     break;
+   }
+-  case ISD::UINT_TO_FP:
+-  case ISD::SINT_TO_FP:
+-    if (SDValue R = simplifyUseOfIntToFP(Op, DemandedBits, TLO.DAG))
+-      return TLO.CombineTo(Op, R);
+-    Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+-    break;
+   case ISD::SIGN_EXTEND_INREG: {
+     SDValue Op0 = Op.getOperand(0);
+     EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
++++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+@@ -68,9 +68,6 @@
+ public:
+   VPBuilder() = default;
+   VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); }
+-  VPBuilder(VPRecipeBase *InsertPt) {
+-    setInsertPoint(InsertPt->getParent(), InsertPt->getIterator());
+-  }
+ 
+   /// Clear the insertion point: created instructions will not be inserted into
+   /// a block.
+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
+--- a/llvm/lib/Transforms/Vectorize/VPlan.h
++++ b/llvm/lib/Transforms/Vectorize/VPlan.h
+@@ -1127,12 +1127,6 @@
+     return WrapFlags.HasNSW;
+   }
+ 
+-  bool isDisjoint() const {
+-    assert(OpType == OperationType::DisjointOp &&
+-           "recipe cannot have a disjoing flag");
+-    return DisjointFlags.IsDisjoint;
+-  }
+-
+ #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+   void printFlags(raw_ostream &O) const;
+ #endif
+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
++++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+@@ -261,11 +261,6 @@
+   return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
+ }
+ 
+-template <typename Op0_t, typename Op1_t>
+-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or>
+-m_Or(const Op0_t &Op0, const Op1_t &Op1) {
+-  return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
+-}
+ } // namespace VPlanPatternMatch
+ } // namespace llvm
+ 
+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
++++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+@@ -1216,23 +1216,6 @@
+       // load/store. If the underlying instruction has poison-generating flags,
+       // drop them directly.
+       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
+-        VPValue *A, *B;
+-        using namespace llvm::VPlanPatternMatch;
+-        // Dropping disjoint from an OR may yield incorrect results, as some
+-        // analysis may have converted it to an Add implicitly (e.g. SCEV used
+-        // for dependence analysis). Instead, replace it with an equivalent Add.
+-        // This is possible as all users of the disjoint OR only access lanes
+-        // where the operands are disjoint or poison otherwise.
+-        if (match(RecWithFlags, m_Or(m_VPValue(A), m_VPValue(B))) &&
+-            RecWithFlags->isDisjoint()) {
+-          VPBuilder Builder(RecWithFlags);
+-          VPInstruction *New = Builder.createOverflowingOp(
+-              Instruction::Add, {A, B}, {false, false},
+-              RecWithFlags->getDebugLoc());
+-          RecWithFlags->replaceAllUsesWith(New);
+-          RecWithFlags->eraseFromParent();
+-          CurRec = New;
+-        }
+         RecWithFlags->dropPoisonGeneratingFlags();
+       } else {
+         Instruction *Instr = dyn_cast_or_null<Instruction>(
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
+--- a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
++++ b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
+@@ -164,13 +164,14 @@
+ ; SSE-LABEL: demandedbits_sitofp_blendvps:
+ ; SSE:       # %bb.0:
+ ; SSE-NEXT:    movaps %xmm0, %xmm3
+-; SSE-NEXT:    movaps %xmm2, %xmm0
++; SSE-NEXT:    cvtdq2ps %xmm2, %xmm0
+ ; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
+ ; SSE-NEXT:    movaps %xmm3, %xmm0
+ ; SSE-NEXT:    retq
+ ;
+ ; AVX-LABEL: demandedbits_sitofp_blendvps:
+ ; AVX:       # %bb.0:
++; AVX-NEXT:    vcvtdq2ps %xmm2, %xmm2
+ ; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+ ; AVX-NEXT:    retq
+   %cvt = sitofp <4 x i32> %a2 to <4 x float>
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/X86/int-to-fp-demanded.ll b/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
+--- a/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
++++ b/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
+@@ -7,13 +7,19 @@
+ define i32 @sitofp_signbit_only(i32 %i_in) nounwind {
+ ; X86-LABEL: sitofp_signbit_only:
+ ; X86:       # %bb.0:
++; X86-NEXT:    subl $8, %esp
++; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
++; X86-NEXT:    movl %eax, (%esp)
++; X86-NEXT:    fildl (%esp)
++; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+ ; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+ ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
++; X86-NEXT:    addl $8, %esp
+ ; X86-NEXT:    retl
+ ;
+ ; X64-LABEL: sitofp_signbit_only:
+ ; X64:       # %bb.0:
+-; X64-NEXT:    movd %edi, %xmm0
++; X64-NEXT:    cvtsi2ss %edi, %xmm0
+ ; X64-NEXT:    movmskps %xmm0, %eax
+ ; X64-NEXT:    shll $31, %eax
+ ; X64-NEXT:    retq
+@@ -38,8 +44,8 @@
+ ;
+ ; X64-LABEL: sitofp_signbit_only_okay_width:
+ ; X64:       # %bb.0:
+-; X64-NEXT:    shll $16, %edi
+-; X64-NEXT:    movd %edi, %xmm0
++; X64-NEXT:    movswl %di, %eax
++; X64-NEXT:    cvtsi2ss %eax, %xmm0
+ ; X64-NEXT:    movmskps %xmm0, %eax
+ ; X64-NEXT:    shll $31, %eax
+ ; X64-NEXT:    retq
+@@ -76,14 +82,15 @@
+ ; X86-LABEL: sitofp_signbit_only_fail_bad_width2:
+ ; X86:       # %bb.0:
+ ; X86-NEXT:    subl $8, %esp
+-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+-; X86-NEXT:    movl %edx, (%esp)
++; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
++; X86-NEXT:    movl %eax, (%esp)
+ ; X86-NEXT:    fildl (%esp)
+ ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
++; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
++; X86-NEXT:    movl %eax, %edx
+ ; X86-NEXT:    shrl $16, %edx
++; X86-NEXT:    andl $32768, %eax # imm = 0x8000
+ ; X86-NEXT:    andl $32768, %edx # imm = 0x8000
+-; X86-NEXT:    movl $32768, %eax # imm = 0x8000
+-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+ ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+ ; X86-NEXT:    # kill: def $dx killed $dx killed $edx
+ ; X86-NEXT:    addl $8, %esp
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
+--- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
++++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
+@@ -29,7 +29,7 @@
+ ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+ ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
+ ; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
+-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP0]], 1
++; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP0]], 1
+ ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[ARR]], i64 [[TMP5]]
+ ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
+ ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 -3
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 4e7f72bf7a0b85..54da9c8c4fae3b 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "930f21c6bc0fe05c1d08be6353f7c7c6c51f4dc0"
-    LLVM_SHA256 = "e588ac42b895b4b9746bd5d15ede287f3247a88d82ed41a0704e6ff8641f4734"
+    LLVM_COMMIT = "d93cfd8dab577b09a8d01ef10a279b02572e4814"
+    LLVM_SHA256 = "8f904d21ea90c0ac73c7080c13c7bd1ec31d642ac30d112e009a1e8953d8bcee"
 
     tf_http_archive(
         name = name,

From 56d85c960f9cabb026c45dd2eec783fb92b445de Mon Sep 17 00:00:00 2001
From: Jane Liu <janeliu@nvidia.com>
Date: Wed, 20 Mar 2024 10:09:24 -0700
Subject: [PATCH 184/670] PR #10636: Offloading 1/3: Add annotation for
 copy-start/copy-done

Imported from GitHub PR https://github.com/openxla/xla/pull/10636

Add stream id in the backend_config of copy-start instruction. The stream id is obtained from hlo_query::NextChannelId().
The corresponding copy-done instruction which is the use of copy-start instruction will be traversed and added the stream id in the backend_config too. This part is automatically done by the subsequent AnnotateStreamAttributesForUsers() existing in the function.
The bool data member copy_start_done_ is used to differentiate copy-start/copy-done from other collective instructions and go through two different paths.
https://github.com/openxla/xla/pull/10450 is split and the current PR is the first 1 out of 3 PRs.
Copybara import of the project:

--
ff99c161a634b868d4265204d10d0b80adf2e772 by Jane Liu <janeliu@nvidia.com>:

Add annotation of stream id for copy-start and its use of copy-done instruction

--
64c746a5de6aa4c9370034ed712192dfd78e3a6e by Jane Liu <janeliu@nvidia.com>:

Enable the annotator for copy-start/copy-done in gpu compiler

--
7972b987e45cd165834483c4350e953094b5dbe8 by Jane Liu <janeliu@nvidia.com>:

Add the annotator pass after HLO rematerialization pass

--
2a9e317aac85e4de3f0ff5f1558885408ab4562d by Jane Liu <janeliu@nvidia.com>:

Add the dependency in BUILD

--
db5ed79f0ec90a1032252beff5108e4f01b61c4a by Jane Liu <janeliu@nvidia.com>:

Use a function to annotate copy-start and add description

--
b4b3932bfedfabc7f3e22a8766ecbc1fa1188402 by Jane Liu <janeliu@nvidia.com>:

remove the bool var copy_start from the StreamAttributeAnnotator class

--
11148fe63588f58c85ab2530bda64807bf400476 by Jane Liu <janeliu@nvidia.com>:

Fixes according to the code review

Merging this change closes #10636

PiperOrigin-RevId: 617550093
---
 third_party/xla/xla/service/gpu/BUILD         |  1 +
 .../xla/xla/service/gpu/gpu_compiler.cc       |  1 +
 .../service/gpu/stream_attribute_annotator.cc | 23 +++++++++-
 .../gpu/stream_attribute_annotator_test.cc    | 44 +++++++++++++++++++
 4 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 34eedbeb2f26c9..d2d96fb0b6b31d 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -6101,6 +6101,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_pass",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 29614e5d163821..8fec56418f6edb 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -2195,6 +2195,7 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
         /*host_memory_offload_config=*/std::nullopt);
     HloRematerialization::RematerializationSizes sizes;
     pipeline.AddPass<HloRematerialization>(options, sizes);
+    pipeline.AddPass<StreamAttributeAnnotator>();
     pipeline.AddPass<OptimizationBarrierExpander>();
 
     TF_ASSIGN_OR_RETURN(bool changed, pipeline.Run(module));
diff --git a/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc b/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc
index 37e118678db035..f0884d6638cc09 100644
--- a/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc
+++ b/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/statusor.h"
@@ -46,7 +47,6 @@ bool IsOnlyRootNonDefaultStream(HloComputation* computation) {
   int64_t root_stream_id = root_gpu_config->operation_queue_id();
   VLOG(2) << "Found fusion computation's root stream id to be "
           << root_stream_id;
-
   if (root_stream_id == Thunk::kDefaultExecutionStreamId.value()) {
     return false;
   }
@@ -88,6 +88,20 @@ absl::StatusOr<bool> AnnotateStreamAttributesForInstruction(
   return true;
 }
 
+absl::StatusOr<bool> AnnotateStreamAttributesForCopyStart(
+    HloInstruction* instr, int64_t channel_id,
+    GpuBackendConfig& instr_gpu_config) {
+  // Do nothing if copy-start has already been annotated
+  if (instr_gpu_config.operation_queue_id() !=
+      Thunk::kDefaultExecutionStreamId.value()) {
+    return false;
+  }
+  instr_gpu_config.set_operation_queue_id(channel_id);
+  TF_RETURN_IF_ERROR(instr->set_backend_config(instr_gpu_config));
+  VLOG(3) << "Add copy-start's backend config: " << channel_id;
+  return true;
+}
+
 absl::StatusOr<bool> AnnotateStreamAttributesForUsers(
     HloInstruction* instr, GpuBackendConfig& instr_gpu_config) {
   bool changed = false;
@@ -125,6 +139,7 @@ absl::StatusOr<bool> StreamAttributeAnnotator::Run(
   XLA_VLOG_LINES(
       5, "StreamAttributeAnnotator::Run(), before:\n" + module->ToString());
   bool changed = false;
+  int64_t channel_id = hlo_query::NextChannelId(*module);
   for (const HloComputation* comp : module->computations(execution_threads)) {
     for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
       auto instr_gpu_config = instr->backend_config<GpuBackendConfig>();
@@ -139,6 +154,12 @@ absl::StatusOr<bool> StreamAttributeAnnotator::Run(
                             AnnotateStreamAttributesForInstruction(
                                 instr, instr_gpu_config.value()));
         changed |= comp_result;
+      } else if (instr->opcode() == HloOpcode::kCopyStart) {
+        TF_ASSIGN_OR_RETURN(bool comp_result,
+                            AnnotateStreamAttributesForCopyStart(
+                                instr, channel_id, instr_gpu_config.value()));
+        changed |= comp_result;
+        continue;
       }
 
       TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/gpu/stream_attribute_annotator_test.cc b/third_party/xla/xla/service/gpu/stream_attribute_annotator_test.cc
index d5e2629712f7ab..2861f9a82a7ef1 100644
--- a/third_party/xla/xla/service/gpu/stream_attribute_annotator_test.cc
+++ b/third_party/xla/xla/service/gpu/stream_attribute_annotator_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -164,5 +165,48 @@ TEST_F(StreamAttributeAnnotatorTest, FusionIsAnnotated) {
   EXPECT_EQ(gpu_config.operation_queue_id(), 1);
 }
 
+TEST_F(StreamAttributeAnnotatorTest, CopyStartIsAnnotated) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule offloading
+    ENTRY %main (param_0: f32[1024], param_1: f32[1024]) -> f32[1024] {
+    %param_1 = f32[1024]{0} parameter(1)
+    %param_0 = f32[1024]{0} parameter(0)
+    %res_3 = f32[1024]{0} add(f32[1024]{0} %param_0, f32[1024]{0} %param_1)
+    %copy-start = (f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) copy-start(f32[1024]{0} %res_3)
+    %res_4 = f32[1024]{0} tanh(f32[1024]{0} %res_3)
+    %copy-start.2 = (f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) copy-start(f32[1024]{0} %res_4)
+    %res_5 = f32[1024]{0} tanh(f32[1024]{0} %res_4)
+    %copy-done = f32[1024]{0:S(5)} copy-done((f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) %copy-start)
+    %res_6 = f32[1024]{0} tanh(f32[1024]{0} %res_5)
+    %copy-done.2 = f32[1024]{0:S(5)} copy-done((f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) %copy-start.2)
+    %copy-start.3 = (f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) copy-start(f32[1024]{0:S(5)} %copy-done.2)
+    %res_7 = f32[1024]{0} add(f32[1024]{0} %res_6, f32[1024]{0} %res_6)
+    %copy-start.1 = (f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) copy-start(f32[1024]{0:S(5)} %copy-done)
+    %res_8 = f32[1024]{0} add(f32[1024]{0} %res_7, f32[1024]{0} %res_5)
+    %copy-done.3 = f32[1024]{0} copy-done((f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) %copy-start.3)
+    %res_9 = f32[1024]{0} add(f32[1024]{0} %res_8, f32[1024]{0} %copy-done.3)
+    %copy-done.1 = f32[1024]{0} copy-done((f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) %copy-start.1)
+    %res_10 = f32[1024]{0} add(f32[1024]{0} %res_9, f32[1024]{0} %copy-done.1)
+    ROOT %res_11 = f32[1024]{0} tanh(f32[1024]{0} %res_10)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+
+  StreamAttributeAnnotator attr_annotator;
+  bool changed;
+  TF_ASSERT_OK_AND_ASSIGN(changed, attr_annotator.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  for (std::string i : {"", ".1", ".2", ".3"}) {
+    const HloInstruction* cp_start =
+        FindInstruction(module.get(), "copy-start" + i);
+    EXPECT_TRUE(cp_start->has_backend_config());
+    TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
+                            cp_start->backend_config<GpuBackendConfig>());
+    EXPECT_EQ(gpu_config.operation_queue_id(), 1);
+  }
+}
 }  // namespace
 }  // namespace xla::gpu

From 519043d1a53f931ffd3a25ce8c98c74c2f491c31 Mon Sep 17 00:00:00 2001
From: Eunjae Kim <eunjaekim@google.com>
Date: Wed, 20 Mar 2024 10:11:07 -0700
Subject: [PATCH 185/670] Make TaskQueue keep track of the start time of each
 task

PiperOrigin-RevId: 617550671
---
 .../kernels/batching_util/batch_scheduler.h   | 45 +++++++++---
 .../batching_util/batch_scheduler_test.cc     | 70 ++++++++++++++-----
 .../batching_util/shared_batch_scheduler.h    |  2 +-
 3 files changed, 91 insertions(+), 26 deletions(-)

diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index fcee8f4463392f..f6ed5cf47aef91 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include <deque>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
@@ -87,8 +88,16 @@ class TaskQueue {
  public:
   TaskQueue() = default;
 
-  // Appends a task to the end of the queue.
-  void AddTask(std::unique_ptr<TaskType> task);
+  struct TaskWrapper {
+    std::unique_ptr<TaskType> task;
+    uint64 start_time_micros;
+
+    TaskWrapper(std::unique_ptr<TaskType> task, uint64 start_time_micros)
+        : task(std::move(task)), start_time_micros(start_time_micros) {}
+  };
+
+  // Appends a task to the end of the queue with the given start time.
+  void AddTask(std::unique_ptr<TaskType> task, uint64 start_time_micros);
 
   // Removes a task from the front of the queue, i.e., the oldest task in the
   // queue.
@@ -99,6 +108,10 @@ class TaskQueue {
   // argument.
   std::vector<std::unique_ptr<TaskType>> RemoveTask(int size);
 
+  // Returns the start time of the earliest task in the queue. If the queue is
+  // empty, return the null value.
+  std::optional<uint64> EarliestTaskStartTime() const;
+
   // Returns true iff the queue contains 0 tasks.
   bool empty() const;
 
@@ -112,7 +125,7 @@ class TaskQueue {
   mutable mutex mu_;
 
   // Tasks in the queue.
-  std::deque<std::unique_ptr<TaskType>> tasks_ TF_GUARDED_BY(mu_);
+  std::deque<TaskWrapper> tasks_ TF_GUARDED_BY(mu_);
 
   // The sum of the sizes of the tasks in 'tasks_'.
   int size_ TF_GUARDED_BY(mu_) = 0;
@@ -126,11 +139,12 @@ class TaskQueue {
 };
 
 template <typename TaskType>
-void TaskQueue<TaskType>::AddTask(std::unique_ptr<TaskType> task) {
+void TaskQueue<TaskType>::AddTask(std::unique_ptr<TaskType> task,
+                                  uint64 start_time_micros) {
   {
     mutex_lock l(mu_);
     size_ += task->size();
-    tasks_.emplace_back(std::move(task));
+    tasks_.emplace_back(std::move(task), start_time_micros);
     empty_.store(false);
   }
 }
@@ -142,7 +156,7 @@ std::unique_ptr<TaskType> TaskQueue<TaskType>::RemoveTask() {
     if (tasks_.empty()) {
       return nullptr;
     }
-    std::unique_ptr<TaskType> task = std::move(tasks_.front());
+    std::unique_ptr<TaskType> task = std::move(tasks_.front().task);
     size_ -= task->size();
     tasks_.pop_front();
     if (tasks_.empty()) {
@@ -164,10 +178,10 @@ std::vector<std::unique_ptr<TaskType>> TaskQueue<TaskType>::RemoveTask(
     int size_lower_bound = size_ - size;
     std::vector<std::unique_ptr<TaskType>> remove_tasks;
     while (!tasks_.empty() &&
-           size_ - static_cast<int>(tasks_.front()->size()) >=
+           size_ - static_cast<int>(tasks_.front().task->size()) >=
                size_lower_bound) {
-      size_ -= static_cast<int>(tasks_.front()->size());
-      remove_tasks.push_back(std::move(tasks_.front()));
+      size_ -= static_cast<int>(tasks_.front().task->size());
+      remove_tasks.push_back(std::move(tasks_.front().task));
       tasks_.pop_front();
       if (tasks_.empty()) {
         empty_.store(true);
@@ -185,6 +199,19 @@ bool TaskQueue<TaskType>::empty() const {
   }
 }
 
+template <typename TaskType>
+std::optional<uint64> TaskQueue<TaskType>::EarliestTaskStartTime() const {
+  {
+    mutex_lock l(mu_);
+
+    if (tasks_.empty()) {
+      return std::nullopt;
+    }
+
+    return tasks_.front().start_time_micros;
+  }
+}
+
 template <typename TaskType>
 int TaskQueue<TaskType>::num_tasks() const {
   {
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
index c9def0cd49916d..916e709c67015d 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 
 #include <cstddef>
+#include <cstdint>
 #include <memory>
+#include <optional>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -64,7 +66,7 @@ TEST(TaskQueueTest, EmptyTaskQueue) {
 TEST(TaskQueueTest, AddTaskToTaskQueue) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(1, task_queue.size());
@@ -73,17 +75,17 @@ TEST(TaskQueueTest, AddTaskToTaskQueue) {
 TEST(TaskQueueTest, AddTasksToTaskQueue) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(1, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(2));
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 2);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(2, task_queue.num_tasks());
   EXPECT_EQ(3, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(3));
+  task_queue.AddTask(std::make_unique<FakeTask>(3), 3);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(3, task_queue.num_tasks());
   EXPECT_EQ(6, task_queue.size());
@@ -92,7 +94,7 @@ TEST(TaskQueueTest, AddTasksToTaskQueue) {
 TEST(TaskQueueTest, RemoveTaskFromTaskQueueWithSingleTask) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(1, task_queue.size());
@@ -107,12 +109,12 @@ TEST(TaskQueueTest, RemoveTaskFromTaskQueueWithSingleTask) {
 TEST(TaskQueueTest, RemoveTaskFromTaskQueueWithMultipleTasks) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(2));
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(2, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 2);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(2, task_queue.num_tasks());
   EXPECT_EQ(3, task_queue.size());
@@ -127,17 +129,17 @@ TEST(TaskQueueTest, RemoveTaskFromTaskQueueWithMultipleTasks) {
 TEST(TaskQueueTest, RemoveTasksFromTaskQueue) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(1, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(2));
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 2);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(2, task_queue.num_tasks());
   EXPECT_EQ(3, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(3));
+  task_queue.AddTask(std::make_unique<FakeTask>(3), 3);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(3, task_queue.num_tasks());
   EXPECT_EQ(6, task_queue.size());
@@ -155,17 +157,17 @@ TEST(TaskQueueTest, RemoveTasksFromTaskQueue) {
 TEST(TaskQueueTest, RemoveTasksFewerThanArgFromTaskQueue) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(1, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(2));
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 2);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(2, task_queue.num_tasks());
   EXPECT_EQ(3, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(3));
+  task_queue.AddTask(std::make_unique<FakeTask>(3), 3);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(3, task_queue.num_tasks());
   EXPECT_EQ(6, task_queue.size());
@@ -183,17 +185,17 @@ TEST(TaskQueueTest, RemoveTasksFewerThanArgFromTaskQueue) {
 TEST(TaskQueueTest, RemoveAllTasksWhenArgGreaterThanTaskSize) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(1, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(2));
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 2);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(2, task_queue.num_tasks());
   EXPECT_EQ(3, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(3));
+  task_queue.AddTask(std::make_unique<FakeTask>(3), 3);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(3, task_queue.num_tasks());
   EXPECT_EQ(6, task_queue.size());
@@ -208,6 +210,42 @@ TEST(TaskQueueTest, RemoveAllTasksWhenArgGreaterThanTaskSize) {
   EXPECT_EQ(0, task_queue.size());
 }
 
+TEST(TaskQueueTest, EarliestStartTimeWithEmptyQueue) {
+  TaskQueue<FakeTask> task_queue;
+  EXPECT_FALSE(task_queue.EarliestTaskStartTime().has_value());
+}
+
+TEST(TaskQueueTest, EarliestStartTimeWithMultipleTasksInQueue) {
+  TaskQueue<FakeTask> task_queue;
+
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 2);
+
+  std::optional<uint64_t> result = task_queue.EarliestTaskStartTime();
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(*result, 1);
+}
+
+TEST(TaskQueueTest, EarliestStartTimeAfterTaskRemoval) {
+  TaskQueue<FakeTask> task_queue;
+
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 2);
+  task_queue.AddTask(std::make_unique<FakeTask>(3), 3);
+
+  std::optional<uint64_t> result = task_queue.EarliestTaskStartTime();
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(*result, 1);
+
+  EXPECT_THAT(task_queue.RemoveTask(3),
+              ElementsAre(Pointee(Property(&FakeTask::size, Eq(1))),
+                          Pointee(Property(&FakeTask::size, Eq(2)))));
+
+  result = task_queue.EarliestTaskStartTime();
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(*result, 3);
+}
+
 TEST(BatchTest, Basic) {
   Batch<FakeTask> batch;
 
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 4b9a599a77c5ac..9b836447b74720 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -1048,7 +1048,7 @@ Status Queue<TaskType>::ScheduleWithoutOrEagerSplit(
     if (IsLowPriorityTask(task)) {
       // Insert the task to the low priority task queue instead of the high
       // priority batch queue below.
-      low_priority_tasks_.AddTask(std::move(*task));
+      low_priority_tasks_.AddTask(std::move(*task), env_->NowMicros());
     } else {
       TF_RETURN_IF_ERROR(ScheduleWithoutOrEagerSplitImpl(task));
     }

From e21a1d32812659f0bed642a8d2a220607753d0ca Mon Sep 17 00:00:00 2001
From: RJ Ascani <rjascani@google.com>
Date: Wed, 20 Mar 2024 10:30:58 -0700
Subject: [PATCH 186/670] Align is_finite op parameters with other ops

This change no longer caches the input and output tensors in the IsFiniteOp data structure, similar to the unary elementwise ops. This aligns with the guidance in the README.

PiperOrigin-RevId: 617557369
---
 .../lite/experimental/shlo/ops/is_finite.cc    | 18 ++++++------------
 .../lite/experimental/shlo/ops/is_finite.h     |  6 ++----
 .../experimental/shlo/ops/is_finite_bench.cc   | 13 ++++++-------
 .../experimental/shlo/ops/is_finite_test.cc    | 12 ++++++------
 4 files changed, 20 insertions(+), 29 deletions(-)

diff --git a/tensorflow/lite/experimental/shlo/ops/is_finite.cc b/tensorflow/lite/experimental/shlo/ops/is_finite.cc
index 36246104104e82..153bf8a6e3b777 100644
--- a/tensorflow/lite/experimental/shlo/ops/is_finite.cc
+++ b/tensorflow/lite/experimental/shlo/ops/is_finite.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
-#include <utility>
 
 #include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/data_type.h"
@@ -81,23 +80,18 @@ IsFiniteOp Create(const IsFiniteOp::Attributes& attributes) {
   return IsFiniteOp();
 }
 
-absl::Status Prepare(IsFiniteOp& op, Tensor operand, Tensor result) {
-  if (absl::Status status = CheckParameters(operand, result); !status.ok()) {
-    return status;
-  }
-  op.operand = std::move(operand);
-  op.result = std::move(result);
-  return absl::OkStatus();
+absl::Status Prepare(IsFiniteOp& op, const Tensor& operand, Tensor& result) {
+  return CheckParameters(operand, result);
 }
 
-absl::Status Evaluate(IsFiniteOp& op) {
-  if (!op.operand.data) {
+absl::Status Evaluate(IsFiniteOp& op, const Tensor& operand, Tensor& result) {
+  if (!operand.data) {
     return absl::InvalidArgumentError("No operand.data");
   }
-  if (!op.result.data) {
+  if (!result.data) {
     return absl::InvalidArgumentError("No result.data");
   }
-  return EvaluateImpl(op.operand, op.result);
+  return EvaluateImpl(operand, result);
 }
 
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/is_finite.h b/tensorflow/lite/experimental/shlo/ops/is_finite.h
index aaff0eaadce7cf..0000567bb75c60 100644
--- a/tensorflow/lite/experimental/shlo/ops/is_finite.h
+++ b/tensorflow/lite/experimental/shlo/ops/is_finite.h
@@ -23,13 +23,11 @@ namespace shlo_ref {
 
 struct IsFiniteOp {
   struct Attributes {};
-  Tensor operand;
-  Tensor result;
 };
 
 IsFiniteOp Create(const IsFiniteOp::Attributes& attributes);
-absl::Status Prepare(IsFiniteOp& op, Tensor operand, Tensor result);
-absl::Status Evaluate(IsFiniteOp& op);
+absl::Status Prepare(IsFiniteOp& op, const Tensor& operand, Tensor& result);
+absl::Status Evaluate(IsFiniteOp& op, const Tensor& operand, Tensor& result);
 
 }  // namespace shlo_ref
 
diff --git a/tensorflow/lite/experimental/shlo/ops/is_finite_bench.cc b/tensorflow/lite/experimental/shlo/ops/is_finite_bench.cc
index d9a362f18008cf..46b95c86639f14 100644
--- a/tensorflow/lite/experimental/shlo/ops/is_finite_bench.cc
+++ b/tensorflow/lite/experimental/shlo/ops/is_finite_bench.cc
@@ -31,16 +31,15 @@ void BM_IsFinite(benchmark::State& state, DimensionSize num_elements,
                  const Tensor& operand) {
   IsFiniteOp op = Create(IsFiniteOp::Attributes{});
 
-  ABSL_CHECK_OK(
-      Prepare(op, operand,
-              Tensor{.type = TensorType{.shape = Shape{{num_elements}},
-                                        .element_type = DataType::kI1}}));
+  Tensor result = Tensor{.type = TensorType{.shape = Shape{{num_elements}},
+                                            .element_type = DataType::kI1}};
+  ABSL_CHECK_OK(Prepare(op, operand, result));
 
-  std::vector<std::byte> result_values(op.result.SizeInBytes());
-  op.result.data = result_values.data();
+  std::vector<std::byte> result_values(result.SizeInBytes());
+  result.data = result_values.data();
 
   for (auto _ : state) {
-    ABSL_CHECK_OK(Evaluate(op));
+    ABSL_CHECK_OK(Evaluate(op, operand, result));
   }
 }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc b/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc
index 2493e570ab27dc..0c78f5264a1849 100644
--- a/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc
@@ -45,15 +45,15 @@ TEST_P(IsFiniteTest, IsFinite) {
   const auto& params = GetParam();
 
   IsFiniteOp op = Create(IsFiniteOp::Attributes{});
+  Tensor result{.type = params.expected.tensor().type};
 
-  EXPECT_OK(Prepare(op, params.operand.tensor(),
-                    Tensor{.type = params.expected.tensor().type}));
+  ASSERT_OK(Prepare(op, params.operand.tensor(), result));
 
-  std::vector<std::byte> result_data(op.result.SizeInBytes());
-  op.result.data = result_data.data();
+  std::vector<std::byte> result_data(result.SizeInBytes());
+  result.data = result_data.data();
 
-  EXPECT_OK(Evaluate(op));
-  EXPECT_THAT(op.result, TensorEq(params.expected.tensor()));
+  EXPECT_OK(Evaluate(op, params.operand.tensor(), result));
+  EXPECT_THAT(result, TensorEq(params.expected.tensor()));
 }
 
 INSTANTIATE_TEST_SUITE_P(

From 568832a9f39975b74fce5cc1dafeee5a0e665888 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Wed, 20 Mar 2024 11:15:47 -0700
Subject: [PATCH 187/670] #tf-data Re-use `GlobalShuffleIterator` for the Range
 dataset.

Also fix an issue with global shuffling when not dropping remainders:

The original range_op implementation did not check out-of-range errors
after getting the shuffled index. If the parent iterator produces an
out-of-range index, it may return an out-of-range element.

Without fix, shuffling
```
range(10).batch(3, drop_remainder=False)
```
could return [[3, 4, 5], [9, 10, 11], [0, 1, 2], [6]]
which should be [[3, 4, 5], [9], [0, 1, 2], [6, 7, 8]]

However, this may be confusing to the users, so we'll not support
global shuffling with `drop_remainder=False` for now.

PiperOrigin-RevId: 617571841
---
 tensorflow/core/kernels/data/BUILD            |  1 +
 .../core/kernels/data/batch_dataset_op.cc     |  7 +-
 .../core/kernels/data/experimental/BUILD      |  1 +
 .../assert_cardinality_dataset_op.cc          |  6 ++
 .../core/kernels/data/range_dataset_op.cc     | 35 ++++------
 .../kernel_tests/assert_cardinality_test.py   |  3 +-
 .../experimental/ops/global_shuffle_op.py     |  5 +-
 .../python/data/kernel_tests/batch_test.py    | 64 +++++++++----------
 .../python/data/kernel_tests/shard_test.py    |  2 +
 9 files changed, 63 insertions(+), 61 deletions(-)

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 3210941c5d2d7d..e952431d3da14b 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -26,6 +26,7 @@ tf_kernel_library(
         "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:mutex",
     ],
 )
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 40021d00e41075..78d9bce7ae0ea7 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
@@ -90,7 +91,11 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     }
 
     random_indexing_compatible_ = absl::OkStatus();
-    if (input_ != nullptr) {
+    if (!drop_remainder_) {
+      random_indexing_compatible_ = absl::FailedPreconditionError(absl::StrCat(
+          type_string(),
+          " does not support global shuffling with `drop_remainder=False`."));
+    } else if (input_ != nullptr) {
       random_indexing_compatible_ = input_->RandomIndexingCompatible();
     }
   }
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index e7f3e2175af489..54af95bb0669b1 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -44,6 +44,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/data:name_utils",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
index 655ae0d906a112..8760952578dcab 100644
--- a/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <map>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
@@ -122,6 +123,11 @@ class AssertCardinalityDatasetOp::Dataset : public DatasetBase {
         num_elements_++;
       }
       if (*end_of_sequence && num_elements_ != dataset()->cardinality_) {
+        if (ctx->index_mapper()) {
+          return absl::FailedPreconditionError(
+              absl::StrCat("Input dataset was expected to contain ",
+                           ElementString(dataset()->cardinality_), "."));
+        }
         return errors::FailedPrecondition(
             "Input dataset was expected to contain ",
             ElementString(dataset()->cardinality_), " but contained only ",
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index dc97850f758a66..471a2aeb39b951 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
+#include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/split_utils.h"
 #include "tensorflow/core/framework/dataset.h"
@@ -236,6 +237,10 @@ class RangeDatasetOp::Dataset : public DatasetBase {
 
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override {
+    return Get(index, out_tensors);
+  }
+
+  Status Get(int64 index, std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     return ConvertOutputTypes(output_dtypes(), out_tensors,
                               start_ + (index * step_));
@@ -265,7 +270,8 @@ class RangeDatasetOp::Dataset : public DatasetBase {
   class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<Dataset>(params) {}
+        : DatasetIterator<Dataset>(params),
+          global_shuffle_iterator_(dataset()) {}
 
     bool SymbolicCheckpointCompatible() const override { return true; }
 
@@ -284,7 +290,8 @@ class RangeDatasetOp::Dataset : public DatasetBase {
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
       if (ctx->index_mapper() != nullptr) {
-        return Get(ctx, out_tensors, end_of_sequence);
+        return global_shuffle_iterator_.GetNext(ctx, out_tensors,
+                                                end_of_sequence);
       }
       int64_t value;
       if (split_provider_ != nullptr) {
@@ -304,20 +311,6 @@ class RangeDatasetOp::Dataset : public DatasetBase {
       return ConvertOutputTypes(output_dtypes(), out_tensors, value);
     }
 
-    absl::Status Get(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                     bool* end_of_sequence) {
-      tsl::mutex_lock l(mu_);
-      if (element_count_ >=
-          (dataset()->stop_ - dataset()->start_) / dataset()->step_) {
-        *end_of_sequence = true;
-        return absl::OkStatus();
-      }
-      size_t output_index = ctx->index_mapper()(element_count_++);
-      int64_t value = dataset()->start_ + output_index * dataset()->step_;
-      *end_of_sequence = false;
-      return ConvertOutputTypes(output_dtypes(), out_tensors, value);
-    }
-
    protected:
     std::shared_ptr<model::Node> CreateNode(
         IteratorContext* ctx, model::Node::Args args) const override {
@@ -344,9 +337,7 @@ class RangeDatasetOp::Dataset : public DatasetBase {
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       if (ctx->restored_element_count().has_value()) {
-        tsl::mutex_lock l(mu_);
-        element_count_ = *(ctx->restored_element_count());
-        return absl::OkStatus();
+        return global_shuffle_iterator_.Restore(ctx);
       }
       if (reader->Contains(prefix(), kHasSplitProvider)) {
         TF_RETURN_IF_ERROR(split_provider_->Restore(
@@ -369,11 +360,7 @@ class RangeDatasetOp::Dataset : public DatasetBase {
    private:
     std::unique_ptr<RangeCounter> counter_;
     std::shared_ptr<SplitProvider> split_provider_;
-
-    mutable tsl::mutex mu_;
-    // Count of elements produced by this iterator when it runs in the random
-    // access mode.
-    size_t element_count_ TF_GUARDED_BY(mu_) = 0;
+    GlobalShuffleIterator global_shuffle_iterator_;
   };
 
   const int64_t start_;
diff --git a/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py
index fae498f0869906..46dbc5fb6a22c2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py
@@ -97,8 +97,7 @@ def testIncorrectCardinality(self, num_elements, asserted_cardinality,
               asserted_cardinality=100,
               expected_error=errors.FailedPreconditionError,
               expected_error_message=(
-                  "Input dataset was expected to contain 100 elements but "
-                  "contained only 10 elements.")) +
+                  "Input dataset was expected to contain 100 elements.")) +
           combinations.combine(
               num_elements=10,
               asserted_cardinality=cardinality.INFINITE,
diff --git a/tensorflow/python/data/experimental/ops/global_shuffle_op.py b/tensorflow/python/data/experimental/ops/global_shuffle_op.py
index 43714f43088911..911b29ac6278ba 100644
--- a/tensorflow/python/data/experimental/ops/global_shuffle_op.py
+++ b/tensorflow/python/data/experimental/ops/global_shuffle_op.py
@@ -55,8 +55,9 @@ def _global_shuffle(  # pylint: disable=unused-private-name
     A new `Dataset` where elements are produced in a globally shuffled order.
 
   Raises:
-    InvalidArgumentError if the input dataset does not support random access, or
-    it has infinite or unknown cardinality.
+    - InvalidArgumentError if the input dataset does not support random access,
+      or it has infinite or unknown cardinality.
+    - FailedPreconditionError for batching with `drop_remainder=False`.
   """
   return _GlobalShuffleDataset(
       input_dataset,
diff --git a/tensorflow/python/data/kernel_tests/batch_test.py b/tensorflow/python/data/kernel_tests/batch_test.py
index d1fc8ddcfc5b49..d8fe368fb19b77 100644
--- a/tensorflow/python/data/kernel_tests/batch_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_test.py
@@ -15,7 +15,6 @@
 # ==============================================================================
 """Tests for `tf.data.Dataset.batch()`."""
 
-import math
 import time
 from typing import Callable, Optional
 
@@ -453,19 +452,16 @@ class BatchGlobalShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
           test_base.default_test_combinations(),
           combinations.combine(
               dataset_range=[100],
-              batch_size=[2, 7],
-              drop_remainder=[True, False])))
+              batch_size=[2, 7])))
   def testBatch(
-      self, dataset_range: int, batch_size: int, drop_remainder: bool):
+      self, dataset_range: int, batch_size: int):
     dataset = dataset_ops.Dataset.range(dataset_range)
-    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+    dataset = dataset.batch(batch_size, drop_remainder=True)
     dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
     dataset = global_shuffle_op._global_shuffle(dataset)
-    dataset = dataset.map(lambda x: x[0])
+    dataset = dataset.unbatch()
 
-    expected = list(range(0, dataset_range, batch_size))
-    if drop_remainder:
-      expected = expected[: (dataset_range // batch_size)]
+    expected = list(range(0, (dataset_range // batch_size) * batch_size))
     dataset_output = self.getDatasetOutput(
         dataset, requires_initialization=True)
     self.assertCountEqual(dataset_output, expected)
@@ -477,27 +473,23 @@ def testBatch(
           combinations.combine(
               dataset_range=[100],
               batch_size=[2, 7],
-              drop_remainder=[True, False],
               reshuffle=[True, False],
               seed=[None, 42])))
   def testReshuffleRepeatEpochs(
       self,
       dataset_range: int,
       batch_size: int,
-      drop_remainder: bool,
       reshuffle: bool,
       seed: Optional[int]):
     dataset = dataset_ops.Dataset.range(dataset_range)
-    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+    dataset = dataset.batch(batch_size, drop_remainder=True)
     dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
     dataset = global_shuffle_op._global_shuffle(
         dataset, seed=seed, reshuffle_each_iteration=reshuffle)
-    dataset = dataset.map(lambda x: x[0])
     dataset = dataset.repeat(2)
+    dataset = dataset.unbatch()
 
-    expected = list(range(0, dataset_range, batch_size))
-    if drop_remainder:
-      expected = expected[: (dataset_range // batch_size)]
+    expected = list(range(0, (dataset_range // batch_size) * batch_size))
     len_per_iteration = len(expected)
     expected *= 2
 
@@ -511,6 +503,24 @@ def testReshuffleRepeatEpochs(
     else:
       self.assertEqual(output_per_iteration[0], output_per_iteration[1])
 
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[100],
+              batch_size=[2, 7])))
+  def testNoDropRemainder(
+      self, dataset_range: int, batch_size: int):
+    dataset = dataset_ops.Dataset.range(dataset_range)
+    dataset = dataset.batch(batch_size, drop_remainder=False)
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+
+    with self.assertRaisesRegex(
+        errors.FailedPreconditionError,
+        "does not support global shuffling with `drop_remainder=False`."):
+      dataset = global_shuffle_op._global_shuffle(dataset)
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
 
 class BatchGlobalShuffleCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                                        parameterized.TestCase):
@@ -522,22 +532,20 @@ class BatchGlobalShuffleCheckpointTest(checkpoint_test_base.CheckpointTestBase,
           combinations.combine(
               dataset_range=[10],
               batch_size=[2, 3],
-              drop_remainder=[True, False],
               symbolic_checkpoint=[True, False])))
   def testBatch(
       self,
       verify_fn: Callable[..., None],
       dataset_range: int,
       batch_size: int,
-      drop_remainder: bool,
       symbolic_checkpoint: bool):
 
     def _build_dataset() -> dataset_ops.Dataset:
       dataset = dataset_ops.Dataset.range(dataset_range)
-      dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+      dataset = dataset.batch(batch_size, drop_remainder=True)
       dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
       dataset = global_shuffle_op._global_shuffle(dataset, seed=42)
-      dataset = dataset.map(lambda x: x[0])
+      dataset = dataset.unbatch()
       options = options_lib.Options()
       options.experimental_symbolic_checkpoint = symbolic_checkpoint
       return dataset.with_options(options)
@@ -545,10 +553,7 @@ def _build_dataset() -> dataset_ops.Dataset:
     verify_fn(
         self,
         _build_dataset,
-        num_outputs=(
-            dataset_range // batch_size
-            if drop_remainder
-            else math.ceil(dataset_range / batch_size)),
+        num_outputs=(dataset_range // batch_size) * batch_size,
         assert_items_equal=True)
 
   # Creating multiple iterators with the same seed is only supported in v2 API.
@@ -559,7 +564,6 @@ def _build_dataset() -> dataset_ops.Dataset:
           combinations.combine(
               dataset_range=[10],
               batch_size=[2, 3],
-              drop_remainder=[True, False],
               reshuffle_each_iteration=[True, False],
               symbolic_checkpoint=[True, False])))
   def testReshuffleEachIteration(
@@ -567,17 +571,16 @@ def testReshuffleEachIteration(
       verify_fn: Callable[..., None],
       dataset_range: int,
       batch_size: int,
-      drop_remainder: bool,
       reshuffle_each_iteration: bool,
       symbolic_checkpoint: bool):
 
     def _build_dataset() -> dataset_ops.Dataset:
       dataset = dataset_ops.Dataset.range(dataset_range)
-      dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+      dataset = dataset.batch(batch_size, drop_remainder=True)
       dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
       dataset = global_shuffle_op._global_shuffle(
           dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
-      dataset = dataset.map(lambda x: x[0])
+      dataset = dataset.unbatch()
       options = options_lib.Options()
       options.experimental_symbolic_checkpoint = symbolic_checkpoint
       return dataset.with_options(options)
@@ -585,10 +588,7 @@ def _build_dataset() -> dataset_ops.Dataset:
     verify_fn(
         self,
         _build_dataset,
-        num_outputs=(
-            dataset_range // batch_size
-            if drop_remainder
-            else math.ceil(dataset_range / batch_size)),
+        num_outputs=(dataset_range // batch_size) * batch_size,
         assert_items_equal=reshuffle_each_iteration)
 
 
diff --git a/tensorflow/python/data/kernel_tests/shard_test.py b/tensorflow/python/data/kernel_tests/shard_test.py
index 1405c30d74e4f7..6ed7b7573a1c43 100644
--- a/tensorflow/python/data/kernel_tests/shard_test.py
+++ b/tensorflow/python/data/kernel_tests/shard_test.py
@@ -204,6 +204,7 @@ def testShard(
       shard_index: int,
       seed: Optional[int],
       reshuffle_each_iteration: bool):
+    self.skipTest("TODO(b/616275227): Fix implementation for shard.")
     if shard_index >= num_shards:
       return
 
@@ -235,6 +236,7 @@ def testShard(
       verify_fn: Callable[..., None],
       reshuffle_each_iteration: bool,
       symbolic_checkpoint: bool):
+    self.skipTest("TODO(b/616275227): Fix implementation for shard.")
 
     def _build_dataset() -> dataset_ops.Dataset:
       dataset = dataset_ops.Dataset.range(10)

From d07d0ff8a79bca6115458fd932c7d0efc7ebc6dd Mon Sep 17 00:00:00 2001
From: Edward Schwartz <schwartzedward@google.com>
Date: Wed, 20 Mar 2024 12:01:10 -0700
Subject: [PATCH 188/670] Public API changes for forthcoming PjRt GPU client
 with remote devices fix

PiperOrigin-RevId: 617585334
---
 tensorflow/core/protobuf/config.proto                |  8 ++++++++
 .../api/golden/v1/tensorflow.-g-p-u-options.pbtxt    | 12 ++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 2f5e5127f2fc60..377b53c48d04b2 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -265,6 +265,14 @@ message GPUOptions {
     // system memory size for better resource estimation of multi-tenancy(one
     // gpu with multiple model) use case.
     int32 gpu_system_memory_size_in_mb = 16;
+
+    // If true, save information needed for created a PjRt GPU client for
+    // creating a client with remote devices.
+    bool populate_pjrt_gpu_client_creation_info = 17;
+
+    // node_id for use when creating a PjRt GPU client with remote devices,
+    // which enumerates jobs*tasks from a ServerDef.
+    int32 node_id = 18;
   }
 
   // Everything inside experimental is subject to change and is not subject
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
index 05ff5b9941c821..980d5261ed9c8b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
@@ -150,6 +150,18 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT32
       }
+      field {
+        name: "populate_pjrt_gpu_client_creation_info"
+        number: 17
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
+      field {
+        name: "node_id"
+        number: 18
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
       nested_type {
         name: "VirtualDevices"
         field {

From 06e37b8d7f2ba0338efd0f588f068122414f0722 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Mar 2024 12:44:49 -0700
Subject: [PATCH 189/670] Bugfix, nit: do not use designated initializers for
 aggregate initialization.

Drive-by: Shard `onednn_softmax_test` test.
PiperOrigin-RevId: 617597142
---
 third_party/xla/xla/python/ifrt_proxy/client/array.cc  | 10 +++++-----
 .../xla/xla/python/ifrt_proxy/client/array_test.cc     |  9 ++++-----
 .../xla/xla/python/ifrt_proxy/client/executable.cc     |  7 +++----
 .../xla/python/ifrt_proxy/client/executable_test.cc    |  3 +--
 .../xla/python/ifrt_proxy/server/ifrt_backend_test.cc  |  4 ++--
 third_party/xla/xla/tests/BUILD                        |  1 +
 6 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.cc b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
index 58c01c1d062906..05b9147cd7b830 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
@@ -88,7 +88,7 @@ Array::MakeArrayFromHostBuffer(
   TF_ASSIGN_OR_RETURN(
       auto response,
       rpc_helper->MakeArrayFromHostBuffer(std::move(req)).Await());
-  const ArrayHandle handle{.handle = response->array_handle()};
+  const ArrayHandle handle{response->array_handle()};
 
   if (on_done_with_host_buffer != nullptr) {
     std::move(on_done_with_host_buffer)();
@@ -185,7 +185,7 @@ Array::AssembleArrayFromSingleDeviceArrays(
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<AssembleArrayFromSingleDeviceArraysResponse> response,
       rpc_helper->AssembleArrayFromSingleDeviceArrays(std::move(req)).Await());
-  ArrayHandle handle{.handle = response->array_handle()};
+  ArrayHandle handle{response->array_handle()};
 
   return tsl::RCReference<xla::ifrt::Array>(
       tsl::MakeRef<Array>(client, std::move(rpc_helper), arrays[0]->dtype(),
@@ -203,7 +203,7 @@ Array::DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) {
       rpc_helper_->DisassembleIntoSingleDeviceArrays(std::move(req)).Await());
   std::vector<ArrayHandle> handles;
   for (auto& handle : response->single_device_array_handles()) {
-    handles.push_back(ArrayHandle{.handle = handle});
+    handles.push_back(ArrayHandle{handle});
   }
 
   TF_ASSIGN_OR_RETURN(auto shape_and_shardings, sharding_->Disassemble(shape_));
@@ -232,7 +232,7 @@ absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> Array::FullyReplicatedShard(
       std::shared_ptr<FullyReplicatedShardResponse> response,
       rpc_helper_->FullyReplicatedShard(std::move(req)).Await());
 
-  ArrayHandle handle{.handle = response->array_handle()};
+  ArrayHandle handle{response->array_handle()};
 
   // We are making the assumption the Array returned by the server corresponds
   // to the first device. Revisit this when IFRT supports: (1) an inexpensive
@@ -258,7 +258,7 @@ absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> Array::Reshard(
 
   TF_ASSIGN_OR_RETURN(std::shared_ptr<ReshardResponse> response,
                       rpc_helper_->Reshard(std::move(req)).Await());
-  ArrayHandle handle{.handle = response->array_handle()};
+  ArrayHandle handle{response->array_handle()};
 
   return tsl::RCReference<xla::ifrt::Array>(tsl::MakeRef<Array>(
       client_, rpc_helper_, dtype_, shape_, std::move(new_sharding), handle));
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
index 0d80bba73cd9dc..686f533387bde3 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
@@ -98,8 +98,7 @@ TEST_F(ArrayTest, Destruction) {
 
   MockClient client;
   tsl::MakeRef<Array>(&client, rpc_helper_, DType(DType::Kind::kBF16),
-                      Shape({}), /*sharding=*/nullptr,
-                      ArrayHandle{.handle = 1234});
+                      Shape({}), /*sharding=*/nullptr, ArrayHandle{1234});
 }
 #endif
 
@@ -124,9 +123,9 @@ TEST_F(ArrayTest, FullyReplicatedShard) {
   auto sharding = xla::ifrt::SingleDeviceSharding::Create(
       &mock_device, xla::ifrt::MemoryKind());
 
-  auto array = tsl::MakeRef<Array>(
-      &client, rpc_helper_, DType(DType::Kind::kBF16), Shape({}),
-      std::move(sharding), ArrayHandle{.handle = 1234});
+  auto array =
+      tsl::MakeRef<Array>(&client, rpc_helper_, DType(DType::Kind::kBF16),
+                          Shape({}), std::move(sharding), ArrayHandle{1234});
 
   ASSERT_THAT(array->FullyReplicatedShard(ArrayCopySemantics::kAlwaysCopy),
               IsOk());
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
index 40d540c954e8de..af7a1374eb530a 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
@@ -427,9 +427,8 @@ LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
     result.outputs.clear();  // Cleaned up by `~Array()`.
 
     for (; index < response->outputs_size(); ++index) {
-      Array::Destruct(
-          rpc_helper_.get(),
-          ArrayHandle{.handle = response->outputs(index).array_handle()});
+      Array::Destruct(rpc_helper_.get(),
+                      ArrayHandle{response->outputs(index).array_handle()});
     }
   };
   const auto lookup_device = absl::bind_front(&Client::LookupDevice, client());
@@ -440,7 +439,7 @@ LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
                         FromShardingProto(lookup_device, output.sharding()));
     result.outputs.push_back(tsl::MakeRef<Array>(
         client(), rpc_helper_, dtype, std::move(shape), std::move(sharding),
-        ArrayHandle{.handle = output.array_handle()}));
+        ArrayHandle{output.array_handle()}));
   }
   std::move(cleanup).Cancel();
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
index 8d8e71b38c8fe7..4decc3c789efa5 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
@@ -236,8 +236,7 @@ TEST_F(LoadedExecutableTest, Execute) {
   for (const uint64_t handle : {1000, 1001}) {
     args.push_back(tsl::MakeRef<Array>(
         &client, rpc_helper_, DType(DType::kF32), Shape({2, 2}),
-        OpaqueSharding::Create(devices, MemoryKind()),
-        ArrayHandle{.handle = handle}));
+        OpaqueSharding::Create(devices, MemoryKind()), ArrayHandle{handle}));
   }
 
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
index 9ee2d1b88155fe..cf90e2805e09ae 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
@@ -890,8 +890,8 @@ TEST_F(IfrtBackendHandlerTest, CompileSuccess) {
       addressable_device_logical_ids;
   std::vector<xla::ifrt::Device*> addressable_devices;
   for (int i = 0; i < 4; ++i) {
-    addressable_device_logical_ids.push_back(
-        {.replica = i / 2, .partition = i % 2});
+    xla::ifrt::LoadedExecutable::LogicalDeviceIds id{i / 2, i % 2};
+    addressable_device_logical_ids.push_back(id);
     addressable_devices.push_back(&devices[i]);
   }
 
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index f1a249a9ff027a..27fb77cf0bfc9a 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -2898,6 +2898,7 @@ xla_test(
         "cpu",
     ],
     copts = tsl_copts(),
+    shard_count = 4,
     deps = [
         ":hlo_test_base",
         ":test_macros_header",

From 147ccb7e49f33bc9d988f7b022dd616e1dc7dd3b Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 20 Mar 2024 13:10:02 -0700
Subject: [PATCH 190/670] Added clarification profiling pass that uses command
 buffers.

PiperOrigin-RevId: 617603997
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |  2 +
 .../delegates/gpu/cl/inference_context.cc     | 93 +++++++++++++++++--
 .../lite/delegates/gpu/cl/inference_context.h |  2 +
 3 files changed, 91 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 760e34464e2c94..7e02516b5d8c87 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -409,8 +409,10 @@ cc_library(
     ],
     deps = [
         ":buffer",
+        ":cl_command_buffer",
         ":cl_command_queue",
         ":cl_device",
+        ":cl_event",
         ":cl_operation",
         ":environment",
         ":gpu_object",
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 1a507374756440..32cb168b505431 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -32,8 +32,12 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_model.h"
@@ -41,6 +45,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/task/serialization_base.h"
 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
@@ -217,6 +222,50 @@ absl::Status GetBufferAsignment(
   return absl::OkStatus();
 }
 
+absl::Status ClarifyWithCommandBuffer(ProfilingCommandQueue* queue,
+                                      int num_tries, double cb_duration_ms,
+                                      const std::vector<CLNode*>& nodes,
+                                      std::vector<double>* time_ns) {
+  auto get_tasks_count = [&](int node_index) {
+    const int tasks_count = cb_duration_ms / ((*time_ns)[node_index] * 1e-6f);
+    return std::min(256, std::max(1, tasks_count));
+  };
+
+  std::vector<CLCommandBuffer> cbs(nodes.size() * num_tries);
+  for (int t = 0; t < num_tries; ++t) {
+    for (int node_index = 0; node_index < nodes.size(); ++node_index) {
+      const int index = t * nodes.size() + node_index;
+      auto& cb = cbs[index];
+      RETURN_IF_ERROR(cb.Init(queue, /*simultaneous_use=*/false));
+      const int num_kernels_in_cb = get_tasks_count(node_index);
+      for (int j = 0; j < num_kernels_in_cb; ++j) {
+        RETURN_IF_ERROR(nodes[node_index]->cl_operation.AddToCommanBuffer(
+            cb.GetCommandBuffer()));
+      }
+      RETURN_IF_ERROR(cb.Finalize());
+    }
+  }
+  std::vector<CLEvent> events(nodes.size() * num_tries);
+  for (int t = 0; t < num_tries; ++t) {
+    for (int node_index = 0; node_index < nodes.size(); ++node_index) {
+      const int index = t * nodes.size() + node_index;
+      RETURN_IF_ERROR(cbs[index].Enqueue(queue, &events[index]));
+    }
+  }
+  clFinish(queue->queue());
+  for (int node_index = 0; node_index < nodes.size(); ++node_index) {
+    double min_time_ns = std::numeric_limits<double>::max();
+    for (int t = 0; t < num_tries; ++t) {
+      const int num_kernels_in_cb = get_tasks_count(node_index);
+      double time_ns = events[t * nodes.size() + node_index].GetEventTimeNs() /
+                       num_kernels_in_cb;
+      min_time_ns = std::min(min_time_ns, time_ns);
+    }
+    (*time_ns)[node_index] = min_time_ns;
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 void InferenceContext::ExecutionHints::Init(const GpuInfo& gpu_info) {
@@ -817,6 +866,33 @@ absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
   return absl::OkStatus();
 }
 
+absl::Status InferenceContext::ClarifyTimeWithCommandBuffer(
+    ProfilingCommandQueue* queue, ProfilingInfo* result) {
+  const int num_tries = 3;
+  const double cb_duration_ms = 10.0;  // looks like enough
+  const int node_group_count = 8;  // Current PowerVR drivers have issues with
+                                   // big amount of CB or big CB.
+  for (int node_index = 0; node_index < nodes_.size();
+       node_index += node_group_count) {
+    std::vector<CLNode*> nodes_to_clarify;
+    std::vector<double> times_ns;
+    for (int i = 0; i < node_group_count && node_index + i < nodes_.size();
+         ++i) {
+      nodes_to_clarify.push_back(&nodes_[node_index + i]);
+      times_ns.push_back(absl::ToDoubleNanoseconds(
+          result->dispatches[node_index + i].duration));
+    }
+    RETURN_IF_ERROR(ClarifyWithCommandBuffer(queue, num_tries, cb_duration_ms,
+                                             nodes_to_clarify, &times_ns));
+    for (int i = 0; i < node_group_count && node_index + i < nodes_.size();
+         ++i) {
+      result->dispatches[node_index + i].duration =
+          absl::Nanoseconds(times_ns[i]);
+    }
+  }
+  return absl::OkStatus();
+}
+
 absl::Status InferenceContext::ClarifyTimeMultipleEnqueue(
     double ops_total_duration_ms, int min_ops, int max_ops,
     ProfilingCommandQueue* queue, ProfilingInfo* result) {
@@ -855,12 +931,17 @@ absl::Status InferenceContext::ProfileTime(ProfilingCommandQueue* queue,
   }
 
   if (gpu_info_.IsPowerVR()) {
-    RETURN_IF_ERROR(ClarifyTimeMultipleEnqueue(/*ops_total_duration_ms=*/32.0,
-                                               /*min_ops=*/4, /*max_ops=*/64,
-                                               queue, result));
-    return ClarifyTimeMultipleEnqueue(/*ops_total_duration_ms=*/128.0,
-                                      /*min_ops=*/4, /*max_ops=*/1024, queue,
-                                      result);
+    if (gpu_info_.SupportsExtension("cl_khr_command_buffer")) {
+      RETURN_IF_ERROR(ClarifyTimeWithCommandBuffer(queue, result));
+      RETURN_IF_ERROR(ClarifyTimeWithCommandBuffer(queue, result));
+    } else {
+      RETURN_IF_ERROR(ClarifyTimeMultipleEnqueue(/*ops_total_duration_ms=*/32.0,
+                                                 /*min_ops=*/4, /*max_ops=*/64,
+                                                 queue, result));
+      return ClarifyTimeMultipleEnqueue(/*ops_total_duration_ms=*/128.0,
+                                        /*min_ops=*/4, /*max_ops=*/1024, queue,
+                                        result);
+    }
   }
 
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h
index 93cf4975d01ae2..e964b6185cb87f 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -152,6 +152,8 @@ class InferenceContext {
                                           int min_ops, int max_ops,
                                           ProfilingCommandQueue* queue,
                                           ProfilingInfo* result);
+  absl::Status ClarifyTimeWithCommandBuffer(ProfilingCommandQueue* queue,
+                                            ProfilingInfo* result);
 
   struct ExecutionHints {
     bool need_flush = false;

From 4491c4e38510898d87f9b4533cb4c731d493eaad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Mar 2024 13:20:06 -0700
Subject: [PATCH 191/670] Fix a test that fails when we enable small object
 optimization in swisstable.

In this case, we don't call hash/eq on the first insertion with SOO so the crash doesn't take place on every insertion anymore.

PiperOrigin-RevId: 617606725
---
 third_party/xla/xla/python/weakref_lru_cache_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/python/weakref_lru_cache_test.py b/third_party/xla/xla/python/weakref_lru_cache_test.py
index e6a5c40326fb85..a3b3d5ee6bf5cf 100644
--- a/third_party/xla/xla/python/weakref_lru_cache_test.py
+++ b/third_party/xla/xla/python/weakref_lru_cache_test.py
@@ -126,8 +126,8 @@ def __hash__(self):
 
     cache = xla_client.weakref_lru_cache(lambda: None, lambda x, y: y, 2048)
     wrkey = WRKey()
-    for _ in range(3):
-      with self.assertRaises(ValueError):
+    with self.assertRaises(ValueError):
+      for _ in range(100):
         cache(wrkey, CrashingKey())
 
 

From 11d3ca098f095c59db181ea8edf82545b6bd3398 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 20 Mar 2024 13:42:21 -0700
Subject: [PATCH 192/670] Fix api generation?

PiperOrigin-RevId: 617613389
---
 tensorflow/tools/docs/generate2.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index 1129c9412492a8..dd20297b07f3d0 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -109,12 +109,15 @@
   ```
   """
 
-tf.estimator.Estimator = doc_controls.inheritable_header("""\
-  Warning: TensorFlow 2.15 included the final release of the `tf-estimator` 
-  package. Estimators will not be available in TensorFlow 2.16 or after. See the
-  [migration guide](https://www.tensorflow.org/guide/migrate/migrating_estimator)
-  for more information about how to convert off of Estimators."
-  """)(tf.estimator.Estimator)
+try:
+  tf.estimator.Estimator = doc_controls.inheritable_header(textwrap.dedent("""\
+    Warning: TensorFlow 2.15 included the final release of the `tf-estimator` 
+    package. Estimators will not be available in TensorFlow 2.16 or after. See the
+    [migration guide](https://www.tensorflow.org/guide/migrate/migrating_estimator)
+    for more information about how to convert off of Estimators."
+    """))(tf.estimator.Estimator)
+except AttributeError:
+  pass
 
 
 class RawOpsPageInfo(module_page.ModulePageInfo):

From 0a01354111739a6b3e450aca8c67e4b65d748734 Mon Sep 17 00:00:00 2001
From: Faijul Amin <md.faijul.amin@intel.com>
Date: Wed, 20 Mar 2024 13:52:46 -0700
Subject: [PATCH 193/670] PR #10747: [XLA:CPU][oneDNN][Bugfix] Fix FLOPs count
 for dot in oneDNN MatMul Rewriter

Imported from GitHub PR https://github.com/openxla/xla/pull/10747

This PR fixes a bug in MAC (multiply-accumulate) / FMA(fused-multiply-add)  count in dot operation when B matrix is transposed.
Copybara import of the project:

--
d989fdba6beaff1f0deed1fdee80792f7ef03089 by mdfaijul <md.faijul.amin@intel.com>:

Fix MAC(Multiply Accumulate) count

Merging this change closes #10747

PiperOrigin-RevId: 617616266
---
 .../xla/service/cpu/onednn_matmul_rewriter.cc | 15 ++++----
 .../xla/xla/tests/onednn_matmul_test.cc       | 37 ++++++++++++++-----
 2 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
index fbcad680576ce6..68f39b3399207c 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/service/cpu/onednn_matmul.h"
 #include "xla/service/cpu/onednn_memory_util.h"
 #include "xla/service/cpu/onednn_util.h"
+#include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/status_macros.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
@@ -359,15 +360,15 @@ bool OneDnnMatMulRewriter::ShouldRewrite(const HloInstruction* dot_instr) {
   }
 
   // OneDNN matmul has scratch allocation and copy overheads. The overheads
-  // can be amortized if there is sufficient MAC (multiply-accumulate)
-  // operations. We don't rewrite for small cases (determined empirically).
+  // can be amortized if there is sufficient number of flops. We don't rewrite
+  // for small cases (determined empirically).
   // TODO(intel-tf): Relax the condition when more optimizations in oneDNN
   // matmul is achieved.
-  auto rank = lhs_shape.rank();
-  auto rhs_dims = rhs_shape.dimensions();
-  int64_t num_mac_ops = ShapeUtil::ElementsIn(lhs_shape) * rhs_dims.back();
-  int mac_ops_threshold = (rank == 2) ? (1 << 23) : (1 << 18);
-  return (num_mac_ops >= mac_ops_threshold);
+  auto num_flops = xla::HloCostAnalysis::GetDotFlops(lhs_shape, output_shape,
+                                                     dot_dim_numbers);
+  auto rank = output_shape.rank();
+  auto flops_threshold = (rank == 2) ? (1 << 24) : (1 << 19);
+  return (num_flops >= flops_threshold);
 }
 
 class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
diff --git a/third_party/xla/xla/tests/onednn_matmul_test.cc b/third_party/xla/xla/tests/onednn_matmul_test.cc
index 52ef3d34adec68..36c936487773d8 100644
--- a/third_party/xla/xla/tests/onednn_matmul_test.cc
+++ b/third_party/xla/xla/tests/onednn_matmul_test.cc
@@ -277,19 +277,19 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter3) {
 
 TEST_F(MatmulTest, SimpleTestF32TransposeBWithBiasAddFusion) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.1, entry_computation_layout={(f32[32,8,4,16]{3,1,2,0},f32[32,8,4,16]{3,1,2,0})->f32[32,8,4,4]{3,2,1,0}}
+  HloModule matmul.test.1, entry_computation_layout={(f32[32,8,4,16]{3,1,2,0},f32[32,8,16,16]{3,1,2,0})->f32[32,8,4,16]{3,2,1,0}}
   
   ENTRY matmul.test.1 {
     arg.0 = f32[32,8,4,16]{3,1,2,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[32,8,4,16]{3,1,2,0} parameter(1), parameter_replication={false}
-    dot.7 = f32[32,8,4,4]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+    arg.1 = f32[32,8,16,16]{3,1,2,0} parameter(1), parameter_replication={false}
+    dot.7 = f32[32,8,4,16]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
     constant.5 = f32[] constant(15)
-    broadcast.6 = f32[4]{0} broadcast(constant.5), dimensions={}
-    broadcast.9 = f32[32,8,4,4]{3,2,1,0} broadcast(broadcast.6), dimensions={3}
-    add.10 = f32[32,8,4,4]{3,2,1,0} add(dot.7, broadcast.9)
-    reshape.11 = f32[32,8,4,4]{3,2,1,0} reshape(add.10)
-    tuple.12 = (f32[32,8,4,4]{3,2,1,0}) tuple(reshape.11)
-    ROOT get-tuple-element.13 = f32[32,8,4,4]{3,2,1,0} get-tuple-element(tuple.12), index=0
+    broadcast.6 = f32[16]{0} broadcast(constant.5), dimensions={}
+    broadcast.9 = f32[32,8,4,16]{3,2,1,0} broadcast(broadcast.6), dimensions={3}
+    add.10 = f32[32,8,4,16]{3,2,1,0} add(dot.7, broadcast.9)
+    reshape.11 = f32[32,8,4,16]{3,2,1,0} reshape(add.10)
+    tuple.12 = (f32[32,8,4,16]{3,2,1,0}) tuple(reshape.11)
+    ROOT get-tuple-element.13 = f32[32,8,4,16]{3,2,1,0} get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -599,6 +599,25 @@ TEST_F(MatmulTest, TestF32ConstantWeights) {
   )");
 }
 
+TEST_F(MatmulTest, TestTransposeBNoRewriteF32) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32, entry_computation_layout={(f32[384,1024]{1,0},f32[2,1024]{1,0})->f32[384,2]{1,0}}
+
+  ENTRY matmul.test.f32 {
+    arg.0 = f32[384,1024]{1,0} parameter(0), parameter_replication={false}
+    arg.1 = f32[2,1024]{1,0} parameter(1), parameter_replication={false}
+    ROOT dot.2 = f32[384,2]{1,0} dot(arg.0, arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str,
+                    R"(
+  ; CHECK:     %matmul.test.f32
+  ; CHECK-NOT: custom_call_target="__onednn$matmul",
+  ; CHECK:     f32[384,2]{1,0} dot(%arg.0, %arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  )");
+}
+
 }  // namespace cpu
 }  // namespace xla
 

From 419aeb9f43f5385e9c95a54b07c42180bed5b153 Mon Sep 17 00:00:00 2001
From: Shan Han <hanshan@google.com>
Date: Wed, 20 Mar 2024 14:14:00 -0700
Subject: [PATCH 194/670] Rollback of #63091: Fix checkfail in
 tf.raw_ops.Substr.

PiperOrigin-RevId: 617622232
---
 tensorflow/core/kernels/substr_op.cc | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 9a3b878180f70e..a7880ccc681eff 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -56,14 +56,7 @@ class SubstrOp : public OpKernel {
                 errors::InvalidArgument(
                     "pos and len should have the same shape, got: ",
                     pos_shape.DebugString(), " vs. ", len_shape.DebugString()));
-    OP_REQUIRES(
-        context, pos_tensor.NumElements() > 0,
-        absl::InvalidArgumentError(absl::StrCat(
-            "received empty tensor pos_tensor: ", pos_tensor.DebugString())));
-    OP_REQUIRES(
-        context, len_tensor.NumElements() > 0,
-        absl::InvalidArgumentError(absl::StrCat(
-            "received empty tensor len_tensor: ", len_tensor.DebugString())));
+
     bool is_scalar = TensorShapeUtils::IsScalar(pos_shape);
 
     if (is_scalar || input_shape == pos_shape) {

From 631c62ce23c0a9c6a371622451b2d345652aeb6b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Mar 2024 14:19:18 -0700
Subject: [PATCH 195/670] Add a knob to HloCSE to only sink scalar
 instructions.

HloCSE is executed to identifying unroallable loops.

PiperOrigin-RevId: 617623774
---
 third_party/xla/xla/service/BUILD             |  2 ++
 third_party/xla/xla/service/hlo_cse.cc        | 23 ++++++++++++----
 third_party/xla/xla/service/hlo_cse.h         |  7 +++--
 third_party/xla/xla/service/hlo_cse_test.cc   | 27 +++++++++++++++++++
 .../xla/xla/service/while_loop_unroller.cc    |  5 ++--
 5 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 41a636e985782a..d11459389aa5ae 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -5041,6 +5041,7 @@ cc_library(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:errors",
     ],
 )
@@ -5062,6 +5063,7 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/third_party/xla/xla/service/hlo_cse.cc b/third_party/xla/xla/service/hlo_cse.cc
index 44f25c83b63164..46a4b52c3b9698 100644
--- a/third_party/xla/xla/service/hlo_cse.cc
+++ b/third_party/xla/xla/service/hlo_cse.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -60,7 +61,8 @@ struct ConstantKey {
 // While we're here, also combine identical iota instructions, since they need
 // similar treatment.
 template <bool kIsLayoutSensitive>
-StatusOr<bool> CombineConstants(HloComputation* computation) {
+absl::StatusOr<bool> CombineConstants(HloComputation* computation,
+                                      bool only_scalars) {
   // Populating the domain map is somewhat expensive -- only do it if there are
   // kDomain ops in the computation.  If there are no kDomain ops, the domain
   // map is trivial, every op gets mapped to the same domain.
@@ -85,6 +87,10 @@ StatusOr<bool> CombineConstants(HloComputation* computation) {
     // invalidated due to deletion.
     ++inst_it;
 
+    if (only_scalars && !ShapeUtil::IsScalar(instruction->shape())) {
+      continue;
+    }
+
     HloInstruction* match = nullptr;
     if (auto* constant_inst = DynCast<HloConstantInstruction>(instruction)) {
       auto insert_result = constants.insert(ConstantKey<kIsLayoutSensitive>{
@@ -249,10 +255,11 @@ StatusOr<bool> HloCSE::Run(
       continue;
     }
 
-    TF_ASSIGN_OR_RETURN(bool combined,
-                        is_layout_sensitive_
-                            ? CombineConstants<true>(computation)
-                            : CombineConstants<false>(computation));
+    TF_ASSIGN_OR_RETURN(
+        bool combined,
+        is_layout_sensitive_
+            ? CombineConstants<true>(computation, only_scalars_)
+            : CombineConstants<false>(computation, only_scalars_));
     changed |= combined;
 
     // HLO instructions are grouped into equivalency classes by using the
@@ -274,6 +281,10 @@ StatusOr<bool> HloCSE::Run(
         continue;
       }
 
+      if (only_scalars_ && !ShapeUtil::IsScalar(instruction->shape())) {
+        continue;
+      }
+
       auto pair = representatives.insert(CseKey{instruction});
       if (!pair.second) {
         HloInstruction* equivalent_instruction = pair.first->hlo;
@@ -282,6 +293,8 @@ StatusOr<bool> HloCSE::Run(
         TF_RETURN_IF_ERROR(computation->RemoveInstructionAndUnusedOperands(
             instruction, /*cleanup=*/std::nullopt,
             ignore_control_dependencies_));
+        VLOG(4) << "Replaced " << instruction->name() << " with "
+                << equivalent_instruction->name();
         changed = true;
         continue;
       }
diff --git a/third_party/xla/xla/service/hlo_cse.h b/third_party/xla/xla/service/hlo_cse.h
index cdf1d702cd4bf1..8e87e4f8ab9100 100644
--- a/third_party/xla/xla/service/hlo_cse.h
+++ b/third_party/xla/xla/service/hlo_cse.h
@@ -33,10 +33,12 @@ class HloCSE : public HloModulePass {
   // when replacing instructions with their equivalents.
   explicit HloCSE(bool is_layout_sensitive,
                   bool only_fusion_computations = false,
-                  bool ignore_control_dependencies = false)
+                  bool ignore_control_dependencies = false,
+                  bool only_scalars = false)
       : is_layout_sensitive_(is_layout_sensitive),
         only_fusion_computations_(only_fusion_computations),
-        ignore_control_dependencies_(ignore_control_dependencies) {}
+        ignore_control_dependencies_(ignore_control_dependencies),
+        only_scalars_(only_scalars) {}
   ~HloCSE() override = default;
   absl::string_view name() const override { return "cse"; }
 
@@ -51,6 +53,7 @@ class HloCSE : public HloModulePass {
   const bool is_layout_sensitive_;
   const bool only_fusion_computations_;
   const bool ignore_control_dependencies_;
+  const bool only_scalars_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_cse_test.cc b/third_party/xla/xla/service/hlo_cse_test.cc
index 5364d21321e661..bd93f7cd88b8cc 100644
--- a/third_party/xla/xla/service/hlo_cse_test.cc
+++ b/third_party/xla/xla/service/hlo_cse_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/substitute.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -709,6 +710,32 @@ TEST_F(HloCseTest, OptimizationBarrier) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(HloCseTest, OnlyScalar) {
+  const char* const hlo_string = R"(
+    HloModule m
+
+    ENTRY entry {
+      %const1 = f32[] constant(1)
+      %const2 = f32[] constant(1)
+      %const3 = f32[2] constant({1,2})
+      %const4 = f32[2] constant({1,2})
+      %add.0 = f32[] add(%const1, %const2)
+      %add.1 = f32[2] add(%const3, %const4)
+      ROOT out = (f32[], f32[2]) tuple(%add.0, %add.1)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  HloCSE cse(/*is_layout_sensitive=*/false, /*only_fusion_computations=*/false,
+             /*ignore_control_dependencies=*/false, /*only_scalars=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&cse, m.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(absl::c_count_if(m->entry_computation()->instructions(),
+                             [](const HloInstruction* instruction) {
+                               return instruction->IsConstant();
+                             }),
+            3);
+}
+
 class HloCseCustomCallTest
     : public HloCseTest,
       public ::testing::WithParamInterface<std::tuple<
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index 3329635b7db3d0..62faffe6992664 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -421,12 +421,13 @@ absl::StatusOr<bool> PrepareModuleForUnrolling(
   bool changed = false;
   TF_ASSIGN_OR_RETURN(
       bool applied_cse,
-      HloCSE{/*is_layout_sensitive=*/true}.Run(module, execution_threads));
+      HloCSE(/*is_layout_sensitive=*/true, /*only_fusion_computations=*/false,
+             /*ignore_control_dependencies=*/false, /*only_scalars=*/true)
+          .Run(module, execution_threads));
   if (applied_cse) {
     changed = true;
     VLOG(3) << "Applied hlo cse to module " << module->name();
   }
-
   TF_ASSIGN_OR_RETURN(bool applied_tuple_simplifier,
                       TupleSimplifier{}.Run(module, execution_threads));
   if (applied_tuple_simplifier) {

From a9cdeaea496387b0b7905c352311de0f26281b45 Mon Sep 17 00:00:00 2001
From: Alan Kelly <alankelly@google.com>
Date: Wed, 20 Mar 2024 14:25:07 -0700
Subject: [PATCH 196/670] Internal config change

PiperOrigin-RevId: 617625382
---
 tensorflow/lite/tools/cmake/modules/xnnpack.cmake | 2 +-
 tensorflow/workspace2.bzl                         | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index e252870628271c..bc5a62d3180590 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG db84c8c822a70183e10ef6b7402c6730f5d54da5
+  GIT_TAG 65ca94a2d3f39e7ff8e4045852b32b65ca6a16b7
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 730882498221ec..865b12e59d64f7 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -150,9 +150,9 @@ def _tf_repositories():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "eba4410af9f455f484b425a80196ff3c3b8cf6f650616a281a815a3bd45bae5f",
-        strip_prefix = "XNNPACK-db84c8c822a70183e10ef6b7402c6730f5d54da5",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/db84c8c822a70183e10ef6b7402c6730f5d54da5.zip"),
+        sha256 = "55827bd7c39a080d4296e84dfbe576240d83a3347df8ad4e10cf9fe400678db7",
+        strip_prefix = "XNNPACK-65ca94a2d3f39e7ff8e4045852b32b65ca6a16b7",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/65ca94a2d3f39e7ff8e4045852b32b65ca6a16b7.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 

From 3dd43f8f4b00b5f7d55666a72f49c33a2ae388c6 Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Wed, 20 Mar 2024 14:42:04 -0700
Subject: [PATCH 197/670] [XLA:GPU] Enable Triton Codegen to emit matmuls with
 a trivial non-contracting dimension

PiperOrigin-RevId: 617629846
---
 .../xla/service/gpu/ir_emitter_triton_test.cc | 127 ++++++++++++++++++
 .../gpu/triton_fusion_analysis_test.cc        |  59 +++++++-
 .../service/gpu/triton_tiling_propagation.cc  |  34 +++--
 .../service/gpu/triton_tiling_propagation.h   |   2 +
 .../gpu/triton_tiling_propagation_test.cc     |  17 +++
 5 files changed, 228 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 11a7c5bf59e089..0cf4f75b2fe8cd 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -241,6 +241,93 @@ CHECK:    }
 )"));
 }
 
+TEST_F(TritonFilecheckTest, TestGemmWithTrivialNonContractingDimension) {
+  const std::string kHloText = R"(
+HloModule t, is_scheduled=true
+
+triton_dot {
+  param_0.1 = f32[137,115]{1,0} parameter(0)
+  param_1.1 = f32[1,115]{1,0} parameter(1)
+  ROOT dot = f32[137,1]{1,0} dot(param_0.1, param_1.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f32[137,115]{1,0} parameter(0)
+  p1 = f32[1,115]{1,0} parameter(1)
+  ROOT custom-call = f32[137,1]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":2,
+                         "num_ctas":1}}}
+})";
+
+  TritonGemmConfig config(16, 16, 32, 1, 1, 1);
+  EXPECT_OK(
+      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-DAG:  %[[ZERO_KN:.*]] = arith.constant dense<0.000000e+00> : tensor<32x16xf32>
+CHECK-DAG:  %[[ZERO_MK:.*]] = arith.constant dense<0.000000e+00> : tensor<16x32xf32>
+CHECK-DAG:  %[[ZERO_MN:.*]] = arith.constant dense<0.000000e+00> : tensor<16x16xf32>
+CHECK-DAG:  %[[SIZE_K:.*]] = arith.constant 115 : i32
+CHECK-DAG:  %[[SIZE_M:.*]] = arith.constant 137 : i64
+CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : i64
+CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : i32
+CHECK-DAG:  %[[C115:.*]] = arith.constant 115 : i64
+CHECK-DAG:  %[[TILE_SIZE_K:.*]] = arith.constant 32 : i32
+CHECK-DAG:  %[[TILE_SIZE_M:.*]] = arith.constant 16 : i32
+CHECK-DAG:  %[[C8:.*]] = arith.constant 8 : i32
+CHECK-DAG:  %[[NUM_TILES_M:.*]] = arith.constant 9 : i32
+CHECK:    %[[PID_NC:.*]] = tt.get_program_id x : i32
+CHECK:    %[[GROUP_ID:.*]] = arith.divsi %[[PID_NC]], %[[C8]]
+CHECK:    %[[FIRST_PID_M:.*]] = arith.muli %[[GROUP_ID]], %[[C8]]
+CHECK:    %[[MAX_M:.*]] = arith.subi %[[NUM_TILES_M]], %[[FIRST_PID_M]]
+CHECK:    %[[CMP:.*]] = arith.cmpi slt, %[[MAX_M]], %[[C8]]
+CHECK:    %[[GROUP_SIZE:.*]] = arith.select %[[CMP]], %[[MAX_M]], %[[C8]]
+CHECK:    %[[PID_M:.*]] = arith.remsi %[[PID_NC]], %[[GROUP_SIZE]]
+CHECK:    %[[TILE_INDEX_M:.*]] = arith.addi %[[FIRST_PID_M]], %[[PID_M]]
+CHECK:    %[[TMP:.*]] = arith.remsi %[[PID_NC]], %[[C8]]
+CHECK:    %[[TILE_INDEX_N:.*]] = arith.divsi %[[TMP]], %[[GROUP_SIZE]]
+CHECK:    %[[TILE_OFFSET_M_LHS:.*]] = arith.muli %[[TILE_INDEX_M]], %[[TILE_SIZE_M]]
+CHECK:    %[[LHS_PTR:.*]] = tt.make_tensor_ptr %[[LHS]]
+CHECK:    %[[LHS_TILE_PTR:.*]] = tt.advance %[[LHS_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[C0]]]
+CHECK:    %[[TILE_OFFSET_N_RHS:.*]] = arith.muli %[[TILE_INDEX_N]], %[[TILE_SIZE_M]]
+CHECK:    %[[RHS_PTR:.*]] = tt.make_tensor_ptr %[[RHS]]
+CHECK:    %[[RHS_TILE_PTR:.*]] = tt.advance %[[RHS_PTR]], [%[[C0]], %[[TILE_OFFSET_N_RHS]]]
+CHECK:    %[[FOR:.*]]:3 = scf.for %[[BLOCK_K:.*]] = %[[C0]] to %[[SIZE_K]] step %[[TILE_SIZE_K]]
+CHECK-SAME:       iter_args(%[[LHS_ITER_PTR:.*]] = %[[LHS_TILE_PTR]], %[[RHS_ITER_PTR:.*]] = %[[RHS_TILE_PTR]], %[[ACC:.*]] = %[[ZERO_MN]])
+CHECK:      %[[LHS_TILE:.*]] = tt.load %[[LHS_ITER_PTR]] {boundaryCheck = array<i32: 0, 1>
+CHECK:      %[[LHS_ITER_PTR_NEXT:.*]] = tt.advance %[[LHS_ITER_PTR]], [%[[C0]], %[[TILE_SIZE_K]]]
+CHECK:      %[[RHS_TILE:.*]] = tt.load %[[RHS_ITER_PTR]] {boundaryCheck = array<i32: 0, 1>
+CHECK:      %[[RHS_ITER_PTR_NEXT:.*]] = tt.advance %[[RHS_ITER_PTR]], [%[[TILE_SIZE_K]], %[[C0]]]
+CHECK:      %[[TILE_K_LIMIT:.*]] = arith.subi %[[SIZE_K]], %[[BLOCK_K]] : i32
+CHECK:      %[[K_TILE_IOTA:.*]] = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
+CHECK:      %[[K_OFFSETS_1K:.*]] = tt.expand_dims %[[K_TILE_IOTA]] {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32>
+CHECK:      %[[TILE_K_LIMIT_1K:.*]] = tt.splat %[[TILE_K_LIMIT]] : i32 -> tensor<1x32xi32>
+CHECK:      %[[LHS_INBOUNDS_1K:.*]] = arith.cmpi slt, %[[K_OFFSETS_1K]], %[[TILE_K_LIMIT_1K]] : tensor<1x32xi32>
+CHECK:      %[[LHS_INBOUNDS_MK:.*]] = tt.broadcast %[[LHS_INBOUNDS_1K]] : tensor<1x32xi1> -> tensor<16x32xi1>
+CHECK:      %[[LHS_MASKED:.*]] = arith.select %[[LHS_INBOUNDS_MK]], %[[LHS_TILE]], %[[ZERO_MK]]
+CHECK:      %[[K_OFFSETS_K1:.*]] = tt.expand_dims %[[K_TILE_IOTA]] {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32>
+CHECK:      %[[TILE_K_LIMIT_K1:.*]] = tt.splat %[[TILE_K_LIMIT]] : i32 -> tensor<32x1xi32>
+CHECK:      %[[RHS_INBOUNDS_K1:.*]] = arith.cmpi slt, %[[K_OFFSETS_K1]], %[[TILE_K_LIMIT_K1]] : tensor<32x1xi32>
+CHECK:      %[[RHS_INBOUNDS_KN:.*]] = tt.broadcast %[[RHS_INBOUNDS_K1]] : tensor<32x1xi1> -> tensor<32x16xi1>
+CHECK:      %[[RHS_MASKED:.*]] = arith.select %[[RHS_INBOUNDS_KN]], %[[RHS_TILE]], %[[ZERO_KN]] : tensor<32x16xi1>, tensor<32x16xf32>
+CHECK:      %[[ACC_NEXT:.*]] = tt.dot %[[LHS_MASKED]], %[[RHS_MASKED]], %[[ACC]]
+CHECK:      scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xf32>, 1>, !tt.ptr<tensor<32x16xf32>, 1>, tensor<16x16xf32>
+CHECK:    }
+
+
+CHECK:    %[[TILE_OFFSET_M_OUT:.*]] = arith.muli %[[TILE_INDEX_M]], %[[TILE_SIZE_M]]
+CHECK:    %[[TILE_OFFSET_N_OUT:.*]] = arith.muli %[[TILE_INDEX_N]], %[[TILE_SIZE_M]]
+CHECK:    %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[SIZE_M]], %[[C1]]], [%[[C1]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x16xf32>, 1>
+CHECK:    %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_OUT]], %[[TILE_OFFSET_N_OUT]]] : <tensor<16x16xf32>, 1>
+CHECK:    tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 0, 1>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<16x16xf32>, 1>, tensor<16x16xf32>
+CHECK:    tt.return
+CHECK:  }
+)"));
+}
+
 TEST_F(TritonFilecheckTest, TestSoftmaxEmitterWithSingleParameter) {
   const std::string kHloText = R"(
 HloModule t
@@ -2855,6 +2942,46 @@ ENTRY e {
                                       /*run_hlo_passes=*/false));
 }
 
+TEST_F(CompareTest, F32WithTrivialNonContractingDimension) {
+  const char* hlo_text_ref = R"(
+HloModule r
+
+ENTRY e {
+  arg0 = f32[5,7] parameter(0)
+  arg1 = f32[1,7] parameter(1)
+  gemm = (f32[5,1], s8[0]{0}) custom-call(arg0, arg1),
+    custom_call_target="__cublas$gemm",
+    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[1],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
+  ROOT get-tuple-element = f32[5,1]{1,0} get-tuple-element((f32[5,1]{1,0}, s8[0]{0}) gemm), index=0
+}
+)";
+
+  const char* hlo_text_triton = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[5,7] parameter(0)
+  p1 = f32[1,7] parameter(1)
+  ROOT dot = f32[5,1] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f32[5,7]{1,0} parameter(0)
+  p1 = f32[1,7]{1,0} parameter(1)
+  ROOT _ = f32[5,1] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":1,
+                         "num_ctas":1}}}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
+                                      ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3},
+                                      /*run_hlo_passes=*/false));
+}
+
 TEST_F(CompareTest, BF16TransposedLHS) {
   if (!GetCudaComputeCapability().IsAtLeast(
           se::CudaComputeCapability::AMPERE)) {
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
index 9c96ee62d5a35f..4b0e2a481cd0d2 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
@@ -115,6 +115,57 @@ ENTRY e {
                             /*slice_limit=*/3, ElementsAre(3))));
 }
 
+TEST_F(TritonDotAnalysisTest, DoNotRemoveTrivialDimensionForDot) {
+  const std::string hlo_text = R"(
+HloModule t, is_scheduled=true
+
+triton_dot {
+  param_0.1 = f32[137,115]{1,0} parameter(0)
+  param_1.1 = f32[1,115]{1,0} parameter(1)
+  ROOT dot = f32[137,1]{1,0} dot(param_0.1, param_1.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f32[137,115]{1,0} parameter(0)
+  p1 = f32[1,115]{1,0} parameter(1)
+  ROOT custom-call = f32[137,1]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":2,
+                         "num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  const HloComputation* dot_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
+  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
+            p0);
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
+            p1);
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
+      ElementsAre(FieldsAre(/*stride=*/115, /*count=*/137, /*slice_start=*/0,
+                            /*slice_limit=*/137, ElementsAre(137))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/115, /*slice_start=*/0,
+                            /*slice_limit=*/115, ElementsAre(115))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
+      ElementsAre(FieldsAre(/*stride=*/115, /*count=*/1, /*slice_start=*/0,
+                            /*slice_limit=*/1, ElementsAre(1))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/115, /*slice_start=*/0,
+                            /*slice_limit=*/115, ElementsAre(115))));
+}
+
 TEST_F(TritonDotAnalysisTest, Merge) {
   const std::string hlo_text = R"(
 HloModule t
@@ -687,9 +738,11 @@ ENTRY e {
               ElementsAre(FieldsAre(/*stride=*/1, /*count=*/97,
                                     /*slice_start=*/0, /*slice_limit=*/97,
                                     /*subfragments=*/ElementsAre(97))));
-  EXPECT_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
-                              computation->root_instruction(), 1),
-            nullptr);
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
+                                 computation->root_instruction(), 1),
+              ElementsAre(FieldsAre(/*stride=*/97, /*count=*/1,
+                                    /*slice_start=*/0, /*slice_limit=*/1,
+                                    /*subfragments=*/ElementsAre(1))));
 }
 
 TEST_F(TritonSoftmaxAnalysisTest, BroadcastIntoBatchDimensionIsSupported) {
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index c3e8ab2ef428d2..9d54797f27f849 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -79,6 +79,15 @@ const TensorIterationSpec::DimIterationSpec* TensorIterationSpec::Find(
   return nullptr;
 }
 
+std::vector<int> TensorIterationSpec::GetDimensions() const {
+  std::vector<int> result;
+  result.reserve(dim_iteration_specs_.size());
+  for (const auto& [dim, _] : dim_iteration_specs_) {
+    result.push_back(dim);
+  }
+  return result;
+}
+
 bool TensorIterationSpec::IsPhysicallyEquivalent(
     const TensorIterationSpec& other) const {
   // Filter out trivial dims since they don't affect physical representation.
@@ -202,12 +211,6 @@ TensorIterationSpec DimensionOrder::ToTensorIterationSpec() const {
   TensorIterationSpec tensor_spec;
   int64_t accumulated_stride = 1;
   int last_dim = -1;
-  auto remove_last_fragment_if_degenerate = [&tensor_spec](const int dim_idx) {
-    if (dim_idx >= 0 && !tensor_spec[dim_idx].empty() &&
-        tensor_spec[dim_idx].back().count == 1) {
-      tensor_spec[dim_idx].pop_back();
-    }
-  };
   for (int dim_order_index = 0; dim_order_index < dim_fragments.size();
        ++dim_order_index) {
     const DimensionOrder::Fragment& fragment = dim_fragments[dim_order_index];
@@ -234,7 +237,6 @@ TensorIterationSpec DimensionOrder::ToTensorIterationSpec() const {
         dim_spec.back().subfragments.push_back(fragment.sliced_count());
       }
     } else {
-      remove_last_fragment_if_degenerate(last_dim);
       // Add part of the dimension.
       dim_spec.push_back(TensorIterationSpec::IterationSpecFragment{
           accumulated_stride,
@@ -247,7 +249,23 @@ TensorIterationSpec DimensionOrder::ToTensorIterationSpec() const {
     accumulated_stride *= fragment.full_count();
     last_dim = fragment.dst_dim_number();
   }
-  remove_last_fragment_if_degenerate(last_dim);
+
+  // Remove degenerate fragments.
+  for (int dim_idx : tensor_spec.GetDimensions()) {
+    TensorIterationSpec::DimIterationSpec& dim_spec = tensor_spec[dim_idx];
+
+    // We should not remove the only fragment in a dimension, because if it is
+    // removed, the dimension will be removed from the TensorIterationSpec.
+    if (dim_spec.size() <= 1) continue;
+
+    TensorIterationSpec::DimIterationSpec filtered_dim_spec;
+    absl::c_copy_if(dim_spec, std::back_inserter(filtered_dim_spec),
+                    [](const TensorIterationSpec::IterationSpecFragment& f) {
+                      return f.count != 1;
+                    });
+    tensor_spec[dim_idx] = filtered_dim_spec;
+  }
+
   tensor_spec.RemoveEmptyDimensions();
   return tensor_spec;
 }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
index 5fc2d952d2453b..87ff11ae7c7415 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
@@ -114,6 +114,8 @@ class TensorIterationSpec {
   // Returns nullptr if not found.
   const DimIterationSpec* Find(int dimension) const;
 
+  std::vector<int> GetDimensions() const;
+
   void RemoveEmptyDimensions() {
     absl::erase_if(dim_iteration_specs_,
                    [](const auto& it) { return it.second.empty(); });
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc
index b483c73f4ddbd2..515bffbe0eb644 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc
@@ -80,5 +80,22 @@ TEST_F(
   EXPECT_TRUE(spec_1.IsPhysicallyEquivalent(spec_2));
 }
 
+TEST_F(TritonTilingPropagationTest,
+       DimensionsShouldNotBeRemovedByToTensorIterationSpec) {
+  DimensionOrder::Fragment fragment_0(/*dst_dim_number=*/0, /*count=*/97);
+  DimensionOrder::Fragment fragment_1(/*dst_dim_number=*/1, /*count=*/1);
+  DimensionOrder dimension_order = FromFragments({fragment_0, fragment_1});
+  TensorIterationSpec spec = dimension_order.ToTensorIterationSpec();
+  const TensorIterationSpec::DimIterationSpec* dim_spec_0 = spec.Find(0);
+  EXPECT_NE(dim_spec_0, nullptr);
+  EXPECT_EQ(dim_spec_0->size(), 1);
+  EXPECT_EQ(dim_spec_0->at(0).count, 97);
+
+  const TensorIterationSpec::DimIterationSpec* dim_spec_1 = spec.Find(1);
+  EXPECT_NE(dim_spec_1, nullptr);
+  EXPECT_EQ(dim_spec_1->size(), 1);
+  EXPECT_EQ(dim_spec_1->at(0).count, 1);
+}
+
 }  // namespace
 }  // namespace xla::gpu

From b0450502bff1f23fdedf29d1e773c7bbe21b2fa0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Mar 2024 15:28:26 -0700
Subject: [PATCH 198/670] Add constants for non-batching cost

PiperOrigin-RevId: 617641979
---
 tensorflow/core/common_runtime/cost_constants.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/core/common_runtime/cost_constants.h b/tensorflow/core/common_runtime/cost_constants.h
index d580a95370bfd7..a19c567dc0a2f4 100644
--- a/tensorflow/core/common_runtime/cost_constants.h
+++ b/tensorflow/core/common_runtime/cost_constants.h
@@ -30,12 +30,15 @@ inline constexpr char kNoOpCostName[] = "no_op";
 // '_with_smear" includes this part.
 inline constexpr char kWithSmearSuffix[] = "_with_smear";
 inline constexpr char kNoSmearSuffix[] = "_no_smear";
+inline constexpr char kNonBatchingSuffix[] = "_non_batching";
 
 // Full names of per-request cost.
 inline constexpr char kTpuWithSmearCostName[] = "tpu_with_smear";
 inline constexpr char kTpuNoSmearCostName[] = "tpu_no_smear";
+inline constexpr char kTpuNonBatchingCostName[] = "tpu_non_batching";
 inline constexpr char kGcuWithSmearCostName[] = "gcu_with_smear";
 inline constexpr char kGcuNoSmearCostName[] = "gcu_no_smear";
+inline constexpr char kGcuNonBatchingCostName[] = "gcu_non_batching";
 
 }  // namespace tensorflow
 

From d80ec04f9bc3ce7ffaef3707fadc6538d844621b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Mar 2024 15:34:12 -0700
Subject: [PATCH 199/670] Increase memory for ASAN test.

PiperOrigin-RevId: 617643592
---
 tensorflow/core/tfrt/tfrt_session/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/tfrt/tfrt_session/BUILD b/tensorflow/core/tfrt/tfrt_session/BUILD
index b4157cacbd7785..ed2289b31cba02 100644
--- a/tensorflow/core/tfrt/tfrt_session/BUILD
+++ b/tensorflow/core/tfrt/tfrt_session/BUILD
@@ -128,6 +128,10 @@ tf_cc_shared_test(
 # tf_py_strict_test(
 #     name = "tfrt_session_python_test",
 #     srcs = ["tfrt_session_python_test.py"],
+#     exec_properties = select({
+#         "//tools/cpp:asan_build": {"cpp_link.mem": "16g"},
+#         "//conditions:default": None,
+#     }),
 #     python_version = "PY3",
 #     deps = [
 #         ":tfrt_session_py",

From d1f0ebfd2a8b9e1cbc88b8ae8a77497de17158af Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Mar 2024 15:36:48 -0700
Subject: [PATCH 200/670] Support Per-channel quantization for DotGeneral

PiperOrigin-RevId: 617644231
---
 .../mhlo_quant_legalize_to_int.cc             |  7 ++-
 .../mhlo/mhlo-quant-legalize-to-int.mlir      | 51 +++++++++++++++++++
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc
index 145713148fa653..d0da0f12e098fb 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc
@@ -1040,10 +1040,9 @@ FailureOr<bool> isDotLikeOpHybrid(DotLikeOp op) {
       getElementTypeOrSelf(op.getResult()));
 
   if (isLhsQuant && ((isRhsQuant && isResQuant) ||
-                     (isa<mhlo::ConvolutionOp>(op) && isRhsQuantPerChannel &&
-                      isResQuantPerChannel))) {
-    // For quantized ops, RHS and result must be both per-channel quantized.
-    // For Convolution, we also support per-channel quantized RHS/result.
+                     (isRhsQuantPerChannel && isResQuantPerChannel))) {
+    // For quantized ops, RHS and result must be both per-channel quantized or
+    // both per-tensor quantized.
     return false;
   }
   if (!isLhsQuant && !isLhsQuantPerChannel && isRhsQuant && !isResQuant &&
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir
index 1d5a45b047442c..01ac1b8d13bc4a 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir
@@ -1166,6 +1166,57 @@ func.func @dot_general_multiple_dynamic_dims(
 
 // -----
 
+// CHECK-LABEL: func @dot_general_per_channel
+func.func @dot_general_per_channel(
+    %arg0: tensor<?x2x!quant.uniform<i8:f32, 2.0:3>>,
+    %arg1: tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {3.0,4.0}>>
+  ) -> tensor<?x2x!quant.uniform<i32:f32:1, {6.0,8.0}>> {
+  // CHECK: %[[DOT_RES:.*]] = "mhlo.dot_general"
+  // CHECK-SAME: lhs_contracting_dimensions = [1]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]>}
+
+  // Zero point offset contribution from RHS tensor * LHS ZP.
+
+  // CHECK: %[[RHS_I32:.*]] = mhlo.convert %arg1 : (tensor<2x2xi8>)
+  // CHECK-SAME: -> tensor<2x2xi32>
+  // CHECK: %[[RHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[RHS_REDUCE:.*]] = mhlo.reduce(%[[RHS_I32]] init: %[[RHS_REDUCE_INIT]])
+  // CHECK-SAME: applies mhlo.add across dimensions = [0]
+  // CHECK-SAME: (tensor<2x2xi32>, tensor<i32>)
+  // CHECK-SAME: -> tensor<2xi32>
+  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<3> : tensor<i32>
+  // CHECK: %[[RHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RHS_REDUCE]], %[[RHS_ZP]] :
+  // CHECK-SAME: (tensor<2xi32>, tensor<i32>) -> tensor<2xi32>
+
+  // Calculate output dynamic dims.
+  // CHECK: %[[DIM_1_1:.*]] = "mhlo.get_dimension_size"(%[[DOT_RES]])
+  // CHECK-SAME: {dimension = 0 : i64}
+  // CHECK: %[[DIM_1_2:.*]] = mhlo.convert %[[DIM_1_1]] : (tensor<i32>) -> tensor<i64>
+  // CHECK: %[[DIM_1:.*]] = mhlo.reshape %[[DIM_1_2]] : (tensor<i64>) -> tensor<1xi64>
+  // CHECK: %[[DIM_2:.*]] = mhlo.constant dense<2> : tensor<1xi64>
+  // CHECK: %[[OUTPUT_DIMS:.*]] = "mhlo.concatenate"
+  // CHECK-SAME: %[[DIM_1]], %[[DIM_2]]
+
+  // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.dynamic_broadcast_in_dim"
+  // CHECK-SAME: (%[[RHS_ZP_CONTRIB]], %[[OUTPUT_DIMS]])
+  // CHECK-SAME: broadcast_dimensions = dense<1>
+  // CHECK-SAME: (tensor<2xi32>, tensor<2xi64>) -> tensor<?x2xi32>
+  // CHECK: %[[ZPS_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_subtract %[[ZPS_INIT]], %[[RHS_ZP_BCAST]]
+  // CHECK-SAME: (tensor<i32>, tensor<?x2xi32>) -> tensor<?x2xi32>
+  // CHECK: chlo.broadcast_add %[[DOT_RES]], %[[ZP_TOTAL_2]]
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+      dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1],
+      rhs_contracting_dimensions = [0]>} : (
+    tensor<?x2x!quant.uniform<i8:f32, 2.0:3>>,
+    tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {3.0,4.0}>>
+  ) -> tensor<?x2x!quant.uniform<i32:f32:1, {6.0,8.0}>>
+  return %0 : tensor<?x2x!quant.uniform<i32:f32:1, {6.0,8.0}>>
+}
+
+// -----
+
 // CHECK-LABEL: func @conv2d_dynamic
 func.func @conv2d_dynamic(
     %arg0: tensor<?x?x?x?x!quant.uniform<i8:f32, 2.000000e+00:4>>,

From 590adf882c3c8b8813863fc2355dcf197f47b90a Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Wed, 20 Mar 2024 16:27:45 -0700
Subject: [PATCH 201/670] Strip memory space from xla layout inside
 `PjRtLayout`. PJRT tracks memory space as a different field.

Stripping is done by setting the memory space to default memory space

PiperOrigin-RevId: 617656564
---
 third_party/xla/xla/pjrt/pjrt_layout.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/pjrt/pjrt_layout.h b/third_party/xla/xla/pjrt/pjrt_layout.h
index a3c080797922f1..0fbf205de8b6e7 100644
--- a/third_party/xla/xla/pjrt/pjrt_layout.h
+++ b/third_party/xla/xla/pjrt/pjrt_layout.h
@@ -62,7 +62,11 @@ class PjRtLayout {
 // have access to full xla::Layouts.
 class PjRtXlaLayout : public PjRtLayout {
  public:
-  explicit PjRtXlaLayout(Layout layout) : xla_layout_(std::move(layout)) {}
+  explicit PjRtXlaLayout(Layout layout) : xla_layout_(std::move(layout)) {
+    // Strip memory space and set it to the default. PJRT tracks memory space
+    // separately from layout.
+    xla_layout_.set_memory_space(xla::Layout::kDefaultMemorySpace);
+  }
 
   std::string Serialize() const override { return xla_layout_.ToString(); }
 

From eec9b1ad92239bec8321677e3f09ee6c324bbc25 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Wed, 20 Mar 2024 17:27:36 -0700
Subject: [PATCH 202/670] #tf-data Fix global shuffling for shard dataset.

The issue is similar to batching without dropping remainders:
The input dataset may return `end_of_sequence` in the middle
of its output since the index is shuffled.

For example, without fix, when shuffling
```
range(10).shard(3, 0)
```
The range dataset could return `end_of_sequence` after producing
9.

This CL fixes the index mapper for `shard` so no skipping is
needed for global shuffling.

PiperOrigin-RevId: 617670463
---
 tensorflow/core/kernels/data/BUILD            |  1 +
 .../core/kernels/data/shard_dataset_op.cc     | 88 +++++++++++--------
 .../data/experimental/kernel_tests/BUILD      |  1 +
 .../kernel_tests/auto_shard_dataset_test.py   | 45 ++++++++++
 .../python/data/kernel_tests/shard_test.py    | 23 +++--
 5 files changed, 113 insertions(+), 45 deletions(-)

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index e952431d3da14b..fc902bf6360ade 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -1161,6 +1161,7 @@ tf_kernel_library(
         "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index 25957f96ba5d94..bda0c9b6ed7297 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
@@ -51,6 +52,12 @@ namespace data {
 
 constexpr char kInputImplEmpty[] = "input_impl_empty";
 constexpr char kNextIndex[] = "next_index";
+constexpr char kFileShardErrorMessage[] =
+    "If you are using datasets with distribution strategy, consider setting "
+    "the auto sharding policy to either DATA or OFF using the "
+    "`experimental_distribute.auto_shard_policy` option of `tf.data.Options()`."
+    " Or, split your input files into a larger number of small files such that "
+    "number of files is greater than number of shards/workers.";
 
 class ShardDatasetOp::Dataset : public DatasetBase {
  public:
@@ -145,7 +152,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
   class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<Dataset>(params), next_index_(0) {}
+        : DatasetIterator<Dataset>(params), next_index_(0), element_count_(0) {}
 
     bool SymbolicCheckpointCompatible() const override { return true; }
 
@@ -172,10 +179,9 @@ class ShardDatasetOp::Dataset : public DatasetBase {
         return absl::OkStatus();
       }
 
-      IteratorContextWithIndexMapper ctx_with_index_mapper(ctx, this);
-      auto merge_checkpoint = gtl::MakeCleanup([&ctx_with_index_mapper] {
-        ctx_with_index_mapper.MergeCheckpoint();
-      });
+      if (ctx->index_mapper() != nullptr) {
+        return Get(ctx, out_tensors, end_of_sequence);
+      }
 
       int num_to_skip =
           (dataset()->index_ - next_index_) % dataset()->num_shards_;
@@ -183,9 +189,8 @@ class ShardDatasetOp::Dataset : public DatasetBase {
         num_to_skip += dataset()->num_shards_;
       }
       int num_skipped;
-      TF_RETURN_IF_ERROR(input_impl_->Skip(ctx_with_index_mapper.Get(),
-                                           num_to_skip, end_of_sequence,
-                                           &num_skipped));
+      TF_RETURN_IF_ERROR(
+          input_impl_->Skip(ctx, num_to_skip, end_of_sequence, &num_skipped));
       next_index_ += num_skipped;
       if (*end_of_sequence) {
         input_impl_.reset();
@@ -193,8 +198,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
       }
 
       std::vector<Tensor> result;
-      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx_with_index_mapper.Get(),
-                                              &result, end_of_sequence));
+      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &result, end_of_sequence));
       if (*end_of_sequence) {
         input_impl_.reset();
         return absl::OkStatus();
@@ -204,24 +208,17 @@ class ShardDatasetOp::Dataset : public DatasetBase {
       if (dataset()->require_non_empty_ &&
           next_index_ < dataset()->num_shards_) {
         int num_skipped;
-        Status s = input_impl_->Skip(ctx_with_index_mapper.Get(),
-                                     dataset()->num_shards_ - next_index_,
+        Status s = input_impl_->Skip(ctx, dataset()->num_shards_ - next_index_,
                                      end_of_sequence, &num_skipped);
         if (*end_of_sequence || errors::IsOutOfRange(s)) {
           // `dataset()->require_non_empty_` implies that this transformation
           // was introduced by auto_sharding rewrite, so it's acceptable
           // produce an error message that assumes auto-sharding context.
-          return errors::InvalidArgument(
+          return absl::InvalidArgumentError(absl::StrCat(
               "Could not apply FILE based sharding: the dataset only has ",
               next_index_, " file(s), which is not enough for the required ",
-              dataset()->num_shards_,
-              " shards/workers."
-              "If you are using datasets with distribution strategy, "
-              "consider setting the auto sharding policy to either DATA or "
-              "OFF using the `experimental_distribute.auto_shard_policy` option"
-              "of `tf.data.Options()`. Or, split your input files into a "
-              "larger number of small files such that number of files is "
-              "greater than number of shards/workers.");
+              dataset()->num_shards_, " shards/workers. ",
+              kFileShardErrorMessage));
         } else if (!s.ok()) {
           return s;
         }
@@ -233,16 +230,37 @@ class ShardDatasetOp::Dataset : public DatasetBase {
       return absl::OkStatus();
     }
 
-    IndexMapperFn GetIndexMapper(IndexMapperFn parent_index_mapper)
-        const override TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    Status Get(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+               bool* end_of_sequence) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      IteratorContextWithIndexMapper ctx_with_index_mapper(ctx, this);
+      auto merge_checkpoint = gtl::MakeCleanup([&ctx_with_index_mapper] {
+        ctx_with_index_mapper.MergeCheckpoint();
+      });
+      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx_with_index_mapper.Get(),
+                                              out_tensors, end_of_sequence));
+      if (*end_of_sequence && dataset()->require_non_empty_ &&
+          element_count_ == 0) {
+        // `dataset()->require_non_empty_` implies that this transformation
+        // was introduced by auto_sharding rewrite, so it's acceptable to
+        // produce an error message that assumes auto-sharding context.
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Could not apply FILE based sharding: The dataset does not have "
+            "enough file(s) for the required ",
+            dataset()->num_shards_, " shards/workers. ",
+            kFileShardErrorMessage));
+      }
+      ++element_count_;
+      return absl::OkStatus();
+    }
+
+    IndexMapperFn GetIndexMapper(
+        IndexMapperFn parent_index_mapper) const override {
       int64_t num_shards = dataset()->num_shards_;
-      return [parent_index_mapper,
-              num_shards](size_t element_position) -> size_t {
-        size_t sharded_element_position = element_position / num_shards;
-        size_t input_element_offset = element_position % num_shards;
-        size_t shuffled_element_position =
-            parent_index_mapper(sharded_element_position);
-        return shuffled_element_position * num_shards + input_element_offset;
+      int64_t shard_index = dataset()->index_;
+      return [parent_index_mapper, num_shards,
+              shard_index](size_t element_position) -> size_t {
+        size_t output_index = parent_index_mapper(element_position);
+        return output_index * num_shards + shard_index;
       };
     }
 
@@ -270,13 +288,8 @@ class ShardDatasetOp::Dataset : public DatasetBase {
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
       if (ctx->restored_element_count().has_value()) {
-        next_index_ = *ctx->restored_element_count() * dataset()->num_shards_;
-        IteratorContext::Params params(ctx);
-        params.restored_element_count =
-            *ctx->restored_element_count() * dataset()->num_shards_;
-        IteratorContext ctx_with_restored_element_count(params);
-        return RestoreInput(&ctx_with_restored_element_count, reader,
-                            input_impl_);
+        element_count_ = *ctx->restored_element_count();
+        return RestoreInput(ctx, reader, input_impl_);
       }
 
       int64_t input_empty;
@@ -300,6 +313,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
     mutex mu_;
     std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     int64_t next_index_ TF_GUARDED_BY(mu_);
+    size_t element_count_ TF_GUARDED_BY(mu_);
   };
 
   const int64_t num_shards_;
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 94ee7f74a5edbb..ae9f5462886eeb 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -65,6 +65,7 @@ tf_py_strict_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/experimental/ops:cardinality",
         "//tensorflow/python/data/experimental/ops:distribute",
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
         "//tensorflow/python/data/experimental/ops:interleave_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/experimental/ops:testing",
diff --git a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
index e4f33b1ed1571a..c84b889267ff66 100644
--- a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests for the private `_AutoShardDataset` transformation."""
 import os
+from typing import Optional
 
 from absl.testing import parameterized
 
@@ -21,6 +22,7 @@
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.experimental.ops import distribute
+from tensorflow.python.data.experimental.ops import global_shuffle_op
 from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.experimental.ops import testing
@@ -713,5 +715,48 @@ def build_dataset():
     verify_fn(self, build_dataset, num_outputs=20)
 
 
+class AutoShardGlobalShuffleTest(
+    test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[100],
+              num_shards=[1, 3, 5],
+              shard_index=[0, 1, 2, 4],
+              seed=[None, 42],
+              reshuffle_each_iteration=[True, False])))
+  def test(
+      self,
+      dataset_range: int,
+      num_shards: int,
+      shard_index: int,
+      seed: Optional[int],
+      reshuffle_each_iteration: bool):
+    if shard_index >= num_shards:
+      return
+
+    dataset = dataset_ops.Dataset.range(dataset_range)
+    dataset = distribute._AutoShardDataset(dataset, num_shards, shard_index)
+    dataset = global_shuffle_op._global_shuffle(
+        dataset, seed=seed, reshuffle_each_iteration=reshuffle_each_iteration)
+
+    expected = list(range(shard_index, dataset_range, num_shards))
+    dataset_output = self.getDatasetOutput(
+        dataset, requires_initialization=True)
+    self.assertCountEqual(dataset_output, expected)
+    self.assertNotEqual(dataset_output, expected)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testNotSufficientInput(self):
+    dataset = dataset_ops.Dataset.range(1)
+    dataset = distribute._AutoShardDataset(dataset, 5, 4)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = global_shuffle_op._global_shuffle(dataset)
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/shard_test.py b/tensorflow/python/data/kernel_tests/shard_test.py
index 6ed7b7573a1c43..b9ca03cf840220 100644
--- a/tensorflow/python/data/kernel_tests/shard_test.py
+++ b/tensorflow/python/data/kernel_tests/shard_test.py
@@ -193,9 +193,9 @@ class ShardGlobalShuffleTest(
           test_base.default_test_combinations(),
           combinations.combine(
               dataset_range=[100],
-              num_shards=[1, 5, 13],
-              shard_index=[0, 1, 4],
-              seed=[None, 19],
+              num_shards=[1, 3, 5],
+              shard_index=[0, 1, 2, 4],
+              seed=[None, 42],
               reshuffle_each_iteration=[True, False])))
   def testShard(
       self,
@@ -204,7 +204,6 @@ def testShard(
       shard_index: int,
       seed: Optional[int],
       reshuffle_each_iteration: bool):
-    self.skipTest("TODO(b/616275227): Fix implementation for shard.")
     if shard_index >= num_shards:
       return
 
@@ -229,18 +228,26 @@ class ShardGlobalShuffleCheckpointTest(
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
           combinations.combine(
+              dataset_range=[10],
+              num_shards=[1, 3],
+              shard_index=[0, 1, 2],
               reshuffle_each_iteration=[True, False],
               symbolic_checkpoint=[True, False])))
   def testShard(
       self,
       verify_fn: Callable[..., None],
+      dataset_range: int,
+      num_shards: int,
+      shard_index: int,
       reshuffle_each_iteration: bool,
       symbolic_checkpoint: bool):
-    self.skipTest("TODO(b/616275227): Fix implementation for shard.")
+
+    if shard_index >= num_shards:
+      return
 
     def _build_dataset() -> dataset_ops.Dataset:
-      dataset = dataset_ops.Dataset.range(10)
-      dataset = dataset.shard(3, 0)
+      dataset = dataset_ops.Dataset.range(dataset_range)
+      dataset = dataset.shard(num_shards, shard_index)
       dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
       dataset = global_shuffle_op._global_shuffle(
           dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
@@ -251,7 +258,7 @@ def _build_dataset() -> dataset_ops.Dataset:
     verify_fn(
         self,
         _build_dataset,
-        num_outputs=4,
+        num_outputs=len(range(shard_index, dataset_range, num_shards)),
         assert_items_equal=reshuffle_each_iteration)
 
 

From c1f19cb19b3adc3a35c7fcd18c8c4d2214481af3 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 20 Mar 2024 17:40:50 -0700
Subject: [PATCH 203/670] [xla:gpu] Enable command buffers for FFI handlers
 registered as compatible

PiperOrigin-RevId: 617673413
---
 third_party/xla/xla/ffi/ffi_api.cc            |  4 ++++
 third_party/xla/xla/ffi/ffi_api.h             |  2 ++
 third_party/xla/xla/service/gpu/BUILD         |  1 +
 .../service/gpu/command_buffer_scheduling.cc  | 19 +++++++++++++------
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index 75de43e277cfc1..af8dbaa5d50b65 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -50,6 +50,10 @@ struct XLA_FFI_ExecutionContext {
 
 namespace xla::ffi {
 
+bool IsCommandBufferCompatible(XLA_FFI_Handler_Traits traits) {
+  return traits & XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE;
+}
+
 //===----------------------------------------------------------------------===//
 // Calling XLA FFI handlers
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/ffi_api.h b/third_party/xla/xla/ffi/ffi_api.h
index d101a8974587b6..a90f19182ffc6e 100644
--- a/third_party/xla/xla/ffi/ffi_api.h
+++ b/third_party/xla/xla/ffi/ffi_api.h
@@ -66,6 +66,8 @@ struct HandlerRegistration {
   XLA_FFI_Handler_Traits traits = 0;
 };
 
+bool IsCommandBufferCompatible(XLA_FFI_Handler_Traits traits);
+
 // Returns registered FFI handler for a given name and platform, or an error if
 // it's not found in the static registry.
 absl::StatusOr<HandlerRegistration> FindHandler(std::string_view name,
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index d2d96fb0b6b31d..fffd60e47580b3 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3428,6 +3428,7 @@ cc_library(
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
+        "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
         "//xla/stream_executor:device_description",
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
index b2270173c70819..97e28bdd356761 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
@@ -19,9 +19,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
-#include <optional>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -30,9 +28,9 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -123,20 +121,29 @@ bool IsCommand<HloOpcode::kConditional>(const HloInstruction* hlo,
 
 static bool IsCommand(const HloCustomCallInstruction* hlo,
                       const CommandBufferConfig& config) {
+  // cuBLAS gemms represented in the HLO as custom call instructions.
   if (config.enabled_commands.contains(DebugOptions::CUBLAS) &&
       IsLegacyCublasMatmul(*hlo)) {
     return true;
   }
 
-  if (config.enabled_commands.contains(DebugOptions::CUSTOM_CALL) &&
-      hlo->custom_call_target() == "triton_kernel_call" &&
+  if (!config.enabled_commands.contains(DebugOptions::CUSTOM_CALL)) {
+    return false;
+  }
+
+  // A special case for jax-triton kernel while it is not ported to FFI.
+  if (hlo->custom_call_target() == "triton_kernel_call" &&
       // TODO(b/327718087): This is an ugly hack to prevent capturing triton
       // custom calls that might do autotuning at run time.
       !absl::StrContains(hlo->metadata().op_name(), "Autotuner")) {
     return true;
   }
 
-  return false;
+  // Check if FFI handler is compatible with command buffers.
+  auto registration = ffi::FindHandler(hlo->custom_call_target(), "gpu");
+  return registration.ok()
+             ? ffi::IsCommandBufferCompatible(registration->traits)
+             : false;
 }
 
 static bool IsCommand(const HloInstruction* hlo,

From 0f08501e37a1a66016bddd52046ef85121d8253b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Mar 2024 18:01:43 -0700
Subject: [PATCH 204/670] Minor changes that help debugability: 1. Cast the
 printed memory budget into double to make it easy to compare it with the
 estimated minumum memory required, and thereby easy to compare. 2. Guard some
 checks when computing the memory lower bound within vlogs as the checks are
 often slow. 3. Increase the VLOG level for printing the memory usage of the
 solution as computing this also often takes a while.

PiperOrigin-RevId: 617677396
---
 .../auto_sharding/auto_sharding.cc            | 34 +++++++++++--------
 .../auto_sharding/auto_sharding_solver.cc     |  3 +-
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index a59f4ee2f335b3..d466a7bb8eedb3 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -2637,24 +2637,28 @@ int64_t MemoryBudgetLowerBound(const HloModule& module,
   // as aliasing HloValues are mapped to the same buffer.
   absl::flat_hash_map<HloBuffer::Id, const HloValue*>
       buffer_to_sharded_value_mapping;
+  bool vlog_is_on_5 = VLOG_IS_ON(5);
   for (LivenessIdx time_idx = 0; time_idx < liveness_set.size(); ++time_idx) {
     for (const HloValue* value : liveness_set[time_idx]) {
       const auto& buffer = alias_analysis->GetBufferContainingValue(*value);
       if (value->instruction()->has_sharding()) {
-        auto this_value_sharding = get_value_sharding(value);
-        auto iter = buffer_to_sharded_value_mapping.find(buffer.id());
-        if (iter != buffer_to_sharded_value_mapping.end()) {
-          auto buffer_value_sharding = get_value_sharding(iter->second);
-          if (this_value_sharding != buffer_value_sharding) {
-            // TODO(pratikf): This is an unavoidable situation, but possibly
-            // there is a better design decision that can be made here.
-            VLOG(1) << "We have a situation where two HloValues alias, but "
-                       "they have different shardings. This can happen in the "
-                       "presence of user-specified shardings, and is expected. "
-                       "This, however, means that the memory budget estimate "
-                       "is not very accurate. The aliasing HLOs are "
-                    << value->ToShortString() << " and "
-                    << iter->second->ToShortString();
+        if (vlog_is_on_5) {
+          auto this_value_sharding = get_value_sharding(value);
+          auto iter = buffer_to_sharded_value_mapping.find(buffer.id());
+          if (iter != buffer_to_sharded_value_mapping.end()) {
+            auto buffer_value_sharding = get_value_sharding(iter->second);
+            if (this_value_sharding != buffer_value_sharding) {
+              // TODO(pratikf): This is an unavoidable situation, but possibly
+              // there is a better design decision that can be made here.
+              VLOG(1)
+                  << "We have a situation where two HloValues alias, but "
+                     "they have different shardings. This can happen in the "
+                     "presence of user-specified shardings, and is expected. "
+                     "This, however, means that the memory budget estimate "
+                     "is not very accurate. The aliasing HLOs are "
+                  << value->ToShortString() << " and "
+                  << iter->second->ToShortString();
+            }
           }
         }
         buffer_to_sharded_value_mapping[buffer.id()] = value;
@@ -3768,7 +3772,7 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     XLA_VLOG_LINES(5, PrintAutoShardingSolution(sequence, liveness_set,
                                                 strategy_map, strategy_groups,
                                                 cost_graph, s_val, objective));
-    XLA_VLOG_LINES(1, PrintSolutionMemoryUsage(liveness_set, strategy_map,
+    XLA_VLOG_LINES(6, PrintSolutionMemoryUsage(liveness_set, strategy_map,
                                                cost_graph, s_val));
 
     // ----- Substitute all-reduce with reduce-scatter -----
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
index d627cc84f01409..f7a6d3c67be523 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
@@ -519,7 +519,8 @@ AutoShardingSolverResult CallORToolsSolver(
     }
     LOG(INFO) << "Minimum memory budget estimate: "
               << MinimumMemoryBudgetRequired(request);
-    LOG(INFO) << "Using memory budget: " << request.memory_budget();
+    LOG(INFO) << "Using memory budget: "
+              << static_cast<double>(request.memory_budget());
   }
 
   // d. specified via "BoolVarArray"

From 7df34ef023daffefc6f03e1710c204ca5d66a137 Mon Sep 17 00:00:00 2001
From: Edward Schwartz <schwartzedward@google.com>
Date: Wed, 20 Mar 2024 18:42:18 -0700
Subject: [PATCH 205/670] Create new PjRt GPU client with remote devices after
 coordination service agent is available

This does not modify the previous PjRt GPU client. The test environment case where multiple threads are used to simulate multiple workers is supported.

For the PjRt GPU MultiWorkerMirroredStrategy (MWMS) case where there are multiple workers each with a GPU or GPUs, the symptom of a client without this fix is a error message like the following where the number at the end is the ID of the first remote device, e.g. 8 when there are eight local GPUs with IDs 0..7. (Another example is 1 when there is one local GPU with ID 0.)
`INVALID_ARGUMENT: No matching device found for device_id 8`

Note that while the primary purpose of `BaseGPUDeviceFactory::CreateDevices` is to do one-time initialization, it is often called multiple times. In the typical production MWMS case, it is called both when creating a TF Context and when creating GRPC servers when enabling collectives. In the unit test added by this CL (which uses two GPUs), it is first called when a TF Context is created when starting up the test environment (with both GPUS) and then two worker TF processes are created which both call it twice during MWMS startup (each with the one GPU assigned to the process). Other test cases have different patterns.

PiperOrigin-RevId: 617685758
---
 tensorflow/core/common_runtime/eager/BUILD    |  43 ++-
 .../eager/context_distributed_manager.cc      | 363 ++++++++++++++++--
 .../core/common_runtime/gpu/gpu_device.cc     | 137 +++++--
 .../rpc/grpc_server_lib.cc                    |   7 +
 tensorflow/core/tfrt/common/BUILD             |   6 +
 tensorflow/core/tfrt/common/pjrt_state.cc     |  13 +
 tensorflow/core/tfrt/common/pjrt_state.h      |  25 ++
 tensorflow/core/tfrt/common/pjrt_util.cc      |  33 ++
 tensorflow/core/tfrt/common/pjrt_util.h       |   5 +
 tensorflow/python/distribute/BUILD            |  25 ++
 .../python/distribute/mwms_pjrt_gpu_test.py   | 116 ++++++
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |   8 +-
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.h     |  10 +
 13 files changed, 737 insertions(+), 54 deletions(-)
 create mode 100644 tensorflow/python/distribute/mwms_pjrt_gpu_test.py

diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index b2af36ec848ce8..5c53fb9fc4fcee 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -1,5 +1,8 @@
+load("@bazel_skylib//lib:selects.bzl", "selects")
 load(
     "//tensorflow:tensorflow.bzl",
+    "clean_dep",
+    "if_google",
     "if_zendnn",
     "tf_cc_test",
     "tf_cc_test_mkl",
@@ -234,12 +237,50 @@ tf_cuda_library(
             "//tensorflow/core/distributed_runtime/eager:cluster_function_library_runtime",
             "//tensorflow/core/distributed_runtime/eager:eager_client",
             "//tensorflow/core/distributed_runtime/eager:remote_mgr",
+            "//tensorflow/core/tfrt/common:global_state",
+            "//tensorflow/core/tfrt/common:pjrt_state",
+            "//tensorflow/core/tfrt/common:pjrt_util",
+            "@com_google_absl//absl/base:core_headers",
+            "@com_google_absl//absl/container:flat_hash_map",
+            "@com_google_absl//absl/log",
+            "@com_google_absl//absl/status",
+            "@com_google_absl//absl/status:statusor",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/synchronization",
+            "@com_google_absl//absl/time",
             "@local_tsl//tsl/distributed_runtime/coordination:coordination_service",
             "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
             "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_impl",
             "@local_tsl//tsl/distributed_runtime/preemption:preemption_notifier",
+            "@local_tsl//tsl/platform:mutex",
+            "@local_tsl//tsl/platform:statusor",
+            "@local_xla//xla/pjrt:pjrt_stream_executor_client",
+            "@local_xla//xla/pjrt/gpu:se_gpu_pjrt_client",
         ],
-    }),
+    }) + if_google(
+        # TODO(b/282068262): PJRT pulls in TFRT components that are incompatible with ARM platform.
+        # Clean up so that PJRT can run on ARM (and remove "#if defined(PLATFORM_GOOGLE) ..." use
+        # from gpu_util.cc).
+        # Also it won't build with WeightWatcher which tracks OSS build binaries.
+        # TODO(b/290533709): Clean up this build rule.
+        selects.with_or({
+            clean_dep("//tensorflow:linux_x86_64_with_weightwatcher"): [],
+            (
+                clean_dep("//tensorflow:linux_x86_64"),
+                clean_dep("//tensorflow:haswell"),
+            ): [
+                "//tensorflow/core",
+                "//tensorflow/core/framework:resource_base",
+                "@local_xla//xla/pjrt/distributed:key_value_store_interface",
+                "@local_xla//xla/pjrt:local_device_state",
+                "@local_xla//xla/pjrt:pjrt_client",
+                "@local_xla//xla/pjrt:pjrt_compiler",
+                "@local_xla//xla/service/gpu:gpu_executable_run_options",
+                "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
+            ],
+            "//conditions:default": [],
+        }),
+    ),
 )
 
 tf_cc_test(
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
index 3393e08481fb72..3abc2605144133 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
@@ -18,14 +18,17 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <iterator>
+#include <map>
 #include <memory>
 #include <numeric>
 #include <string>
+#include <string_view>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "google/protobuf/any.pb.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -48,9 +51,19 @@ limitations under the License.
 #include "tensorflow/core/protobuf/device_filters.pb.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h"
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
@@ -63,23 +76,315 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #endif  // !IS_MOBILE_PLATFORM
 
+#if (defined(PLATFORM_GOOGLE) && defined(TF_PLATFORM_LINUX_X86_64))
+#define TF_GPU_USE_PJRT
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/pjrt/local_device_state.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_stream_executor_client.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
+#include "tensorflow/core/framework/resource_base.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/tfrt/common/global_state.h"
+#include "tensorflow/core/tfrt/common/pjrt_state.h"
+#include "tensorflow/core/tfrt/common/pjrt_util.h"
+#endif
+
 namespace tensorflow {
 
+#if !defined(IS_MOBILE_PLATFORM)
+namespace {
+
 // We don't use the TF_RETURN_IF_ERROR macro directly since that destroys the
 // server object (which currently CHECK-fails) and we miss the error, instead,
 // we log the error, and then return to allow the user to see the error
 // message.
-#define LOG_AND_RETURN_IF_ERROR(...)                  \
-  do {                                                \
-    const tensorflow::Status _status = (__VA_ARGS__); \
-    if (TF_PREDICT_FALSE(!_status.ok())) {            \
-      LOG(ERROR) << _status.message();                \
-      return _status;                                 \
-    }                                                 \
+#define LOG_AND_RETURN_IF_ERROR(...)            \
+  do {                                          \
+    const absl::Status _status = (__VA_ARGS__); \
+    if (TF_PREDICT_FALSE(!_status.ok())) {      \
+      LOG(ERROR) << _status.message();          \
+      return _status;                           \
+    }                                           \
   } while (0);
 
-#if !defined(IS_MOBILE_PLATFORM)
-namespace {
+#ifdef TF_GPU_USE_PJRT
+// Provide a KeyValue interface to the coordination service agent for use by
+// BuildDistributedDevices.
+class XlaKeyValueStore : public xla::KeyValueStoreInterface {
+ public:
+  explicit XlaKeyValueStore(
+      tsl::CoordinationServiceAgent* coordination_service_agent,
+      std::string key_prefix = "gpu:")
+      : coordination_service_agent_(coordination_service_agent),
+        key_prefix_(key_prefix) {}
+  absl::StatusOr<std::string> Get(std::string_view key,
+                                  absl::Duration timeout) override {
+    return coordination_service_agent_->GetKeyValue(
+        absl::StrCat(key_prefix_, key), timeout);
+  }
+
+  absl::Status Set(std::string_view key, std::string_view value) override {
+    return coordination_service_agent_->InsertKeyValue(
+        absl::StrCat(key_prefix_, key), value);
+  };
+
+ private:
+  tsl::CoordinationServiceAgent* coordination_service_agent_;
+  std::string key_prefix_;
+};
+
+// Remove LocalDeviceState objects from
+// info->local_device_states that have unique hardware IDs
+// (i.e. ignore duplicate virtual devices) and return them in a map.
+static std::map<int, std::unique_ptr<xla::LocalDeviceState>>
+GetUniqueDeviceStates(PjRtGpuClientCreationInfo* info) {
+  // Only consider each hardware device once. In test environments, one
+  // physical GPU (e.g. hardware_id 0) might be shared as virtual GPUs (e.g.
+  // local_id 0 and 1) by multiple workers (multiple processes on the same
+  // computer). If there is a need to not ignore these for an actual case, a
+  // possible solution is to add a flag to only enable the use of
+  // hardware_id_to_local_id for tests.
+
+  auto input_states = std::move(info->local_device_states);
+
+  absl::flat_hash_map<int, int> hardware_id_to_local_id;
+  for (const auto& id_state : input_states) {
+    int local_id = id_state.second->local_device_id().value();
+    int hardware_id = id_state.second->local_hardware_id().value();
+    if (hardware_id_to_local_id.contains(hardware_id)) {
+      if (hardware_id_to_local_id[hardware_id] > local_id) {
+        // Use the device with the smallest local_id, ignore others.
+        hardware_id_to_local_id[hardware_id] = local_id;
+      }
+    } else {
+      hardware_id_to_local_id[hardware_id] = local_id;
+    }
+  }
+  std::map<int, std::unique_ptr<xla::LocalDeviceState>> local_device_states;
+  for (auto& id_state : input_states) {
+    int local_id = id_state.second->local_device_id().value();
+    int hardware_id = id_state.second->local_hardware_id().value();
+    if (hardware_id_to_local_id[hardware_id] != local_id) {
+      VLOG(1) << "For hardware_id=" << hardware_id
+              << ", ignoring redundant local_id=" << local_id
+              << ". local_id=" << hardware_id_to_local_id[hardware_id]
+              << " will be used instead.";
+      continue;
+    }
+    local_device_states.emplace(id_state.first, std::move(id_state.second));
+  }
+  return local_device_states;
+}
+
+// Coordinate creation of a PjRt GPU client with distributed devices when there
+// are multiple threads (which typically occurs in test environments that use
+// multiple threads to simulate multiple workers).
+class ClientCreationState : public ResourceBase {
+ public:
+  static ClientCreationState* Create() { return new ClientCreationState(); }
+
+  // Returns true for the thread that should create the PjRt GPU client,
+  // which is first thread to call this method.
+  // Each thread should only call this once (i.e. it will return
+  // false after the first call). This modifies internal state (i.e. the first
+  // call clears `first_task_`).
+  bool FirstThread() ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock lock(&mu_);
+    if (first_task_) {
+      first_task_ = false;
+      return true;
+    }
+    return false;
+  }
+
+  // The first task signals other tasks that it is ready to run
+  // BuildDistributedDevices.
+  void SetReady() { ready_notification_.Notify(); }
+
+  // The first task signals other tasks that either it has finished either after
+  // creating a client or an error occurred.
+  void SetDone() { done_notification_.Notify(); }
+
+  // The first task signals other tasks that an error occurred.
+  // done_notification_ is notified first and then ready_notification_ is
+  // notified second so any task waiting for ready_notification_ will have the
+  // proper value for done_notification_ before it unblocks to use for
+  // determining if an error occurred.
+  void SetErrorBeforeReady() {
+    done_notification_.Notify();
+    ready_notification_.Notify();
+  }
+
+  // Wait for the first task to notify that it is either ready to run
+  // BuildDistributedDevices and return true or there is an error and return
+  // false.
+  bool WaitForReadyOrError() {
+    ready_notification_.WaitForNotification();
+    return !done_notification_.HasBeenNotified();
+  }
+
+  // Wait for first task to notify that it created a client or there is an
+  // error.
+  void WaitForDone() { done_notification_.WaitForNotification(); };
+
+  std::string DebugString() const override { return "ClientCreationState"; }
+
+ private:
+  absl::Mutex mu_;
+
+  // Only the task with `first_task_ == true` (the first task) creates
+  // the PjRt GPU client.
+  bool first_task_ ABSL_GUARDED_BY(mu_) = true;
+
+  // The first task notifies any other tasks just before it is ready to run
+  // BuildDistributedDevices or if there is an error. It must notify in every
+  // codepath in the first thread, esp. every early return for errors, etc.,
+  // i.e. an error might need to notify both ready_notification_ and
+  // done_notification_.
+  Notification ready_notification_;
+
+  // The first task notifies after the PjRT GPU client is created or if
+  // there is an error. It must notify every codepath in the first
+  // thread, esp. every early return for errors, etc., i.e. an error might need
+  // to notify both ready_notification_ and done_notification_.
+  Notification done_notification_;
+};
+
+absl::StatusOr<ClientCreationState*> GetOrCreateClientCreationState() {
+  ResourceMgr* rmgr = tfrt_global::GetTFGlobalResourceMgr();
+  ClientCreationState* client_creation_state;
+  TF_RETURN_IF_ERROR(rmgr->LookupOrCreate<ClientCreationState>(
+      rmgr->default_container(), kPjRtStateResourceName, &client_creation_state,
+      [&](ClientCreationState** ret) {
+        *ret = ClientCreationState::Create();
+        return absl::OkStatus();
+      }));
+  core::ScopedUnref client_creation_state_ref(client_creation_state);
+  return client_creation_state;
+}
+
+// PjRt GPU specific code for creating a PjRt GPU client that knows about
+// remote devices. If run by multiple threads (which is done by tests using
+// threads to simulate multiple workers), all threads run
+// BuildDistributedDevices but only one thread creates the client.
+// Synchronization assures that if the task designated to create the client
+// exits early without running BuildDistributedDevices, all tasks exit early.
+// For the usual case where a client is created, synchonization assures that the
+// tasks that do not create the client wait for client creation before exiting.
+absl::Status CreateClientOnce(
+    int node_id, int num_nodes,
+    tsl::CoordinationServiceAgent* coordination_service_agent) {
+  TF_ASSIGN_OR_RETURN(auto creation_state, GetOrCreateClientCreationState());
+  bool use_creation_info = creation_state->FirstThread();
+
+  PjRtGpuClientCreationInfo* info = nullptr;
+  if (use_creation_info) {
+    auto obtained_info = GetPjRtGpuClientCreationInfo();
+    if (!obtained_info.ok()) {
+      creation_state->SetErrorBeforeReady();
+      return obtained_info.status();
+    }
+    info = obtained_info.value();
+    if (info == nullptr) {
+      VLOG(2) << "No PjRtGpuClientCreationInfo in CreateClientOnce";
+      creation_state->SetErrorBeforeReady();
+      return absl::OkStatus();
+    }
+    VLOG(2) << "Creating PjRtGpuClientCreationInfo in CreateClientOnce.";
+  } else {
+    LOG(INFO)
+        << "Skipping using GetPjRtGpuClientCreationInfo in CreateClientOnce "
+           "since it has already been used. This is expected in tests that use "
+           "multiple threads to simulate multiple workers. If this occurs in "
+           "production and op execution on GPU fails, this could be related.";
+    if (!creation_state->WaitForReadyOrError()) {
+      LOG(INFO) << "In CreateClientOnce, first thread exited early, causing "
+                   "this thread to exit.";
+      return absl::OkStatus();
+    }
+  }
+
+  std::vector<std::unique_ptr<xla::PjRtStreamExecutorDevice>> pjrt_devices;
+  auto gpu_run_options = std::make_unique<xla::gpu::GpuExecutableRunOptions>();
+#if TENSORFLOW_USE_ROCM
+  auto platform_name = xla::RocmName();
+#elif TENSORFLOW_USE_SYCL
+  auto pjrt_platform_name = xla::SyclName();
+#else   // TENSORFLOW_USE_ROCM
+  auto platform_name = xla::CudaName();
+#endif  // TENSORFLOW_USE_ROCM
+
+  auto kv_store =
+      std::make_shared<XlaKeyValueStore>(coordination_service_agent);
+  std::map<int, std::unique_ptr<xla::LocalDeviceState>>
+      unique_local_device_states;
+  if (use_creation_info) {
+    unique_local_device_states = GetUniqueDeviceStates(info);
+  }
+  if (use_creation_info) {
+    // Tell any other threads are waiting to call BuildDistributedDevices to
+    // proceed.
+    creation_state->SetReady();
+  }
+  auto status = BuildDistributedDevices(
+      platform_name, std::move(unique_local_device_states), node_id, num_nodes,
+      &pjrt_devices, gpu_run_options.get(), kv_store,
+      /*enable_mock_nccl=*/false);
+  if (!status.ok()) {
+    if (use_creation_info) {
+      creation_state->SetDone();
+    }
+    return status;
+  }
+
+  VLOG(2) << "Distributed devices built with size=" << pjrt_devices.size();
+  int i = 0;
+  for (const auto& pjrt_device : pjrt_devices) {
+    if (pjrt_device != nullptr) {
+      VLOG(2) << "  pjrt_device " << i++ << ":"
+              << pjrt_device->description().DebugString();
+    } else {
+      VLOG(2) << "  pjrt_device " << i++ << ":" << "nullptr";
+    }
+  }
+
+  if (use_creation_info) {
+    std::unique_ptr<xla::PjRtClient> pjrt_client =
+        std::make_unique<xla::StreamExecutorGpuClient>(
+            platform_name, info->local_client, std::move(pjrt_devices),
+            /*process_index=*/node_id,
+            /*allocator=*/std::move(info->allocator),
+            /*host_memory_allocator=*/std::move(info->host_memory_allocator),
+            /*should_stage_host_to_device_transfers=*/true,
+            /*gpu_run_options=*/std::move(gpu_run_options));
+    VLOG(2) << "PJRT GPU client with remote devices created.";
+    status = SetPjRtClientInTFGlobalResourceManager(DeviceType(DEVICE_GPU),
+                                                    std::move(pjrt_client));
+    creation_state->SetDone();
+    return status;
+  } else {
+    LOG(INFO) << "Skipping creating PJRT GPU client, another thread has "
+                 "already created the client.";
+    creation_state->WaitForDone();
+    return absl::OkStatus();
+  }
+}
+#endif  // TF_GPU_USE_PJRT
+
+absl::Status CreatePjRtGpuClientWithDistributedDevices(
+    int node_id, int num_nodes,
+    tsl::CoordinationServiceAgent* coordination_service_agent) {
+#ifdef TF_GPU_USE_PJRT
+  if (num_nodes <= 1) {
+    return absl::OkStatus();
+  }
+  return CreateClientOnce(node_id, num_nodes, coordination_service_agent);
+#else   // TF_GPU_USE_PJRT
+  return absl::OkStatus();
+#endif  // TF_GPU_USE_PJRT
+}
 
 bool AreLocalDevicesCompatible(const EagerContext* context,
                                const ServerDef& server_def) {
@@ -416,19 +721,6 @@ Status UpdateContextWithServerDef(EagerContext* context,
                                   bool reset_context, int keep_alive_secs,
                                   int64_t init_timeout_in_ms, int retries,
                                   bool clear_existing_contexts = false) {
-  // We don't use the TF_RETURN_IF_ERROR macro directly since that destroys the
-  // server object (which currently CHECK-fails) and we miss the error, instead,
-  // we log the error, and then return to allow the user to see the error
-  // message.
-#define LOG_AND_RETURN_IF_ERROR(...)                  \
-  do {                                                \
-    const tensorflow::Status _status = (__VA_ARGS__); \
-    if (TF_PREDICT_FALSE(!_status.ok())) {            \
-      LOG(ERROR) << _status.message();                \
-      return _status;                                 \
-    }                                                 \
-  } while (0);
-
   string worker_name =
       strings::StrCat("/job:", server_def.job_name(),
                       "/replica:0/task:", server_def.task_index());
@@ -699,12 +991,16 @@ Status EagerContextDistributedManager::SetOrUpdateServerDef(
   Status s = UpdateContextWithServerDef(context_, server_def, reset_context,
                                         keep_alive_secs, init_timeout_in_ms,
                                         retries, clear_existing_contexts);
+  if (!s.ok()) {
+    coordination_service_agent_ = nullptr;
+    return s;
+  }
   // If context is reset, make sure pointer is set to the new agent.
   coordination_service_agent_ =
       context_->GetServer()
           ->worker_env()
           ->session_mgr->GetCoordinationServiceAgent();
-  return s;
+  return absl::OkStatus();
 }
 
 Status EagerContextDistributedManager::InitializeLocalOnlyContext(
@@ -844,6 +1140,27 @@ Status EagerContextDistributedManager::EnableCollectiveOps(
       LOG_AND_RETURN_IF_ERROR(coordination_service_agent_->Connect());
       LOG_AND_RETURN_IF_ERROR(
           coordination_service_agent_->WaitForAllTasks(devices));
+      // Coordination service agent is now connected.
+
+      // Convert nested "job name" and "task index" into a flat "node_id" index
+      // in 0..num_nodes-1. num_nodes is the sum of the number of tasks in each
+      // job.
+      int num_nodes = 0;
+      int node_id = -1;
+      const std::string& job_name = server_def.job_name();
+      int task_index = server_def.task_index();
+      for (const auto& job : server_def.cluster().job()) {
+        if (job.name() == job_name) {
+          node_id = num_nodes + task_index;
+        }
+        num_nodes += job.tasks().size();
+      }
+      VLOG(2)
+          << "Creating PJRT client with distributed devices with num_nodes: "
+          << num_nodes << ", node_id: " << node_id;
+
+      LOG_AND_RETURN_IF_ERROR(CreatePjRtGpuClientWithDistributedDevices(
+          node_id, num_nodes, coordination_service_agent_));
 
       // Add remote devices to eager context.
       std::vector<std::unique_ptr<Device>> remote_devices;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index af4dc4dcb4bbe7..6680e9622f04e8 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -1410,12 +1410,33 @@ Status BaseGPUDeviceFactory::CreateDevices(
     num_gpus_to_use = iter->second;
   }
   const auto& gpu_options = options.config.gpu_options();
+  bool populate_pjrt_gpu_client_creation_info =
+      gpu_options.experimental().populate_pjrt_gpu_client_creation_info();
+
+#ifdef TF_GPU_USE_PJRT
+  absl::StatusOr<PjRtGpuClientCreationInfo*> obtained_info =
+      GetPjRtGpuClientCreationInfo();
+  if (obtained_info.ok() && obtained_info.value() != nullptr) {
+    populate_pjrt_gpu_client_creation_info = false;
+    VLOG(3) << "Previous GetPjRtGpuClientCreationInfo exists, setting "
+               "populate_pjrt_gpu_client_creation_info to false.";
+  } else {
+    VLOG(3)
+        << "Previous GetPjRtGpuClientCreationInfo does not exist. Will create.";
+  }
+#endif
+
   std::vector<tsl::PlatformDeviceId> visible_gpu_order;
   std::vector<tsl::PlatformDeviceId> valid_platform_device_ids;
   // If we aren't going to use any GPUs, don't initialize them.
   // We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0,
   // because it treats an empty gpu_options.visible_device_list as 'all GPUs
   // are visible'.
+  VLOG(3) << "CreateGPUDevice: num_gpus_to_use: " << num_gpus_to_use
+          << ", visible_device_list: " << gpu_options.visible_device_list()
+          << ", populate_pjrt_gpu_client_creation_info: "
+          << populate_pjrt_gpu_client_creation_info;
+
   if (num_gpus_to_use > 0) {
     TF_RETURN_IF_ERROR(tsl::ParseVisibleDeviceList(
         gpu_options.visible_device_list(), gpu_manager->VisibleDeviceCount(),
@@ -1764,7 +1785,13 @@ Status BaseGPUDeviceFactory::CreateDevices(
     }
 
     xla::LocalDeviceState* local_device_state = nullptr;
-    if (should_create_new_pjrt_client) {
+    if (should_create_new_pjrt_client ||
+        populate_pjrt_gpu_client_creation_info) {
+      VLOG(3) << "should_create_new_pjrt_client="
+              << should_create_new_pjrt_client
+              << ", populate_pjrt_gpu_client_creation_info="
+              << populate_pjrt_gpu_client_creation_info
+              << " for device ordinal " << di;
       const int priority = GetPriority(tf_device_id.value(), gpu_options);
       xla::LocalDeviceState::StreamOptions stream_options;
       int num_d2d_streams =
@@ -1787,12 +1814,17 @@ Status BaseGPUDeviceFactory::CreateDevices(
                   /*allow_event_reuse=*/true, /*use_callback_stream=*/true,
                   /*device_ordinal=*/di, /*stream_options=*/stream_options));
       if (!emplace_result.second) {
+        LOG(ERROR) << "Creating LocalDeviceState for device ordinal: " << di
+                   << " already exists! Returning an error";
         return absl::InternalError(absl::StrCat(
             "GPU local device state for tf_device_id: ", tf_device_id.value(),
             " already exists."));
       }
       local_device_state = emplace_result.first->second.get();
     } else {
+      VLOG(3) << "should_create_new_pjrt_client="
+              << should_create_new_pjrt_client << " for device ordinal " << di
+              << ". Re-using local_device_state";
       auto* pjrt_se_client =
           tensorflow::down_cast<xla::PjRtStreamExecutorClient*>(
               *obtained_pjrt_client);
@@ -1806,7 +1838,8 @@ Status BaseGPUDeviceFactory::CreateDevices(
         /*dev_locality=*/it->second,
         /*xla_local_device_state=*/local_device_state, gpu_allocator, devices));
 
-    if (should_create_new_pjrt_client) {
+    if (should_create_new_pjrt_client ||
+        populate_pjrt_gpu_client_creation_info) {
       auto gpu_allocator_ptr = std::unique_ptr<Allocator>(gpu_allocator);
       allocator_id_stream_tuples.emplace_back(
           std::move(gpu_allocator_ptr), local_device_state->compute_stream(), 0,
@@ -1819,7 +1852,10 @@ Status BaseGPUDeviceFactory::CreateDevices(
     return OkStatus();
   }
 
-  if (should_create_new_pjrt_client) {
+  if (should_create_new_pjrt_client || populate_pjrt_gpu_client_creation_info) {
+    VLOG(3) << "should_create_new_pjrt_client=" << should_create_new_pjrt_client
+            << ", populate_pjrt_gpu_client_creation_info="
+            << populate_pjrt_gpu_client_creation_info;
     auto allocator_adapter = std::make_unique<se::MultiDeviceAdapter>(
         gpu_manager, std::move(allocator_id_stream_tuples));
 
@@ -1829,34 +1865,83 @@ Status BaseGPUDeviceFactory::CreateDevices(
     std::unique_ptr<tsl::Allocator> pjrt_gpu_host_allocator(
         process_state->GetGpuHostAllocator(/*options=*/{}, numa_node));
 
-    std::vector<std::unique_ptr<xla::PjRtStreamExecutorDevice>> pjrt_devices =
-        xla::BuildLocalDevices(std::move(local_device_states),
-                               /*node_id=*/numa_node);
+    if (populate_pjrt_gpu_client_creation_info &&
+        !should_create_new_pjrt_client) {
+      auto pjrt_gpu_client_creation_info =
+          std::make_unique<PjRtGpuClientCreationInfo>();
+
+      pjrt_gpu_client_creation_info->allocator = std::move(allocator_adapter);
+      pjrt_gpu_client_creation_info->host_memory_allocator =
+          std::move(pjrt_gpu_host_allocator);
+      pjrt_gpu_client_creation_info->local_device_states =
+          std::move(local_device_states);
+      pjrt_gpu_client_creation_info->local_client = xla_client;
+      pjrt_gpu_client_creation_info->allowed_devices =
+          std::move(allowed_devices);
 
-    auto& pjrt_rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
-    pjrt_rollout_config.AllowForDeviceInXlaLaunch(DEVICE_GPU);
-    pjrt_rollout_config.AllowForDeviceInXlaCompileOnDemand(DEVICE_GPU);
-    pjrt_rollout_config.AllowForDeviceInXlaCompileAndRun(DEVICE_GPU);
+      return SetPjRtGpuClientCreationInfoInTFGlobalResourceManager(
+          std::move(pjrt_gpu_client_creation_info));
+    }
 
-    // Creates PJRT GPU client and places it into a TF global resource manager.
-    auto gpu_run_options =
-        std::make_unique<xla::gpu::GpuExecutableRunOptions>();
+    if (should_create_new_pjrt_client) {
+      // The first time BaseGPUDeviceFactory::CreateDevices is called, a PJRT
+      // GPU client is always created (assuming the source code for PJRT GPU
+      // support is included in the build). This client only has information
+      // about local devices.
+      //
+      // If later a client is created with information about both local and
+      // remote devices (i.e. if there are multiple nodes/jobs/tasks and
+      // collectives are enabled), that newer client will be used instead from
+      // then on. Also, in certain test cases (e.g. to simulate changes to the
+      // number of GPUs), this code block may be run multiple times and create a
+      // client more than once. When a new client is created, the old client
+      // will not be used for any new executions but will be kept so that any
+      // buffers for computations in progress remain valid.
+      //
+      // Otherwise, once a client is created by the first call, it is the only
+      // client that is created/used and future calls skip this code block.
+      int node_id = gpu_options.experimental().node_id();
+      std::vector<std::unique_ptr<xla::PjRtStreamExecutorDevice>> pjrt_devices =
+          xla::BuildLocalDevices(std::move(local_device_states),
+                                 /*node_id=*/node_id);
+
+      auto& pjrt_rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
+      pjrt_rollout_config.AllowForDeviceInXlaLaunch(DEVICE_GPU);
+      pjrt_rollout_config.AllowForDeviceInXlaCompileOnDemand(DEVICE_GPU);
+      pjrt_rollout_config.AllowForDeviceInXlaCompileAndRun(DEVICE_GPU);
+
+      // Creates PJRT GPU client and places it into a TF global resource
+      // manager.
+      auto gpu_run_options =
+          std::make_unique<xla::gpu::GpuExecutableRunOptions>();
 #if TENSORFLOW_USE_ROCM
-    auto platform_name = xla::RocmName();
+      auto platform_name = xla::RocmName();
+#elif TENSORFLOW_USE_SYCL
+      auto pjrt_platform_name = xla::SyclName();
 #else   // TENSORFLOW_USE_ROCM
-    auto platform_name = xla::CudaName();
+      auto platform_name = xla::CudaName();
 #endif  // TENSORFLOW_USE_ROCM
-    std::unique_ptr<xla::PjRtClient> pjrt_client =
-        std::make_unique<xla::StreamExecutorGpuClient>(
-            platform_name, xla_client, std::move(pjrt_devices),
-            /*process_index=*/numa_node,
-            /*allocator=*/std::move(allocator_adapter),
-            /*host_memory_allocator=*/std::move(pjrt_gpu_host_allocator),
-            /*should_stage_host_to_device_transfers=*/true,
-            /*gpu_run_options=*/std::move(gpu_run_options));
-
-    return SetPjRtClientInTFGlobalResourceManager(DeviceType(DEVICE_GPU),
-                                                  std::move(pjrt_client));
+      std::unique_ptr<xla::PjRtClient> pjrt_client =
+          std::make_unique<xla::StreamExecutorGpuClient>(
+              platform_name, xla_client, std::move(pjrt_devices),
+              /*process_index=*/numa_node,
+              /*allocator=*/std::move(allocator_adapter),
+              /*host_memory_allocator=*/std::move(pjrt_gpu_host_allocator),
+              /*should_stage_host_to_device_transfers=*/true,
+              /*gpu_run_options=*/std::move(gpu_run_options));
+
+      return SetPjRtClientInTFGlobalResourceManager(DeviceType(DEVICE_GPU),
+                                                    std::move(pjrt_client));
+    }
+
+    LOG(INFO)
+        << "Unexpectedly returning OK STATUS in "
+           "BaseGPUDeviceFactory::CreateDevices without creating a PJRT GPU "
+           "client when one does not already exist. If there is any problem "
+           "with GPU computations, file a bug. (But if this occurs in a "
+           "test environment that doesn't actually perform GPU "
+           "computations, this might not be a problem.)";
+    return OkStatus();
   } else {
     return obtained_pjrt_client.status();
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index d5eacc7e6a16cd..42f0d3ea79c274 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -198,6 +198,13 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   VLOG(3) << "Grpc Server Init Definition: " << server_def_.DebugString();
   ConfigProto config = server_def_.default_session_config();
   sess_opts.config = config;
+  // Allow creation of PjRt client that knows about remote devices so
+  // collectives will work with MultiWorkerMirroredStrategy.
+  sess_opts.config.mutable_gpu_options()
+      ->mutable_experimental()
+      ->set_populate_pjrt_gpu_client_creation_info(true);
+  sess_opts.config.mutable_gpu_options()->mutable_experimental()->set_node_id(
+      server_def_.task_index());
 
   // Configure shared devices between master and worker.
   string name_prefix = strings::StrCat("/job:", server_def_.job_name(),
diff --git a/tensorflow/core/tfrt/common/BUILD b/tensorflow/core/tfrt/common/BUILD
index 48ad12ffda86f0..ac61ea80b22a2e 100644
--- a/tensorflow/core/tfrt/common/BUILD
+++ b/tensorflow/core/tfrt/common/BUILD
@@ -85,8 +85,12 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status",
+        "@local_xla//xla/client:local_client",
+        "@local_xla//xla/pjrt:local_device_state",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt:tf_pjrt_client",
+        "@local_xla//xla/stream_executor/integrations:tf_allocator_adapter",
     ],
 )
 
@@ -107,6 +111,8 @@ cc_library(
         "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@local_xla//xla/pjrt:pjrt_client",
     ],
 )
diff --git a/tensorflow/core/tfrt/common/pjrt_state.cc b/tensorflow/core/tfrt/common/pjrt_state.cc
index 86d2a3ffeaf377..c7fb3304d42bc6 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.cc
+++ b/tensorflow/core/tfrt/common/pjrt_state.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/tf_pjrt_client.h"
 #include "tensorflow/core/platform/errors.h"
@@ -80,6 +81,18 @@ Status PjRtState::MovePjRtClientToUnused(const DeviceType& device_type) {
                           device_type);
 }
 
+Status PjRtState::SetPjRtGpuClientCreationInfo(
+    std::unique_ptr<PjRtGpuClientCreationInfo> info) {
+  absl::MutexLock lock(&mu_);
+  pjrt_gpu_client_creation_info_ = std::move(info);
+  return absl::OkStatus();
+}
+
+PjRtGpuClientCreationInfo* PjRtState::GetPjRtGpuClientCreationInfo() {
+  absl::MutexLock lock(&mu_);
+  return pjrt_gpu_client_creation_info_.get();
+}
+
 string PjRtState::DebugString() const { return "PjRtState"; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_state.h b/tensorflow/core/tfrt/common/pjrt_state.h
index 0fd411f2cd683c..180163376b4cd2 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.h
+++ b/tensorflow/core/tfrt/common/pjrt_state.h
@@ -19,7 +19,10 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "xla/client/local_client.h"
+#include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/status.h"
 
@@ -28,6 +31,16 @@ namespace tensorflow {
 const char kPjRtStateResourceName[] = "pjrt_state";
 using PjRtClientsMap = std::map<DeviceType, std::unique_ptr<xla::PjRtClient>>;
 
+// Information needed to create a PjRt GPU Client which is used when creating
+// a client after after information about remote devices is available.
+struct PjRtGpuClientCreationInfo {
+  std::set<int> allowed_devices;
+  std::unique_ptr<se::MultiDeviceAdapter> allocator;
+  std::unique_ptr<tsl::Allocator> host_memory_allocator;
+  std::map<int, std::unique_ptr<xla::LocalDeviceState>> local_device_states;
+  xla::LocalClient* local_client;
+};
+
 // The class for the state related to PjRt. It contains a map from `DeviceType`
 // to `PjRtClient`. It will be stored in the global `ResourceManager`.
 class PjRtState : public ResourceBase {
@@ -43,6 +56,15 @@ class PjRtState : public ResourceBase {
   Status MovePjRtClientToUnused(const DeviceType& device_type);
   string DebugString() const override;
 
+  // Saves information needed to create a PJRT client (to enable creating a
+  // client with remote devices).
+  absl::Status SetPjRtGpuClientCreationInfo(
+      std::unique_ptr<PjRtGpuClientCreationInfo> info);
+
+  // Retrieves information needed to create a PJRT client (for creating a
+  // client with remote devices).
+  PjRtGpuClientCreationInfo* GetPjRtGpuClientCreationInfo();
+
  private:
   explicit PjRtState() {}
   absl::Mutex mu_;
@@ -50,6 +72,9 @@ class PjRtState : public ResourceBase {
   // Store the PJRT clients that are no longer used to guarantee that PJRT
   // clients outlive PJRT buffers.
   std::vector<std::unique_ptr<xla::PjRtClient>> unused_ ABSL_GUARDED_BY(mu_);
+
+  std::unique_ptr<PjRtGpuClientCreationInfo> pjrt_gpu_client_creation_info_
+      ABSL_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_util.cc b/tensorflow/core/tfrt/common/pjrt_util.cc
index 4a4832a81d649f..d679e92eed3f3b 100644
--- a/tensorflow/core/tfrt/common/pjrt_util.cc
+++ b/tensorflow/core/tfrt/common/pjrt_util.cc
@@ -17,7 +17,10 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <set>
+#include <utility>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
@@ -57,4 +60,34 @@ absl::StatusOr<xla::PjRtClient*> GetPjRtClient(const DeviceType& device_type) {
   return pjrt_state->GetPjRtClient(device_type);
 }
 
+absl::Status SetPjRtGpuClientCreationInfoInTFGlobalResourceManager(
+    std::unique_ptr<PjRtGpuClientCreationInfo> info) {
+  ResourceMgr* rmgr = tfrt_global::GetTFGlobalResourceMgr();
+  PjRtState* pjrt_state;
+  TF_RETURN_IF_ERROR(rmgr->LookupOrCreate<PjRtState>(
+      rmgr->default_container(), kPjRtStateResourceName, &pjrt_state,
+      [&](PjRtState** ret) {
+        *ret = PjRtState::Create();
+        return absl::OkStatus();
+      }));
+  core::ScopedUnref pjrt_state_ref(pjrt_state);
+  if (info == nullptr) {
+    return absl::InvalidArgumentError("PJRT client creation info is nullptr.");
+  }
+  TF_RETURN_IF_ERROR(pjrt_state->SetPjRtGpuClientCreationInfo(std::move(info)));
+  return absl::OkStatus();
+}
+
+absl::StatusOr<PjRtGpuClientCreationInfo*> GetPjRtGpuClientCreationInfo() {
+  ResourceMgr* rmgr = tfrt_global::GetTFGlobalResourceMgr();
+  PjRtState* pjrt_state;
+  TF_RETURN_IF_ERROR(rmgr->LookupOrCreate<PjRtState>(
+      rmgr->default_container(), kPjRtStateResourceName, &pjrt_state,
+      [&](PjRtState** ret) {
+        *ret = PjRtState::Create();
+        return absl::OkStatus();
+      }));
+  core::ScopedUnref pjrt_state_ref(pjrt_state);
+  return pjrt_state->GetPjRtGpuClientCreationInfo();
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_util.h b/tensorflow/core/tfrt/common/pjrt_util.h
index 75ce2c94b01ae6..ce9cbc1d11c287 100644
--- a/tensorflow/core/tfrt/common/pjrt_util.h
+++ b/tensorflow/core/tfrt/common/pjrt_util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/common/pjrt_state.h"
 
 namespace tensorflow {
 
@@ -36,6 +37,10 @@ Status SetPjRtClientInTFGlobalResourceManager(
 // TFGlobalResourceManager.
 absl::StatusOr<xla::PjRtClient*> GetPjRtClient(const DeviceType& device_type);
 
+Status SetPjRtGpuClientCreationInfoInTFGlobalResourceManager(
+    std::unique_ptr<PjRtGpuClientCreationInfo> info);
+absl::StatusOr<PjRtGpuClientCreationInfo*> GetPjRtGpuClientCreationInfo();
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TFRT_COMMON_PJRT_UTIL_H_
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index c75f57dd36f892..8bcd53d37efb49 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -651,6 +651,31 @@ py_strict_test(
     ],
 )
 
+cuda_py_strict_test(
+    name = "mwms_pjrt_gpu_test",
+    srcs = ["mwms_pjrt_gpu_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    xla_enabled = True,
+    deps = [
+        ":multi_process_runner",
+        ":multi_worker_test_base",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:collective_ops",
+    ],
+)
+
 py_strict_library(
     name = "tpu_strategy",
     srcs = ["tpu_strategy.py"],
diff --git a/tensorflow/python/distribute/mwms_pjrt_gpu_test.py b/tensorflow/python/distribute/mwms_pjrt_gpu_test.py
new file mode 100644
index 00000000000000..dc5f85aef2486c
--- /dev/null
+++ b/tensorflow/python/distribute/mwms_pjrt_gpu_test.py
@@ -0,0 +1,116 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import copy
+
+from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python.distribute import cluster_resolver as cluster_resolver_lib
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import config as tf_config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops
+
+
+# Based on collective_ops_multi_worker_test.py
+def enable_collective_ops(cluster_resolver):
+  context.context().configure_collective_ops(
+      collective_leader="/job:worker/replica:0/task:0")
+  config_proto = copy.deepcopy(context.context().config)
+  server_def = tensorflow_server_pb2.ServerDef(
+      cluster=cluster_resolver.cluster_spec().as_cluster_def(),
+      default_session_config=config_proto,
+      job_name=cluster_resolver.task_type,
+      task_index=cluster_resolver.task_id,
+      protocol=cluster_resolver.rpc_layer or "grpc")
+  context.context().enable_collective_ops(server_def)
+
+
+def configure_coordination_service():
+  context.context().configure_coordination_service(
+      service_type="standalone",
+      service_leader="/job:worker/replica:0/task:0",
+      enable_health_check=False,
+  )
+
+
+class MultiWorkerMirroredStrategyPjRtRemoteGpuTest(
+    multi_worker_test_base.MultiWorkerTestBase, test.TestCase
+):
+  def testRemoteGpusFound(self):
+
+    def worker_fn():
+
+      configure_coordination_service()
+      cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
+      enable_collective_ops(cluster_resolver=cluster_resolver)
+      context.context().ensure_initialized()
+
+      group_size = 2
+      group_key = 1
+      instance_key1 = 1
+      instance_key2 = 2
+      tensor_size = 10
+
+      # cluster_resolver.task_id is int 0 or 1.
+      tensor_val = [cluster_resolver.task_id + 1.] * tensor_size
+      constant = constant_op.constant(tensor_val)
+
+      @def_function.function(jit_compile=True)
+      def g():
+
+        def f(x):
+          return 2 * x + 1
+
+        input_tensor1 = array_ops.identity(f(constant))
+        input_tensor2 = array_ops.identity(f(constant))
+
+        reduced_tensor1 = collective_ops.all_reduce_v2(
+            input_tensor1, group_size, group_key, instance_key1, "Add", "Id")
+        reduced_tensor2 = collective_ops.all_reduce_v2(
+            input_tensor2, group_size, group_key, instance_key2, "Add", "Id")
+        return reduced_tensor1, reduced_tensor2
+
+      return g()
+
+    num_gpus = len(tf_config.list_physical_devices("GPU"))
+    skip_flag = (num_gpus < 2) or not test_util.is_xla_enabled()
+    if skip_flag:
+      self.skipTest(
+          "This test is intended to test the 2 GPU (1 per worker with 2"
+          " workers) with XLA case (%d GPUs found, using XLA = %s)."
+          % (num_gpus, test_util.is_xla_enabled())
+      )
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    mpr = multi_process_runner.MultiProcessRunner(
+        worker_fn, cluster_spec, share_gpu=False
+    )
+    mpr.start()
+    mpr_result = mpr.join()
+    self.assertLen(mpr_result.return_value, 2)
+    for rval in mpr_result.return_value:
+      for t in rval:
+        # for IDs 0 and 1: (2*(0+1)+1) + (2*(1+1)+1) = 8
+        self.assertAllClose(t.numpy(), [8., 8., 8., 8., 8., 8., 8., 8., 8., 8.])
+
+
+if __name__ == "__main__":
+  multi_process_runner.test_main()
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 30132a90042082..c57fe404badb38 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -902,6 +902,8 @@ GetStreamExecutorGpuDeviceAllocator(
                                                   std::move(allocators));
 }
 
+}  // namespace
+
 Status BuildDistributedDevices(
     std::string_view platform_name,
     std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
@@ -909,8 +911,8 @@ Status BuildDistributedDevices(
     std::vector<std::unique_ptr<PjRtStreamExecutorDevice>>* devices,
     gpu::GpuExecutableRunOptions* gpu_executable_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
-    absl::Duration get_local_topology_timeout = absl::Minutes(2),
-    absl::Duration get_global_topology_timeout = absl::Minutes(5)) {
+    absl::Duration get_local_topology_timeout,
+    absl::Duration get_global_topology_timeout) {
   LocalTopologyProto local_topology;
   local_topology.set_node_id(node_id);
   std::string boot_id_str;
@@ -991,8 +993,6 @@ Status BuildDistributedDevices(
   return OkStatus();
 }
 
-}  // namespace
-
 std::string MakeComputeCapabilityString(const se::DeviceDescription* desc) {
   std::string compute_capability;
 #if GOOGLE_CUDA
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index 99b98cabbea8fc..529258e2a90cc0 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -237,6 +237,16 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
 
 std::string MakeComputeCapabilityString(const se::DeviceDescription* desc);
 
+Status BuildDistributedDevices(
+    std::string_view platform_name,
+    std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
+    int node_id, int num_nodes,
+    std::vector<std::unique_ptr<PjRtStreamExecutorDevice>>* devices,
+    gpu::GpuExecutableRunOptions* gpu_executable_run_options,
+    std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
+    absl::Duration get_local_topology_timeout = absl::Minutes(2),
+    absl::Duration get_global_topology_timeout = absl::Minutes(5));
+
 struct GpuClientOptions {
   GpuAllocatorConfig allocator_config;
 

From a8eb9003c62fded72b878ce9e7c8ef4e869715f4 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <isergachev@nvidia.com>
Date: Wed, 20 Mar 2024 19:16:34 -0700
Subject: [PATCH 206/670] PR #10730: [GPU] Reorganize compilation of cuDNN
 fusions.

Imported from GitHub PR https://github.com/openxla/xla/pull/10730

As discussed recently this moves compiled cuDNN graphs from HLO attributes to a separate structure passed to GPU executable.

cuDNN backend is now explicitly disabled in deviceless compilation mode - in the GEMM fusion autotuner pass and in the GPU compiler.
Copybara import of the project:

--
e25ae7343bf08a88b0e917a10267d644dabde2d3 by Ilia Sergachev <isergachev@nvidia.com>:

[GPU] Reorganize compilation of cuDNN fusions.

Merging this change closes #10730

PiperOrigin-RevId: 617692885
---
 third_party/xla/xla/service/gpu/BUILD         |  4 +--
 .../xla/xla/service/gpu/backend_configs.proto |  1 -
 .../xla/service/gpu/cudnn_fusion_compiler.cc  | 28 ++++++++-----------
 .../xla/service/gpu/cudnn_fusion_compiler.h   | 18 ++++++++----
 .../xla/xla/service/gpu/executable.proto      |  1 +
 third_party/xla/xla/service/gpu/fusions/BUILD |  1 -
 .../xla/xla/service/gpu/fusions/cudnn.cc      | 22 ++-------------
 .../xla/service/gpu/gemm_fusion_autotuner.cc  | 13 ++++-----
 .../xla/xla/service/gpu/gpu_compiler.cc       | 23 +++++++++++++--
 .../xla/xla/service/gpu/gpu_compiler.h        |  1 +
 .../xla/xla/service/gpu/gpu_executable.cc     |  4 ++-
 .../xla/xla/service/gpu/gpu_executable.h      |  7 +++++
 .../xla/xla/service/gpu/nvptx_compiler.cc     |  4 ---
 .../xla/service/gpu/runtime/cudnn_thunk.cc    | 10 +++----
 .../xla/xla/service/gpu/runtime/cudnn_thunk.h |  4 +--
 third_party/xla/xla/service/gpu/thunk.h       |  4 +++
 16 files changed, 76 insertions(+), 69 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index fffd60e47580b3..30fc46fd7a1616 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3223,7 +3223,6 @@ cc_library(
     srcs = if_cuda_is_configured(["cudnn_fusion_compiler.cc"]),
     hdrs = if_cuda_is_configured(["cudnn_fusion_compiler.h"]),
     deps = if_cuda_is_configured([
-        ":autotuner_util",
         ":backend_configs_cc",
         ":ir_emission_utils",
         ":kernel_reuse_cache",
@@ -3234,7 +3233,6 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@local_config_cuda//cuda:cudnn_header",
         "//xla:shape_util",
@@ -3637,6 +3635,7 @@ cc_library(
         ":compile_module_to_llvm_ir",
         ":conv_layout_normalization",
         ":copy_fusion",
+        ":cudnn_fusion_compiler",
         ":custom_kernel_fusion_rewriter",
         ":dot_dimension_sorter",
         ":dot_operand_converter",
@@ -3996,7 +3995,6 @@ cc_library(
         ":move_copy_to_users",
         ":target_constants",
         ":triangular_solve_rewriter",
-        ":cudnn_fusion_compiler",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index 39d1afcaee03fb..47fa4e3383150a 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -151,7 +151,6 @@ message CustomFusionConfig {
 
 message CuDnnFusionConfig {
   int64 plan_id = 1;
-  optional string serialized_graph = 2;
 }
 
 message FusionBackendConfig {
diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
index 1262ad321fe1ca..a8e6a50362c7e7 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/escaping.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -449,8 +448,10 @@ absl::StatusOr<se::gpu::CudnnGraph> PrepareGraph(
 
 class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit CuDnnFusionVisitor(se::dnn::DnnSupport& dnn_support)
-      : dnn_support_(dnn_support) {}
+  explicit CuDnnFusionVisitor(
+      se::dnn::DnnSupport& dnn_support,
+      CuDnnFusionCompiler::BinaryMap& compilation_results)
+      : dnn_support_(dnn_support), compilation_results_(compilation_results) {}
 
   absl::Status HandleFusion(HloInstruction* hlo) override {
     TF_ASSIGN_OR_RETURN(auto gpu_config,
@@ -461,10 +462,6 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
     }
     int64_t plan_id = -1;
     if (fusion_backend_config.has_cudnn_fusion_config()) {
-      if (fusion_backend_config.cudnn_fusion_config().has_serialized_graph()) {
-        VLOG(4) << "Skipping already serialized " << hlo->ToShortString();
-        return absl::OkStatus();
-      }
       plan_id = fusion_backend_config.cudnn_fusion_config().plan_id();
     }
 
@@ -473,7 +470,7 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
 
     const std::string cache_key =
         GetComputationFingerprint(hlo->fused_instructions_computation(), {});
-    std::string& cache_entry = compilation_cache_[cache_key];
+    std::string& cache_entry = compilation_results_[cache_key];
     if (cache_entry.empty()) {
       TF_ASSIGN_OR_RETURN(
           se::gpu::CudnnGraph graph,
@@ -508,16 +505,15 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
 
       std::vector<uint8_t> serialized_graph;
       RETURN_IF_CUDNN_FRONTEND_ERROR(graph.Graph().serialize(serialized_graph));
-      cache_entry = absl::CEscape(
-          absl::string_view(reinterpret_cast<char*>(serialized_graph.data()),
-                            serialized_graph.size()));
+      cache_entry =
+          std::string(reinterpret_cast<char*>(serialized_graph.data()),
+                      serialized_graph.size());
     } else {
       VLOG(4) << "Cache hit.";
     }
     auto cudnn_config = gpu_config.mutable_fusion_backend_config()
                             ->mutable_cudnn_fusion_config();
     cudnn_config->set_plan_id(plan_id);
-    cudnn_config->set_serialized_graph(cache_entry);
     TF_RETURN_IF_ERROR(hlo->set_backend_config(gpu_config));
 
     MarkAsChanged();
@@ -527,7 +523,7 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
  private:
   se::dnn::DnnSupport& dnn_support_;
   // <HLO computation fingerprint, serialized compiled cuDNN graph>.
-  absl::flat_hash_map<std::string, std::string> compilation_cache_;
+  CuDnnFusionCompiler::BinaryMap& compilation_results_;
 };
 
 }  // namespace
@@ -536,13 +532,13 @@ absl::StatusOr<bool> CuDnnFusionCompiler::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER("cuDNN fusion compiler");
-  return CuDnnFusionVisitor(dnn_support_)
+  return CuDnnFusionVisitor(dnn_support_, compilation_results_)
       .RunOnModule(module, execution_threads);
 }
 
 int CuDnnFusionCompiler::GetAvailablePlanCount(
-    const HloFusionInstruction& hlo) const {
-  auto graph = PrepareGraph(dnn_support_, hlo);
+    se::StreamExecutor& stream_exec, const HloFusionInstruction& hlo) {
+  auto graph = PrepareGraph(*stream_exec.AsDnn(), hlo);
   if (!graph.ok()) {
     return 0;
   }
diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h
index 69bd00a71ba2b1..e5ce4ddefa7b6d 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h
@@ -21,20 +21,24 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
 
 // Converts HLO fusions with cuDNN backend config to cuDNN graphs,
-// compiles them using a cuDNN handle and stores them in the
-// backend config in serialized form.
+// compiles them using a cuDNN handle and serializes them.
 class CuDnnFusionCompiler : public HloModulePass {
  public:
-  explicit CuDnnFusionCompiler(const AutotuneConfig& config)
-      : dnn_support_(*config.GetExecutor()->AsDnn()) {}
+  // <HLO computation fingerprint, serialized compiled cuDNN graph>.
+  using BinaryMap = absl::flat_hash_map<std::string, std::string>;
+
+  explicit CuDnnFusionCompiler(se::StreamExecutor& stream_exec,
+                               BinaryMap& compilation_results)
+      : dnn_support_(*stream_exec.AsDnn()),
+        compilation_results_(compilation_results) {}
 
   absl::string_view name() const override { return "cudnn-fusion-compiler"; }
 
@@ -43,10 +47,12 @@ class CuDnnFusionCompiler : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
-  int GetAvailablePlanCount(const HloFusionInstruction& hlo) const;
+  static int GetAvailablePlanCount(se::StreamExecutor& stream_exec,
+                                   const HloFusionInstruction& hlo);
 
  private:
   se::dnn::DnnSupport& dnn_support_;
+  BinaryMap& compilation_results_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/executable.proto b/third_party/xla/xla/service/gpu/executable.proto
index 970d57f0604dd0..e66c48c4762b2d 100644
--- a/third_party/xla/xla/service/gpu/executable.proto
+++ b/third_party/xla/xla/service/gpu/executable.proto
@@ -44,4 +44,5 @@ message CompilationResultProto {
   BufferAssignmentProto buffer_assignment = 2;
   string asm_text = 3;
   bytes binary = 4;
+  map<string, string> dnn_compiled_graphs = 5;
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 7de49d329c808a..cd200676dfc1c6 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -562,7 +562,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/runtime:cudnn_thunk",
-        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn.cc b/third_party/xla/xla/service/gpu/fusions/cudnn.cc
index 010bba895d1172..60f8af52222eb6 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/fusions/cudnn.h"
 
-#include "absl/strings/escaping.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #if GOOGLE_CUDA
 #include "xla/service/gpu/runtime/cudnn_thunk.h"
@@ -30,30 +29,13 @@ absl::StatusOr<FusionEmissionResult> CuDnnFusion::Emit(
 #if GOOGLE_CUDA
   VLOG(3) << fusion.ToString();
 
-  TF_ASSIGN_OR_RETURN(auto gpu_config,
-                      fusion.backend_config<GpuBackendConfig>());
-  if (!gpu_config.fusion_backend_config().has_cudnn_fusion_config() ||
-      !gpu_config.fusion_backend_config()
-           .cudnn_fusion_config()
-           .has_serialized_graph()) {
-    return absl::FailedPreconditionError("cuDNN fusion is not compiled.");
-  }
-  std::string unescaped;
-  std::string error;
-  if (!absl::CUnescape(gpu_config.fusion_backend_config()
-                           .cudnn_fusion_config()
-                           .serialized_graph(),
-                       &unescaped, &error)) {
-    return absl::UnknownError(
-        absl::StrCat("Failed unescaping string: ", error));
-  }
-
   TF_ASSIGN_OR_RETURN(
       auto kernel_arguments,
       KernelArguments::Create(ir_emitter_context.buffer_assignment(), &fusion));
   FusionEmissionResult result;
   result.thunks.emplace_back(std::make_unique<CuDnnThunk>(
-      std::move(unescaped), Thunk::ThunkInfo::WithProfileAnnotation(&fusion),
+      GetComputationFingerprint(fusion.fused_instructions_computation(), {}),
+      Thunk::ThunkInfo::WithProfileAnnotation(&fusion),
       kernel_arguments.args()));
   return result;
 #else
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
index e68364f71d3903..4c532b8b2f9751 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
@@ -571,8 +571,6 @@ absl::StatusOr<std::unique_ptr<HloModule>> CudnnGemmAutotuneExtractor(
   TF_RETURN_IF_ERROR(
       new_module->entry_computation()->root_instruction()->set_backend_config(
           gpu_config));
-  CuDnnFusionCompiler compiler(autotune_config);
-  TF_RETURN_IF_ERROR(compiler.Run(new_module.get()).status());
 
   return new_module;
 }
@@ -592,18 +590,19 @@ bool IsFusionKind(const HloInstruction& hlo, absl::string_view kind) {
 
 int GetCuDnnPlanCount(const HloInstruction& hlo,
                       const AutotuneConfig& autotune_config) {
-  auto gpu_config = hlo.backend_config<GpuBackendConfig>();
-  if (!gpu_config.ok() ||
+  if (auto gpu_config = hlo.backend_config<GpuBackendConfig>();
+      !gpu_config.ok() ||
       gpu_config->fusion_backend_config().has_cudnn_fusion_config()) {
     return {};
   }
-  return CuDnnFusionCompiler(autotune_config)
-      .GetAvailablePlanCount(*DynCast<HloFusionInstruction>(&hlo));
+  return CuDnnFusionCompiler::GetAvailablePlanCount(
+      *autotune_config.GetExecutor(), *DynCast<HloFusionInstruction>(&hlo));
 }
 
 bool IsCuDnnEnabled(const AutotuneConfig& config,
                     const DebugOptions& debug_opts) {
-  return std::get<se::CudaComputeCapability>(config.GetGpuComputeCapability())
+  return !config.IsDeviceless() &&
+         std::get<se::CudaComputeCapability>(config.GetGpuComputeCapability())
              .IsAtLeastHopper() &&
          debug_opts.xla_gpu_cudnn_gemm_fusion_level() > 0 &&
          GetDnnVersionInfo(config.GetExecutor()).major_version() >= 9;
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 8fec56418f6edb..baab8ed21195b6 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -124,6 +124,7 @@ limitations under the License.
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "xla/service/gpu/conv_layout_normalization.h"
 #include "xla/service/gpu/copy_fusion.h"
+#include "xla/service/gpu/cudnn_fusion_compiler.h"
 #include "xla/service/gpu/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/dot_dimension_sorter.h"
 #include "xla/service/gpu/dot_operand_converter.h"
@@ -391,13 +392,16 @@ class GpuThunkAotCompilationResult : public AotCompilationResult {
   static absl::StatusOr<std::unique_ptr<GpuThunkAotCompilationResult>>
   FromModule(const HloModule* hlo_module,
              const BufferAssignment* buffer_assignment,
-             std::string_view asm_text, absl::Span<const uint8_t> binary) {
+             std::string_view asm_text, absl::Span<const uint8_t> binary,
+             const Thunk::BinaryMap& dnn_compiled_graphs) {
     CompilationResultProto proto;
     TF_ASSIGN_OR_RETURN(*proto.mutable_hlo_module_with_config(),
                         hlo_module->ToProtoWithConfig());
     *proto.mutable_buffer_assignment() = buffer_assignment->ToProto();
     proto.set_asm_text(std::string(asm_text));
     proto.set_binary(binary.data(), binary.size());
+    proto.mutable_dnn_compiled_graphs()->insert(dnn_compiled_graphs.cbegin(),
+                                                dnn_compiled_graphs.cend());
     return std::unique_ptr<GpuThunkAotCompilationResult>(
         new GpuThunkAotCompilationResult(hlo_module->Clone(),
                                          std::move(proto)));
@@ -506,6 +510,9 @@ GpuThunkAotCompilationResult::LoadExecutable(
       GpuExecutable::Create(GpuExecutable::Params{
           /*asm_text=*/proto_.asm_text(),
           /*binary=*/binary,
+          /*dnn_compiled_graphs=*/
+          Thunk::BinaryMap(proto_.dnn_compiled_graphs().cbegin(),
+                           proto_.dnn_compiled_graphs().cend()),
           /*gpu_version=*/gpu_device_info.gpu_compute_capability(),
           /*executable=*/std::move(thunk_sequence),
           /*constants=*/std::move(constants),
@@ -1974,6 +1981,12 @@ GpuCompiler::CompileToBackendResult(
 absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     const CompileOptions& options) {
+  Thunk::BinaryMap dnn_compiled_graphs;
+  if (stream_exec) {
+    CuDnnFusionCompiler cudnn_compiler(*stream_exec, dnn_compiled_graphs);
+    TF_RETURN_IF_ERROR(cudnn_compiler.Run(&*module).status());
+  }
+
   const DebugOptions& debug_opts = module->config().debug_options();
   TF_ASSIGN_OR_RETURN(TargetConfig gpu_target_config,
                       GetTargetConfig(options, debug_opts, stream_exec));
@@ -2048,6 +2061,8 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
               ? std::string()
               : std::move(res.backend_result.asm_text),
           /*binary=*/std::move(res.backend_result.binary),
+          /*dnn_compiled_graphs=*/
+          std::move(dnn_compiled_graphs),
           /*gpu_version=*/gpu_device_info.gpu_compute_capability(),
           /*executable=*/std::move(res.compile_module_results.executable),
           /*constants=*/std::move(res.compile_module_results.constants),
@@ -2140,7 +2155,8 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
         results.emplace_back(),
         GpuThunkAotCompilationResult::FromModule(
             module.get(), res.compile_module_results.buffer_assignment.get(),
-            res.backend_result.asm_text, res.backend_result.binary));
+            res.backend_result.asm_text, res.backend_result.binary,
+            res.backend_result.dnn_compiled_graphs));
   }
 
   return std::move(results);
@@ -2160,7 +2176,8 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
 
   return GpuThunkAotCompilationResult::FromModule(
       &gpu_executable->module(), gpu_executable->buffer_assignment(),
-      gpu_executable->text(), gpu_executable->binary());
+      gpu_executable->text(), gpu_executable->binary(),
+      gpu_executable->dnn_compiled_graphs());
 }
 
 absl::Status GpuCompiler::RunPostSchedulingPipelines(
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 36ede049f2f76a..dccb93acc01e96 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -123,6 +123,7 @@ class GpuCompiler : public LLVMCompiler {
   struct BackendCompileResult {
     std::string asm_text;
     std::vector<uint8_t> binary;
+    Thunk::BinaryMap dnn_compiled_graphs;
   };
 
   // During compilation with device, stream_exec != null and autotune_results
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 5e78157c6342e8..176fea2fd89115 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -167,6 +167,7 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params)
     : Executable(std::move(params.debug_module)),
       text_(std::move(params.asm_text)),
       binary_(std::move(params.binary)),
+      dnn_compiled_graphs_(std::move(params.dnn_compiled_graphs)),
       gpu_version_(params.gpu_version),
       thunks_(std::move(params.executable)),
       execution_stream_ids_(has_module()
@@ -1015,7 +1016,8 @@ absl::Status GpuExecutable::ExecuteThunksOrXlaRuntime(
   ModuleIdentifier unique_id = has_module() ? module().unique_id() : -1;
 
   if (thunks_) {
-    Thunk::ExecutableSource executable_source = {text_, binary_};
+    Thunk::ExecutableSource executable_source = {text_, binary_,
+                                                 dnn_compiled_graphs_};
     int64_t collective_max_nchannels =
         has_module() ? module_config()
                            .debug_options()
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index 85d45fc3873ee6..76c325b797145b 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -88,6 +88,7 @@ class GpuExecutable : public Executable {
   struct Params {
     std::string asm_text;
     std::vector<uint8_t> binary;
+    Thunk::BinaryMap dnn_compiled_graphs;
     se::GpuComputeCapability gpu_version;
     OwnedThunkSequence executable;
     std::vector<ConstantInfo> constants;
@@ -138,6 +139,10 @@ class GpuExecutable : public Executable {
   // compiled.
   const std::vector<uint8_t>& binary() const { return binary_; }
 
+  const Thunk::BinaryMap& dnn_compiled_graphs() const {
+    return dnn_compiled_graphs_;
+  }
+
   // ExecuteAsyncOnStream will fail if the compute capability of the stream
   // doesn't match the compute capability passed to this object's constructor.
   absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
@@ -241,6 +246,8 @@ class GpuExecutable : public Executable {
   // May be empty, in which case we leave compilation up to the GPU driver.
   std::vector<uint8_t> binary_;
 
+  Thunk::BinaryMap dnn_compiled_graphs_;
+
   // The GPU version for compute compatibility check.
   se::GpuComputeCapability gpu_version_;
 
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 9a5b82d8a7b6f7..62920ee5adb920 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -57,7 +57,6 @@ limitations under the License.
 #include "xla/service/gpu/cudnn_fused_conv_rewriter.h"
 #include "xla/service/gpu/cudnn_fused_mha_rewriter.h"
 #include "xla/service/gpu/cudnn_fused_mha_transpose_fusion.h"
-#include "xla/service/gpu/cudnn_fusion_compiler.h"
 #include "xla/service/gpu/cudnn_norm_rewriter.h"
 #include "xla/service/gpu/cudnn_pad_for_convolutions.h"
 #include "xla/service/gpu/cudnn_simplify_padding.h"
@@ -326,9 +325,6 @@ absl::Status NVPTXCompiler::AddConvAndGemmAutotuningPasses(
     pipeline->AddPass<GpuConvAlgorithmPicker>(autotune_config);
   }
   pipeline->AddPass<GemmAlgorithmPicker>(autotune_config);
-  if (!autotune_config.IsDeviceless()) {
-    pipeline->AddPass<CuDnnFusionCompiler>(autotune_config);
-  }
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.cc b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.cc
index d4f25890924f3e..b156aa3228c24b 100644
--- a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.cc
@@ -25,10 +25,10 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-CuDnnThunk::CuDnnThunk(std::string serialized_graph, ThunkInfo thunk_info,
+CuDnnThunk::CuDnnThunk(std::string fingerprint, ThunkInfo thunk_info,
                        absl::Span<const KernelArgument> kernel_arguments)
     : Thunk(Kind::kCuDnn, std::move(thunk_info)),
-      serialized_graph_(std::move(serialized_graph)),
+      fingerprint_(std::move(fingerprint)),
       graph_(std::make_shared<se::dnn::LazyDnnGraph>(nullptr)) {
   args_.reserve(kernel_arguments.size());
   for (const KernelArgument& kernel_argument : kernel_arguments) {
@@ -39,9 +39,9 @@ CuDnnThunk::CuDnnThunk(std::string serialized_graph, ThunkInfo thunk_info,
 absl::Status CuDnnThunk::Initialize(const InitializeParams& params) {
   absl::Status ret = absl::OkStatus();
   absl::call_once(once_flag_, [&] {
-    auto result =
-        params.stream->parent()->AsDnn()->DeserializeGraph(serialized_graph_);
-    std::string().swap(serialized_graph_);
+    auto result = params.stream->parent()->AsDnn()->DeserializeGraph(
+        params.src.dnn_compiled_graphs.at(fingerprint_));
+    std::string().swap(fingerprint_);
     if (result.ok()) {
       graph_->swap(*result);
     }
diff --git a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
index 6e7f355090d6c7..54b3ea0359a03d 100644
--- a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
@@ -34,7 +34,7 @@ namespace gpu {
 // Wraps executable cuDNN graph objects.
 class CuDnnThunk : public Thunk {
  public:
-  CuDnnThunk(std::string serialized_graph, ThunkInfo,
+  CuDnnThunk(std::string fingerprint, ThunkInfo,
              absl::Span<const KernelArgument>);
   CuDnnThunk(const CuDnnThunk&) = delete;
   CuDnnThunk& operator=(const CuDnnThunk&) = delete;
@@ -50,7 +50,7 @@ class CuDnnThunk : public Thunk {
 
  private:
   absl::once_flag once_flag_;
-  std::string serialized_graph_;
+  std::string fingerprint_;
   std::shared_ptr<se::dnn::LazyDnnGraph> graph_;
   std::vector<BufferAllocation::Slice> args_;
 };
diff --git a/third_party/xla/xla/service/gpu/thunk.h b/third_party/xla/xla/service/gpu/thunk.h
index d7c06cbdb7284d..3adbea09dc7319 100644
--- a/third_party/xla/xla/service/gpu/thunk.h
+++ b/third_party/xla/xla/service/gpu/thunk.h
@@ -142,6 +142,9 @@ class Thunk {
     kCuDnn
   };
 
+  // <HLO computation fingerprint, serialized compiled object>.
+  using BinaryMap = absl::flat_hash_map<std::string, std::string>;
+
   // TODO(ezhulenev): This should become a part of StreamExecutor library, but
   // for now we keep it here as a Thunk implementation detail. It's not yet
   // clear what else should become a part of "executable source", we likely
@@ -149,6 +152,7 @@ class Thunk {
   struct ExecutableSource {
     std::string_view text;             // PTX for NVIDIA backend
     absl::Span<const uint8_t> binary;  // CUBIN for NVIDIA backends
+    BinaryMap dnn_compiled_graphs;
   };
 
   struct ThunkInfo {

From 91d4ce4a67f1c62364fe97100c50924bd8adfe5d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Mar 2024 21:02:44 -0700
Subject: [PATCH 207/670] Change flat_hash_map to vector to have deterministic
 order while unrolling.

PiperOrigin-RevId: 617714545
---
 third_party/xla/xla/service/BUILD                  |  1 -
 third_party/xla/xla/service/while_loop_unroller.cc | 11 +++++------
 third_party/xla/xla/service/while_loop_unroller.h  |  7 ++++---
 .../xla/xla/service/while_loop_unroller_test.cc    | 14 ++------------
 4 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index d11459389aa5ae..3f81f542fa05d6 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -3184,7 +3184,6 @@ cc_library(
         "//xla/hlo/utils:hlo_query",
         "@com_google_absl//absl/algorithm",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index 62faffe6992664..cf260e9919e256 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -24,7 +24,6 @@ limitations under the License.
 
 #include "absl/algorithm/algorithm.h"
 #include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -448,7 +447,7 @@ absl::StatusOr<bool> PrepareModuleForUnrolling(
   return changed;
 }
 
-absl::flat_hash_map<HloInstruction*, WhileLoopConfig> GetUnrollableLoops(
+std::vector<std::pair<HloInstruction*, WhileLoopConfig>> GetUnrollableLoops(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Processing the while loops in the reverse topological order. If the body
@@ -459,11 +458,11 @@ absl::flat_hash_map<HloInstruction*, WhileLoopConfig> GetUnrollableLoops(
                     HloPredicateIsOp<HloOpcode::kWhile>);
   }
 
-  absl::flat_hash_map<HloInstruction*, WhileLoopConfig> while_loop_configs;
+  std::vector<std::pair<HloInstruction*, WhileLoopConfig>> while_loop_configs;
   for (HloInstruction* instr : all_while_ops) {
     std::optional<WhileLoopConfig> config = IsLoopUnrollable(instr);
     if (config.has_value()) {
-      while_loop_configs[instr] = *config;
+      while_loop_configs.emplace_back(instr, config.value());
     }
   }
   return while_loop_configs;
@@ -529,8 +528,8 @@ StatusOr<bool> WhileLoopUnroller::Run(
   // Gather a preliminary vector of all the while ops that we think we can
   // unroll. We do this ahead of time so we don't have to worry about mutating
   // the lists of computations or instructions while we iterate.
-  absl::flat_hash_map<HloInstruction*, WhileLoopConfig> unrollable_while_ops =
-      GetUnrollableLoops(module, execution_threads);
+  std::vector<std::pair<HloInstruction*, WhileLoopConfig>>
+      unrollable_while_ops = GetUnrollableLoops(module, execution_threads);
 
   VLOG(3) << "Number of while instructions in the module to unroll: "
           << unrollable_while_ops.size();
diff --git a/third_party/xla/xla/service/while_loop_unroller.h b/third_party/xla/xla/service/while_loop_unroller.h
index ca462e387323f3..024ea408141762 100644
--- a/third_party/xla/xla/service/while_loop_unroller.h
+++ b/third_party/xla/xla/service/while_loop_unroller.h
@@ -17,9 +17,9 @@ limitations under the License.
 #define XLA_SERVICE_WHILE_LOOP_UNROLLER_H_
 
 #include <cstdint>
-#include <optional>
+#include <utility>
+#include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -48,7 +48,8 @@ absl::StatusOr<bool> PrepareModuleForUnrolling(
     const absl::flat_hash_set<absl::string_view>& execution_threads);
 
 // Returns the list of unrollable loops in the given module
-absl::flat_hash_map<HloInstruction*, WhileLoopConfig> GetUnrollableLoops(
+
+std::vector<std::pair<HloInstruction*, WhileLoopConfig>> GetUnrollableLoops(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads);
 
diff --git a/third_party/xla/xla/service/while_loop_unroller_test.cc b/third_party/xla/xla/service/while_loop_unroller_test.cc
index cd4cfeb20f59b5..02a6db1be5fe87 100644
--- a/third_party/xla/xla/service/while_loop_unroller_test.cc
+++ b/third_party/xla/xla/service/while_loop_unroller_test.cc
@@ -16,11 +16,9 @@ limitations under the License.
 #include "xla/service/while_loop_unroller.h"
 
 #include <cstdint>
-#include <iterator>
 #include <memory>
 #include <optional>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -532,17 +530,9 @@ TEST_F(WhileLoopUnrollerTest, GetUnrollableLoops) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  HloInstruction* while1 =
-      module->entry_computation()->GetInstructionWithName("while1");
-  HloInstruction* while2 =
-      module->entry_computation()->GetInstructionWithName("while2");
-  HloInstruction* while3 =
-      module->entry_computation()->GetInstructionWithName("while3");
-
   auto unrollable_loops = GetUnrollableLoops(module.get(), {});
-  EXPECT_TRUE(unrollable_loops.contains(while1));
-  EXPECT_TRUE(unrollable_loops.contains(while2));
-  EXPECT_FALSE(unrollable_loops.contains(while3));
+  // Only while1 and while2 are unrollable
+  EXPECT_EQ(unrollable_loops.size(), 2);
 }
 
 TEST_F(WhileLoopUnrollerTest, UnrollMutipleLoops) {

From 66051cfb324b81cbfe80ababcae77d8caed35d68 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 Mar 2024 22:18:22 -0700
Subject: [PATCH 208/670] Add `gpu_h100` backend to the `tf_xla_py_test` build
 macro.

PiperOrigin-RevId: 617727265
---
 tensorflow/compiler/tests/BUILD          | 12 ++++++++++++
 tensorflow/compiler/tests/build_defs.bzl |  6 +++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 475627ff3b503d..def24f8187fb16 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -259,6 +259,7 @@ tf_xla_py_strict_test(
         "cpu",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     python_version = "PY3",
     shard_count = 2,
@@ -681,6 +682,7 @@ tf_xla_py_strict_test(
         "cpu",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     python_version = "PY3",
     tags = [
@@ -927,6 +929,7 @@ tf_xla_py_strict_test(
         "cpu",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     python_version = "PY3",
     shard_count = 10,
@@ -1519,6 +1522,7 @@ tf_xla_py_strict_test(
     disabled_backends = [
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = True,
     python_version = "PY3",
@@ -1556,6 +1560,7 @@ tf_xla_py_strict_test(
     disabled_backends = [
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = True,
     python_version = "PY3",
@@ -1593,6 +1598,7 @@ tf_xla_py_strict_test(
     disabled_backends = [
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     # TODO(b/232442915): Enable MLIR.
     enable_mlir_bridge = False,
@@ -2346,6 +2352,7 @@ tf_xla_py_strict_test(
     disabled_backends = [
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = True,
     python_version = "PY3",
@@ -2380,6 +2387,7 @@ tf_xla_py_strict_test(
     disabled_backends = [
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = False,
     python_version = "PY3",
@@ -2607,6 +2615,7 @@ tf_xla_py_strict_test(
         "cpu",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     tags = [
         "no_pip",
@@ -2635,6 +2644,7 @@ tf_xla_py_strict_test(
         "cpu_ondemand",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = False,
     main = "where_op_test.py",
@@ -2721,6 +2731,7 @@ tf_xla_py_strict_test(
         "cpu_ondemand",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = False,
     python_version = "PY3",
@@ -2885,6 +2896,7 @@ tf_xla_py_strict_test(
         "cpu",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     python_version = "PY3",
     tags = [
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 7343bb9b89efce..ce6b626683e281 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -1,7 +1,7 @@
 """Build rules for Tensorflow/XLA testing."""
 
-load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
+load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow/compiler/tests:plugin.bzl", "plugins")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
@@ -84,7 +84,7 @@ def tf_xla_py_test(
                 "--test_device=" + cpu_xla_device,
                 "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_UINT8,DT_QUINT8,DT_INT8,DT_QINT8,DT_INT32,DT_QINT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_COMPLEX128",
             ]
-        elif backend in ("gpu", "gpu_a100"):
+        elif backend in ("gpu", "gpu_a100", "gpu_h100"):
             backend_args += [
                 "--test_device=" + gpu_xla_device,
                 "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_UINT8,DT_QUINT8,DT_INT8,DT_QINT8,DT_INT32,DT_QINT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_COMPLEX128,DT_BFLOAT16",
@@ -125,7 +125,7 @@ def tf_xla_py_test(
                 #
                 # This is for testing book keeping because the bridge does not have any gpu specific
                 # logic at this time, so CPU testing is good enough and cheaper.
-                extra_tag = ["ondemand"] if backend in ("gpu", "gpu_a100") else []
+                extra_tag = ["ondemand"] if backend in ("gpu", "gpu_a100", "gpu_h100") else []
             elif has_mlir_dep:
                 # Some tests run only with mlir_bridge by explicitly adding the MLIR
                 # bridge dep so if the dep is already present skip non MLIR

From 3a5a9ec33c1194f8fdec2a243714055c0118b44a Mon Sep 17 00:00:00 2001
From: Kevin Gleason <gleasonk@google.com>
Date: Wed, 20 Mar 2024 22:19:43 -0700
Subject: [PATCH 209/670] Fix JAX PjRT Clients using XLA pins from 12/2023.

PiperOrigin-RevId: 617727497
---
 third_party/xla/xla/pjrt/mlir_to_hlo.cc       | 80 +++++++++++--------
 third_party/xla/xla/pjrt/mlir_to_hlo.h        |  5 +-
 third_party/xla/xla/pjrt/pjrt_c_api_client.cc | 16 +++-
 3 files changed, 62 insertions(+), 39 deletions(-)

diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.cc b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
index d9019c9fd3bc62..c00045e0f27ce0 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.cc
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/pjrt/mlir_to_hlo.h"
 
 #include <cstdint>
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -98,11 +99,9 @@ static void ConvertAttr(
 // explicitly after parsing.
 void ConvertStablehloDenseAttributes(
     mlir::Operation* root_op,
-    llvm::function_ref<mlir::Attribute(mlir::Attribute)> convert) {
+    llvm::function_ref<mlir::Attribute(mlir::Attribute)> convert,
+    std::optional<int64_t> plugin_version) {
   llvm::TypeSwitch<mlir::Operation*>(root_op)
-      .Case([&](mlir::stablehlo::BroadcastOp op) {
-        ConvertAttr(op, "broadcast_sizes", convert);
-      })
       .Case([&](mlir::stablehlo::BroadcastInDimOp op) {
         ConvertAttr(op, "broadcast_dimensions", convert);
       })
@@ -123,23 +122,12 @@ void ConvertStablehloDenseAttributes(
         ConvertAttr(op, "rhs_dilation", convert);
         ConvertAttr(op, "window_reversal", convert);
       })
-      .Case([&](mlir::stablehlo::DynamicSliceOp op) {
-        ConvertAttr(op, "slice_sizes", convert);
-      })
-      .Case([&](mlir::stablehlo::FftOp op) {
-        ConvertAttr(op, "fft_length", convert);
-      })
       .Case([&](mlir::stablehlo::GatherOp op) {
         ConvertAttr(op, "slice_sizes", convert);
       })
       .Case([&](mlir::stablehlo::MapOp op) {
         ConvertAttr(op, "dimensions", convert);
       })
-      .Case([&](mlir::stablehlo::PadOp op) {
-        ConvertAttr(op, "edge_padding_low", convert);
-        ConvertAttr(op, "edge_padding_high", convert);
-        ConvertAttr(op, "interior_padding", convert);
-      })
       .Case([&](mlir::stablehlo::ReduceOp op) {
         ConvertAttr(op, "dimensions", convert);
       })
@@ -149,31 +137,56 @@ void ConvertStablehloDenseAttributes(
         ConvertAttr(op, "base_dilations", convert);
         ConvertAttr(op, "window_dilations", convert);
       })
-      .Case([&](mlir::stablehlo::ReverseOp op) {
-        ConvertAttr(op, "dimensions", convert);
-      })
+
       .Case([&](mlir::stablehlo::SelectAndScatterOp op) {
         ConvertAttr(op, "window_dimensions", convert);
         ConvertAttr(op, "window_strides", convert);
-      })
-      .Case([&](mlir::stablehlo::SliceOp op) {
-        ConvertAttr(op, "start_indices", convert);
-        ConvertAttr(op, "limit_indices", convert);
-        ConvertAttr(op, "strides", convert);
-      })
-      .Case([&](mlir::stablehlo::TransposeOp op) {
-        ConvertAttr(op, "permutation", convert);
       });
+
+  // Use PJRT_API_MINOR 40 from Nov 27, 2023 for Dec 9, 2023 StableHLO changes.
+  // Always run when plugin_value is unset (used for deserialization upgrades)
+  // and only run when plugin version is less than 40 otherwise.
+  if (!plugin_version.has_value() || plugin_version.value() < 40) {
+    // Downgrade slice, dynamic_slice, pad, broadcast, transpose, fft, reverse
+    llvm::TypeSwitch<mlir::Operation*>(root_op)
+        .Case([&](mlir::stablehlo::BroadcastOp op) {
+          ConvertAttr(op, "broadcast_sizes", convert);
+        })
+        .Case([&](mlir::stablehlo::DynamicSliceOp op) {
+          ConvertAttr(op, "slice_sizes", convert);
+        })
+        .Case([&](mlir::stablehlo::FftOp op) {
+          ConvertAttr(op, "fft_length", convert);
+        })
+        .Case([&](mlir::stablehlo::PadOp op) {
+          ConvertAttr(op, "edge_padding_low", convert);
+          ConvertAttr(op, "edge_padding_high", convert);
+          ConvertAttr(op, "interior_padding", convert);
+        })
+        .Case([&](mlir::stablehlo::ReverseOp op) {
+          ConvertAttr(op, "dimensions", convert);
+        })
+        .Case([&](mlir::stablehlo::SliceOp op) {
+          ConvertAttr(op, "start_indices", convert);
+          ConvertAttr(op, "limit_indices", convert);
+          ConvertAttr(op, "strides", convert);
+        })
+        .Case([&](mlir::stablehlo::TransposeOp op) {
+          ConvertAttr(op, "permutation", convert);
+        });
+  }
 }
 
-void DowngradeStablehlo(mlir::ModuleOp module) {
-  module->walk([](mlir::Operation* op) {
-    ConvertStablehloDenseAttributes(op, ArrayToElements);
+void DowngradeStablehlo(mlir::ModuleOp module,
+                        std::optional<int64_t> plugin_version) {
+  module->walk([&](mlir::Operation* op) {
+    ConvertStablehloDenseAttributes(op, ArrayToElements, plugin_version);
   });
 }
 void UpgradeStablehlo(mlir::ModuleOp module) {
   module->walk([](mlir::Operation* op) {
-    ConvertStablehloDenseAttributes(op, ElementsToArray);
+    ConvertStablehloDenseAttributes(op, ElementsToArray,
+                                    /*plugin_version=*/std::nullopt);
   });
 }
 
@@ -264,7 +277,8 @@ Status ParseMlirModuleStringAndConvertToXlaComputation(
                                    return_tuple);
 }
 
-StatusOr<std::string> SerializeUsingNativeBytecode(mlir::ModuleOp module) {
+absl::StatusOr<std::string> SerializeUsingNativeBytecode(
+    mlir::ModuleOp module, std::optional<int64_t> plugin_version) {
   std::string bytecode;
   llvm::raw_string_ostream os(bytecode);
   mlir::BytecodeWriterConfig config;
@@ -280,14 +294,14 @@ StatusOr<std::string> SerializeUsingNativeBytecode(mlir::ModuleOp module) {
   // deserializing.
   // TODO: b/320507168 - Remove this conversion code.
   mlir::OwningOpRef<mlir::ModuleOp> cloned = module.clone();
-  DowngradeStablehlo(*cloned);
+  DowngradeStablehlo(*cloned, plugin_version);
   if (mlir::failed(mlir::writeBytecodeToFile(*cloned, os, config))) {
     return absl::InvalidArgumentError("mlir::writeBytecodeToFile failed");
   }
   return bytecode;
 }
 
-StatusOr<std::string> SerializeUsingVersionedStablehlo(
+absl::StatusOr<std::string> SerializeUsingVersionedStablehlo(
     mlir::ModuleOp mlir_module, absl::string_view target, bool inplace) {
   // Legalize CHLO -> [MHLO+Shape] -> StableHLO
   mlir::PassManager pm(mlir_module->getContext());
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.h b/third_party/xla/xla/pjrt/mlir_to_hlo.h
index aacaa1f7b87e4e..9bad5399717422 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.h
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.h
@@ -40,7 +40,8 @@ Status ParseMlirModuleStringAndConvertToXlaComputation(
 // Serialize using MLIR Bytecode Format which does not guarantee forward or
 // backward compatiblity of the dialects used. If passing StableHLO with forward
 // or backward compatibility requirements, use SerializeUsingVersionedStablehlo.
-StatusOr<std::string> SerializeUsingNativeBytecode(mlir::ModuleOp mlir_module);
+absl::StatusOr<std::string> SerializeUsingNativeBytecode(
+    mlir::ModuleOp mlir_module, std::optional<int64_t> plugin_version);
 
 // Serializes an MLIR module to a portable artifact with forward and backward
 // compatibility. Supports modules using StableHLO/MHLO/CHLO/Func dialects.
@@ -52,7 +53,7 @@ StatusOr<std::string> SerializeUsingNativeBytecode(mlir::ModuleOp mlir_module);
 // Ideally should be the `mlir::stablehlo::getCurrentVersion()` of the plugin.
 // If program contains dialects that aren't supposed in StableHLO portable
 // artifacts, use SerializeUsingNativeBytecode.
-StatusOr<std::string> SerializeUsingVersionedStablehlo(
+absl::StatusOr<std::string> SerializeUsingVersionedStablehlo(
     mlir::ModuleOp mlir_module, absl::string_view target, bool inplace = false);
 
 // Given a module that might be a portable artifact, deserialize and upgrade it
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index 9f892640dd2bd4..8cea8ef39858f1 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -393,8 +394,11 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
     mlir::ModuleOp module, CompileOptions options) {
   // TODO: Once plugins are ready, use SerializeUsingVersionedStablehlo.
-  TF_ASSIGN_OR_RETURN(std::string serialized,
-                      xla::SerializeUsingNativeBytecode(module));
+  if (!pjrt_c_api()) llvm::report_fatal_error("pjrt_c_api is null");
+  TF_ASSIGN_OR_RETURN(
+      std::string serialized,
+      xla::SerializeUsingNativeBytecode(
+          module, plugin_attributes()->pjrt_c_api_minor_version));
   std::string format(pjrt::kMlirFormat);
   return InitializeArgsAndCompile(this, c_api_, c_client_.get(), options,
                                   serialized, format);
@@ -2191,8 +2195,12 @@ StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
     CompileOptions options, mlir::ModuleOp module,
     const PjRtTopologyDescription& topology, PjRtClient* client) {
   // TODO: Once plugins are ready, use SerializeUsingVersionedStablehlo.
-  TF_ASSIGN_OR_RETURN(std::string serialized,
-                      xla::SerializeUsingNativeBytecode(module));
+  std::optional<int64_t> plugin_version;
+  if (client) {
+    plugin_version = client->plugin_attributes()->pjrt_c_api_minor_version;
+  }
+  TF_ASSIGN_OR_RETURN(std::string serialized, xla::SerializeUsingNativeBytecode(
+                                                  module, plugin_version));
   std::string format(pjrt::kMlirFormat);
   return InitializeArgsAndCompileAot(c_api_, client, options, topology,
                                      serialized, format);

From 66d622459f064626475445551e6c456aa738328d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Mar 2024 01:28:15 -0700
Subject: [PATCH 210/670] Integrate Triton up to
 [92ac0aad](https://github.com/openai/triton/commits/92ac0aade352d7be626e8c608331ddebfe5f6fb2)

PiperOrigin-RevId: 617761210
---
 third_party/triton/workspace.bzl                 | 4 ++--
 third_party/xla/third_party/triton/workspace.bzl | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index 418bdf0b9d6a5b..bf6fae7683e1c5 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl615024090"
-    TRITON_SHA256 = "157aa79fc6a0aec852c3af40a27791ffe0620553285c94315f41f4d455c212ff"
+    TRITON_COMMIT = "cl617459344"
+    TRITON_SHA256 = "f23e65175a67b1091ab6782720b0bcb969f33c19cae8168bf93eea523dab8a3f"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 418bdf0b9d6a5b..bf6fae7683e1c5 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl615024090"
-    TRITON_SHA256 = "157aa79fc6a0aec852c3af40a27791ffe0620553285c94315f41f4d455c212ff"
+    TRITON_COMMIT = "cl617459344"
+    TRITON_SHA256 = "f23e65175a67b1091ab6782720b0bcb969f33c19cae8168bf93eea523dab8a3f"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,

From 7685e2dcd557fef12b8fff0d1c241f5c58bed8ff Mon Sep 17 00:00:00 2001
From: Doyeon Kim <doyeonkim@google.com>
Date: Thu, 21 Mar 2024 01:41:28 -0700
Subject: [PATCH 211/670] Add checks for output file size ratio in the
 integration tests

PiperOrigin-RevId: 617764047
---
 .../quantization/common/python/testing.py     |  35 ++++-
 .../common/python/testing_test.py             |  22 +++
 .../integration_test/quantize_model_test.py   |  80 +++++++++-
 .../integration_test/quantize_model_test.py   | 141 ++++++++++++------
 .../quantize_model_test_base.py               |  34 -----
 5 files changed, 231 insertions(+), 81 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/common/python/testing.py b/tensorflow/compiler/mlir/quantization/common/python/testing.py
index 78eb2409c70f89..211e08df7d9e4b 100644
--- a/tensorflow/compiler/mlir/quantization/common/python/testing.py
+++ b/tensorflow/compiler/mlir/quantization/common/python/testing.py
@@ -1,5 +1,3 @@
-"""Common testing utilities for quantization libraries."""
-
 # Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Common testing utilities for quantization libraries."""
 import itertools
+import os
 from typing import Any, Mapping, Sequence
 
 
@@ -36,3 +36,34 @@ def parameter_combinations(
     for curr in itertools.product(*parameters.values()):
       real_parameters.append(dict(zip(keys, curr)))
   return real_parameters
+
+
+def get_dir_size(path: str = '.') -> int:
+  """Get the total size of files and sub-directories under the path.
+
+  Args:
+    path: Path of a directory or a file to calculate the total size.
+
+  Returns:
+    Total size of the directory or a file.
+  """
+  total = 0
+  for root, _, files in os.walk(path):
+    for filename in files:
+      total += os.path.getsize(os.path.join(root, filename))
+  return total
+
+
+def get_size_ratio(path_a: str, path_b: str) -> float:
+  """Return the size ratio of the given paths.
+
+  Args:
+    path_a: Path of a directory or a file to be the nominator of the ratio.
+    path_b: Path of a directory or a file to be the denominator of the ratio.
+
+  Returns:
+    Ratio of size of path_a / size of path_b.
+  """
+  size_a = get_dir_size(path_a)
+  size_b = get_dir_size(path_b)
+  return size_a / size_b
diff --git a/tensorflow/compiler/mlir/quantization/common/python/testing_test.py b/tensorflow/compiler/mlir/quantization/common/python/testing_test.py
index 3366959456d5fe..9549e10898cedb 100644
--- a/tensorflow/compiler/mlir/quantization/common/python/testing_test.py
+++ b/tensorflow/compiler/mlir/quantization/common/python/testing_test.py
@@ -37,5 +37,27 @@ def test_parameter_combinations(self):
     self.assertIn({'shapes': [3, None], 'has_bias': False}, combinations)
 
 
+class FileSizeTestCase(test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+
+    self.path_a = self.create_tempdir('dir_a').full_path
+    self.create_tempfile(file_path='dir_a/w.txt', content='abcd')
+
+    self.path_b = self.create_tempdir('dir_b').full_path
+    self.create_tempfile(file_path='dir_b/x.txt', content='1234')
+    self.create_tempfile(file_path='dir_b/y.txt', content='56')
+    self.create_tempfile(file_path='dir_b/z.txt', content='78')
+
+  def test_get_dir_size(self):
+    self.assertEqual(testing.get_dir_size(self.path_a), 4)
+    self.assertEqual(testing.get_dir_size(self.path_b), 8)
+
+  def test_get_size_ratio(self):
+    self.assertEqual(testing.get_size_ratio(self.path_a, self.path_b), 0.5)
+    self.assertEqual(testing.get_size_ratio(self.path_b, self.path_a), 2.0)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
index ae257a2550d8a9..78494fcbe98132 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -144,6 +144,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.03, atol=0.2)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.65,
+    )
+
   @parameterized.parameters(
       testing.parameter_combinations([{
           'same_scale_op': (
@@ -225,6 +233,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.03, atol=0.2)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.65,
+    )
+
   @parameterized.parameters(
       testing.parameter_combinations([{
           'same_scale_op': (
@@ -304,6 +320,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.03, atol=0.2)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.6,
+    )
+
   @parameterized.parameters(
       testing.parameter_combinations([{
           'bias_fn': (
@@ -412,6 +436,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.02, atol=0.05)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.6,
+    )
+
   @parameterized.parameters(
       testing.parameter_combinations([{
           'equation': (
@@ -489,6 +521,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.02, atol=0.04)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.65,
+    )
+
   def test_when_preset_not_srq_raises_error(self):
     self._create_matmul_model(
         input_shape=(1, 1024),
@@ -573,6 +613,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # are negligible numeric difference.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.000001)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.4,
+    )
+
   @test_util.run_in_graph_and_eager_modes
   def test_ptq_selective_denylist(self):
     """Tests that the op is not quantized when no quantization is enabled."""
@@ -667,6 +715,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # Indirectly tests that the model is only partially quantized.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.011)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.55,
+    )
+
   @test_util.run_in_graph_and_eager_modes
   def test_ptq_quantization_method_not_applied_when_matcher_mismatch(self):
     """Tests that quantization method is not applied to unmatched units."""
@@ -737,6 +793,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.04)
     self.assertNotAllClose(new_outputs, expected_outputs, rtol=0.00001)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.4,
+    )
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
@@ -856,6 +920,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.02, atol=0.5)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.4,
+    )
+
 
 class WeightOnlyQuantizationTest(quantize_model_test_base.QuantizedModelTest):
 
@@ -944,7 +1016,13 @@ def test_matmul_weight_only_model(
         re.search('stablehlo.dot_general.*xf32>.*xf32>.*xf32>', module_str)
     )
 
-    # TODO: b/329342175 - Add check for output file size.
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.3,
+    )
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index 18e5a14de44110..c705b939b2f346 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -1178,8 +1178,12 @@ def test_qat_gather_and_conv_model(
         quantization_options,
     )
     self.assertIsNotNone(converted_model)
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path, self._input_saved_model_path, 0.5
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.5,
     )
 
   def test_qat_vocab_table_lookup_model(self):
@@ -2016,15 +2020,22 @@ def test_gather_and_conv_model(
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
 
     if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
-      self.assertSizeRatioGreaterThan(
-          self._output_saved_model_path, self._input_saved_model_path, 0.68
+      self.assertGreater(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          0.68,
       )
       self.assertTrue(
           self._contains_op(output_graphdef, 'UniformQuantizedConvolution')
       )
     else:
-      self.assertSizeRatioLessThan(
-          self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+      # Due to other meta data, the compression is not exactly 1/4.
+      self.assertLess(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          1 / 3,
       )
       if target_opset == quant_opts_pb2.XLA:
         self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
@@ -2975,12 +2986,19 @@ def test_gather_model(
     )
 
     if expect_quantized_gather:
-      self.assertSizeRatioLessThan(
-          self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+      # Due to other meta data, the compression is not exactly 1/4.
+      self.assertLess(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          1 / 3,
       )
     else:
-      self.assertSizeRatioGreaterThan(
-          self._output_saved_model_path, self._input_saved_model_path, 2 / 3
+      self.assertGreater(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          2 / 3,
       )
 
   @test_util.run_in_graph_and_eager_modes
@@ -4876,12 +4894,19 @@ def test_gather_model(
     )
 
     if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
-      self.assertSizeRatioGreaterThan(
-          self._output_saved_model_path, self._input_saved_model_path, 0.65
+      self.assertGreater(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          0.65,
       )
     else:
-      self.assertSizeRatioLessThan(
-          self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+      # Due to other meta data, the compression is not exactly 1/4.
+      self.assertLess(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          1 / 3,
       )
 
   @parameterized.named_parameters(
@@ -4930,8 +4955,11 @@ def test_gather_and_conv_model(
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
 
     if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
-      self.assertSizeRatioGreaterThan(
-          self._output_saved_model_path, self._input_saved_model_path, 0.65
+      self.assertGreater(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          0.65,
       )
       self.assertTrue(
           self._contains_op(
@@ -4939,8 +4967,12 @@ def test_gather_and_conv_model(
           )
       )
     else:
-      self.assertSizeRatioLessThan(
-          self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+      # Due to other meta data, the compression is not exactly 1/4.
+      self.assertLess(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          1 / 3,
       )
       if target_opset == quant_opts_pb2.XLA:
         self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
@@ -5096,14 +5128,20 @@ def test_gather_model_tf1(
 
     if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
       threshold = 0.45 if use_variable else 0.7
-      self.assertSizeRatioGreaterThan(
-          self._output_saved_model_path, self._input_saved_model_path, threshold
+      self.assertGreater(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          threshold,
       )
 
     else:
       threshold = 0.19 if use_variable else 0.42
-      self.assertSizeRatioLessThan(
-          self._output_saved_model_path, self._input_saved_model_path, threshold
+      self.assertLess(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          threshold,
       )
 
   @test_util.run_in_graph_and_eager_modes
@@ -5357,10 +5395,11 @@ def test_einsum_model(
         )
     )
     # Due to other meta data, the compression is not exactly 1/4.
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path,
-        self._input_saved_model_path,
-        threshold=0.5,
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.5,
     )
 
   @parameterized.named_parameters(
@@ -5408,10 +5447,11 @@ def test_matmul_model(
 
     # Due to other meta data, the compression is not exactly 1/4.
     self.assertTrue(self._contains_op(output_graphdef, 'XlaDotV2'))
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path,
-        self._input_saved_model_path,
-        threshold=0.3,
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.3,
     )
 
   @parameterized.named_parameters(
@@ -5468,10 +5508,11 @@ def test_conv_model(
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
 
     # Due to other meta data, the compression is not exactly 1/4.
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path,
-        self._input_saved_model_path,
-        threshold=0.3,
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.3,
     )
 
     if enable_per_channel_quantization:
@@ -5560,10 +5601,11 @@ def test_depthwise_conv2d_model(
 
     # Due to other meta data, the compression is not exactly 1/4.
     size_threshold = 0.5 if enable_per_channel_quantization else 0.32
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path,
-        self._input_saved_model_path,
-        threshold=size_threshold,
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        size_threshold,
     )
 
     if enable_per_channel_quantization:
@@ -5658,8 +5700,12 @@ def test_gather_model(
     self.assertCountEqual(
         converted_model.signatures._signatures.keys(), {'serving_default'}
     )
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path, self._input_saved_model_path, 0.3
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.3,
     )
 
   @parameterized.named_parameters(
@@ -5719,8 +5765,12 @@ def test_gather_and_conv_model(
     )
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
     self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        1 / 3,
     )
 
   @test_util.run_in_graph_and_eager_modes
@@ -6879,8 +6929,11 @@ def test_selective_quantization_on_gather(
     # The Conv2D op shouldn't be quantized as it has no FakeQuant on input.
     self.assertTrue(self._contains_op(graphdef, 'Conv2D'))
     # If the Gather op is quantized, input_model_size / output_model_size > 2.
-    self.assertSizeRatioLessThan(
-        self._input_saved_model_path, self._output_saved_model_path, 1.15
+    self.assertLess(
+        testing.get_size_ratio(
+            self._input_saved_model_path, self._output_saved_model_path
+        ),
+        1.15,
     )
 
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
index 1aad9b619b61d6..245240e5ebb1be 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
@@ -108,40 +108,6 @@ def _any_log_contains(
         )
     )
 
-  def assertSizeRatioGreaterThan(
-      self, path_a: str, path_b: str, threshold: float
-  ):
-    """Check if the size ratio of the given paths is greater than the threshold.
-
-    Args:
-      path_a: Path of a directory or a file to be the nominator of the ratio.
-      path_b: Path of a directory or a file to be the denominator of the ratio.
-      threshold: a number to compare with.
-
-    Returns:
-      True if the size ratio of path_a / path_b is greater than threshold.
-    """
-    size_a = self._get_dir_size(path_a)
-    size_b = self._get_dir_size(path_b)
-    size_ratio = size_a / size_b
-    return self.assertGreater(size_ratio, threshold)
-
-  def assertSizeRatioLessThan(self, path_a: str, path_b: str, threshold: float):
-    """Check if the size ratio of the given paths is less than the threshold.
-
-    Args:
-      path_a: Path of a directory or a file to be the nominator of the ratio.
-      path_b: Path of a directory or a file to be the denominator of the ratio.
-      threshold: a number to compare with.
-
-    Returns:
-      True if the size ratio of path_a / path_b is less than threshold.
-    """
-    size_a = self._get_dir_size(path_a)
-    size_b = self._get_dir_size(path_b)
-    size_ratio = size_a / size_b
-    return self.assertLess(size_ratio, threshold)
-
   def _is_quantized_function(self, func: function_pb2.FunctionDef) -> bool:
     """Determine whether a FunctionDef is quantized.
 

From 1df918fbb89ff727da71082690a040e7d073102b Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Thu, 21 Mar 2024 01:50:54 -0700
Subject: [PATCH 212/670] Reland of GpuTimer: improve kernel execution time
 measurement accuracy

Some production workloads show drastically increased latency. The log shows many instance of this message:

```
Delay kernel timed out: measured time has sub-optimal accuracy.
There may be a missing warmup execution, please investigate in Nsight Systems.
```

PiperOrigin-RevId: 617765839
---
 .../xla/service/gpu/conv_algorithm_picker.cc  |  12 +-
 .../xla/service/gpu/gemm_algorithm_picker.cc  |  31 ++++-
 third_party/xla/xla/stream_executor/gpu/BUILD |  26 +++-
 .../xla/xla/stream_executor/gpu/gpu_timer.cc  | 114 ++++++++++++++++--
 .../xla/xla/stream_executor/gpu/gpu_timer.h   |  34 +++++-
 .../gpu/gpu_timer_kernel.cu.cc                |  52 ++++++++
 .../stream_executor/gpu/gpu_timer_kernel.h    |  26 ++++
 7 files changed, 274 insertions(+), 21 deletions(-)
 create mode 100644 third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
 create mode 100644 third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h

diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index 54bde3ac33e147..4c21084f51d48b 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -612,7 +612,6 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   // Use assignment instead of brace-list to make GCC 4.9 happy.
   RunConvOptions options;
   options.runner_cache = runner;
-  options.profile_result = &profile_result;
   // The following plan timing code is based on
   // https://github.com/NVIDIA/cudnn-frontend/blob/60496f42fdc7a4ccc059f5934e306e728a756755/include/cudnn_frontend_find_plan.h
   float max_time = 0;
@@ -625,15 +624,20 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   // Dry-run to warmup the plan.
   launch_status = RunGpuConv(config, operand_buffers, result_buffers,
                              scratch_memory, stream, options);
+  // It is intentional that the warm-up run does not have a profile result.
+  // This avoids a timeout and error message if lazy module loading is enabled
+  // by ensuring that lazy loading happens outside the GpuTimer region.
+  options.profile_result = &profile_result;
   constexpr int kMaxIter = 10;
   // Iterate until the new measurement is within kThreshold of the current
   // minimum.
   int num_iters = 0;
-  for (;
-       num_iters < kMaxIter && launch_status.ok() && profile_result.is_valid();
-       num_iters++) {
+  for (; num_iters < kMaxIter && launch_status.ok(); ++num_iters) {
     launch_status = RunGpuConv(config, operand_buffers, result_buffers,
                                scratch_memory, stream, options);
+    if (!profile_result.is_valid()) {
+      break;
+    }
     float old_min_time = min_time;
     min_time = std::min(min_time, profile_result.elapsed_time_in_ms());
     max_time = std::max(max_time, profile_result.elapsed_time_in_ms());
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index 446cde8de272de..50e4d1daca6a0f 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -246,6 +247,15 @@ class GemmAutotuner {
 
     auto tuned_func = [&](const se::blas::AlgorithmType& algorithm)
         -> absl::StatusOr<se::blas::ProfileResult> {
+      // Do a warm-up run first, without a profile result. This avoids a timeout
+      // and error message if lazy module loading is enabled by ensuring that
+      // lazy loading happens outside the GpuTimer. RunGemm swallows error codes
+      // when profile_result is passed, as it is in the measurement below, but
+      // not otherwise. It is, therefore, consistent to ignore the error code
+      // here.
+      static_cast<void>(RunGemm(gemm_config, lhs_buffer_, rhs_buffer_,
+                                output_buffer_, workspace_buffer,
+                                deterministic_ops_, stream_, algorithm));
       se::blas::ProfileResult profile_result;
       // We expect GemmWithAlgorithm to fail sometimes -- in fact, it will fail
       // for all algorithms if we're targeting < sm_50. But because we pass a
@@ -417,15 +427,28 @@ absl::StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
                  config.GetGpuComputeCapability());
 
   if (update_algorithm) {
+    int64_t new_algorithm{};
     if (algorithm.has_gemm()) {
-      backend_config.set_selected_algorithm(algorithm.gemm().algorithm());
+      new_algorithm = algorithm.gemm().algorithm();
     } else {
       // NOTE: runtime autotuning is no longer available => set to default
-      backend_config.set_selected_algorithm(se::blas::kDefaultAlgorithm);
+      new_algorithm = se::blas::kDefaultAlgorithm;
     }
+
+    if (new_algorithm == old_algorithm &&
+        backend_config.has_selected_algorithm()) {
+      // We don't need to update the backend config if
+      // the algorithm hasn't changed unless previously
+      // the algorithm wasn't set explicitly.
+      return false;
+    }
+
+    backend_config.set_selected_algorithm(new_algorithm);
+    TF_RETURN_IF_ERROR(gemm->set_backend_config(gpu_config));
+    return true;  // We changed `gemm`
   }
-  TF_RETURN_IF_ERROR(gemm->set_backend_config(gpu_config));
-  return old_algorithm != backend_config.selected_algorithm();
+
+  return false;  // No change to `gemm`
 }
 
 absl::StatusOr<bool> RunOnComputation(HloComputation* computation,
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index f75e32f0fe8866..6853900e2c6edd 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -315,11 +315,29 @@ gpu_only_cc_library(
     ],
 )
 
+gpu_only_cc_library(
+    name = "gpu_timer_kernel_header",
+    hdrs = ["gpu_timer_kernel.h"],
+)
+
+gpu_kernel_library(
+    name = "gpu_timer_kernel",
+    srcs = if_gpu_is_configured(["gpu_timer_kernel.cu.cc"]),
+    deps = [
+        ":gpu_timer_kernel_header",
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
+
 gpu_only_cc_library(
     name = "gpu_timer_header",
     hdrs = ["gpu_timer.h"],
     deps = [
         ":gpu_executor_header",
+        ":gpu_timer_kernel_header",
         ":gpu_types_header",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/time",
@@ -330,10 +348,14 @@ gpu_only_cc_library(
     name = "gpu_timer",
     srcs = ["gpu_timer.cc"],
     hdrs = ["gpu_timer.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     deps = [
         ":gpu_driver_header",
         ":gpu_executor_header",
         ":gpu_stream",
+        ":gpu_timer_kernel_header",
         ":gpu_types_header",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_internal",
@@ -348,7 +370,9 @@ gpu_only_cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
+    ] + if_gpu_is_configured([
+        ":gpu_timer_kernel",
+    ]) + if_cuda_is_configured([
         "//xla/stream_executor/cuda:cuda_driver",
     ]) + if_rocm_is_configured([
         "//xla/stream_executor/rocm:rocm_driver",
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
index ecd3f40c6725c9..c0256e8051c719 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
@@ -51,10 +51,21 @@ absl::Duration RandomDuration() {
   return absl::Microseconds(distribution(rng));
 }
 
+bool ShouldLaunchDelayKernel() {
+  // Only launch the delay kernel if CUDA_LAUNCH_BLOCKING is not set to 1.
+  static bool value = [] {
+    const char* blocking = std::getenv("CUDA_LAUNCH_BLOCKING");
+    return !blocking || std::string_view{blocking} != "1";
+  }();
+  return value;
+}
+
 }  // namespace
 
 /*deprecated*/ /*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(
     GpuStream* stream) {
+  // This deprecated factory does not launch the delay kernel and may lead to
+  // reduced measurement accuracy.
   GpuExecutor* parent = stream->parent();
   GpuContext* context = parent->gpu_context();
   GpuEventHandle start_event;
@@ -72,6 +83,8 @@ absl::Duration RandomDuration() {
 
 /*deprecated*/ /*static*/ absl::StatusOr<std::optional<GpuTimer>>
 GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
+  // This deprecated factory does not launch the delay kernel and may lead to
+  // reduced measurement accuracy.
   if (is_needed) {
     TF_ASSIGN_OR_RETURN(GpuTimer t, GpuTimer::Create(stream));
     return {std::make_optional(std::move(t))};
@@ -79,16 +92,78 @@ GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
   return std::nullopt;
 }
 
-[[deprecated("So it can quietly call a deprecated method")]] /*static*/ absl::
-    StatusOr<GpuTimer>
-    GpuTimer::Create(Stream* stream) {
-  return GpuTimer::Create(AsGpuStream(stream));
+/*static*/ absl::StatusOr<GpuTimer::GpuSemaphore>
+GpuTimer::GpuSemaphore::Create(StreamExecutor* executor) {
+  // Allocate the value in pinned host memory that can be read from both
+  // host and device.
+  TF_ASSIGN_OR_RETURN(auto alloc,
+                      executor->HostMemoryAllocate(sizeof(GpuSemaphoreState)));
+  return GpuSemaphore{std::move(alloc)};
 }
 
-[[deprecated("So it can quietly call a deprecated method")]] /*static*/ absl::
-    StatusOr<std::optional<GpuTimer>>
-    GpuTimer::CreateIfNeeded(Stream* stream, bool is_needed) {
-  return GpuTimer::CreateIfNeeded(AsGpuStream(stream), is_needed);
+DeviceMemory<GpuSemaphoreState> GpuTimer::GpuSemaphore::device() {
+  // This assumes unified addressing, as we do not explicitly translate the
+  // host pointer into a device pointer.
+  return DeviceMemory<GpuSemaphoreState>::MakeFromByteSize(
+      ptr_->opaque(), sizeof(GpuSemaphoreState));
+}
+
+/*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(Stream* real_stream) {
+  StreamExecutor* executor = real_stream->parent();
+  GpuStream* stream = AsGpuStream(real_stream);
+  GpuExecutor* parent = stream->parent();
+  GpuContext* context = parent->gpu_context();
+  GpuEventHandle start_event;
+  TF_RETURN_IF_ERROR(GpuDriver::InitEvent(context, &start_event,
+                                          GpuDriver::EventFlags::kDefault));
+  GpuEventHandle stop_event;
+  TF_RETURN_IF_ERROR(GpuDriver::InitEvent(context, &stop_event,
+                                          GpuDriver::EventFlags::kDefault));
+  CHECK(start_event != nullptr && stop_event != nullptr);
+  GpuSemaphore semaphore{};
+  if (ShouldLaunchDelayKernel()) {
+    // Check the assumption that this device supports unified addressing,
+    // otherwise skip the delay kernel
+    TF_ASSIGN_OR_RETURN(int status, GpuDriver::GetDeviceAttribute(
+                                        CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
+                                        parent->device()));
+    if (!status) {
+      LOG(WARNING) << "Skipping the delay kernel because the device does not "
+                      "support unified addressing";
+    } else {
+      // Allocate a semaphore value that will be used to signal to the delay
+      // kernel that it may exit.
+      TF_ASSIGN_OR_RETURN(semaphore, GpuSemaphore::Create(executor));
+      *semaphore = GpuSemaphoreState::Hold;
+      // In principle the kernel could be loaded lazily and shared across
+      // multiple GpuTimer objects.
+      TF_ASSIGN_OR_RETURN(
+          auto kernel,
+          (TypedKernel<DeviceMemory<GpuSemaphoreState>,
+                       GpuSemaphoreState>::Create(executor, "DelayKernel",
+                                                  delay_kernel::kernel())));
+      // Launch a delay kernel into this stream, which will spin until
+      // GetElapsedDuration() is called, the timer is destroyed, or the timeout
+      // in the kernel is reached.
+      TF_RETURN_IF_ERROR(real_stream->ThenLaunch(
+          ThreadDim(1, 1, 1), BlockDim(1, 1, 1), kernel, semaphore.device(),
+          GpuSemaphoreState::Release));
+    }
+  }
+  // The start event goes after the delay kernel in the stream
+  TF_RETURN_IF_ERROR(GpuDriver::RecordEvent(parent->gpu_context(), start_event,
+                                            stream->gpu_stream()));
+  return absl::StatusOr<GpuTimer>{absl::in_place, parent, start_event,
+                                  stop_event,     stream, std::move(semaphore)};
+}
+
+/*static*/ absl::StatusOr<std::optional<GpuTimer>> GpuTimer::CreateIfNeeded(
+    Stream* stream, bool is_needed) {
+  if (is_needed) {
+    TF_ASSIGN_OR_RETURN(GpuTimer t, GpuTimer::Create(stream));
+    return {std::make_optional(std::move(t))};
+  }
+  return std::nullopt;
 }
 
 /*static*/ void GpuTimer::ReturnRandomDurationsForTesting() {
@@ -97,6 +172,17 @@ GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
 
 GpuTimer::~GpuTimer() {
   GpuContext* context = parent_->gpu_context();
+  if (semaphore_ && !is_stopped_) {
+    // Signal the delay kernel that it can exit
+    *semaphore_ = GpuSemaphoreState::Release;
+    // Wait for the delay kernel to exit before destroying the value that it is
+    // watching.
+    absl::Status status =
+        GpuDriver::SynchronizeStream(context, stream_->gpu_stream());
+    if (!status.ok()) {
+      LOG(ERROR) << status;
+    }
+  }
   if (start_event_ != nullptr) {
     absl::Status status = GpuDriver::DestroyEvent(context, &start_event_);
     if (!status.ok()) {
@@ -117,6 +203,18 @@ absl::StatusOr<absl::Duration> GpuTimer::GetElapsedDuration() {
   }
   TF_RETURN_IF_ERROR(GpuDriver::RecordEvent(parent_->gpu_context(), stop_event_,
                                             stream_->gpu_stream()));
+  // If we launched the delay kernel then check if it already timed out.
+  if (semaphore_) {
+    if (*semaphore_ == GpuSemaphoreState::TimedOut) {
+      // The delay kernel did not achieve the intended result.
+      LOG(ERROR) << "Delay kernel timed out: measured time has sub-optimal "
+                    "accuracy. There may be a missing warmup execution, please "
+                    "investigate in Nsight Systems.";
+    } else {
+      // Signal that the kernel can exit
+      *semaphore_ = GpuSemaphoreState::Release;
+    }
+  }
   float elapsed_milliseconds = NAN;
   if (!GpuDriver::GetEventElapsedTime(parent_->gpu_context(),
                                       &elapsed_milliseconds, start_event_,
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
index 8fd83bec6499e3..251c77ec7ee1ea 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/time/time.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 
 namespace xla {
@@ -36,9 +37,29 @@ namespace gpu {
 class GpuExecutor;
 class GpuStream;
 
-// Timer is started once it's created, and is stopped once read.
+// When a timer is created it launches a delay kernel into the given stream and
+// queues a start event immediately afterwards. This delay kernel blocks
+// execution on the stream until GetElapsedDuration() is called, at which point
+// an end event is queued and the delay kernel exits. This allows the device
+// execution time of the tasks queued to the stream while the timer is active
+// to be measured more accurately.
 class GpuTimer {
  public:
+  class GpuSemaphore {
+   public:
+    GpuSemaphore() = default;
+    static absl::StatusOr<GpuSemaphore> Create(StreamExecutor* executor);
+    explicit operator bool() const { return bool{ptr_}; }
+    GpuSemaphoreState& operator*() {
+      return *static_cast<GpuSemaphoreState*>(ptr_->opaque());
+    }
+    DeviceMemory<GpuSemaphoreState> device();
+
+   private:
+    explicit GpuSemaphore(std::unique_ptr<HostMemoryAllocation> alloc)
+        : ptr_{std::move(alloc)} {}
+    std::unique_ptr<HostMemoryAllocation> ptr_;
+  };
   static absl::StatusOr<GpuTimer> Create(Stream* stream);
   [[deprecated("Pass Stream* not GpuStream*")]] static absl::StatusOr<GpuTimer>
   Create(GpuStream* stream);
@@ -53,17 +74,20 @@ class GpuTimer {
   CreateIfNeeded(GpuStream* stream, bool is_needed);
 
   explicit GpuTimer(GpuExecutor* parent, GpuEventHandle start_event,
-                    GpuEventHandle stop_event, GpuStream* stream)
+                    GpuEventHandle stop_event, GpuStream* stream,
+                    GpuSemaphore semaphore = {})
       : parent_(parent),
         start_event_(start_event),
         stop_event_(stop_event),
-        stream_(stream) {}
+        stream_(stream),
+        semaphore_(std::move(semaphore)) {}
 
   GpuTimer(GpuTimer&& other)
       : parent_(other.parent_),
         start_event_(std::exchange(other.start_event_, nullptr)),
         stop_event_(std::exchange(other.stop_event_, nullptr)),
-        stream_(other.stream_) {}
+        stream_(other.stream_),
+        semaphore_(std::move(other.semaphore_)) {}
 
   GpuTimer& operator=(GpuTimer&& other) {
     if (this != &other) {
@@ -71,6 +95,7 @@ class GpuTimer {
       start_event_ = std::exchange(other.start_event_, nullptr);
       stop_event_ = std::exchange(other.stop_event_, nullptr);
       stream_ = other.stream_;
+      semaphore_ = std::move(other.semaphore_);
     }
     return *this;
   }
@@ -86,6 +111,7 @@ class GpuTimer {
   GpuEventHandle start_event_ = nullptr;
   GpuEventHandle stop_event_ = nullptr;
   GpuStream* stream_;
+  GpuSemaphore semaphore_;
   bool is_stopped_ = false;
 
   GpuTimer(const GpuTimer&) = delete;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
new file mode 100644
index 00000000000000..0ce4b1d9fbb323
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
@@ -0,0 +1,52 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
+
+#include <cstddef>
+
+namespace stream_executor::gpu {
+namespace {
+// Wait for the value pointed to by `semaphore` to have value `target`, timing
+// out after approximately `APPROX_TIMEOUT_SECONDS` seconds if that value is
+// not reached. This can happen if, for example, blocking launches are enabled
+// via CUDA_LAUNCH_BLOCKING=1. It can also happen if launching a kernel after
+// this delay kernel causes synchronisation, e.g. because of lazy loading.
+__global__ void DelayKernel(volatile GpuSemaphoreState* semaphore,
+                            GpuSemaphoreState target) {
+  constexpr int64_t WAIT_CYCLES{1024};
+  constexpr int64_t TIMEOUT_CYCLES{200000000};  // 100ms at 2GHz
+  const int64_t tstart{clock64()};
+  bool target_not_reached;
+  while ((target_not_reached = (*semaphore != target)) &&
+         (clock64() - tstart) < TIMEOUT_CYCLES) {
+    int64_t elapsed{};
+    const int64_t t0{clock64()};
+    do {
+      elapsed = clock64() - t0;
+    } while (elapsed < WAIT_CYCLES);
+  }
+  if (target_not_reached) {
+    // We are exiting due to the timeout. Signal this back to the host so that
+    // we can emit a warning, as it probably indicates suboptimal usage.
+    *semaphore = GpuSemaphoreState::TimedOut;
+  }
+}
+}  // namespace
+
+namespace delay_kernel {
+void* kernel() { return reinterpret_cast<void*>(DelayKernel); }
+}  // namespace delay_kernel
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
new file mode 100644
index 00000000000000..2ac358b4ee56c5
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
@@ -0,0 +1,26 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
+
+namespace stream_executor::gpu {
+enum struct GpuSemaphoreState { Hold, Release, TimedOut };
+namespace delay_kernel {
+void* kernel();  // returns a pointer to a CUDA C++ device function
+}  // namespace delay_kernel
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_

From 0fcbe924e68ae2e226fb8705d451fd27eb8e7976 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Thu, 21 Mar 2024 02:00:27 -0700
Subject: [PATCH 213/670] [XLA:GPU][IndexAnalysis] Add indexing map for gather.

PiperOrigin-RevId: 617767558
---
 .../xla/xla/service/gather_simplifier.cc      | 22 +++---
 .../xla/xla/service/gather_simplifier.h       |  3 +
 third_party/xla/xla/service/gpu/model/BUILD   |  1 +
 .../service/gpu/model/indexing_analysis.cc    | 71 +++++++++++++++++++
 .../gpu/model/indexing_analysis_test.cc       | 47 ++++++++++++
 5 files changed, 133 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/service/gather_simplifier.cc b/third_party/xla/xla/service/gather_simplifier.cc
index 051f346bd7877e..354d26b4026a68 100644
--- a/third_party/xla/xla/service/gather_simplifier.cc
+++ b/third_party/xla/xla/service/gather_simplifier.cc
@@ -125,19 +125,19 @@ absl::StatusOr<HloInstruction*> GatherSimplifier::ExpandInstruction(
   return MaybeTranspose(result, output_perm);
 }
 
-bool GatherSimplifier::InstructionMatchesPattern(HloInstruction* inst) {
-  auto* gather = DynCast<HloGatherInstruction>(inst);
-  if (!gather) {
-    return false;
-  }
-
+bool GatherSimplifier::IsSimplifiedGather(const HloGatherInstruction* gather) {
   auto* start_indices = gather->operands()[1];
   const auto& dims = gather->gather_dimension_numbers();
-  return start_indices->shape().rank() != 2 || dims.index_vector_dim() != 1 ||
-         !IsIdentityPermutation(dims.start_index_map()) ||
-         !dims.collapsed_slice_dims().empty() ||
-         *dims.offset_dims().begin() != 1 ||
-         *dims.offset_dims().rbegin() != dims.offset_dims().size();
+  return start_indices->shape().rank() == 2 && dims.index_vector_dim() == 1 &&
+         IsIdentityPermutation(dims.start_index_map()) &&
+         dims.collapsed_slice_dims().empty() &&
+         *dims.offset_dims().begin() == 1 &&
+         *dims.offset_dims().rbegin() == dims.offset_dims().size();
+}
+
+bool GatherSimplifier::InstructionMatchesPattern(HloInstruction* inst) {
+  auto* gather = DynCast<HloGatherInstruction>(inst);
+  return gather && !IsSimplifiedGather(gather);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gather_simplifier.h b/third_party/xla/xla/service/gather_simplifier.h
index 6d4d70170f852c..6d2b37502cd4c0 100644
--- a/third_party/xla/xla/service/gather_simplifier.h
+++ b/third_party/xla/xla/service/gather_simplifier.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GATHER_SIMPLIFIER_H_
 #define XLA_SERVICE_GATHER_SIMPLIFIER_H_
 
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/op_expander_pass.h"
 
 namespace xla {
@@ -36,6 +37,8 @@ class GatherSimplifier : public OpExpanderPass {
  public:
   absl::string_view name() const override { return "gather_simplifier"; }
 
+  static bool IsSimplifiedGather(const HloGatherInstruction* gather);
+
  protected:
   bool InstructionMatchesPattern(HloInstruction* inst) override;
 
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 3f6fbf71370784..1460a8ce81247c 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -487,6 +487,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:gather_simplifier",
         "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu/fusions:tiling_util",
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index bff3a3431a4d4c..5de7463655921c 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout.h"
 #include "xla/permutation_util.h"
+#include "xla/service/gather_simplifier.h"
 #include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/matmul_utils.h"
@@ -401,6 +402,73 @@ HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
   return HloInstructionIndexing::FromIndexingMaps(indexing_maps);
 }
 
+HloInstructionIndexing ComputeOutputToInputGatherOpIndexing(
+    const HloGatherInstruction* gather, IndexingContext* indexing_context) {
+  CHECK(GatherSimplifier::IsSimplifiedGather(gather))
+      << "Non-simplified HLO Gather is not supported.";
+  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+
+  const Shape& operand_shape = gather->operand(0)->shape();
+  const Shape& indices_shape = gather->operand(1)->shape();
+
+  const GatherDimensionNumbers& dimension_numbers =
+      gather->gather_dimension_numbers();
+  int64_t index_vector_length =
+      indices_shape.dimensions(dimension_numbers.index_vector_dim());
+
+  const Shape& output_shape = gather->shape();
+  int64_t output_rank = output_shape.rank();
+
+  // A map for the `indices` operand of gather. It is always
+  // (d_0, ... d_{rank - 1}) -> (d_0, s_0),
+  // where 0 <= s_0 <= indices_shape[1] - 1.
+  AffineExpr indices_id_dim = getAffineDimExpr(0, mlir_context);
+  std::vector<DimVar> dim_vars =
+      DimVarsFromTensorSizes(output_shape.dimensions());
+  IndexingMap indices_map{
+      indexing_context,
+      AffineMap::get(output_rank, 1,
+                     {indices_id_dim, getAffineSymbolExpr(0, mlir_context)},
+                     mlir_context),
+      dim_vars,
+      {RangeVar{{0, index_vector_length - 1}}},
+      /*rt_vars=*/{}};
+
+  // A map for the `operand` operand of gather, from which we extract slices.
+  // (d_0, ... d_{rank - 1}) -> (d_1 + s0, d_2 + s_1, ...),
+  // where s_i are RTVars that extract indices from the `indices` operand.
+  std::vector<RTVar> rt_vars;
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(operand_shape.rank());
+  for (auto [operand_dim_id, slice_size] :
+       llvm::enumerate(gather->gather_slice_sizes())) {
+    int64_t output_dim_id = dimension_numbers.offset_dims(operand_dim_id);
+    exprs.push_back(getAffineDimExpr(output_dim_id, mlir_context));
+
+    if (operand_dim_id >= index_vector_length) continue;
+
+    RTVarData rt_var_data{
+        Interval{0, operand_shape.dimensions(operand_dim_id) - slice_size},
+        gather->operand(1),
+        IndexingMap{
+            indexing_context,
+            AffineMap::get(output_rank, /*symbolCount=*/0,
+                           {indices_id_dim, getAffineConstantExpr(
+                                                operand_dim_id, mlir_context)},
+                           mlir_context),
+            dim_vars, /*range_vars=*/{}, /*rt_vars=*/{}}};
+    rt_vars.push_back(indexing_context->RegisterRTVar(rt_var_data));
+    exprs.back() =
+        exprs.back() + getAffineSymbolExpr(operand_dim_id, mlir_context);
+  }
+  IndexingMap operand_map = {
+      indexing_context,
+      AffineMap::get(/*dimCount=*/output_rank,
+                     /*symbolCount=*/index_vector_length, exprs, mlir_context),
+      std::move(dim_vars), /*range_vars=*/{}, std::move(rt_vars)};
+  return HloInstructionIndexing::FromIndexingMaps({operand_map, indices_map});
+}
+
 IndexingMap ComputeOutputToInputPadOpIndexingImpl(
     absl::Span<const int64_t> output_dims,
     absl::Span<const int64_t> padding_low,
@@ -1295,6 +1363,9 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
   if (auto fusion = DynCast<HloFusionInstruction>(instr)) {
     return ComputeOutputToInputFusionOpIndexing(fusion, output_id, ctx);
   }
+  if (auto gather = DynCast<HloGatherInstruction>(instr)) {
+    return ComputeOutputToInputGatherOpIndexing(gather, ctx);
+  }
   if (auto iota = DynCast<HloIotaInstruction>(instr)) {
     return HloInstructionIndexing{};
   }
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 57f825d29eae72..12fa7df22b8981 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -1049,6 +1049,53 @@ TEST_F(IndexingAnalysisTest, FusionExponentialDuplication) {
                           )"))));
 }
 
+TEST_F(IndexingAnalysisTest, GatherOp) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY main {
+      operand = f32[33,76,70] parameter(0)
+      indices = s32[1806,2] parameter(1)
+      ROOT r = f32[1806,7,8,4] gather(operand, indices), offset_dims={1,2,3},
+                                 collapsed_slice_dims={}, start_index_map={0,1},
+                                 index_vector_dim=1, slice_sizes={7,8,4}
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2, d3)[s0, s1] -> (d1 + s0, d2 + s1, d3)
+                domain:
+                d0 in [0, 1805]
+                d1 in [0, 6]
+                d2 in [0, 7]
+                d3 in [0, 3]
+                s0 id: 0 in [0, 26]
+                  hlo: %indices = s32[1806,2]{1,0} parameter(1)
+                  (d0, d1, d2, d3) -> (d0, 0)
+                  domain:
+                  d0 in [0, 1805]
+                  d1 in [0, 6]
+                  d2 in [0, 7]
+                  d3 in [0, 3]
+                s1 id: 1 in [0, 68]
+                  hlo: %indices = s32[1806,2]{1,0} parameter(1)
+                  (d0, d1, d2, d3) -> (d0, 1)
+                  domain:
+                  d0 in [0, 1805]
+                  d1 in [0, 6]
+                  d2 in [0, 7]
+                  d3 in [0, 3]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2, d3)[s0] -> (d0, s0)
+                domain:
+                d0 in [0, 1805]
+                d1 in [0, 6]
+                d2 in [0, 7]
+                d3 in [0, 3]
+                s0 in [0, 1]
+              )"))));
+}
+
 TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfReduce) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m

From b5ff8184be53705260729176c7710a88a7147679 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Mar 2024 02:02:54 -0700
Subject: [PATCH 214/670] compat: Update forward compatibility horizon to
 2024-03-21

PiperOrigin-RevId: 617768081
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 7816e29bf063ef..32fa29c2cf3ef5 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 20)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 21)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 63d7b4bf1bc1f92ab64ed4ca9b0066f7bb4a584b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Mar 2024 02:03:04 -0700
Subject: [PATCH 215/670] Update GraphDef version to 1808.

PiperOrigin-RevId: 617768132
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 529d4bd4736ccd..a5df80bed9c41f 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1807  // Updated: 2024/3/20
+#define TF_GRAPH_DEF_VERSION 1808  // Updated: 2024/3/21
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 32048a936ded9b4b14dc6b8faa208f07ff8961d4 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Thu, 21 Mar 2024 03:10:36 -0700
Subject: [PATCH 216/670] Explicitly depend on MLIR interface targets.

Preparation for https://github.com/llvm/llvm-project/pull/85867.

PiperOrigin-RevId: 617781882
---
 tensorflow/compiler/mlir/lite/BUILD                    | 1 +
 tensorflow/compiler/mlir/quantization/common/ir/BUILD  | 1 +
 tensorflow/compiler/mlir/quantization/tensorflow/BUILD | 1 +
 tensorflow/compiler/mlir/tensorflow/BUILD              | 5 +++++
 tensorflow/compiler/mlir/tfrt/ir/BUILD                 | 1 +
 tensorflow/core/ir/BUILD                               | 4 +++-
 third_party/xla/xla/mlir/runtime/ir/BUILD              | 1 +
 third_party/xla/xla/mlir_hlo/BUILD                     | 1 +
 third_party/xla/xla/python/ifrt/ir/BUILD               | 1 +
 third_party/xla/xla/translate/hlo_to_mhlo/BUILD        | 1 +
 10 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 38a17a52026270..6950078a6c18ae 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -384,6 +384,7 @@ cc_library(
         "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
     ],
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/BUILD b/tensorflow/compiler/mlir/quantization/common/ir/BUILD
index 1f62dff9711d80..0d79e514af12db 100644
--- a/tensorflow/compiler/mlir/quantization/common/ir/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/ir/BUILD
@@ -67,5 +67,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index 6ef72d68c8ea83..883ef9ac57443a 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -301,6 +301,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_traits",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 3fb2f5a73bab44..5b607576c728a5 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -354,6 +354,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
@@ -401,6 +402,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
@@ -449,6 +451,7 @@ cc_library(
         "//tensorflow/core/common_runtime:lower_function_call_inline_policy",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
@@ -675,6 +678,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
     ],
 )
@@ -1601,6 +1605,7 @@ cc_library(
         "tensorflow_side_effects",
         "tensorflow_types",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfrt/ir/BUILD b/tensorflow/compiler/mlir/tfrt/ir/BUILD
index 80257d4812ecd3..550e94431cb489 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/BUILD
@@ -28,6 +28,7 @@ cc_library(
     deps = [
         ":tfrt_fallback_opdefs_inc_gen",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
 
diff --git a/tensorflow/core/ir/BUILD b/tensorflow/core/ir/BUILD
index e29a03da7ded2e..5fc887ce213d76 100644
--- a/tensorflow/core/ir/BUILD
+++ b/tensorflow/core/ir/BUILD
@@ -1,6 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -140,10 +140,12 @@ cc_library(
         ":InterfacesIncGen",
         "//tensorflow/core/ir/types:Dialect",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
     ],
 )
diff --git a/third_party/xla/xla/mlir/runtime/ir/BUILD b/third_party/xla/xla/mlir/runtime/ir/BUILD
index 50e28fcf876897..703d2f83a194b0 100644
--- a/third_party/xla/xla/mlir/runtime/ir/BUILD
+++ b/third_party/xla/xla/mlir/runtime/ir/BUILD
@@ -106,5 +106,6 @@ cc_library(
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index d929cb6af8e9c6..a3322c411f28f4 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -488,6 +488,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:ViewLikeInterface",
         "@stablehlo//:stablehlo_type_inference",
     ],
diff --git a/third_party/xla/xla/python/ifrt/ir/BUILD b/third_party/xla/xla/python/ifrt/ir/BUILD
index 5f99830ce7703f..b40e3def5c8edc 100644
--- a/third_party/xla/xla/python/ifrt/ir/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/BUILD
@@ -137,6 +137,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",  # buildcleaner: keep
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
index 2466eaf6915dc1..ae459a98dd6219 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
@@ -94,6 +94,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",

From 57d59b5d20b893c44e22ad532de5f875e51f2146 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 21 Mar 2024 03:20:39 -0700
Subject: [PATCH 217/670] Also verify whether instructions are allowed to
 change layouts.

If a test provides a instruction_can_change_layout_func, we should also use it
for the HloVerifier created inside the VerifiedHloModule.
Doing this exposed a bug in algebraic_simplifier where a simplification didn't
check whether it is running in layout sensitive mode (since it doesn't handle
layouts in any way, it should not be used in layout sensitive mode).
Also adjust HloVerifier to be not as strict regarding tiling and memory space
to make a test pass.

PiperOrigin-RevId: 617784203
---
 .../xla/xla/service/algebraic_simplifier.cc       |  3 ++-
 .../xla/xla/service/algebraic_simplifier_test.cc  | 13 +++++++++----
 third_party/xla/xla/service/hlo_verifier.cc       |  3 ++-
 third_party/xla/xla/tests/hlo_test_base.cc        | 15 +++++++--------
 third_party/xla/xla/tests/hlo_test_base.h         |  1 +
 third_party/xla/xla/tests/verified_hlo_module.h   |  9 +++++----
 6 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/third_party/xla/xla/service/algebraic_simplifier.cc b/third_party/xla/xla/service/algebraic_simplifier.cc
index 49a021487ce46b..c38802d03b5b4e 100644
--- a/third_party/xla/xla/service/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier.cc
@@ -8082,7 +8082,8 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
   // Replace reshape of a transpose of a reshape with concatenated slicing if
   // the reshape/transpose combination can be interpreted as a space-to-depth
   // transformation.
-  if (operand->opcode() == HloOpcode::kReshape &&
+  if (!options_.is_layout_sensitive() &&
+      operand->opcode() == HloOpcode::kReshape &&
       transpose->user_count() == 1 &&
       HloOpcode::kReshape == transpose->users()[0]->opcode()) {
     VLOG(2) << "trying depth-to-space transform";
diff --git a/third_party/xla/xla/service/algebraic_simplifier_test.cc b/third_party/xla/xla/service/algebraic_simplifier_test.cc
index 35d62966805a5b..e08a4b7fa7847e 100644
--- a/third_party/xla/xla/service/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier_test.cc
@@ -3100,7 +3100,6 @@ TEST_F(AlgebraicSimplifierTest, DoNotRemoveUnaryConcatenateWithCtrlDep) {
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Concatenate(m::Parameter(0))));
-  LOG(ERROR) << "module: " << m->ToString();
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_FALSE(simplifier.Run(m.get()).value());
@@ -5663,7 +5662,7 @@ TEST_F(AlgebraicSimplifierTest, TransposeReshapeToConcatSlice) {
 HloModule TransposeReshapeDepthToSpace
 
 ENTRY entry {
-  %param = f32[8,14,14,128]{0,1,2,3} parameter(0)
+  %param = f32[8,14,14,128] parameter(0)
   %reshape.1 = f32[8,14,14,2,64] reshape(%param)
   %transpose = transpose(%reshape.1), dimensions={0,1,3,2,4}
   ROOT %reshape.2 = f32[8,28,14,64] reshape(%transpose)
@@ -5690,7 +5689,7 @@ TEST_F(AlgebraicSimplifierTest, TransposeReshapeTooLarge) {
 HloModule TransposeReshapeDepthToSpaceBig
 
 ENTRY entry {
-  %param = f32[8,14,14,128]{0,1,2,3} parameter(0)
+  %param = f32[8,14,14,128] parameter(0)
   %reshape.1 = f32[8,14,14,8,16] reshape(%param)
   %transpose = transpose(%reshape.1), dimensions={0,1,3,2,4}
   ROOT %reshape.2 = f32[8,112,14,16] reshape(%transpose)
@@ -5710,7 +5709,7 @@ TEST_F(AlgebraicSimplifierTest, TransposeReshapeNotDepthToSpace) {
 HloModule TransposeReshapeDepthToSpace
 
 ENTRY entry {
-  %param = f32[8,14,14,128]{0,1,2,3} parameter(0)
+  %param = f32[8,14,14,128] parameter(0)
   %reshape.1 = f32[8,14,14,2,64] reshape(%param)
   %transpose = transpose(%reshape.1), dimensions={0,3,1,2,4}
   ROOT %reshape.2 = f32[8,28,14,64] reshape(%transpose)
@@ -6434,6 +6433,11 @@ TEST_F(AlgebraicSimplifierTest, TransposeOfNonCanonicalBatchDotCantSimplify) {
 }
 
 TEST_F(AlgebraicSimplifierTest, DynamicSliceOfTranspose) {
+  // This test is without layouts so we have to set the verifier to be layout
+  // insensitive.
+  verifier_layout_sensitive_ = false;
+  instruction_can_change_layout_func_ = {};
+
   const char* hlo_string = R"(
     HloModule module
 
@@ -7959,6 +7963,7 @@ TEST_F(AlgebraicSimplifierTest, DividedByConstantInstructionWithoutLayout) {
   // This test is without layouts so we have to set the verifier to be layout
   // insensitive.
   verifier_layout_sensitive_ = false;
+  instruction_can_change_layout_func_ = {};
 
   Shape shape = ShapeUtil::MakeShape(F32, {});
   shape.clear_layout();
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index 5682defc527c9f..1a975869303143 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -2772,7 +2772,8 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
             operand_shape.rank() == result_shape.rank() &&
             operand_shape.has_layout()) {
           const Layout& operand_layout = operand_shape.layout();
-          Layout::Equal equal_predicate = Layout::Equal();
+          Layout::Equal equal_predicate =
+              Layout::Equal().IgnoreTiles().IgnoreMemorySpace();
           if (instruction->opcode() == HloOpcode::kConvert) {
             // Convert instructions can change element_size_in_bits
             equal_predicate.IgnoreElementSize();
diff --git a/third_party/xla/xla/tests/hlo_test_base.cc b/third_party/xla/xla/tests/hlo_test_base.cc
index f645043a846c43..06c75fec1fc0ea 100644
--- a/third_party/xla/xla/tests/hlo_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_test_base.cc
@@ -98,6 +98,7 @@ HloTestBase::HloTestBase(se::Platform* test_platform,
       verifier_layout_sensitive_(verifier_layout_sensitive),
       allow_mixed_precision_in_hlo_verifier_(
           allow_mixed_precision_in_hlo_verifier),
+      instruction_can_change_layout_func_(instruction_can_change_layout_func),
       test_platform_(test_platform) {
   hlo_verifier_ = std::make_unique<HloVerifier>(
       /*layout_sensitive=*/verifier_layout_sensitive,
@@ -128,19 +129,16 @@ std::unique_ptr<VerifiedHloModule> HloTestBase::CreateNewVerifiedModule(
   return std::make_unique<VerifiedHloModule>(
       name, GetModuleConfigForTest(replica_count), verifier_layout_sensitive_,
       allow_mixed_precision_in_hlo_verifier_,
-      backend().compiler()->ShapeSizeBytesFunction());
+      backend().compiler()->ShapeSizeBytesFunction(),
+      instruction_can_change_layout_func_);
 }
 
 absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
 HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
                                           int64_t replica_count,
                                           int64_t num_partitions) {
-  TF_ASSIGN_OR_RETURN(
-      auto module,
-      ParseAndReturnVerifiedModule(
-          hlo_text, GetModuleConfigForTest(replica_count, num_partitions)));
-  UpdateEntryComputationLayout(module.get());
-  return module;
+  return ParseAndReturnVerifiedModule(
+      hlo_text, GetModuleConfigForTest(replica_count, num_partitions));
 }
 
 absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
@@ -149,7 +147,8 @@ HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
   auto module = std::make_unique<VerifiedHloModule>(
       TestName(), config, verifier_layout_sensitive_,
       allow_mixed_precision_in_hlo_verifier_,
-      backend().compiler()->ShapeSizeBytesFunction());
+      backend().compiler()->ShapeSizeBytesFunction(),
+      instruction_can_change_layout_func_);
   TF_RETURN_IF_ERROR(module->ParseHloStringAndVerifyModule(hlo_text));
   UpdateEntryComputationLayout(module.get());
   return std::move(module);
diff --git a/third_party/xla/xla/tests/hlo_test_base.h b/third_party/xla/xla/tests/hlo_test_base.h
index 53acb8e69dc4aa..f8f91a8df51b96 100644
--- a/third_party/xla/xla/tests/hlo_test_base.h
+++ b/third_party/xla/xla/tests/hlo_test_base.h
@@ -413,6 +413,7 @@ class HloTestBase : public ManifestCheckingTest {
 
   bool verifier_layout_sensitive_;
   bool allow_mixed_precision_in_hlo_verifier_;
+  HloPredicate instruction_can_change_layout_func_;
   std::unique_ptr<HloVerifier> hlo_verifier_;
 
   ErrorSpec error_spec_{0.0001};
diff --git a/third_party/xla/xla/tests/verified_hlo_module.h b/third_party/xla/xla/tests/verified_hlo_module.h
index 8ff640803f55e3..52fce003b9ccf1 100644
--- a/third_party/xla/xla/tests/verified_hlo_module.h
+++ b/third_party/xla/xla/tests/verified_hlo_module.h
@@ -35,11 +35,12 @@ class VerifiedHloModule : public HloModule {
   VerifiedHloModule(const std::string& name, const HloModuleConfig& config,
                     bool verifier_layout_sensitive,
                     bool allow_mixed_precision_in_hlo_verifier,
-                    std::function<int64_t(const Shape&)> shape_size_function)
+                    std::function<int64_t(const Shape&)> shape_size_function,
+                    HloPredicate instruction_can_change_layout_func = {})
       : HloModule(name, config),
-        verifier_(
-            verifier_layout_sensitive, allow_mixed_precision_in_hlo_verifier,
-            /*instruction_can_change_layout_func=*/{}, shape_size_function) {}
+        verifier_(verifier_layout_sensitive,
+                  allow_mixed_precision_in_hlo_verifier,
+                  instruction_can_change_layout_func, shape_size_function) {}
 
   ~VerifiedHloModule() override { VerifyOrAddFailure("in destructor"); }
 

From 0f9e76ff2b868ee9135141d747df4a1672899fe5 Mon Sep 17 00:00:00 2001
From: zahiqbal <zahid.iqbal@amd.com>
Date: Thu, 21 Mar 2024 03:26:59 -0700
Subject: [PATCH 218/670] =?UTF-8?q?PR=20#10685:=20[ROCM]=20GPU=20profiling?=
 =?UTF-8?q?=20code=20refactor=20-=20seperarted=20rocm=20event=20tracer=20a?=
 =?UTF-8?q?nd=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/10685

rocm profiler code refactored, Separated tracing and event collecting into rocm tracer (rocm_tracer.cc/h) and event collector (rocm_collector.cc/h).
Copybara import of the project:

--
9ada3c5629c208de530cf88beb934f401c8d99be by zahiqbal <zahid.iqbal@amd.com>:

[ROCM] GPU profiling code refactor - seperarted rocm event tracer and event collector.

Merging this change closes #10685

PiperOrigin-RevId: 617785573
---
 .../xla/xla/backends/profiler/gpu/BUILD       |  34 +
 .../profiler/gpu/device_tracer_rocm.cc        | 720 +--------------
 .../backends/profiler/gpu/rocm_collector.cc   | 853 ++++++++++++++++++
 .../backends/profiler/gpu/rocm_collector.h    | 227 +++++
 .../xla/backends/profiler/gpu/rocm_tracer.cc  | 124 +--
 .../xla/backends/profiler/gpu/rocm_tracer.h   | 215 +----
 6 files changed, 1173 insertions(+), 1000 deletions(-)
 create mode 100644 third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
 create mode 100644 third_party/xla/xla/backends/profiler/gpu/rocm_collector.h

diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 35094fd55c1839..9a9265bd5559bb 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -37,6 +37,7 @@ tsl_gpu_library(
         ":cupti_collector",
         ":cupti_tracer",
         ":cupti_wrapper",
+        ":rocm_collector",
         ":rocm_tracer",
     ],
     deps = [
@@ -220,6 +221,37 @@ tsl_gpu_library(
     ],
 )
 
+tsl_gpu_library(
+    name = "rocm_collector",
+    srcs = if_rocm(["rocm_collector.cc"]),
+    hdrs = if_rocm(["rocm_collector.h"]),
+    copts = tf_profiler_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/stream_executor/rocm:roctracer_wrapper",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/types:optional",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:macros",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/profiler/backends/cpu:annotation_stack",
+        "@local_tsl//tsl/profiler/lib:profiler_factory",
+        "@local_tsl//tsl/profiler/lib:profiler_interface",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
+        "@local_tsl//tsl/profiler/utils:parse_annotation",
+        "@local_tsl//tsl/profiler/utils:time_utils",
+        "@local_tsl//tsl/profiler/utils:trace_utils",
+        "@local_tsl//tsl/profiler/utils:xplane_builder",
+        "@local_tsl//tsl/profiler/utils:xplane_schema",
+        "@local_tsl//tsl/profiler/utils:xplane_utils",
+    ],
+)
+
 tsl_gpu_library(
     name = "rocm_tracer",
     srcs = if_rocm(["rocm_tracer.cc"]),
@@ -227,6 +259,7 @@ tsl_gpu_library(
     copts = tf_profiler_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":rocm_collector",
         "//xla/stream_executor/rocm:roctracer_wrapper",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -239,6 +272,7 @@ tsl_gpu_library(
         "@local_tsl//tsl/platform:macros",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/profiler/backends/cpu:annotation_stack",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_tsl//tsl/profiler/utils:time_utils",
     ],
 )
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
index 58f3aaaaee51f1..6ce64f81b68657 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "xla/backends/profiler/gpu/rocm_collector.h"
 #include "xla/backends/profiler/gpu/rocm_tracer.h"
 #include "tsl/platform/abi.h"
 #include "tsl/platform/env_time.h"
@@ -43,11 +44,11 @@ limitations under the License.
 namespace xla {
 namespace profiler {
 
-using absl::OkStatus;
-using absl::Status;
 using tensorflow::ProfileOptions;
 using tsl::mutex;
 using tsl::mutex_lock;
+using tsl::OkStatus;
+using tsl::Status;
 using tsl::profiler::Annotation;
 using tsl::profiler::AnnotationStack;
 using tsl::profiler::FindOrAddMutablePlaneWithName;
@@ -65,711 +66,6 @@ using tsl::profiler::XLineBuilder;
 using tsl::profiler::XPlaneBuilder;
 using tsl::profiler::XSpace;
 
-namespace {
-// Set the all XLines of specified XPlane to starting walltime.
-// Events time in both host and device planes are CUTPI timestamps.
-// We set initial RocmTracer timestamp as start time for all lines to reflect
-// this fact. Eventually we change line start time to corresponding
-// start_walltime_ns to normalize with CPU wall time.
-static void NormalizeTimeStamps(XPlaneBuilder* plane,
-                                uint64_t start_walltime_ns) {
-  plane->ForEachLine([&](tsl::profiler::XLineBuilder line) {
-    line.SetTimestampNs(start_walltime_ns);
-  });
-}
-
-std::string GetDeviceXLineName(
-    int64_t stream_id, absl::flat_hash_set<RocmTracerEventType>& event_types) {
-  std::string line_name = absl::StrCat("Stream #", stream_id);
-  event_types.erase(RocmTracerEventType::Unsupported);
-  if (event_types.empty()) return line_name;
-  std::vector<const char*> type_names;
-  for (const auto event_type : event_types) {
-    type_names.emplace_back(GetRocmTracerEventTypeName(event_type));
-  }
-  return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
-}
-
-}  // namespace
-
-class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
- public:
-  RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options,
-                         uint64_t start_walltime_ns, uint64_t start_gputime_ns)
-      : RocmTraceCollector(options),
-        num_callback_events_(0),
-        num_activity_events_(0),
-        start_walltime_ns_(start_walltime_ns),
-        start_gputime_ns_(start_gputime_ns),
-        num_gpus_(options.num_gpus),
-        per_device_collector_(options.num_gpus) {}
-
-  void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override {
-    mutex_lock lock(event_maps_mutex_);
-
-    if (event.source == RocmTracerEventSource::ApiCallback && !is_auxiliary) {
-      if (num_callback_events_ > options_.max_callback_api_events) {
-        OnEventsDropped("max callback event capacity reached",
-                        event.correlation_id);
-        DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-        return;
-      }
-      num_callback_events_++;
-    } else if (event.source == RocmTracerEventSource::Activity &&
-               event.domain == RocmTracerEventDomain::HIP_API) {
-      // we do not count HIP_OPS activities.
-      if (num_activity_events_ > options_.max_activity_api_events) {
-        OnEventsDropped("max activity event capacity reached",
-                        event.correlation_id);
-        DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-        return;
-      }
-      num_activity_events_++;
-    }
-
-    bool emplace_result = false;
-    if (event.source == RocmTracerEventSource::ApiCallback) {
-      auto& target_api_event_map =
-          (is_auxiliary) ? auxiliary_api_events_map_ : api_events_map_;
-      std::tie(std::ignore, emplace_result) =
-          target_api_event_map.emplace(event.correlation_id, std::move(event));
-    } else if (event.source == RocmTracerEventSource::Activity) {
-      auto result = activity_ops_events_map_.emplace(
-          event.correlation_id, std::vector<RocmTracerEvent>{});
-      result.first->second.push_back(std::move(event));
-      emplace_result = true;  // we always accept Hip-Ops events
-    }
-    if (!emplace_result) {
-      OnEventsDropped("event with duplicate correlation_id was received.",
-                      event.correlation_id);
-      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-    }
-  }
-
-  void OnEventsDropped(const std::string& reason,
-                       uint32_t correlation_id) override {
-    LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
-              << ",) : " << reason << ".";
-  }
-
-  void Flush() override {
-    mutex_lock lock(event_maps_mutex_);
-    auto& aggregated_events_ = ApiActivityInfoExchange();
-
-    VLOG(3) << "RocmTraceCollector collected " << num_callback_events_
-            << " callback events, " << num_activity_events_
-            << " activity events, and aggregated them into "
-            << aggregated_events_.size() << " events.";
-
-    // device ids for GPUs filled in by roctracer are not zero indexed.
-    // They are offset by number of CPUs on the machine
-    tsl::uint32 min_device_id = INT32_MAX;
-    for (auto& event : aggregated_events_) {
-      if (event.device_id < min_device_id) {
-        min_device_id = event.device_id;
-      }
-    }
-
-    for (auto event : aggregated_events_) {
-      event.device_id = event.device_id - min_device_id;
-      if (event.device_id < num_gpus_) {
-        per_device_collector_[event.device_id].AddEvent(event);
-      } else {
-        OnEventsDropped("Invalid device id for an event.",
-                        event.correlation_id);
-        DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-      }
-    }
-
-    activity_ops_events_map_.clear();
-    api_events_map_.clear();
-    auxiliary_api_events_map_.clear();
-  }
-
-  void Export(XSpace* space) {
-    uint64_t end_gputime_ns = RocmTracer::GetTimestamp();
-    XPlaneBuilder host_plane(FindOrAddMutablePlaneWithName(
-        space, tsl::profiler::kRoctracerApiPlaneName));
-
-    for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
-      std::string name = GpuPlaneName(device_ordinal);
-      XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
-      device_plane.SetId(device_ordinal);
-      // Calculate device capabilities before flushing, so that device
-      // properties are available to the occupancy calculator in export().
-      per_device_collector_[device_ordinal].GetDeviceCapabilities(
-          device_ordinal, &device_plane);
-      per_device_collector_[device_ordinal].Export(
-          start_walltime_ns_, start_gputime_ns_, end_gputime_ns, &device_plane,
-          &host_plane);
-      NormalizeTimeStamps(&device_plane, start_walltime_ns_);
-    }
-    NormalizeTimeStamps(&host_plane, start_walltime_ns_);
-  }
-
- private:
-  std::atomic<int> num_callback_events_;
-  std::atomic<int> num_activity_events_;
-  uint64_t start_walltime_ns_;
-  uint64_t start_gputime_ns_;
-  int num_gpus_;
-
-  mutex event_maps_mutex_;
-  absl::flat_hash_map<uint32_t, RocmTracerEvent> api_events_map_
-      TF_GUARDED_BY(event_maps_mutex_);
-
-  /* Some apis such as MEMSETD32 (based on an observation with ResNet50),
-   trigger multiple HIP ops domain activities. We keep them in a vector and
-   merge them with api activities at flush time.
- */
-  absl::flat_hash_map<uint32_t, std::vector<RocmTracerEvent>>
-      activity_ops_events_map_ TF_GUARDED_BY(event_maps_mutex_);
-  // This is for the APIs that we track because we need some information from
-  // them to populate the corresponding activity that we actually track.
-  absl::flat_hash_map<uint32_t, RocmTracerEvent> auxiliary_api_events_map_
-      TF_GUARDED_BY(event_maps_mutex_);
-
-  const std::vector<RocmTracerEvent> ApiActivityInfoExchange() {
-    /* Different from CUDA, roctracer activity records are not enough to fill a
-      TF event. For most of the activities, we need to enable the corresponding
-      API callsbacks (we call them auxiliary API callbacks) to capture the
-      necessary fields from them using the correlation id. The purpose of this
-      function is to let APIs and activities exchange information to reach a
-      state very similar to TF CUDA and getting ready to dump the event.
-    */
-
-    std::vector<RocmTracerEvent> aggregated_events;
-
-    // Copy info from activity events to API callback events
-    for (auto& api_iter : api_events_map_) {
-      RocmTracerEvent& api_event = api_iter.second;
-      auto activity_event =
-          activity_ops_events_map_.find(api_event.correlation_id);
-
-      if (activity_event == activity_ops_events_map_.end()) {
-        OnEventsDropped(
-            "An event from HIP API discarded."
-            "Could not find the counterpart activity.",
-            api_event.correlation_id);
-        DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
-      } else {
-        api_event.device_id = activity_event->second.front().device_id;
-        api_event.stream_id = activity_event->second.front().stream_id;
-        switch (api_event.type) {
-          case RocmTracerEventType::Kernel:
-          case RocmTracerEventType::Memset:
-          case RocmTracerEventType::MemoryAlloc:
-          case RocmTracerEventType::MemoryFree:
-          case RocmTracerEventType::Synchronization: {
-            aggregated_events.push_back(api_event);
-            break;
-          }
-          case RocmTracerEventType::MemcpyD2H:
-          case RocmTracerEventType::MemcpyH2D:
-          case RocmTracerEventType::MemcpyD2D:
-          case RocmTracerEventType::MemcpyOther: {
-            api_event.memcpy_info.destination =
-                activity_event->second.front().device_id;
-            aggregated_events.push_back(api_event);
-            break;
-          }
-          default:
-            OnEventsDropped(
-                "Missing API-Activity information exchange. Dropped!",
-                api_event.correlation_id);
-            DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
-            LOG(WARNING) << "A ROCm API event type with unimplemented activity "
-                            "merge dropped! "
-                            "Type="
-                         << GetRocmTracerEventTypeName(api_event.type);
-        }
-      }
-    }
-
-    // Make sure for all activity events we have API callback events
-    for (auto& activity_iter : activity_ops_events_map_) {
-      RocmTracerEvent& activity_event = activity_iter.second.front();
-      auto api_event = api_events_map_.find(activity_event.correlation_id);
-
-      if (api_event == api_events_map_.end()) {
-        api_event =
-            auxiliary_api_events_map_.find(activity_event.correlation_id);
-      }
-
-      if (api_event == auxiliary_api_events_map_.end()) {
-        OnEventsDropped(
-            "An event from activity was discarded."
-            "Could not find the counterpart HIP API.",
-            activity_event.correlation_id);
-        DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
-      } else {
-        switch (activity_event.type) {
-          // KERNEL ACTIVITY
-          case RocmTracerEventType::Kernel: {
-            activity_event.name = api_event->second.name;
-            activity_event.kernel_info = api_event->second.kernel_info;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          // MEMCPY ACTIVITY
-          case RocmTracerEventType::MemcpyD2H:
-          case RocmTracerEventType::MemcpyH2D:
-          case RocmTracerEventType::MemcpyD2D:
-          case RocmTracerEventType::MemcpyOther: {
-            activity_event.memcpy_info = api_event->second.memcpy_info;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          // MEMSET ACTIVITY
-          case RocmTracerEventType::Memset: {
-            activity_event.memset_info = api_event->second.memset_info;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          // MALLOC ACTIVITY, FREE ACTIVITY
-          case RocmTracerEventType::MemoryAlloc:
-          case RocmTracerEventType::MemoryFree: {
-            activity_event.device_id = api_event->second.device_id;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          // SYNCHRONIZATION ACTIVITY
-          case RocmTracerEventType::Synchronization: {
-            activity_event.device_id = api_event->second.device_id;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          default:
-            OnEventsDropped(
-                "Missing API-Activity information exchange. Dropped!",
-                activity_event.correlation_id);
-            DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
-            LOG(WARNING) << "A ROCm activity event with unimplemented API "
-                            "callback merge dropped! "
-                            "Type="
-                         << GetRocmTracerEventTypeName(activity_event.type);
-            break;
-        }
-      }
-    }
-
-    return aggregated_events;
-  }
-  struct RocmDeviceOccupancyParams {
-    hipFuncAttributes attributes = {};
-    int block_size = 0;
-    size_t dynamic_smem_size = 0;
-    void* func_ptr;
-
-    friend bool operator==(const RocmDeviceOccupancyParams& lhs,
-                           const RocmDeviceOccupancyParams& rhs) {
-      return 0 == memcmp(&lhs, &rhs, sizeof(lhs));
-    }
-
-    template <typename H>
-    friend H AbslHashValue(H hash_state,
-                           const RocmDeviceOccupancyParams& params) {
-      return H::combine(
-          std::move(hash_state), params.attributes.maxThreadsPerBlock,
-          params.attributes.numRegs, params.attributes.sharedSizeBytes,
-          params.attributes.maxDynamicSharedSizeBytes, params.block_size,
-          params.dynamic_smem_size, params.func_ptr);
-    }
-  };
-
-  struct OccupancyStats {
-    double occupancy_pct = 0.0;
-    int min_grid_size = 0;
-    int suggested_block_size = 0;
-  };
-  struct CorrelationInfo {
-    CorrelationInfo(uint32_t t, uint32_t e)
-        : thread_id(t), enqueue_time_ns(e) {}
-    uint32_t thread_id;
-    uint64_t enqueue_time_ns;
-  };
-
-  struct PerDeviceCollector {
-    void GetDeviceCapabilities(int32_t device_ordinal,
-                               XPlaneBuilder* device_plane) {
-      device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
-                                     GetStatTypeStr(StatType::kDevVendor)),
-                                 kDeviceVendorAMD);
-
-      if (hipGetDeviceProperties(&device_properties_, device_ordinal) !=
-          hipSuccess)
-        return;
-
-      auto clock_rate_in_khz =
-          device_properties_.clockRate;  // this is also in Khz
-      if (clock_rate_in_khz) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapClockRateKHz)),
-            clock_rate_in_khz);
-      }
-
-      auto core_count = device_properties_.multiProcessorCount;
-      if (core_count) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapCoreCount)),
-            core_count);
-      }
-
-      auto mem_clock_khz = device_properties_.memoryClockRate;
-      auto mem_bus_width_bits = device_properties_.memoryBusWidth;
-
-      if (mem_clock_khz && mem_bus_width_bits) {
-        // Times 2 because HBM is DDR memory; it gets two data bits per each
-        // data lane.
-        auto memory_bandwidth =
-            uint64_t{2} * (mem_clock_khz) * 1000 * (mem_bus_width_bits) / 8;
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
-            memory_bandwidth);
-      }
-
-      size_t total_memory = device_properties_.totalGlobalMem;
-      if (total_memory) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapMemorySize)),
-            static_cast<uint64_t>(total_memory));
-      }
-
-      auto compute_capability_major = device_properties_.major;
-      if (compute_capability_major) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
-            compute_capability_major);
-      }
-      auto compute_capability_minor = device_properties_.minor;
-      if (compute_capability_minor) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
-            compute_capability_minor);
-      }
-    }
-
-    inline std::string ToXStat(const KernelDetails& kernel_info,
-                               double occupancy_pct) {
-      return absl::StrCat(
-          "regs:", kernel_info.registers_per_thread,
-          " static_shared:", kernel_info.static_shared_memory_usage,
-          " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
-          " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
-          kernel_info.grid_z, " block:", kernel_info.block_x, ",",
-          kernel_info.block_y, ",", kernel_info.block_z,
-          " occ_pct:", occupancy_pct);
-    }
-    OccupancyStats GetOccupancy(const RocmDeviceOccupancyParams& params) const {
-      // TODO(rocm-profiler): hipOccupancyMaxActiveBlocksPerMultiprocessor only
-      // return hipSuccess for HIP_API_ID_hipLaunchKernel
-
-      OccupancyStats stats;
-      int number_of_active_blocks;
-      hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-          &number_of_active_blocks, params.func_ptr, params.block_size,
-          params.dynamic_smem_size);
-
-      if (err != hipError_t::hipSuccess) {
-        return {};
-      }
-
-      stats.occupancy_pct = number_of_active_blocks * params.block_size * 100;
-      stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor;
-
-      err = hipOccupancyMaxPotentialBlockSize(
-          &stats.min_grid_size, &stats.suggested_block_size, params.func_ptr,
-          params.dynamic_smem_size, 0);
-
-      if (err != hipError_t::hipSuccess) {
-        return {};
-      }
-
-      return stats;
-    }
-    void AddEvent(const RocmTracerEvent& event) {
-      mutex_lock l(events_mutex);
-      if (event.source == RocmTracerEventSource::ApiCallback) {
-        // Cupti api callback events were used to populate launch times etc.
-        if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
-          correlation_info_.insert(
-              {event.correlation_id,
-               CorrelationInfo(event.thread_id, event.start_time_ns)});
-        }
-        events.emplace_back(std::move(event));
-      } else {
-        // Cupti activity events measure device times etc.
-        events.emplace_back(std::move(event));
-      }
-    }
-
-    void SortByStartTime() {
-      mutex_lock lock(events_mutex);
-      std::sort(
-          events.begin(), events.end(),
-          [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
-            return event1.start_time_ns < event2.start_time_ns;
-          });
-    }
-
-    void CreateXEvent(const RocmTracerEvent& event, XPlaneBuilder* plane,
-                      uint64_t start_gpu_ns, uint64_t end_gpu_ns,
-                      XLineBuilder* line) {
-      if (event.start_time_ns < start_gpu_ns ||
-          event.end_time_ns > end_gpu_ns ||
-          event.start_time_ns > event.end_time_ns) {
-        VLOG(2) << "events have abnormal timestamps:" << event.name
-                << " start time(ns): " << event.start_time_ns
-                << " end time(ns): " << event.end_time_ns
-                << " start gpu(ns):" << start_gpu_ns
-                << " end gpu(ns):" << end_gpu_ns
-                << " corr. id:" << event.correlation_id;
-        return;
-      }
-      std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str());
-      if (kernel_name.empty()) {
-        kernel_name = GetRocmTracerEventTypeName(event.type);
-      }
-      XEventMetadata* event_metadata =
-          plane->GetOrCreateEventMetadata(std::move(kernel_name));
-      XEventBuilder xevent = line->AddEvent(*event_metadata);
-      VLOG(7) << "Adding event to line=" << line->Id();
-      xevent.SetTimestampNs(event.start_time_ns);
-      xevent.SetEndTimestampNs(event.end_time_ns);
-      if (event.source == RocmTracerEventSource::ApiCallback) {
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kDeviceId)),
-                            event.device_id);
-      }
-      if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kCorrelationId)),
-                            event.correlation_id);
-      }
-      if (!event.roctx_range.empty()) {
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kNVTXRange)),
-                            *plane->GetOrCreateStatMetadata(event.roctx_range));
-      }
-      // if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
-      //   xevent.AddStatValue(
-      //       *plane->GetOrCreateStatMetadata(
-      //           GetStatTypeStr(StatType::kContextId)),
-      //       absl::StrCat("$$", static_cast<uint64_t>(event.context_id)));
-      // }
-
-      if (event.type == RocmTracerEventType::Kernel &&
-          event.source == RocmTracerEventSource::Activity) {
-        RocmDeviceOccupancyParams params{};
-        params.attributes.maxThreadsPerBlock = INT_MAX;
-        params.attributes.numRegs =
-            static_cast<int>(event.kernel_info.registers_per_thread);
-        params.attributes.sharedSizeBytes =
-            event.kernel_info.static_shared_memory_usage;
-        // params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF;
-        // params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
-        params.attributes.maxDynamicSharedSizeBytes = 0;
-        params.block_size = static_cast<int>(event.kernel_info.block_x *
-                                             event.kernel_info.block_y *
-                                             event.kernel_info.block_z);
-
-        params.dynamic_smem_size =
-            event.kernel_info.dynamic_shared_memory_usage;
-        params.func_ptr = event.kernel_info.func_ptr;
-
-        OccupancyStats& occ_stats = occupancy_cache_[params];
-        if (occ_stats.occupancy_pct == 0.0) {
-          occ_stats = GetOccupancy(params);
-        }
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                                StatType::kTheoreticalOccupancyPct)),
-                            occ_stats.occupancy_pct);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                                StatType::kOccupancyMinGridSize)),
-                            static_cast<int32_t>(occ_stats.min_grid_size));
-        xevent.AddStatValue(
-            *plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)),
-            static_cast<int32_t>(occ_stats.suggested_block_size));
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kKernelDetails)),
-                            *plane->GetOrCreateStatMetadata(ToXStat(
-                                event.kernel_info, occ_stats.occupancy_pct)));
-      } else if (event.type == RocmTracerEventType::MemcpyH2D ||
-                 event.type == RocmTracerEventType::MemcpyD2H ||
-                 event.type == RocmTracerEventType::MemcpyD2D ||
-                 event.type == RocmTracerEventType::MemcpyP2P ||
-                 event.type == RocmTracerEventType::MemcpyOther) {
-        VLOG(7) << "Add Memcpy stat";
-        const auto& memcpy_info = event.memcpy_info;
-        std::string memcpy_details = absl::StrCat(
-            // TODO(rocm-profiler): we need to discover the memory kind similar
-            // to CUDA
-            "kind:", "Unknown", " size:", memcpy_info.num_bytes,
-            " dest:", memcpy_info.destination, " async:", memcpy_info.async);
-        xevent.AddStatValue(
-            *plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kMemcpyDetails)),
-            *plane->GetOrCreateStatMetadata(std::move(memcpy_details)));
-      } else if (event.type == RocmTracerEventType::MemoryAlloc) {
-        VLOG(7) << "Add MemAlloc stat";
-        std::string value =
-            // TODO(rocm-profiler): we need to discover the memory kind similar
-            // to CUDA
-            absl::StrCat("kind:", "Unknown",
-                         " num_bytes:", event.memalloc_info.num_bytes);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kMemallocDetails)),
-                            *plane->GetOrCreateStatMetadata(std::move(value)));
-      } else if (event.type == RocmTracerEventType::MemoryFree) {
-        VLOG(7) << "Add MemFree stat";
-        std::string value =
-            // TODO(rocm-profiler): we need to discover the memory kind similar
-            // to CUDA
-            absl::StrCat("kind:", "Unknown",
-                         " num_bytes:", event.memalloc_info.num_bytes);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kMemFreeDetails)),
-                            *plane->GetOrCreateStatMetadata(std::move(value)));
-      } else if (event.type == RocmTracerEventType::Memset) {
-        VLOG(7) << "Add Memset stat";
-        auto value =
-            // TODO(rocm-profiler): we need to discover the memory kind similar
-            // to CUDA
-            absl::StrCat("kind:", "Unknown",
-                         " num_bytes:", event.memset_info.num_bytes,
-                         " async:", event.memset_info.async);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kMemsetDetails)),
-                            *plane->GetOrCreateStatMetadata(std::move(value)));
-      }
-      // TODO(rocm-profiler): we need to support the following event type
-      /* else if (event.type == CuptiTracerEventType::MemoryResidency) {
-        VLOG(7) << "Add MemoryResidency stat";
-        std::string value = absl::StrCat(
-            "kind:", GetMemoryKindName(event.memory_residency_info.kind),
-            " num_bytes:", event.memory_residency_info.num_bytes,
-            " addr:", event.memory_residency_info.address);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                                StatType::kMemoryResidencyDetails)),
-                            *plane->GetOrCreateStatMetadata(std::move(value)));
-      } */
-
-      std::vector<Annotation> annotation_stack =
-          ParseAnnotationStack(event.annotation);
-      if (!annotation_stack.empty()) {
-        xevent.AddStatValue(
-            *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
-            *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
-      }
-      // If multiple metadata have the same key name, show the values from the
-      // top of the stack (innermost annotation). Concatenate the values from
-      // "hlo_op".
-      absl::flat_hash_set<absl::string_view> key_set;
-
-      for (auto annotation = annotation_stack.rbegin();
-           annotation != annotation_stack.rend(); ++annotation) {
-        for (const Annotation::Metadata& metadata : annotation->metadata) {
-          if (key_set.insert(metadata.key).second) {
-            xevent.ParseAndAddStatValue(
-                *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
-          }
-        }
-      }
-    }
-    bool IsHostEvent(const RocmTracerEvent& event, tsl::int64* line_id) {
-      // DriverCallback(i.e. kernel launching) events are host events.
-      if (event.source == RocmTracerEventSource::ApiCallback) {
-        *line_id = event.thread_id;
-        return true;
-      } else {  // activities
-        *line_id = event.stream_id;
-        return false;
-      }
-
-      // TODO(rocm-profiler): do we have such a report in rocm?
-      // Non-overhead activity events are device events.
-      /* if (event.type != CuptiTracerEventType::Overhead) {
-        *line_id = event.stream_id;
-        return false;
-      } */
-      // Overhead events can be associated with a thread or a stream, etc.
-      // If a valid thread id is specified, we consider it as a host event.
-      //
-
-      if (event.stream_id != RocmTracerEvent::kInvalidStreamId) {
-        *line_id = event.stream_id;
-        return false;
-      } else if (event.thread_id != RocmTracerEvent::kInvalidThreadId &&
-                 event.thread_id != 0) {
-        *line_id = event.thread_id;
-        return true;
-      } else {
-        *line_id = tsl::profiler::kThreadIdOverhead;
-        return false;
-      }
-    }
-    void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns,
-                uint64_t end_gputime_ns, XPlaneBuilder* device_plane,
-                XPlaneBuilder* host_plane) {
-      int host_ev_cnt = 0, dev_ev_cnt = 0;
-      mutex_lock l(events_mutex);
-      // Tracking event types per line.
-      absl::flat_hash_map<tsl::int64, absl::flat_hash_set<RocmTracerEventType>>
-          events_types_per_line;
-      for (const RocmTracerEvent& event : events) {
-        int64_t line_id = RocmTracerEvent::kInvalidThreadId;
-        bool is_host_event = IsHostEvent(event, &line_id);
-
-        if (is_host_event) {
-          host_ev_cnt++;
-        } else {
-          dev_ev_cnt++;
-        }
-
-        if (line_id == RocmTracerEvent::kInvalidThreadId ||
-            line_id == RocmTracerEvent::kInvalidStreamId) {
-          VLOG(3) << "Ignoring event, type=" << static_cast<int>(event.type);
-          continue;
-        }
-        auto* plane = is_host_event ? host_plane : device_plane;
-        VLOG(9) << "Event" << " type=" << static_cast<int>(event.type)
-                << " line_id=" << line_id
-                << (is_host_event ? " host plane=" : " device plane=")
-                << plane->Name();
-        XLineBuilder line = plane->GetOrCreateLine(line_id);
-        line.SetTimestampNs(start_gputime_ns);
-        CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line);
-        events_types_per_line[line_id].emplace(event.type);
-      }
-      device_plane->ForEachLine([&](XLineBuilder line) {
-        line.SetName(
-            GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
-      });
-      host_plane->ForEachLine([&](XLineBuilder line) {
-        line.SetName(absl::StrCat("Host Threads/", line.Id()));
-      });
-      events.clear();
-    }
-
-    mutex events_mutex;
-    std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
-    absl::flat_hash_map<uint32_t, CorrelationInfo> correlation_info_
-        TF_GUARDED_BY(events_mutex);
-    absl::flat_hash_map<RocmDeviceOccupancyParams, OccupancyStats>
-        occupancy_cache_;
-    hipDeviceProp_t device_properties_;
-  };
-
-  absl::flat_hash_map<const uint32_t, PerDeviceCollector> per_device_collector_;
-};
-
 // GpuTracer for ROCm GPU.
 class GpuTracer : public profiler::ProfilerInterface {
  public:
@@ -801,7 +97,7 @@ class GpuTracer : public profiler::ProfilerInterface {
   State profiling_state_ = State::kNotStarted;
 
   RocmTracer* rocm_tracer_;
-  std::unique_ptr<RocmTraceCollectorImpl> rocm_trace_collector_;
+  std::unique_ptr<RocmTraceCollector> rocm_trace_collector_;
 };
 
 RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
@@ -899,8 +195,12 @@ Status GpuTracer::DoStart() {
       GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus());
   uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
   uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
-  rocm_trace_collector_ = std::make_unique<RocmTraceCollectorImpl>(
+  rocm_trace_collector_ = CreateRocmCollector(
       trace_collector_options, start_walltime_ns, start_gputime_ns);
+  // rocm_trace_collector_ =
+  // std::make_unique<RocmTraceCollectorImpl>(trace_collector_options,
+  // start_walltime_ns,
+  //                                                  start_gputime_ns);
 
   RocmTracerOptions tracer_options = GetRocmTracerOptions();
   rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get());
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
new file mode 100644
index 00000000000000..f28b91b5dd9e27
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
@@ -0,0 +1,853 @@
+
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/profiler/gpu/rocm_collector.h"
+
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/optional.h"
+#include "xla/stream_executor/rocm/roctracer_wrapper.h"
+#include "tsl/platform/abi.h"
+#include "tsl/platform/env_time.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/thread_annotations.h"
+#include "tsl/platform/types.h"
+#include "tsl/profiler/backends/cpu/annotation_stack.h"
+#include "tsl/profiler/lib/profiler_factory.h"
+#include "tsl/profiler/lib/profiler_interface.h"
+#include "tsl/profiler/utils/parse_annotation.h"
+#include "tsl/profiler/utils/xplane_builder.h"
+#include "tsl/profiler/utils/xplane_schema.h"
+#include "tsl/profiler/utils/xplane_utils.h"
+#include "tsl/util/env_var.h"
+
+namespace xla {
+namespace profiler {
+
+namespace se = ::stream_executor;
+using tensorflow::ProfileOptions;
+using tsl::mutex;
+using tsl::mutex_lock;
+// using tsl::OkStatus;
+using tsl::Status;
+using tsl::profiler::Annotation;
+using tsl::profiler::AnnotationStack;
+using tsl::profiler::FindOrAddMutablePlaneWithName;
+using tsl::profiler::GetStatTypeStr;
+using tsl::profiler::GpuPlaneName;
+using tsl::profiler::kDeviceVendorAMD;
+using tsl::profiler::kThreadIdOverhead;
+using tsl::profiler::ParseAnnotationStack;
+using tsl::profiler::ProfilerInterface;
+// using tsl::profiler::RegisterProfilerFactory;
+using tsl::profiler::StatType;
+using tsl::profiler::XEventBuilder;
+using tsl::profiler::XEventMetadata;
+using tsl::profiler::XLineBuilder;
+using tsl::profiler::XPlaneBuilder;
+using tsl::profiler::XSpace;
+
+void AnnotationMap::Add(uint32_t correlation_id,
+                        const std::string& annotation) {
+  if (annotation.empty()) return;
+  VLOG(3) << "Add annotation: " << " correlation_id=" << correlation_id
+          << ", annotation: " << annotation;
+  absl::MutexLock lock(&map_.mutex);
+  if (map_.annotations.size() < max_size_) {
+    absl::string_view annotation_str =
+        *map_.annotations.insert(annotation).first;
+    map_.correlation_map.emplace(correlation_id, annotation_str);
+  }
+}
+
+absl::string_view AnnotationMap::LookUp(uint32_t correlation_id) {
+  absl::MutexLock lock(&map_.mutex);
+  auto it = map_.correlation_map.find(correlation_id);
+  return it != map_.correlation_map.end() ? it->second : absl::string_view();
+}
+
+//==========
+namespace {
+// Set the all XLines of specified XPlane to starting walltime.
+// Events time in both host and device planes are CUTPI timestamps.
+// We set initial RocmTracer timestamp as start time for all lines to reflect
+// this fact. Eventually we change line start time to corresponding
+// start_walltime_ns to normalize with CPU wall time.
+static void NormalizeTimeStamps(XPlaneBuilder* plane,
+                                uint64_t start_walltime_ns) {
+  plane->ForEachLine([&](tsl::profiler::XLineBuilder line) {
+    line.SetTimestampNs(start_walltime_ns);
+  });
+}
+
+std::string GetDeviceXLineName(
+    int64_t stream_id, absl::flat_hash_set<RocmTracerEventType>& event_types) {
+  std::string line_name = absl::StrCat("Stream #", stream_id);
+  event_types.erase(RocmTracerEventType::Unsupported);
+  if (event_types.empty()) return line_name;
+  std::vector<const char*> type_names;
+  for (const auto event_type : event_types) {
+    type_names.emplace_back(GetRocmTracerEventTypeName(event_type));
+  }
+  return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
+}
+
+}  // namespace
+
+static void DumpRocmTracerEvent(const RocmTracerEvent& event,
+                                uint64_t start_walltime_ns,
+                                uint64_t start_gputime_ns,
+                                const std::string& message) {
+  std::ostringstream oss;
+  oss << "correlation_id=" << event.correlation_id;
+  oss << ",type=" << GetRocmTracerEventTypeName(event.type);
+  oss << ",source=" << GetRocmTracerEventSourceName(event.source);
+  oss << ",domain=" << GetRocmTracerEventDomainName(event.domain);
+  oss << ",name=" << event.name;
+  oss << ",annotation=" << event.annotation;
+  oss << ",start_time_us="
+      << (start_walltime_ns + (start_gputime_ns - event.start_time_ns)) / 1000;
+  oss << ",duration=" << (event.end_time_ns - event.start_time_ns) / 1000;
+  oss << ",device_id=" << event.device_id;
+  oss << ",thread_id=" << event.thread_id;
+  oss << ",stream_id=" << event.stream_id;
+
+  switch (event.type) {
+    case RocmTracerEventType::Kernel:
+      break;
+    case RocmTracerEventType::MemcpyD2H:
+    case RocmTracerEventType::MemcpyH2D:
+    case RocmTracerEventType::MemcpyD2D:
+    case RocmTracerEventType::MemcpyP2P:
+      oss << ",num_bytes=" << event.memcpy_info.num_bytes;
+      oss << ",destination=" << event.memcpy_info.destination;
+      oss << ",async=" << event.memcpy_info.async;
+      break;
+    case RocmTracerEventType::MemoryAlloc:
+      oss << ",num_bytes=" << event.memalloc_info.num_bytes;
+      break;
+    case RocmTracerEventType::Synchronization:
+      break;
+    case RocmTracerEventType::Generic:
+      break;
+    default:
+      DCHECK(false);
+      break;
+  }
+  oss << message;
+  VLOG(3) << oss.str();
+}
+
+static uint64_t get_timestamp() {
+  uint64_t ts;
+  if (se::wrap::roctracer_get_timestamp(&ts) != ROCTRACER_STATUS_SUCCESS) {
+    const char* errstr = se::wrap::roctracer_error_string();
+    LOG(ERROR) << "function roctracer_get_timestamp failed with error "
+               << errstr;
+    // Return 0 on error.
+    return 0;
+  }
+  return ts;
+}
+
+struct RocmDeviceOccupancyParams {
+  hipFuncAttributes attributes = {};
+  int block_size = 0;
+  size_t dynamic_smem_size = 0;
+  void* func_ptr;
+
+  friend bool operator==(const RocmDeviceOccupancyParams& lhs,
+                         const RocmDeviceOccupancyParams& rhs) {
+    return 0 == memcmp(&lhs, &rhs, sizeof(lhs));
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H hash_state,
+                         const RocmDeviceOccupancyParams& params) {
+    return H::combine(
+        std::move(hash_state), params.attributes.maxThreadsPerBlock,
+        params.attributes.numRegs, params.attributes.sharedSizeBytes,
+        params.attributes.maxDynamicSharedSizeBytes, params.block_size,
+        params.dynamic_smem_size, params.func_ptr);
+  }
+};
+
+struct OccupancyStats {
+  double occupancy_pct = 0.0;
+  int min_grid_size = 0;
+  int suggested_block_size = 0;
+};
+
+struct CorrelationInfo {
+  CorrelationInfo(uint32_t t, uint32_t e) : thread_id(t), enqueue_time_ns(e) {}
+  uint32_t thread_id;
+  uint64_t enqueue_time_ns;
+};
+
+class PerDeviceCollector {
+ private:
+  OccupancyStats GetOccupancy(const RocmDeviceOccupancyParams& params) const {
+    // TODO(rocm-profiler): hipOccupancyMaxActiveBlocksPerMultiprocessor only
+    // return hipSuccess for HIP_API_ID_hipLaunchKernel
+
+    OccupancyStats stats;
+    int number_of_active_blocks;
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &number_of_active_blocks, params.func_ptr, params.block_size,
+        params.dynamic_smem_size);
+
+    if (err != hipError_t::hipSuccess) {
+      return {};
+    }
+
+    stats.occupancy_pct = number_of_active_blocks * params.block_size * 100;
+    stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor;
+
+    err = hipOccupancyMaxPotentialBlockSize(
+        &stats.min_grid_size, &stats.suggested_block_size, params.func_ptr,
+        params.dynamic_smem_size, 0);
+
+    if (err != hipError_t::hipSuccess) {
+      return {};
+    }
+
+    return stats;
+  }
+
+  void CreateXEvent(const RocmTracerEvent& event, XPlaneBuilder* plane,
+                    uint64_t start_gpu_ns, uint64_t end_gpu_ns,
+                    XLineBuilder* line) {
+    if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
+        event.start_time_ns > event.end_time_ns) {
+      VLOG(2) << "events have abnormal timestamps:" << event.name
+              << " start time(ns): " << event.start_time_ns
+              << " end time(ns): " << event.end_time_ns
+              << " start gpu(ns):" << start_gpu_ns
+              << " end gpu(ns):" << end_gpu_ns
+              << " corr. id:" << event.correlation_id;
+      return;
+    }
+    std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str());
+    if (kernel_name.empty()) {
+      kernel_name = GetRocmTracerEventTypeName(event.type);
+    }
+    XEventMetadata* event_metadata =
+        plane->GetOrCreateEventMetadata(std::move(kernel_name));
+    XEventBuilder xevent = line->AddEvent(*event_metadata);
+    VLOG(7) << "Adding event to line=" << line->Id();
+    xevent.SetTimestampNs(event.start_time_ns);
+    xevent.SetEndTimestampNs(event.end_time_ns);
+    if (event.source == RocmTracerEventSource::ApiCallback) {
+      xevent.AddStatValue(
+          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)),
+          event.device_id);
+    }
+    if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                              GetStatTypeStr(StatType::kCorrelationId)),
+                          event.correlation_id);
+    }
+    if (!event.roctx_range.empty()) {
+      xevent.AddStatValue(
+          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
+          *plane->GetOrCreateStatMetadata(event.roctx_range));
+    }
+
+    if (event.type == RocmTracerEventType::Kernel &&
+        event.source == RocmTracerEventSource::Activity) {
+      RocmDeviceOccupancyParams params{};
+      params.attributes.maxThreadsPerBlock = INT_MAX;
+      params.attributes.numRegs =
+          static_cast<int>(event.kernel_info.registers_per_thread);
+      params.attributes.sharedSizeBytes =
+          event.kernel_info.static_shared_memory_usage;
+      // params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF;
+      // params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
+      params.attributes.maxDynamicSharedSizeBytes = 0;
+      params.block_size = static_cast<int>(event.kernel_info.block_x *
+                                           event.kernel_info.block_y *
+                                           event.kernel_info.block_z);
+
+      params.dynamic_smem_size = event.kernel_info.dynamic_shared_memory_usage;
+      params.func_ptr = event.kernel_info.func_ptr;
+
+      OccupancyStats& occ_stats = occupancy_cache_[params];
+      if (occ_stats.occupancy_pct == 0.0) {
+        occ_stats = GetOccupancy(params);
+      }
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
+                              StatType::kTheoreticalOccupancyPct)),
+                          occ_stats.occupancy_pct);
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                              GetStatTypeStr(StatType::kOccupancyMinGridSize)),
+                          static_cast<tsl::int32>(occ_stats.min_grid_size));
+      xevent.AddStatValue(
+          *plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)),
+          static_cast<tsl::int32>(occ_stats.suggested_block_size));
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                              GetStatTypeStr(StatType::kKernelDetails)),
+                          *plane->GetOrCreateStatMetadata(ToXStat(
+                              event.kernel_info, occ_stats.occupancy_pct)));
+    } else if (event.type == RocmTracerEventType::MemcpyH2D ||
+               event.type == RocmTracerEventType::MemcpyD2H ||
+               event.type == RocmTracerEventType::MemcpyD2D ||
+               event.type == RocmTracerEventType::MemcpyP2P ||
+               event.type == RocmTracerEventType::MemcpyOther) {
+      VLOG(7) << "Add Memcpy stat";
+      const auto& memcpy_info = event.memcpy_info;
+      std::string memcpy_details = absl::StrCat(
+          // TODO(rocm-profiler): we need to discover the memory kind similar
+          // to CUDA
+          "kind:", "Unknown", " size:", memcpy_info.num_bytes,
+          " dest:", memcpy_info.destination, " async:", memcpy_info.async);
+      xevent.AddStatValue(
+          *plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kMemcpyDetails)),
+          *plane->GetOrCreateStatMetadata(std::move(memcpy_details)));
+    } else if (event.type == RocmTracerEventType::MemoryAlloc) {
+      VLOG(7) << "Add MemAlloc stat";
+      std::string value =
+          // TODO(rocm-profiler): we need to discover the memory kind similar
+          // to CUDA
+          absl::StrCat("kind:", "Unknown",
+                       " num_bytes:", event.memalloc_info.num_bytes);
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                              GetStatTypeStr(StatType::kMemallocDetails)),
+                          *plane->GetOrCreateStatMetadata(std::move(value)));
+    } else if (event.type == RocmTracerEventType::MemoryFree) {
+      VLOG(7) << "Add MemFree stat";
+      std::string value =
+          // TODO(rocm-profiler): we need to discover the memory kind similar
+          // to CUDA
+          absl::StrCat("kind:", "Unknown",
+                       " num_bytes:", event.memalloc_info.num_bytes);
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                              GetStatTypeStr(StatType::kMemFreeDetails)),
+                          *plane->GetOrCreateStatMetadata(std::move(value)));
+    } else if (event.type == RocmTracerEventType::Memset) {
+      VLOG(7) << "Add Memset stat";
+      auto value =
+          // TODO(rocm-profiler): we need to discover the memory kind similar
+          // to CUDA
+          absl::StrCat("kind:", "Unknown",
+                       " num_bytes:", event.memset_info.num_bytes,
+                       " async:", event.memset_info.async);
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                              GetStatTypeStr(StatType::kMemsetDetails)),
+                          *plane->GetOrCreateStatMetadata(std::move(value)));
+    }
+    // TODO(rocm-profiler): we need to support the following event type
+    /* else if (event.type == CuptiTracerEventType::MemoryResidency) {
+      VLOG(7) << "Add MemoryResidency stat";
+      std::string value = absl::StrCat(
+          "kind:", GetMemoryKindName(event.memory_residency_info.kind),
+          " num_bytes:", event.memory_residency_info.num_bytes,
+          " addr:", event.memory_residency_info.address);
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
+                              StatType::kMemoryResidencyDetails)),
+                          *plane->GetOrCreateStatMetadata(std::move(value)));
+    } */
+
+    std::vector<Annotation> annotation_stack =
+        ParseAnnotationStack(event.annotation);
+    if (!annotation_stack.empty()) {
+      xevent.AddStatValue(
+          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
+          *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
+    }
+    // If multiple metadata have the same key name, show the values from the
+    // top of the stack (innermost annotation). Concatenate the values from
+    // "hlo_op".
+    absl::flat_hash_set<absl::string_view> key_set;
+
+    for (auto annotation = annotation_stack.rbegin();
+         annotation != annotation_stack.rend(); ++annotation) {
+      for (const Annotation::Metadata& metadata : annotation->metadata) {
+        if (key_set.insert(metadata.key).second) {
+          xevent.ParseAndAddStatValue(
+              *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
+        }
+      }
+    }
+  }
+
+  void SortByStartTime() {
+    mutex_lock lock(events_mutex);
+    std::sort(events.begin(), events.end(),
+              [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
+                return event1.start_time_ns < event2.start_time_ns;
+              });
+  }
+
+  bool IsHostEvent(const RocmTracerEvent& event, tsl::int64* line_id) {
+    // DriverCallback(i.e. kernel launching) events are host events.
+    if (event.source == RocmTracerEventSource::ApiCallback) {
+      *line_id = event.thread_id;
+      return true;
+    } else {  // activities
+      *line_id = event.stream_id;
+      return false;
+    }
+
+    // TODO(rocm-profiler): do we have such a report in rocm?
+    // Non-overhead activity events are device events.
+    /* if (event.type != CuptiTracerEventType::Overhead) {
+      *line_id = event.stream_id;
+      return false;
+    } */
+    // Overhead events can be associated with a thread or a stream, etc.
+    // If a valid thread id is specified, we consider it as a host event.
+    //
+
+    if (event.stream_id != RocmTracerEvent::kInvalidStreamId) {
+      *line_id = event.stream_id;
+      return false;
+    } else if (event.thread_id != RocmTracerEvent::kInvalidThreadId &&
+               event.thread_id != 0) {
+      *line_id = event.thread_id;
+      return true;
+    } else {
+      *line_id = tsl::profiler::kThreadIdOverhead;
+      return false;
+    }
+  }
+
+ public:
+  void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns,
+              uint64_t end_gputime_ns, XPlaneBuilder* device_plane,
+              XPlaneBuilder* host_plane) {
+    int host_ev_cnt = 0, dev_ev_cnt = 0;
+    mutex_lock l(events_mutex);
+    // Tracking event types per line.
+    absl::flat_hash_map<tsl::int64, absl::flat_hash_set<RocmTracerEventType>>
+        events_types_per_line;
+    for (const RocmTracerEvent& event : events) {
+      int64_t line_id = RocmTracerEvent::kInvalidThreadId;
+      bool is_host_event = IsHostEvent(event, &line_id);
+
+      if (is_host_event) {
+        host_ev_cnt++;
+      } else {
+        dev_ev_cnt++;
+      }
+
+      if (line_id == RocmTracerEvent::kInvalidThreadId ||
+          line_id == RocmTracerEvent::kInvalidStreamId) {
+        VLOG(3) << "Ignoring event, type=" << static_cast<int>(event.type);
+        continue;
+      }
+      auto* plane = is_host_event ? host_plane : device_plane;
+      VLOG(9) << "Event" << " type=" << static_cast<int>(event.type)
+              << " line_id=" << line_id
+              << (is_host_event ? " host plane=" : " device plane=")
+              << plane->Name();
+      XLineBuilder line = plane->GetOrCreateLine(line_id);
+      line.SetTimestampNs(start_gputime_ns);
+      CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line);
+      events_types_per_line[line_id].emplace(event.type);
+    }
+    device_plane->ForEachLine([&](XLineBuilder line) {
+      line.SetName(
+          GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
+    });
+    host_plane->ForEachLine([&](XLineBuilder line) {
+      line.SetName(absl::StrCat("Host Threads/", line.Id()));
+    });
+    events.clear();
+  }
+
+  PerDeviceCollector() = default;
+
+  void AddEvent(const RocmTracerEvent& event) {
+    mutex_lock l(events_mutex);
+    if (event.source == RocmTracerEventSource::ApiCallback) {
+      // Cupti api callback events were used to populate launch times etc.
+      if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
+        correlation_info_.insert(
+            {event.correlation_id,
+             CorrelationInfo(event.thread_id, event.start_time_ns)});
+      }
+      events.emplace_back(std::move(event));
+    } else {
+      // Cupti activity events measure device times etc.
+      events.emplace_back(std::move(event));
+    }
+  }
+
+  void GetDeviceCapabilities(int32_t device_ordinal,
+                             XPlaneBuilder* device_plane) {
+    device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
+                                   GetStatTypeStr(StatType::kDevVendor)),
+                               kDeviceVendorAMD);
+
+    if (hipGetDeviceProperties(&device_properties_, device_ordinal) !=
+        hipSuccess)
+      return;
+
+    auto clock_rate_in_khz =
+        device_properties_.clockRate;  // this is also in Khz
+    if (clock_rate_in_khz) {
+      device_plane->AddStatValue(
+          *device_plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kDevCapClockRateKHz)),
+          clock_rate_in_khz);
+    }
+
+    auto core_count = device_properties_.multiProcessorCount;
+    if (core_count) {
+      device_plane->AddStatValue(
+          *device_plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kDevCapCoreCount)),
+          core_count);
+    }
+
+    auto mem_clock_khz = device_properties_.memoryClockRate;
+    auto mem_bus_width_bits = device_properties_.memoryBusWidth;
+
+    if (mem_clock_khz && mem_bus_width_bits) {
+      // Times 2 because HBM is DDR memory; it gets two data bits per each
+      // data lane.
+      auto memory_bandwidth =
+          uint64_t{2} * (mem_clock_khz) * 1000 * (mem_bus_width_bits) / 8;
+      device_plane->AddStatValue(
+          *device_plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
+          memory_bandwidth);
+    }
+
+    size_t total_memory = device_properties_.totalGlobalMem;
+    if (total_memory) {
+      device_plane->AddStatValue(
+          *device_plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kDevCapMemorySize)),
+          static_cast<uint64_t>(total_memory));
+    }
+
+    auto compute_capability_major = device_properties_.major;
+    if (compute_capability_major) {
+      device_plane->AddStatValue(
+          *device_plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
+          compute_capability_major);
+    }
+    auto compute_capability_minor = device_properties_.minor;
+    if (compute_capability_minor) {
+      device_plane->AddStatValue(
+          *device_plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
+          compute_capability_minor);
+    }
+  }
+
+ private:
+  mutex events_mutex;
+  std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
+  absl::flat_hash_map<uint32_t, CorrelationInfo> correlation_info_
+      TF_GUARDED_BY(events_mutex);
+  absl::flat_hash_map<RocmDeviceOccupancyParams, OccupancyStats>
+      occupancy_cache_;
+  hipDeviceProp_t device_properties_;
+};
+
+class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
+ public:
+  RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options,
+                         uint64_t start_walltime_ns, uint64_t start_gputime_ns)
+      : RocmTraceCollector(options),
+        num_callback_events_(0),
+        num_activity_events_(0),
+        start_walltime_ns_(start_walltime_ns),
+        start_gputime_ns_(start_gputime_ns),
+        num_gpus_(options.num_gpus),
+        per_device_collector_(options.num_gpus) {}
+
+  void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override;
+  void Flush() override;
+  void Export(XSpace* space) override;
+
+  void OnEventsDropped(const std::string& reason,
+                       uint32_t correlation_id) override {
+    LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
+              << ",) : " << reason << ".";
+  }
+
+ private:
+  std::atomic<int> num_callback_events_;
+  std::atomic<int> num_activity_events_;
+  uint64_t start_walltime_ns_;
+  uint64_t start_gputime_ns_;
+  int num_gpus_;
+
+  mutex event_maps_mutex_;
+  absl::flat_hash_map<uint32_t, RocmTracerEvent> api_events_map_
+      TF_GUARDED_BY(event_maps_mutex_);
+
+  /* Some apis such as MEMSETD32 (based on an observation with ResNet50),
+   trigger multiple HIP ops domain activities. We keep them in a vector and
+   merge them with api activities at flush time.
+ */
+  absl::flat_hash_map<uint32_t, std::vector<RocmTracerEvent>>
+      activity_ops_events_map_ TF_GUARDED_BY(event_maps_mutex_);
+  // This is for the APIs that we track because we need some information from
+  // them to populate the corresponding activity that we actually track.
+  absl::flat_hash_map<uint32_t, RocmTracerEvent> auxiliary_api_events_map_
+      TF_GUARDED_BY(event_maps_mutex_);
+
+  const std::vector<RocmTracerEvent> ApiActivityInfoExchange();
+
+  absl::flat_hash_map<const uint32_t, PerDeviceCollector> per_device_collector_;
+};
+//==========
+
+void RocmTraceCollectorImpl::AddEvent(RocmTracerEvent&& event,
+                                      bool is_auxiliary) {
+  mutex_lock lock(event_maps_mutex_);
+
+  if (event.source == RocmTracerEventSource::ApiCallback && !is_auxiliary) {
+    if (num_callback_events_ > options_.max_callback_api_events) {
+      OnEventsDropped("max callback event capacity reached",
+                      event.correlation_id);
+      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
+      return;
+    }
+    num_callback_events_++;
+  } else if (event.source == RocmTracerEventSource::Activity &&
+             event.domain == RocmTracerEventDomain::HIP_API) {
+    // we do not count HIP_OPS activities.
+    if (num_activity_events_ > options_.max_activity_api_events) {
+      OnEventsDropped("max activity event capacity reached",
+                      event.correlation_id);
+      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
+      return;
+    }
+    num_activity_events_++;
+  }
+
+  bool emplace_result = false;
+  if (event.source == RocmTracerEventSource::ApiCallback) {
+    auto& target_api_event_map =
+        (is_auxiliary) ? auxiliary_api_events_map_ : api_events_map_;
+    std::tie(std::ignore, emplace_result) =
+        target_api_event_map.emplace(event.correlation_id, std::move(event));
+  } else if (event.source == RocmTracerEventSource::Activity) {
+    auto result = activity_ops_events_map_.emplace(
+        event.correlation_id, std::vector<RocmTracerEvent>{});
+    result.first->second.push_back(std::move(event));
+    emplace_result = true;  // we always accept Hip-Ops events
+  }
+  if (!emplace_result) {
+    OnEventsDropped("event with duplicate correlation_id was received.",
+                    event.correlation_id);
+    DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
+  }
+}
+
+void RocmTraceCollectorImpl::Flush() {
+  mutex_lock lock(event_maps_mutex_);
+  auto& aggregated_events_ = ApiActivityInfoExchange();
+
+  VLOG(3) << "RocmTraceCollector collected " << num_callback_events_
+          << " callback events, " << num_activity_events_
+          << " activity events, and aggregated them into "
+          << aggregated_events_.size() << " events.";
+
+  // device ids for GPUs filled in by roctracer are not zero indexed.
+  // They are offset by number of CPUs on the machine
+  tsl::uint32 min_device_id = INT32_MAX;
+  ;
+  for (auto& event : aggregated_events_) {
+    if (event.device_id < min_device_id) {
+      min_device_id = event.device_id;
+    }
+  }
+
+  for (auto event : aggregated_events_) {
+    event.device_id = event.device_id - min_device_id;
+    if (event.device_id < num_gpus_) {
+      per_device_collector_[event.device_id].AddEvent(event);
+    } else {
+      OnEventsDropped("Invalid device id for an event.", event.correlation_id);
+      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
+    }
+  }
+
+  activity_ops_events_map_.clear();
+  api_events_map_.clear();
+  auxiliary_api_events_map_.clear();
+}
+
+void RocmTraceCollectorImpl::Export(XSpace* space) {
+  uint64_t end_gputime_ns = get_timestamp();
+  XPlaneBuilder host_plane(FindOrAddMutablePlaneWithName(
+      space, tsl::profiler::kRoctracerApiPlaneName));
+
+  for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
+    std::string name = GpuPlaneName(device_ordinal);
+    XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
+    device_plane.SetId(device_ordinal);
+    // Calculate device capabilities before flushing, so that device
+    // properties are available to the occupancy calculator in export().
+    per_device_collector_[device_ordinal].GetDeviceCapabilities(device_ordinal,
+                                                                &device_plane);
+    per_device_collector_[device_ordinal].Export(
+        start_walltime_ns_, start_gputime_ns_, end_gputime_ns, &device_plane,
+        &host_plane);
+    NormalizeTimeStamps(&device_plane, start_walltime_ns_);
+  }
+  NormalizeTimeStamps(&host_plane, start_walltime_ns_);
+}
+
+const std::vector<RocmTracerEvent>
+RocmTraceCollectorImpl::ApiActivityInfoExchange() {
+  /* Different from CUDA, roctracer activity records are not enough to fill a
+    TF event. For most of the activities, we need to enable the corresponding
+    API callsbacks (we call them auxiliary API callbacks) to capture the
+    necessary fields from them using the correlation id. The purpose of this
+    function is to let APIs and activities exchange information to reach a
+    state very similar to TF CUDA and getting ready to dump the event.
+  */
+
+  std::vector<RocmTracerEvent> aggregated_events;
+
+  // Copy info from activity events to API callback events
+  for (auto& api_iter : api_events_map_) {
+    RocmTracerEvent& api_event = api_iter.second;
+    auto activity_event =
+        activity_ops_events_map_.find(api_event.correlation_id);
+
+    if (activity_event == activity_ops_events_map_.end()) {
+      OnEventsDropped(
+          "An event from HIP API discarded."
+          "Could not find the counterpart activity.",
+          api_event.correlation_id);
+      DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
+    } else {
+      api_event.device_id = activity_event->second.front().device_id;
+      api_event.stream_id = activity_event->second.front().stream_id;
+      switch (api_event.type) {
+        case RocmTracerEventType::Kernel:
+        case RocmTracerEventType::Memset:
+        case RocmTracerEventType::MemoryAlloc:
+        case RocmTracerEventType::MemoryFree:
+        case RocmTracerEventType::Synchronization: {
+          aggregated_events.push_back(api_event);
+          break;
+        }
+        case RocmTracerEventType::MemcpyD2H:
+        case RocmTracerEventType::MemcpyH2D:
+        case RocmTracerEventType::MemcpyD2D:
+        case RocmTracerEventType::MemcpyOther: {
+          api_event.memcpy_info.destination =
+              activity_event->second.front().device_id;
+          aggregated_events.push_back(api_event);
+          break;
+        }
+        default:
+          OnEventsDropped("Missing API-Activity information exchange. Dropped!",
+                          api_event.correlation_id);
+          DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
+          LOG(WARNING) << "A ROCm API event type with unimplemented activity "
+                          "merge dropped! "
+                          "Type="
+                       << GetRocmTracerEventTypeName(api_event.type);
+      }
+    }
+  }
+
+  // Make sure for all activity events we have API callback events
+  for (auto& activity_iter : activity_ops_events_map_) {
+    RocmTracerEvent& activity_event = activity_iter.second.front();
+    auto api_event = api_events_map_.find(activity_event.correlation_id);
+
+    if (api_event == api_events_map_.end()) {
+      api_event = auxiliary_api_events_map_.find(activity_event.correlation_id);
+    }
+
+    if (api_event == auxiliary_api_events_map_.end()) {
+      OnEventsDropped(
+          "An event from activity was discarded."
+          "Could not find the counterpart HIP API.",
+          activity_event.correlation_id);
+      DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
+    } else {
+      switch (activity_event.type) {
+        // KERNEL ACTIVITY
+        case RocmTracerEventType::Kernel: {
+          activity_event.name = api_event->second.name;
+          activity_event.kernel_info = api_event->second.kernel_info;
+          aggregated_events.push_back(activity_event);
+          break;
+        }
+        // MEMCPY ACTIVITY
+        case RocmTracerEventType::MemcpyD2H:
+        case RocmTracerEventType::MemcpyH2D:
+        case RocmTracerEventType::MemcpyD2D:
+        case RocmTracerEventType::MemcpyOther: {
+          activity_event.memcpy_info = api_event->second.memcpy_info;
+          aggregated_events.push_back(activity_event);
+          break;
+        }
+        // MEMSET ACTIVITY
+        case RocmTracerEventType::Memset: {
+          activity_event.memset_info = api_event->second.memset_info;
+          aggregated_events.push_back(activity_event);
+          break;
+        }
+        // MALLOC ACTIVITY, FREE ACTIVITY
+        case RocmTracerEventType::MemoryAlloc:
+        case RocmTracerEventType::MemoryFree: {
+          activity_event.device_id = api_event->second.device_id;
+          aggregated_events.push_back(activity_event);
+          break;
+        }
+        // SYNCHRONIZATION ACTIVITY
+        case RocmTracerEventType::Synchronization: {
+          activity_event.device_id = api_event->second.device_id;
+          aggregated_events.push_back(activity_event);
+          break;
+        }
+        default:
+          OnEventsDropped("Missing API-Activity information exchange. Dropped!",
+                          activity_event.correlation_id);
+          DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
+          LOG(WARNING) << "A ROCm activity event with unimplemented API "
+                          "callback merge dropped! "
+                          "Type="
+                       << GetRocmTracerEventTypeName(activity_event.type);
+          break;
+      }
+    }
+  }
+
+  return aggregated_events;
+}
+
+std::unique_ptr<RocmTraceCollector> CreateRocmCollector(
+    const RocmTraceCollectorOptions& options, const uint64_t start_walltime_ns,
+    const uint64_t start_gputime_ns) {
+  return std::make_unique<RocmTraceCollectorImpl>(options, start_walltime_ns,
+                                                  start_gputime_ns);
+}
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h
new file mode 100644
index 00000000000000..af8bc26f97aa24
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h
@@ -0,0 +1,227 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
+#define XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_set.h"
+#include "tsl/profiler/utils/xplane_builder.h"
+
+namespace xla {
+namespace profiler {
+
+using tsl::profiler::XSpace;
+
+struct MemcpyDetails {
+  // The amount of data copied for memcpy events.
+  size_t num_bytes;
+  // The destination device for peer-2-peer communication (memcpy). The source
+  // device is implicit: it's the current device.
+  uint32_t destination;
+  // Whether or not the memcpy is asynchronous.
+  bool async;
+};
+
+struct MemAllocDetails {
+  // The amount of data requested for cudaMalloc events.
+  uint64_t num_bytes;
+};
+
+struct MemsetDetails {
+  // The number of memory elements getting set
+  size_t num_bytes;
+  // Whether or not the memset is asynchronous.
+  bool async;
+};
+
+struct KernelDetails {
+  // The number of registers used in this kernel.
+  uint32_t registers_per_thread;
+  // The amount of shared memory space used by a thread block.
+  uint32_t static_shared_memory_usage;
+  // The amount of dynamic memory space used by a thread block.
+  uint32_t dynamic_shared_memory_usage;
+  // X-dimension of a thread block.
+  uint32_t block_x;
+  // Y-dimension of a thread block.
+  uint32_t block_y;
+  // Z-dimension of a thread block.
+  uint32_t block_z;
+  // X-dimension of a grid.
+  uint32_t grid_x;
+  // Y-dimension of a grid.
+  uint32_t grid_y;
+  // Z-dimension of a grid.
+  uint32_t grid_z;
+
+  // kernel address. Used for calculating core occupancy
+  void* func_ptr;
+};
+
+inline std::string ToXStat(const KernelDetails& kernel_info,
+                           double occupancy_pct) {
+  return absl::StrCat(
+      "regs:", kernel_info.registers_per_thread,
+      " static_shared:", kernel_info.static_shared_memory_usage,
+      " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
+      " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
+      kernel_info.grid_z, " block:", kernel_info.block_x, ",",
+      kernel_info.block_y, ",", kernel_info.block_z,
+      " occ_pct:", occupancy_pct);
+}
+
+enum class RocmTracerEventType {
+  Unsupported = 0,
+  Kernel,
+  MemcpyH2D,
+  MemcpyD2H,
+  MemcpyD2D,
+  MemcpyP2P,
+  MemcpyOther,
+  MemoryAlloc,
+  MemoryFree,
+  Memset,
+  Synchronization,
+  Generic,
+};
+
+const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
+
+enum class RocmTracerEventSource {
+  Invalid = 0,
+  ApiCallback,
+  Activity,
+};
+
+const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
+
+enum class RocmTracerEventDomain {
+  InvalidDomain = 0,
+  HIP_API,
+  HCC_OPS,  // TODO(rocm-profiler): renme this to HIP_OPS
+};
+const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
+// RocmTracerSyncTypes forward decleration
+enum class RocmTracerSyncTypes;
+
+struct SynchronizationDetails {
+  RocmTracerSyncTypes sync_type;
+};
+
+struct RocmTracerEvent {
+  static constexpr uint32_t kInvalidDeviceId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint32_t kInvalidThreadId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint32_t kInvalidCorrelationId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64_t kInvalidStreamId =
+      std::numeric_limits<uint64_t>::max();
+  RocmTracerEventType type;
+  RocmTracerEventSource source = RocmTracerEventSource::Invalid;
+  RocmTracerEventDomain domain;
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
+  absl::string_view annotation;
+  absl::string_view roctx_range;
+  uint64_t start_time_ns = 0;
+  uint64_t end_time_ns = 0;
+  uint32_t device_id = kInvalidDeviceId;
+  uint32_t correlation_id = kInvalidCorrelationId;
+  uint32_t thread_id = kInvalidThreadId;
+  int64_t stream_id = kInvalidStreamId;
+  union {
+    MemcpyDetails memcpy_info;                    // If type == Memcpy*
+    MemsetDetails memset_info;                    // If type == Memset*
+    MemAllocDetails memalloc_info;                // If type == MemoryAlloc
+    KernelDetails kernel_info;                    // If type == Kernel
+    SynchronizationDetails synchronization_info;  // If type == Synchronization
+  };
+};
+
+struct RocmTraceCollectorOptions {
+  // Maximum number of events to collect from callback API; if -1, no limit.
+  // if 0, the callback API is enabled to build a correlation map, but no
+  // events are collected.
+  uint64_t max_callback_api_events;
+  // Maximum number of events to collect from activity API; if -1, no limit.
+  uint64_t max_activity_api_events;
+  // Maximum number of annotation strings that we can accommodate.
+  uint64_t max_annotation_strings;
+  // Number of GPUs involved.
+  uint32_t num_gpus;
+};
+
+class AnnotationMap {
+ public:
+  explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
+  void Add(uint32_t correlation_id, const std::string& annotation);
+  absl::string_view LookUp(uint32_t correlation_id);
+
+ private:
+  struct AnnotationMapImpl {
+    // The population/consumption of annotations might happen from multiple
+    // callback/activity api related threads.
+    absl::Mutex mutex;
+    // Annotation tends to be repetitive, use a hash_set to store the strings,
+    // an use the reference to the string in the map.
+    absl::node_hash_set<std::string> annotations;
+    absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
+  };
+  const uint64_t max_size_;
+  AnnotationMapImpl map_;
+
+ public:
+  // Disable copy and move.
+  AnnotationMap(const AnnotationMap&) = delete;
+  AnnotationMap& operator=(const AnnotationMap&) = delete;
+};
+
+class RocmTraceCollector {
+ public:
+  explicit RocmTraceCollector(const RocmTraceCollectorOptions& options)
+      : options_(options), annotation_map_(options.max_annotation_strings) {}
+  virtual ~RocmTraceCollector() {}
+
+  virtual void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) = 0;
+  virtual void OnEventsDropped(const std::string& reason,
+                               uint32_t num_events) = 0;
+  virtual void Flush() = 0;
+  virtual void Export(XSpace* space) = 0;
+
+  AnnotationMap* annotation_map() { return &annotation_map_; }
+
+ protected:
+  RocmTraceCollectorOptions options_;
+
+ private:
+  AnnotationMap annotation_map_;
+
+ public:
+  // Disable copy and move.
+  RocmTraceCollector(const RocmTraceCollector&) = delete;
+  RocmTraceCollector& operator=(const RocmTraceCollector&) = delete;
+};
+
+std::unique_ptr<RocmTraceCollector> CreateRocmCollector(
+    const RocmTraceCollectorOptions& options, const uint64_t start_walltime_ns,
+    const uint64_t start_gputime_ns);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
index 2ca9d5a581eb3a..91795b5cf5ec5a 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -169,7 +169,15 @@ inline void DumpApiCallbackData(uint32_t domain, uint32_t cbid,
         break;
       case HIP_API_ID_hipStreamSynchronize:
         break;
+      case HIP_API_ID_hipStreamWaitEvent:  // ignore all aux HIP API Events
+      case HIP_API_ID_hipHostFree:
+      case HIP_API_ID_hipHostMalloc:
+      case HIP_API_ID_hipSetDevice:
+        break;
       default:
+        VLOG(3) << "Warning: HIP_API_ID_x is not handled in "
+                   "DumpApiCallbackData, HIP_API_ID="
+                << cbid;
         DCHECK(false);
         break;
     }
@@ -269,51 +277,8 @@ const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain) {
   return "";
 }
 
-void DumpRocmTracerEvent(const RocmTracerEvent& event,
-                         uint64_t start_walltime_ns, uint64_t start_gputime_ns,
-                         const std::string& message) {
-  std::ostringstream oss;
-  oss << "correlation_id=" << event.correlation_id;
-  oss << ",type=" << GetRocmTracerEventTypeName(event.type);
-  oss << ",source=" << GetRocmTracerEventSourceName(event.source);
-  oss << ",domain=" << GetRocmTracerEventDomainName(event.domain);
-  oss << ",name=" << event.name;
-  oss << ",annotation=" << event.annotation;
-  oss << ",start_time_us="
-      << (start_walltime_ns + (start_gputime_ns - event.start_time_ns)) / 1000;
-  oss << ",duration=" << (event.end_time_ns - event.start_time_ns) / 1000;
-  oss << ",device_id=" << event.device_id;
-  oss << ",thread_id=" << event.thread_id;
-  oss << ",stream_id=" << event.stream_id;
-
-  switch (event.type) {
-    case RocmTracerEventType::Kernel:
-      break;
-    case RocmTracerEventType::MemcpyD2H:
-    case RocmTracerEventType::MemcpyH2D:
-    case RocmTracerEventType::MemcpyD2D:
-    case RocmTracerEventType::MemcpyP2P:
-      oss << ",num_bytes=" << event.memcpy_info.num_bytes;
-      oss << ",destination=" << event.memcpy_info.destination;
-      oss << ",async=" << event.memcpy_info.async;
-      break;
-    case RocmTracerEventType::MemoryAlloc:
-      oss << ",num_bytes=" << event.memalloc_info.num_bytes;
-      break;
-    case RocmTracerEventType::Synchronization:
-      break;
-    case RocmTracerEventType::Generic:
-      break;
-    default:
-      DCHECK(false);
-      break;
-  }
-  oss << message;
-  VLOG(3) << oss.str();
-}
-
-absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
-                                             const void* cbdata) {
+tsl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
+                                            const void* cbdata) {
   /* Some APIs such as hipMalloc, implicitly work on th devices set by the
     user using APIs such as hipSetDevice. API callbacks and activity records
     for functions like hipMalloc does not return the device id (CUDA does). To
@@ -325,7 +290,7 @@ absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
 
   // DumpApiCallbackData(domain, cbid, cbdata);
 
-  if (domain != ACTIVITY_DOMAIN_HIP_API) return absl::OkStatus();
+  if (domain != ACTIVITY_DOMAIN_HIP_API) return tsl::OkStatus();
 
   const hip_api_data_t* data = reinterpret_cast<const hip_api_data_t*>(cbdata);
 
@@ -353,7 +318,7 @@ absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
       } else {
         LOG(WARNING) << "An API exit callback received without API enter "
                         "with same correlation id. Event droped!";
-        return absl::OkStatus();  // This API does not belong to us.
+        return tsl::OkStatus();  // This API does not belong to us.
       }
       exit_time = RocmTracer::GetTimestamp();
     }
@@ -434,7 +399,7 @@ absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
         break;
     }
   }
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
 void RocmApiCallbackImpl::AddKernelEventUponApiExit(uint32_t cbid,
@@ -873,8 +838,8 @@ void RocmApiCallbackImpl::AddSynchronizeEventUponApiExit(
   collector_->AddEvent(std::move(event), is_auxiliary);
 }
 
-absl::Status RocmActivityCallbackImpl::operator()(const char* begin,
-                                                  const char* end) {
+tsl::Status RocmActivityCallbackImpl::operator()(const char* begin,
+                                                 const char* end) {
   // we do not dump activities in this set in logger
 
   static std::set<activity_op_t> dump_excluded_activities = {
@@ -1021,7 +986,7 @@ absl::Status RocmActivityCallbackImpl::operator()(const char* begin,
             ));
   }
 
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
 void RocmActivityCallbackImpl::AddHipKernelActivityEvent(
@@ -1332,25 +1297,6 @@ void RocmActivityCallbackImpl::AddHipOpsMemsetActivityEvent(
   collector_->AddEvent(std::move(event), false);
 }
 
-void AnnotationMap::Add(uint32_t correlation_id,
-                        const std::string& annotation) {
-  if (annotation.empty()) return;
-  VLOG(3) << "Add annotation: " << " correlation_id=" << correlation_id
-          << ", annotation: " << annotation;
-  absl::MutexLock lock(&map_.mutex);
-  if (map_.annotations.size() < max_size_) {
-    absl::string_view annotation_str =
-        *map_.annotations.insert(annotation).first;
-    map_.correlation_map.emplace(correlation_id, annotation_str);
-  }
-}
-
-absl::string_view AnnotationMap::LookUp(uint32_t correlation_id) {
-  absl::MutexLock lock(&map_.mutex);
-  auto it = map_.correlation_map.find(correlation_id);
-  return it != map_.correlation_map.end() ? it->second : absl::string_view();
-}
-
 /* static */ RocmTracer* RocmTracer::GetRocmTracerSingleton() {
   static auto* singleton = new RocmTracer();
   return singleton;
@@ -1413,15 +1359,15 @@ void ApiCallback(uint32_t domain, uint32_t cbid, const void* cbdata,
   tracer->ApiCallbackHandler(domain, cbid, cbdata).IgnoreError();
 }
 
-absl::Status RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                            const void* cbdata) {
+tsl::Status RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid,
+                                           const void* cbdata) {
   if (api_tracing_enabled_)
     TF_RETURN_IF_ERROR((*api_cb_impl_)(domain, cbid, cbdata));
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
-absl::Status RocmTracer::EnableApiTracing() {
-  if (api_tracing_enabled_) return absl::OkStatus();
+tsl::Status RocmTracer::EnableApiTracing() {
+  if (api_tracing_enabled_) return tsl::OkStatus();
   api_tracing_enabled_ = true;
 
   for (auto& iter : options_->api_callbacks) {
@@ -1443,11 +1389,11 @@ absl::Status RocmTracer::EnableApiTracing() {
       }
     }
   }
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
-absl::Status RocmTracer::DisableApiTracing() {
-  if (!api_tracing_enabled_) return absl::OkStatus();
+tsl::Status RocmTracer::DisableApiTracing() {
+  if (!api_tracing_enabled_) return tsl::OkStatus();
   api_tracing_enabled_ = false;
 
   for (auto& iter : options_->api_callbacks) {
@@ -1469,7 +1415,7 @@ absl::Status RocmTracer::DisableApiTracing() {
       }
     }
   }
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
 void ActivityCallback(const char* begin, const char* end, void* user_data) {
@@ -1477,8 +1423,8 @@ void ActivityCallback(const char* begin, const char* end, void* user_data) {
   tracer->ActivityCallbackHandler(begin, end).IgnoreError();
 }
 
-absl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
-                                                 const char* end) {
+tsl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
+                                                const char* end) {
   if (activity_tracing_enabled_) {
     TF_RETURN_IF_ERROR((*activity_cb_impl_)(begin, end));
   } else {
@@ -1503,11 +1449,11 @@ absl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
     }
     VLOG(3) << "Dropped Activity Records End";
   }
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
-absl::Status RocmTracer::EnableActivityTracing() {
-  if (activity_tracing_enabled_) return absl::OkStatus();
+tsl::Status RocmTracer::EnableActivityTracing() {
+  if (activity_tracing_enabled_) return tsl::OkStatus();
   activity_tracing_enabled_ = true;
 
   if (!options_->activity_tracing.empty()) {
@@ -1544,11 +1490,11 @@ absl::Status RocmTracer::EnableActivityTracing() {
     }
   }
 
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
-absl::Status RocmTracer::DisableActivityTracing() {
-  if (!activity_tracing_enabled_) return absl::OkStatus();
+tsl::Status RocmTracer::DisableActivityTracing() {
+  if (!activity_tracing_enabled_) return tsl::OkStatus();
 
   for (auto& iter : options_->activity_tracing) {
     activity_domain_t domain = iter.first;
@@ -1599,7 +1545,7 @@ absl::Status RocmTracer::DisableActivityTracing() {
 
   activity_tracing_enabled_ = false;
 
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
 /*static*/ uint64_t RocmTracer::GetTimestamp() {
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
index cf04442feaab3f..fd566653be02a2 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,107 +16,21 @@ limitations under the License.
 #ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
 #define XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
 
-#include <cstdint>
-#include <limits>
-#include <map>
-#include <set>
-#include <string>
-#include <vector>
-
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_set.h"
-#include "absl/status/status.h"
+#include "absl/types/optional.h"
+#include "xla/backends/profiler/gpu/rocm_collector.h"
 #include "xla/stream_executor/rocm/roctracer_wrapper.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/types.h"
 
 namespace xla {
 namespace profiler {
 
-struct MemcpyDetails {
-  // The amount of data copied for memcpy events.
-  size_t num_bytes;
-  // The destination device for peer-2-peer communication (memcpy). The source
-  // device is implicit: it's the current device.
-  uint32_t destination;
-  // Whether or not the memcpy is asynchronous.
-  bool async;
-};
-
-struct MemsetDetails {
-  // The number of memory elements getting set
-  size_t num_bytes;
-  // Whether or not the memset is asynchronous.
-  bool async;
-};
-
-struct MemAllocDetails {
-  // The amount of data requested for cudaMalloc events.
-  uint64_t num_bytes;
-};
-
-struct KernelDetails {
-  // The number of registers used in this kernel.
-  uint32_t registers_per_thread;
-  // The amount of shared memory space used by a thread block.
-  uint32_t static_shared_memory_usage;
-  // The amount of dynamic memory space used by a thread block.
-  uint32_t dynamic_shared_memory_usage;
-  // X-dimension of a thread block.
-  uint32_t block_x;
-  // Y-dimension of a thread block.
-  uint32_t block_y;
-  // Z-dimension of a thread block.
-  uint32_t block_z;
-  // X-dimension of a grid.
-  uint32_t grid_x;
-  // Y-dimension of a grid.
-  uint32_t grid_y;
-  // Z-dimension of a grid.
-  uint32_t grid_z;
-
-  // kernel address. Used for calculating core occupancy
-  void* func_ptr;
-};
-
-// RocmTracerSyncTypes forward decleration
-enum class RocmTracerSyncTypes;
-struct SynchronizationDetails {
-  RocmTracerSyncTypes sync_type;
-};
-
-enum class RocmTracerEventType {
-  Unsupported = 0,
-  Kernel,
-  MemcpyH2D,
-  MemcpyD2H,
-  MemcpyD2D,
-  MemcpyP2P,
-  MemcpyOther,
-  MemoryAlloc,
-  MemoryFree,
-  Memset,
-  Synchronization,
-  Generic,
-};
-
-const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
-
-enum class RocmTracerEventSource {
-  Invalid = 0,
-  ApiCallback,
-  Activity,
-};
-
-const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
-
-enum class RocmTracerEventDomain {
-  InvalidDomain = 0,
-  HIP_API,
-  HCC_OPS,  // TODO(rocm-profiler): renme this to HIP_OPS
-};
 enum class RocmTracerSyncTypes {
   InvalidSync = 0,
   StreamSynchronize,  // caller thread wait stream to become empty
@@ -124,44 +38,6 @@ enum class RocmTracerSyncTypes {
   StreamWait          // compute stream will wait for event to happen
 };
 
-const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
-
-struct RocmTracerEvent {
-  static constexpr uint32_t kInvalidDeviceId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint32_t kInvalidThreadId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint32_t kInvalidCorrelationId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint64_t kInvalidStreamId =
-      std::numeric_limits<uint64_t>::max();
-  RocmTracerEventType type;
-  RocmTracerEventSource source = RocmTracerEventSource::Invalid;
-  RocmTracerEventDomain domain;
-  std::string name;
-  // This points to strings in AnnotationMap, which should outlive the point
-  // where serialization happens.
-  absl::string_view annotation;
-  absl::string_view roctx_range;
-  uint64_t start_time_ns = 0;
-  uint64_t end_time_ns = 0;
-  uint32_t device_id = kInvalidDeviceId;
-  uint32_t correlation_id = kInvalidCorrelationId;
-  uint32_t thread_id = kInvalidThreadId;
-  int64_t stream_id = kInvalidStreamId;
-  union {
-    MemcpyDetails memcpy_info;                    // If type == Memcpy*
-    MemsetDetails memset_info;                    // If type == Memset*
-    MemAllocDetails memalloc_info;                // If type == MemoryAlloc
-    KernelDetails kernel_info;                    // If type == Kernel
-    SynchronizationDetails synchronization_info;  // If type == Synchronization
-  };
-};
-
-void DumpRocmTracerEvent(const RocmTracerEvent& event,
-                         uint64_t start_walltime_ns, uint64_t start_gputime_ns,
-                         const std::string& message);
-
 struct RocmTracerOptions {
   std::set<uint32_t> api_tracking_set;  // actual api set we want to profile
 
@@ -175,69 +51,6 @@ struct RocmTracerOptions {
       activity_tracing;
 };
 
-struct RocmTraceCollectorOptions {
-  // Maximum number of events to collect from callback API; if -1, no limit.
-  // if 0, the callback API is enabled to build a correlation map, but no
-  // events are collected.
-  uint64_t max_callback_api_events;
-  // Maximum number of events to collect from activity API; if -1, no limit.
-  uint64_t max_activity_api_events;
-  // Maximum number of annotation strings that we can accommodate.
-  uint64_t max_annotation_strings;
-  // Number of GPUs involved.
-  uint32_t num_gpus;
-};
-
-class AnnotationMap {
- public:
-  explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
-  void Add(uint32_t correlation_id, const std::string& annotation);
-  absl::string_view LookUp(uint32_t correlation_id);
-
- private:
-  struct AnnotationMapImpl {
-    // The population/consumption of annotations might happen from multiple
-    // callback/activity api related threads.
-    absl::Mutex mutex;
-    // Annotation tends to be repetitive, use a hash_set to store the strings,
-    // an use the reference to the string in the map.
-    absl::node_hash_set<std::string> annotations;
-    absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
-  };
-  const uint64_t max_size_;
-  AnnotationMapImpl map_;
-
- public:
-  // Disable copy and move.
-  AnnotationMap(const AnnotationMap&) = delete;
-  AnnotationMap& operator=(const AnnotationMap&) = delete;
-};
-
-class RocmTraceCollector {
- public:
-  explicit RocmTraceCollector(const RocmTraceCollectorOptions& options)
-      : options_(options), annotation_map_(options.max_annotation_strings) {}
-  virtual ~RocmTraceCollector() {}
-
-  virtual void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) = 0;
-  virtual void OnEventsDropped(const std::string& reason,
-                               uint32_t num_events) = 0;
-  virtual void Flush() = 0;
-
-  AnnotationMap* annotation_map() { return &annotation_map_; }
-
- protected:
-  RocmTraceCollectorOptions options_;
-
- private:
-  AnnotationMap annotation_map_;
-
- public:
-  // Disable copy and move.
-  RocmTraceCollector(const RocmTraceCollector&) = delete;
-  RocmTraceCollector& operator=(const RocmTraceCollector&) = delete;
-};
-
 class RocmTracer;
 
 class RocmApiCallbackImpl {
@@ -246,7 +59,7 @@ class RocmApiCallbackImpl {
                       RocmTraceCollector* collector)
       : options_(options), tracer_(tracer), collector_(collector) {}
 
-  absl::Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
+  tsl::Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
 
  private:
   void AddKernelEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
@@ -284,7 +97,7 @@ class RocmActivityCallbackImpl {
                            RocmTraceCollector* collector)
       : options_(options), tracer_(tracer), collector_(collector) {}
 
-  absl::Status operator()(const char* begin, const char* end);
+  tsl::Status operator()(const char* begin, const char* end);
 
  private:
   void AddHipKernelActivityEvent(const roctracer_record_t* record);
@@ -314,9 +127,9 @@ class RocmTracer {
   void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector);
   void Disable();
 
-  absl::Status ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                  const void* cbdata);
-  absl::Status ActivityCallbackHandler(const char* begin, const char* end);
+  tsl::Status ApiCallbackHandler(uint32_t domain, uint32_t cbid,
+                                 const void* cbdata);
+  tsl::Status ActivityCallbackHandler(const char* begin, const char* end);
 
   static uint64_t GetTimestamp();
   static int NumGpus();
@@ -340,11 +153,11 @@ class RocmTracer {
   explicit RocmTracer() : num_gpus_(NumGpus()) {}
 
  private:
-  absl::Status EnableApiTracing();
-  absl::Status DisableApiTracing();
+  tsl::Status EnableApiTracing();
+  tsl::Status DisableApiTracing();
 
-  absl::Status EnableActivityTracing();
-  absl::Status DisableActivityTracing();
+  tsl::Status EnableActivityTracing();
+  tsl::Status DisableActivityTracing();
 
   int num_gpus_;
   std::optional<RocmTracerOptions> options_;

From 15faca38db9b0f704f081d5f826a17700c7eff1b Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Thu, 21 Mar 2024 03:38:23 -0700
Subject: [PATCH 219/670] Add tflite_disable_mobile_test tag to
 mutable_op_resolver_utils_test.

PiperOrigin-RevId: 617788624
---
 tensorflow/lite/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 5c7d40745baca8..7385de95bd034c 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -1166,6 +1166,7 @@ cc_test(
 cc_test_with_tflite(
     name = "mutable_op_resolver_utils_test",
     srcs = ["mutable_op_resolver_utils_test.cc"],
+    tags = ["tflite_disable_mobile_test"],  # Due to b/144306101 and b/329899620.
     tflite_deps = [
         ":mutable_op_resolver_utils",
         ":test_util",

From 2bcd43f134407ec6f8506a157f531cc97bb269f0 Mon Sep 17 00:00:00 2001
From: Bart Chrzaszcz <bartchr@google.com>
Date: Thu, 21 Mar 2024 03:47:38 -0700
Subject: [PATCH 220/670] Allow Shardonnay to use XLA targets.

PiperOrigin-RevId: 617790756
---
 third_party/xla/xla/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 5d2be770ff3e4c..6d3f689bc93301 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -25,6 +25,7 @@ package_group(
         "//third_party/mira/...",
         "//third_party/mlcompass/...",
         "//third_party/mlir_edge/model_curriculum/...",
+        "//third_party/openxla/shardonnay/...",
         "//third_party/py/enzyme_ad/...",
         "//third_party/py/jax/...",
         "//third_party/py/t5x/...",

From 60b431300ccf2b8704f0f2091eeccab3cc2f0756 Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Thu, 21 Mar 2024 03:58:24 -0700
Subject: [PATCH 221/670] Update documentation for using custom ops to
 recommend using the stable custom ops API.

PiperOrigin-RevId: 617793327
---
 tensorflow/lite/g3doc/guide/ops_custom.md | 473 ++++++++++++++++++----
 1 file changed, 392 insertions(+), 81 deletions(-)

diff --git a/tensorflow/lite/g3doc/guide/ops_custom.md b/tensorflow/lite/g3doc/guide/ops_custom.md
index 296fd6216ac397..4f7089aca4ec3e 100644
--- a/tensorflow/lite/g3doc/guide/ops_custom.md
+++ b/tensorflow/lite/g3doc/guide/ops_custom.md
@@ -125,113 +125,374 @@ Encountered unresolved custom op: Atan.
 
 ### Create and register the operator.
 
-All TensorFlow Lite operators (both custom and builtin) are defined using a
-simple pure-C interface that consists of four functions:
+```c++
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/c_api_opaque.h"
+```
+
+TensorFlow Lite custom operators are defined using a simple pure-C API that
+consists of an opaque type (`TfLiteRegistrationExternal`) and related functions.
+
+`TfLiteRegistrationExternal` is an opaque type:
+
+```c++
+typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
+```
+
+`TfLiteRegistrationExternal` stores the operator's identity and implementation.
+(Note that the operator is distinct from its operands, which are stored in the
+TF Lite graph nodes for nodes that call the operator.)
+
+Instances of this type are constructed with calls to
+`TfLiteRegistrationExternalCreate` and can be destroyed by calling
+`TfLiteRegistrationExternalDelete`.
+
+The operator's identity is set via the parameters to the constructor function
+`TfLiteRegistrationExternalCreate`:
+
+```c++
+TfLiteRegistrationExternal*
+TfLiteRegistrationExternalCreate(
+    TfLiteBuiltinOperator builtin_code,  // Normally `TfLiteBuiltinCustom`.
+    const char* custom_name,  // The name of the custom op.
+    int version  // Normally `1` for the first version of a custom op.
+);
+```
+
+The operator implementation can define "methods" with the following signatures.
+All of these methods are optional, but for an operator to be successfully
+evaluated, the operator implementation needs to define and set (using the setter
+functions) at least the `Prepare` and `Invoke` methods.
 
 ```c++
-typedef struct {
-  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
-  void (*free)(TfLiteContext* context, void* buffer);
-  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
-  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
-} TfLiteRegistration;
+// Initializes the op from serialized data.
+void* Init(TfLiteOpaqueContext* context, const char* buffer, size_t length);
+
+// Deallocates the op.
+// The pointer `buffer` is the data previously returned by an Init invocation.
+void Free(TfLiteOpaqueContext* context, void* buffer);
+
+// Called when the inputs that this node depends on have been resized.
+TfLiteStatus Prepare(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
+
+// Called when the node is executed. (Should read node inputs and write to
+// node outputs).
+TfLiteStatus Invoke(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
+
+// Retrieves the async kernel.
+TfLiteAsyncKernel AsyncKernel(TfLiteOpaqueContext* context,
+                              TfLiteOpaqueNode* node);
+```
+
+The function *names* (or namespace prefixes, for C++) in your op implementation
+don't have to match the function names in the above code snippet, since the TF
+Lite custom ops API will only use their addresses. Indeed we recommend that you
+declare them in an anonymous namespace or as static functions.
+
+But it is a good idea to include your operator name as a namespace or prefix on
+these function names:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>C++</h3>
+      <p><pre class="prettyprint lang-cpp">
+namespace my_namespace::my_custom_op {
+  void* Init(TfLiteOpaqueContext* context,
+             const char* buffer, size_t length) { ... }
+  // ... plus definitions of Free, Prepare, and Invoke ...
+}
+      </pre></p>
+    </section>
+    <section>
+      <h3>C</h3>
+      <p><pre class="prettyprint lang-cpp">
+void* MyCustomOpInit(TfLiteOpaqueContext* context,
+                     const char* buffer, size_t length) { ... }
+// ... plus definitions of MyCustomOpFree, MyCustomOpPrepare, and
+// MyCustomOpInvoke.
+      </pre></p>
+    </section>
+  </devsite-selector>
+</div>
+
+Since this is a C API, these "methods" are implemented as C function pointers in
+the `TfLiteRegistrationExternal` type, which are set by passing the addresses of
+your implementation functions to the corresponding setter functions
+`TfLiteRegistrationExternalSet`*MethodName*:
+
+```c++
+void TfLiteRegistrationExternalSetInit(
+    TfLiteRegistrationExternal* registration,
+    void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
+                  size_t length));
+void TfLiteRegistrationExternalSetFree(
+    TfLiteRegistrationExternal* registration,
+    void (*free)(TfLiteOpaqueContext* context, void* data));
+void TfLiteRegistrationExternalSetPrepare(
+    TfLiteRegistrationExternal* registration,
+    TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node));
+void TfLiteRegistrationExternalSetInvoke(
+    TfLiteRegistrationExternal* registration,
+    TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
+                           TfLiteOpaqueNode* node));
+void TfLiteRegistrationExternalSetAsyncKernel(
+    TfLiteRegistrationExternal* registration,
+    struct TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
+                                              TfLiteOpaqueNode* node));
 ```
 
 Refer to
-[`common.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/c/common.h)
-for details on `TfLiteContext` and `TfLiteNode`. The former provides error
+[`common.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/core/c/common.h)
+for details on `TfLiteContext` and `TfLiteNode`. `TfLiteContext` provides error
 reporting facilities and access to global objects, including all the tensors.
-The latter allows implementations to access their inputs and outputs.
-
-When the interpreter loads a model, it calls `init()` once for each node in the
-graph. A given `init()` will be called more than once if the op is used multiple
-times in the graph. For custom ops a configuration buffer will be provided,
-containing a flexbuffer that maps parameter names to their values. The buffer is
-empty for builtin ops because the interpreter has already parsed the op
-parameters. Kernel implementations that require state should initialize it here
-and transfer ownership to the caller. For each `init()` call, there will be a
-corresponding call to `free()`, allowing implementations to dispose of the
-buffer they might have allocated in `init()`.
+`TfLiteNode` allows operator implementations to access their inputs and outputs.
+
+When the interpreter loads a model, it calls the `Init()` method once for each
+node in the graph. A given `Init()` will be called more than once if the op is
+used multiple times in the graph. For custom ops a configuration buffer will be
+provided, containing a flexbuffer that maps parameter names to their values. The
+buffer is empty for builtin ops because the interpreter has already parsed the
+op parameters. Kernel implementations that require state should initialize it
+here and transfer ownership to the caller. For each `Init()` call, there will be
+a corresponding call to `Free()`, allowing implementations to dispose of the
+buffer they might have allocated in `Init()`.
 
 Whenever the input tensors are resized, the interpreter will go through the
 graph notifying implementations of the change. This gives them the chance to
 resize their internal buffer, check validity of input shapes and types, and
-recalculate output shapes. This is all done through `prepare()`, and
-implementations can access their state using `node->user_data`.
+recalculate output shapes. This is all done through the `Prepare()` method, and
+implementations can access their state using
+`TfLiteOpaqueNodeGetUserData(node)`.
 
 Finally, each time inference runs, the interpreter traverses the graph calling
-`invoke()`, and here too the state is available as `node->user_data`.
-
-Custom ops can be implemented in exactly the same way as builtin ops, by
-defining those four functions and a global registration function that usually
-looks like this:
+the `Invoke()` method, and here too the state is available as
+`TfLiteOpaqueNodeGetUserData(node)`.
+
+Custom ops can be implemented by defining those "method" functions, and then
+defining a function that returns an instance of `TfLiteRegistrationExternal`
+constructed by calling `TfLiteRegistrationExternalCreate` and then the relevant
+setter methods:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>C++</h3>
+      <p><pre class="prettyprint lang-cpp">
+namespace my_namespace::my_custom_op {
+  namespace {
+    void* Init(TfLiteOpaqueContext* context,
+               const char* buffer, size_t length) { ... }
+    void Free(TfLiteOpaqueContext* context, void* buffer) { ... }
+    TfLiteStatus Prepare(TfLiteOpaqueContext* context,
+                         TfLiteOpaqueNode* node) { ... }
+    TfLiteStatus Invoke(TfLiteOpaqueContext* context,
+                        TfLiteOpaqueNode* node) {... }
+  };
+
+  const TfLiteRegistrationExternal* MyCustomOpRegistrationExternal() {
+    // Singleton instance, intentionally never destroyed.
+    static const TfLiteRegistrationExternal* my_custom_op = ()[] {
+        TfLiteRegistrationExternal* r =
+            TfLiteRegistrationExternalCreate(
+                kTfLiteBuiltinCustom, "MyCustomOp", /*version=*/ 1);
+        TfLiteRegistrationExternalSetInit(r, Init);
+        TfLiteRegistrationExternalSetFree(r, Free);
+        TfLiteRegistrationExternalSetPrepare(r, Prepare);
+        TfLiteRegistrationExternalSetInvoke(r, Eval);
+        return r;
+      };
+    return my_custom_op;
+  }
 
-```c++
-namespace my_namespace {
-  const TfLiteRegistration* Register_MY_CUSTOM_OP() {
-    static const TfLiteRegistration r = {my_custom_op::Init,
-                                         my_custom_op::Free,
-                                         my_custom_op::Prepare,
-                                         my_custom_op::Eval};
-    return &r;
+  const TfLiteRegistration* MyCustomOpRegistration() {
+    static const TfLiteRegistration my_custom_op {
+      .registration_external = MyCustomOpRegistrationExternal();
+    };
+    return my_custom_op;
   }
 }  // namespace my_namespace
-```
+      </pre></p>
+    </section>
+    <section>
+      <h3>C</h3>
+      <p><pre class="prettyprint lang-cpp">
+static void* MyCustomOpInit(TfLiteOpaqueContext* context, const char* buffer,
+                     size_t length) { ... }
+static void MyCustomOpFree(TfLiteOpaqueContext* context, void* buffer) { ... }
+static TfLiteStatus MyCustomOpPrepare(TfLiteOpaqueContext* context,
+                                      TfLiteOpaqueNode* node) { ... }
+static TfLiteStatus MyCustomOpInvoke(TfLiteOpaqueContext* context,
+                                     TfLiteOpaqueNode* node) {... }
+
+static TfLiteRegistrationExternal* MyCustomOpCreate() {
+  const TfLiteRegistrationExternal* r =
+      TfLiteRegistrationExternalCreate(
+          kTfLiteBuiltinCustom, "MyCustomOp", /*version=*/ 1);
+  TfLiteRegistrationExternalSetInit(r, MyCustomOpInit);
+  TfLiteRegistrationExternalSetFree(r, MyCustomOpFree);
+  TfLiteRegistrationExternalSetPrepare(r, MyCustomOpPrepare);
+  TfLiteRegistrationExternalSetInvoke(r, MyCustomOpEval);
+  return r;
+}
 
-Note that registration is not automatic and an explicit call to
-`Register_MY_CUSTOM_OP` should be made. While the standard `BuiltinOpResolver`
-(available from the `:builtin_ops` target) takes care of the registration of
-builtins, custom ops will have to be collected in separate custom libraries.
+const TfLiteRegistrationExternal* MyCustomOpRegistrationExternal() {
+  // Singleton instance, intentionally never destroyed.
+  static const TfLiteRegistrationExternal* my_custom_op = MyCustomOpCreate();
+  return my_custom_op;
+}
+
+const TfLiteRegistration MyCustomOpRegistration() {
+  static const TfLiteRegistration my_custom_op {
+    .registration_external = MyCustomOpRegistrationExternal();
+  };
+  return my_custom_op;
+}
+      </pre></p>
+    </section>
+  </devsite-selector>
+</div>
+
+Note that registration is not automatic and an explicit call to your
+`MyCustomOpRegistration` function should be made (see details below). While the
+standard `BuiltinOpResolver` (available from the `:builtin_ops` target) takes
+care of the registration of builtins, custom ops will have to be collected in
+separate custom libraries.
 
 ### Defining the kernel in the TensorFlow Lite runtime
 
 All we need to do to use the op in TensorFlow Lite is define two functions
-(`Prepare` and `Eval`), and construct a `TfLiteRegistration`:
+(`Prepare` and `Eval`), and a third to construct a `TfLiteRegistrationExternal`:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>C++</h3>
+      <p><pre class="prettyprint lang-cpp">
+namespace atan_op {
+  namespace {
+    TfLiteStatus AtanPrepare(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node) {
+      TF_LITE_OPAQUE_ENSURE_EQ(context, TfLiteOpaqueNodeNumInputs(node), 1);
+      TF_LITE_OPAQUE_ENSURE_EQ(context, TfLiteOpaqueNodeNumOutputs(node), 1);
+
+      const TfLiteOpaqueTensor* input = TfLiteOpaqueNodeGetInput(context, node, 0);
+      TfLiteOpaqueTensor* output = TfLiteOpaqueNodeGetOutput(context, node, 0);
+
+      int num_dims = TfLiteOpaqueTensorNumDimensions(input);
+
+      TfLiteIntArray* output_size = TfLiteIntArrayCreate(num_dims);
+      for (int i=0; i < num_dims; ++i) {
+        output_size->data[i] = input->dims->data[i];
+      }
+
+      return TfLiteOpaqueContextResizeTensor(context, output, output_size);
+    }
+
+    TfLiteStatus AtanEval(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node) {
+      const TfLiteOpaqueTensor* input = TfLiteOpaqueNodeGetInput(context, node, 0);
+      TfLiteOpaqueTensor* output = TfLiteOpaqueNodeGetOutput(context, node, 0);
+
+      float* input_data = static_cast<float*>(TfLiteOpaqueTensorData(input));
+      float* output_data = static_cast<float*>(TfLiteOpaqueTensorData(output));
+
+      size_t count = 1;
+      int num_dims = TfLiteOpaqueTensorNumDimensions(input);
+      for (int i = 0; i < num_dims; ++i) {
+        count *= input->dims->data[i];
+      }
+
+      for (size_t i = 0; i < count; ++i) {
+        output_data[i] = atan(input_data[i]);
+      }
+      return kTfLiteOk;
+    }
+  }  // anonymous namespace
+
+  const TfLiteRegistrationExternal* AtanOpRegistrationExternal() {
+    // Singleton instance, intentionally never destroyed.
+    static const TfLiteRegistrationExternal* atan_op = ()[] {
+        auto* r = TfLiteRegistrationExternalCreate(
+            kTfLiteBuiltinCustom, "ATAN", /*version=*/ 1);
+        TfLiteRegistrationExternalSetPrepare(r, Prepare);
+        TfLiteRegistrationExternalSetInvoke(r, Eval);
+        return r;
+      };
+    return atan_op;
+  }
 
-```cpp
-TfLiteStatus AtanPrepare(TfLiteContext* context, TfLiteNode* node) {
-  using namespace tflite;
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteRegistration AtanOpRegistration() {
+    static const TfLiteRegistration atan_op {
+      .registration_external = AtanOpRegistrationExternal();
+    };
+    return atan_op;
+  }
+}  // namespace atan_op
+      </pre></p>
+    </section>
+    <section>
+      <h3>C</h3>
+      <p><pre class="prettyprint lang-cpp">
+static TfLiteStatus AtanPrepare(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node) {
+  TF_LITE_OPAQUE_ENSURE_EQ(context, TfLiteOpaqueNodeNumInputs(node), 1);
+  TF_LITE_OPAQUE_ENSURE_EQ(context, TfLiteOpaqueNodeNumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteOpaqueTensor* input = TfLiteOpaqueNodeGetInput(context, node, 0);
+  TfLiteOpaqueTensor* output = TfLiteOpaqueNodeGetOutput(context, node, 0);
 
-  int num_dims = NumDimensions(input);
+  int num_dims = TfLiteOpaqueTensorNumDimensions(input);
 
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(num_dims);
-  for (int i=0; i<num_dims; ++i) {
+  for (int i = 0; i < num_dims; ++i) {
     output_size->data[i] = input->dims->data[i];
   }
 
-  return context->ResizeTensor(context, output, output_size);
+  return TfLiteOpaqueContextResizeTensor(context, output, output_size);
 }
 
-TfLiteStatus AtanEval(TfLiteContext* context, TfLiteNode* node) {
-  using namespace tflite;
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+static TfLiteStatus AtanEval(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node) {
+  const TfLiteOpaqueTensor* input = TfLiteOpaqueNodeGetInput(context, node, 0);
+  TfLiteOpaqueTensor* output = TfLiteOpaqueNodeGetOutput(context, node, 0);
 
-  float* input_data = GetTensorData<float>(input);
-  float* output_data = GetTensorData<float>(output);
+  float* input_data = static_cast<float*>(TfLiteOpaqueTensorData(input));
+  float* output_data = static_cast<float*>(TfLiteOpaqueTensorData(output));
 
   size_t count = 1;
-  int num_dims = NumDimensions(input);
+  int num_dims = TfLiteOpaqueTensorNumDimensions(input);
   for (int i = 0; i < num_dims; ++i) {
     count *= input->dims->data[i];
   }
 
-  for (size_t i=0; i<count; ++i) {
+  for (size_t i = 0; i < count; ++i) {
     output_data[i] = atan(input_data[i]);
   }
   return kTfLiteOk;
 }
 
-const TfLiteRegistration* Register_ATAN() {
-  static const TfLiteRegistration r = {nullptr, nullptr, AtanPrepare, AtanEval};
-  return &r;
+static const TfLiteRegistrationExternal* AtanOpCreate() {
+  TfLiteRegistrationExternal* r = TfLiteRegistrationExternalCreate(
+          kTfLiteBuiltinCustom, "ATAN", /*version=*/ 1);
+  TfLiteRegistrationExternalSetPrepare(r, Prepare);
+  TfLiteRegistrationExternalSetInvoke(r, Eval);
+  return r;
+}
+
+const TfLiteRegistrationExternal* AtanOpRegistrationExternal() {
+  // Singleton instance, intentionally never destroyed.
+  static const TfLiteRegistrationExternal* atan_op = AtanOpCreate();
+  return atan_op;
 }
-```
+
+const TfLiteRegistration AtanOpRegistration() {
+  static const TfLiteRegistration atan_op {
+    .registration_external = AtanOpRegistrationExternal();
+  };
+  return atan_op;
+}
+      </pre></p>
+    </section>
+  </devsite-selector>
+</div>
 
 When initializing the `OpResolver`, add the custom op into the resolver (see
 below for an example). This will register the operator with Tensorflow Lite so
@@ -262,6 +523,11 @@ class OpResolver {
 };
 ```
 
+Note that for backwards compatibility, this class uses the older concrete type
+`TfLiteRegistration` rather than the opaque type `TfLiteRegistrationExternal`,
+but the `TfLiteRegistration` struct contains a `registration_external` field of
+type `TfLiteRegistrationExternal*`.
+
 The `MutableOpResolver` and `BuiltinOpResolver` classes are derived from
 `OpResolver`:
 
@@ -281,7 +547,8 @@ class BuiltinOpResolver : public MutableOpResolver {
 };
 ```
 
-Regular usage requires that you use the `BuiltinOpResolver` and write:
+Regular usage (without custom ops) requires that you use the `BuiltinOpResolver`
+and write:
 
 ```c++
 tflite::ops::builtin::BuiltinOpResolver resolver;
@@ -294,7 +561,7 @@ and call `AddCustom` (before you pass the resolver to the
 ```c++
 tflite::ops::builtin::MutableOpResolver resolver;
 resolver.AddAll(tflite::ops::builtin::BuiltinOpResolver());
-resolver.AddCustom("Atan", Register_ATAN());
+resolver.AddCustom("Atan", AtanOpRegistration());
 ```
 
 If the set of builtin ops is deemed to be too large, a new `OpResolver` could be
@@ -332,20 +599,64 @@ call (as show above) to
     of copying as much as possible.
 
 2.  If a data structure will persist during the entire operation, we advise
-    pre-allocating the memory using temporary tensors. You may need to use
+    pre-allocating the memory using temporary tensors. You may need to use an
     OpData struct to reference the tensor indices in other functions. See the
     example in the
     [kernel for convolution](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/conv.cc).
-    A sample code snippet is below
-
-    ```
-    auto* op_data = reinterpret_cast<OpData*>(node->user_data);
-    TfLiteIntArrayFree(node->temporaries);
-    node->temporaries = TfLiteIntArrayCreate(1);
-    node->temporaries->data[0] = op_data->temp_tensor_index;
-    TfLiteTensor* temp_tensor = &context->tensors[op_data->temp_tensor_index];
-    temp_tensor->type =  kTfLiteFloat32;
-    temp_tensor->allocation_type = kTfLiteArenaRw;
+    A sample code snippet is below.
+
+    ```c++
+    struct MyOpData {
+      int temp_tensor_index;
+      ...
+    };
+
+    void* Init(TfLiteOpaqueContext* context,
+        const char* buffer, size_t length) {
+      auto* op_data = new MyOpData{};
+      ...
+      return op_data;
+    }
+    void Free(TfLiteOpaqueContext* context, void* buffer) {
+      ...
+      delete reinterpret_cast<MyOpData*>(buffer);
+    }
+    TfLiteStatus Prepare(TfLiteOpaqueContext* context,
+                         TfLiteOpaqueNode* node) {
+      ...
+      auto* op_data =
+          reinterpret_cast<MyOpData*>(TfLiteOpaqueNodeGetUserData(node));
+      const int num_temporaries = 1;
+      int temporary_tensor_indices[num_temporaries];
+      TfLiteOpaqueTensorBuilder* builder = TfLiteOpaqueTensorBuilderCreate();
+      TfLiteOpaqueTensorBuilderSetType(builder, kTfLiteFloat32);
+      TfLiteOpaqueTensorBuilderSetAllocationType(builder, kTfLiteArenaRw);
+      TfLiteOpaqueContextAddTensor(context, builder,
+          &temporary_tensor_indices[0]);
+      TfLiteOpaqueTensorBuilderDelete(builder);
+      TfLiteOpaqueNodeSetTemporaries(node, temporary_tensor_indices,
+          num_temporaries);
+      op_data->temp_tensor_index = temporary_tensor_indices[0];
+      ...
+      return kTfLiteOk;
+    }
+    TfLiteStatus Invoke(TfLiteOpaqueContext* context,
+                        TfLiteOpaqueNode* node) {
+      ...
+      auto* op_data = reinterpret_cast<MyOpData*>(
+          TfLiteOpaqueNodeGetUserData(node));
+      TfLiteOpaqueTensor* temp_tensor =
+          TfLiteOpaqueContextGetOpaqueTensor(context,
+              op_data->temp_tensor_index);
+      TF_LITE_OPAQUE_ENSURE(context,
+          TfLiteTensorType(temp_tensor) == kTfLiteFloat32);
+      TF_LITE_OPAQUE_ENSURE(context,
+          TfLiteTensorGetAllocationType(temp_Tensor) == kTfLiteArenaRw);
+      void *temp_data = TfLiteTensorData(temp_tensor);
+      TF_LITE_OPAQUE_ENSURE(context, temp_data != nullptr);
+      ...
+      return kTfLiteOk;
+    }
     ```
 
 3.  If it doesn't cost too much wasted memory, prefer using a static fixed size
@@ -363,7 +674,7 @@ call (as show above) to
     `malloc` in a function and have an error exit, deallocate memory before you
     exit.
 
-6.  Use `TF_LITE_ENSURE(context, condition)` to check for a specific condition.
-    Your code must not leave memory hanging when `TF_LITE_ENSURE` is used, i.e.,
-    these macros should be used before any resources are allocated that will
-    leak.
+6.  Use `TF_LITE_OPAQUE_ENSURE(context, condition)` to check for a specific
+    condition. Your code must not leave memory hanging when
+    `TF_LITE_OPAQUE_ENSURE` is used, i.e., these macros should be used before
+    any resources are allocated that will leak.

From 2c42451c5a18e65069a1b7d37951f3613ffdd60e Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Thu, 21 Mar 2024 04:10:47 -0700
Subject: [PATCH 222/670] [XLA:GPU] Move helper to set target tile sizes with
 default offsets and strides.

PiperOrigin-RevId: 617796445
---
 .../service/gpu/model/symbolic_tile_analysis.cc | 14 ++++++++++++++
 .../service/gpu/model/symbolic_tile_analysis.h  |  5 +++++
 .../gpu/model/symbolic_tile_analysis_test.cc    | 17 +----------------
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index 872811f343583c..33c393835eb9c3 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -219,5 +219,19 @@ void SymbolicTileAnalysis::SetTileParameters(
   tile_parameters_ = std::vector(parameters.begin(), parameters.end());
 }
 
+void SymbolicTileAnalysis::SetTileParametersWithDefaultOffsetsAndStrides(
+    absl::Span<int64_t const> sizes) {
+  std::vector<int64_t> parameters;
+  parameters.reserve(3 * sizes.size());
+
+  for (int64_t size : sizes) {
+    // Untiled dims have offset = 0 and stride = 1.
+    parameters.push_back(0);
+    parameters.push_back(size);
+    parameters.push_back(1);
+  }
+  SetTileParameters(parameters);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
index 439a1bf4d1d992..cc595990bea0a0 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -75,6 +75,11 @@ class SymbolicTileAnalysis {
   // concrete values using `TileOffsets`, `TileSizes`, and `TileStrides`.
   void SetTileParameters(absl::Span<int64_t const> parameters);
 
+  // Populate tile parameters with given sizes. All offsets are 0 and strides
+  // are 1.
+  void SetTileParametersWithDefaultOffsetsAndStrides(
+      absl::Span<int64_t const> sizes);
+
   // Return the underlying IndexingContext.
   IndexingContext* GetIndexingContext() const { return context_; };
 
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
index 3ba190e22f97b2..1a918a5b17d7ff 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
@@ -36,21 +36,6 @@ namespace gpu {
 namespace {
 
 using ::testing::ElementsAre;
-
-void SetTileParametersWithDefaultOffsetsAndStrides(
-    absl::Span<int64_t const> sizes, SymbolicTileAnalysis& analysis) {
-  std::vector<int64_t> parameters;
-  parameters.reserve(3 * sizes.size());
-
-  for (int64_t size : sizes) {
-    // Untiled dims have offset = 0 and stride = 1.
-    parameters.push_back(0);
-    parameters.push_back(size);
-    parameters.push_back(1);
-  }
-  analysis.SetTileParameters(parameters);
-}
-
 using SymbolicTileAnalysisTest = HloTestBase;
 
 TEST_F(SymbolicTileAnalysisTest, SimpleNormalizationDiamondIsSupported) {
@@ -81,7 +66,7 @@ ENTRY main {
   SymbolicTileAnalysis analysis =
       std::get<SymbolicTileAnalysis>(analysis_or_error);
 
-  SetTileParametersWithDefaultOffsetsAndStrides(/*sizes=*/{1, 10}, analysis);
+  analysis.SetTileParametersWithDefaultOffsetsAndStrides(/*sizes=*/{1, 10});
 
   const HloInstruction* p0 =
       module->entry_computation()->parameter_instruction(0);

From 77db29d973ed727d0d924da4a892f3f4e966cd6e Mon Sep 17 00:00:00 2001
From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com>
Date: Thu, 21 Mar 2024 14:45:51 +0300
Subject: [PATCH 223/670] Update lifetime of a PR diagram image

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2b9b9f9304d142..89c61463462745 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -71,7 +71,7 @@ Before sending your pull requests, make sure you do the following:
 
 In a graphical form, the entire lifetime of a PR looks like
 
-![image](https://user-images.githubusercontent.com/323199/229561784-0a2f5509-b731-493f-ad88-bad487688c8d.png)
+![image](https://github.com/tensorflow/tensorflow/assets/52792999/3eea4ca5-daa0-4570-b0b5-2a2b03a724a3)
 
 ### Contributor License Agreements
 

From 635a857618be9c0ede4cf7b447b4ccff98ea7906 Mon Sep 17 00:00:00 2001
From: "Jiyoun (Jen) Ha" <jiyounha@google.com>
Date: Thu, 21 Mar 2024 04:23:58 -0700
Subject: [PATCH 224/670] Migrate `UniformSupport*` and `FakeQuantSupport*`
 components from `lite` directory.

* Uses namespaces rather than `using-directives`.
* Cleans up header imports.

PiperOrigin-RevId: 617798874
---
 tensorflow/compiler/mlir/lite/BUILD           |  5 ++++
 .../compiler/mlir/lite/quantization/ir/BUILD  |  7 ++---
 .../mlir/lite/quantization/ir/ConvertConst.cc |  1 -
 .../lite/quantization/ir/ConvertSimQuant.cc   |  4 +--
 .../lite/quantization/ir/QuantizeUtils.cc     |  2 +-
 .../mlir/lite/quantization/lite/BUILD         |  1 +
 .../lite/transforms/default_quant_params.cc   |  2 +-
 .../mlir/lite/transforms/legalize_tf.cc       |  2 --
 .../mlir/lite/transforms/prepare_quantize.cc  |  1 -
 .../lite/transforms/prepare_quantize_helper.h |  2 +-
 .../mlir/lite/transforms/prepare_tf.cc        |  2 --
 .../mlir/quantization/common/ir/BUILD         | 10 +++++--
 .../common}/ir/FakeQuantSupport.cc            | 28 ++++++++++++++-----
 .../common}/ir/FakeQuantSupport.h             |  6 ++--
 .../common}/ir/UniformSupport.cc              |  2 +-
 .../common}/ir/UniformSupport.h               |  6 ++--
 .../common/quantization_lib/BUILD             |  1 +
 .../quantization_lib/quantization_utils.cc    |  4 +--
 .../quantization_lib/quantization_utils.h     |  2 +-
 .../mlir/quantization/tensorflow/BUILD        |  1 +
 .../tensorflow/passes/prepare_quantize.cc     |  1 -
 21 files changed, 54 insertions(+), 36 deletions(-)
 rename tensorflow/compiler/mlir/{lite/quantization => quantization/common}/ir/FakeQuantSupport.cc (88%)
 rename tensorflow/compiler/mlir/{lite/quantization => quantization/common}/ir/FakeQuantSupport.h (93%)
 rename tensorflow/compiler/mlir/{lite/quantization => quantization/common}/ir/UniformSupport.cc (97%)
 rename tensorflow/compiler/mlir/{lite/quantization => quantization/common}/ir/UniformSupport.h (97%)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 6950078a6c18ae..1ca2163146eefc 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -704,6 +704,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/lite/stablehlo:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/lite/stablehlo:tfl_legalize_hlo",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -855,12 +856,16 @@ cc_library(
     ],
     deps = [
         "convert_type",
+        ":op_quant_spec_getters_inc",
         ":tensorflow_lite",
         ":tensorflow_lite_passes_inc_gen",
+        ":tensorflow_lite_post_quantize_inc_gen",
+        ":tensorflow_lite_quantize_inc_gen",
         ":validators",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/lite/quantization/lite:tfl_to_std",
         "//tensorflow/compiler/mlir/quantization/common:uniform_quantized_types",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/BUILD b/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
index 727fb03d833964..ffac0779313307 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -77,22 +77,19 @@ cc_library(
     srcs = [
         "ConvertConst.cc",
         "ConvertSimQuant.cc",
-        "FakeQuantSupport.cc",
         "QuantOps.cc",
         "QuantizeUtils.cc",
-        "UniformSupport.cc",
     ],
     hdrs = [
-        "FakeQuantSupport.h",
         "Passes.h",
         "QuantOps.h",
         "QuantizeUtils.h",
-        "UniformSupport.h",
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":QuantOpsIncGen",
         ":QuantPassIncGen",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc
index ae9b67e9e60af6..3de159a1414429 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/ir/Passes.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
 
 using namespace mlir;
 using namespace mlir::quantfork;
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc
index f64c400d4fb155..e99addc5b5f8a5 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/Passes.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
 
 using namespace mlir;
 using namespace mlir::quantfork;
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc b/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc
index 67c1c7d9284f2b..919c711272b2c1 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
 
 using namespace mlir;
 using namespace mlir::quantfork;
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index a0f55e0408932f..66df4f528aa43d 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -88,6 +88,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index 208c20492c10f6..2f015e61d58fe6 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 7f55ca054383fa..401f34e6e7943c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -49,9 +49,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/constant_utils.h"
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 6694db441b6566..ce11ca73970136 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -43,7 +43,6 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
index da5a941179deb8..e102c6bedd4328 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
@@ -38,8 +38,8 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 41a3144da6fe87..9f0a7fbafff450 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -59,9 +59,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/dilated_conv.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/BUILD b/tensorflow/compiler/mlir/quantization/common/ir/BUILD
index 0d79e514af12db..2fdd6efb2f3dfc 100644
--- a/tensorflow/compiler/mlir/quantization/common/ir/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/ir/BUILD
@@ -57,9 +57,15 @@ gentbl_cc_library(
 cc_library(
     name = "QuantOps",
     srcs = [
+        "FakeQuantSupport.cc",
         "QuantOps.cc",
+        "UniformSupport.cc",
+    ],
+    hdrs = [
+        "FakeQuantSupport.h",
+        "QuantOps.h",
+        "UniformSupport.h",
     ],
-    hdrs = ["QuantOps.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":QuantOpsIncGen",
@@ -67,6 +73,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.cc b/tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.cc
similarity index 88%
rename from tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.cc
rename to tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.cc
index 9b662ebdca8461..292e0eeb3cce71 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.cc
+++ b/tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.cc
@@ -13,12 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
 
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
-using namespace mlir;
-using namespace mlir::quantfork;
+namespace mlir::quantfork {
 
 static bool getDefaultStorageParams(unsigned numBits, bool narrowRange,
                                     bool isSigned, MLIRContext *ctx,
@@ -121,9 +131,11 @@ static void getNudgedScaleAndZeroPoint(int64_t qmin, int64_t qmax, double rmin,
   assert(nudgedZeroPoint <= qmax);
 }
 
-quant::UniformQuantizedType mlir::quantfork::fakeQuantAttrsToType(
-    Location loc, unsigned numBits, double rmin, double rmax, bool narrowRange,
-    Type expressedType, bool isSigned) {
+quant::UniformQuantizedType fakeQuantAttrsToType(Location loc, unsigned numBits,
+                                                 double rmin, double rmax,
+                                                 bool narrowRange,
+                                                 Type expressedType,
+                                                 bool isSigned) {
   MLIRContext *ctx = expressedType.getContext();
   unsigned flags = isSigned ? quant::QuantizationFlags::Signed : 0;
   Type storageType;
@@ -152,7 +164,7 @@ quant::UniformQuantizedType mlir::quantfork::fakeQuantAttrsToType(
                                                  nudgedZeroPoint, qmin, qmax);
 }
 
-quant::UniformQuantizedPerAxisType mlir::quantfork::fakeQuantAttrsToType(
+quant::UniformQuantizedPerAxisType fakeQuantAttrsToType(
     Location loc, unsigned numBits, int32_t quantizedDimension,
     ArrayRef<double> rmins, ArrayRef<double> rmaxs, bool narrowRange,
     Type expressedType, bool isSigned) {
@@ -198,3 +210,5 @@ quant::UniformQuantizedPerAxisType mlir::quantfork::fakeQuantAttrsToType(
       loc, flags, storageType, expressedType, scales, zeroPoints,
       quantizedDimension, qmin, qmax);
 }
+
+}  // namespace mlir::quantfork
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h b/tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h
similarity index 93%
rename from tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h
rename to tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h
index 6072172eaebe38..335f80782a5e20 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h
+++ b/tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h
@@ -41,8 +41,8 @@ limitations under the License.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_FAKEQUANTSUPPORT_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_FAKEQUANTSUPPORT_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_FAKEQUANTSUPPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_FAKEQUANTSUPPORT_H_
 
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 
@@ -71,4 +71,4 @@ quant::UniformQuantizedPerAxisType fakeQuantAttrsToType(
 }  // namespace quantfork
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_FAKEQUANTSUPPORT_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_FAKEQUANTSUPPORT_H_
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.cc b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc
similarity index 97%
rename from tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.cc
rename to tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc
index e5c3dd35a27981..5a200241af00dd 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.cc
+++ b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
 
 #include <numeric>
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
similarity index 97%
rename from tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h
rename to tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
index 064afb0b36aa13..b6f65e455d0c09 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h
+++ b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_UNIFORMSUPPORT_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_UNIFORMSUPPORT_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
 
 #include <utility>
 
@@ -237,4 +237,4 @@ class UniformQuantizedPerAxisValueConverter {
 }  // namespace quantfork
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_UNIFORMSUPPORT_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD b/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
index 9ba2ad29d5c212..7c68bb0f0c4b04 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
@@ -31,6 +31,7 @@ cc_library(
         ":quantization_config",
         ":quantization_interfaces_inc_gen",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite/kernels/internal:tensor_utils",
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc
index 5021805a879ef3..f6c561be98d49b 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc
@@ -46,10 +46,10 @@ limitations under the License.
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
index 88017117098aca..e1d36df58a3fd9 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
@@ -54,8 +54,8 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/core/framework/types.pb.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index 883ef9ac57443a..062cc52504af33 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -407,6 +407,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
         "//tensorflow/compiler/mlir/quantization/common:func",
         "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
index 3a42967e6ada1b..fe38ed8dc0f634 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"

From a67bbc38a49564291dd0c06d3afcb888d0ac129c Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Thu, 21 Mar 2024 05:08:30 -0700
Subject: [PATCH 225/670] Integrate LLVM at llvm/llvm-project@407937036fa7

Updates LLVM usage to match
[407937036fa7](https://github.com/llvm/llvm-project/commit/407937036fa7)

PiperOrigin-RevId: 617807327
---
 third_party/llvm/generated.patch | 273 +++----------------------------
 third_party/llvm/workspace.bzl   |   4 +-
 2 files changed, 21 insertions(+), 256 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index d31be3f7ac4ea7..17fb01d7f98000 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,256 +1,21 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/clang/test/CodeGen/aarch64-soft-float-abi-errors.c b/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
---- a/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
-+++ b/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
-@@ -1,9 +1,9 @@
- // REQUIRES: aarch64-registered-target
+diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
++++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+@@ -505,7 +505,7 @@
+     name = "__support_sign",
+     hdrs = ["src/__support/sign.h"],
+     deps = [
+-        ":__support_macros_properties_types",
++        ":__support_macros_attributes",
+     ],
+ )
  
--// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +fp-armv8 -S -target-abi aapcs      -verify=fp-hard %s
--// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -target-abi aapcs-soft -verify=nofp-soft %s
--// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -target-abi aapcs      -verify=nofp-hard %s
--// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -target-abi aapcs -O1  -verify=nofp-hard,nofp-hard-opt -emit-llvm %s
-+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +fp-armv8 -S -o /dev/null -target-abi aapcs      -verify=fp-hard %s
-+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -o /dev/null -target-abi aapcs-soft -verify=nofp-soft %s
-+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -o /dev/null -target-abi aapcs      -verify=nofp-hard %s
-+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -o /dev/null -target-abi aapcs -O1  -verify=nofp-hard,nofp-hard-opt -emit-llvm %s
- // No run line needed for soft-float ABI with an FPU because that is rejected by the driver
- 
- // With the hard-float ABI and a target with an FPU, FP arguments are passed in
-diff -ruN --strip-trailing-cr a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
---- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
-+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
-@@ -4074,6 +4074,7 @@
-   switch (qual_type->getTypeClass()) {
-   case clang::Type::Atomic:
-   case clang::Type::Auto:
-+  case clang::Type::CountAttributed:
-   case clang::Type::Decltype:
-   case clang::Type::Elaborated:
-   case clang::Type::Paren:
-@@ -4755,6 +4756,7 @@
-   switch (qual_type->getTypeClass()) {
-   case clang::Type::Atomic:
-   case clang::Type::Auto:
-+  case clang::Type::CountAttributed:
-   case clang::Type::Decltype:
-   case clang::Type::Elaborated:
-   case clang::Type::Paren:
-@@ -5088,6 +5090,7 @@
-   switch (qual_type->getTypeClass()) {
-   case clang::Type::Atomic:
-   case clang::Type::Auto:
-+  case clang::Type::CountAttributed:
-   case clang::Type::Decltype:
-   case clang::Type::Elaborated:
-   case clang::Type::Paren:
-diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
---- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
-+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
-@@ -611,25 +611,6 @@
-   return false;
- }
- 
--static SDValue simplifyUseOfIntToFP(SDValue Op, const APInt &DemandedBits,
--                                    SelectionDAG &DAG) {
--  unsigned Opc = Op.getOpcode();
--  assert((Opc == ISD::SINT_TO_FP || Opc == ISD::UINT_TO_FP) &&
--         "Invalid Int -> FP Opcode");
--  if (!DemandedBits.isSignMask())
--    return SDValue();
--
--  EVT VT = Op.getValueType();
--  if (Opc == ISD::UINT_TO_FP)
--    return DAG.getConstant(0, SDLoc(Op), VT);
--
--  EVT InnerVT = Op.getOperand(0).getValueType();
--  if (VT.getScalarSizeInBits() == InnerVT.getScalarSizeInBits())
--    return DAG.getBitcast(VT, Op.getOperand(0));
--
--  return SDValue();
--}
--
- bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
-                                           DAGCombinerInfo &DCI) const {
-   SelectionDAG &DAG = DCI.DAG;
-@@ -835,11 +816,6 @@
-     }
-     break;
-   }
--  case ISD::UINT_TO_FP:
--  case ISD::SINT_TO_FP:
--    if (SDValue R = simplifyUseOfIntToFP(Op, DemandedBits, DAG))
--      return R;
--    break;
-   case ISD::SIGN_EXTEND_INREG: {
-     // If none of the extended bits are demanded, eliminate the sextinreg.
-     SDValue Op0 = Op.getOperand(0);
-@@ -2337,12 +2313,6 @@
-     Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
-     break;
-   }
--  case ISD::UINT_TO_FP:
--  case ISD::SINT_TO_FP:
--    if (SDValue R = simplifyUseOfIntToFP(Op, DemandedBits, TLO.DAG))
--      return TLO.CombineTo(Op, R);
--    Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
--    break;
-   case ISD::SIGN_EXTEND_INREG: {
-     SDValue Op0 = Op.getOperand(0);
-     EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
---- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
-+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
-@@ -68,9 +68,6 @@
- public:
-   VPBuilder() = default;
-   VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); }
--  VPBuilder(VPRecipeBase *InsertPt) {
--    setInsertPoint(InsertPt->getParent(), InsertPt->getIterator());
--  }
- 
-   /// Clear the insertion point: created instructions will not be inserted into
-   /// a block.
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
---- a/llvm/lib/Transforms/Vectorize/VPlan.h
-+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
-@@ -1127,12 +1127,6 @@
-     return WrapFlags.HasNSW;
-   }
- 
--  bool isDisjoint() const {
--    assert(OpType == OperationType::DisjointOp &&
--           "recipe cannot have a disjoing flag");
--    return DisjointFlags.IsDisjoint;
--  }
--
- #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-   void printFlags(raw_ostream &O) const;
- #endif
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
---- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
-+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
-@@ -261,11 +261,6 @@
-   return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
- }
- 
--template <typename Op0_t, typename Op1_t>
--inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or>
--m_Or(const Op0_t &Op0, const Op1_t &Op1) {
--  return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
--}
- } // namespace VPlanPatternMatch
- } // namespace llvm
- 
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
---- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
-+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
-@@ -1216,23 +1216,6 @@
-       // load/store. If the underlying instruction has poison-generating flags,
-       // drop them directly.
-       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
--        VPValue *A, *B;
--        using namespace llvm::VPlanPatternMatch;
--        // Dropping disjoint from an OR may yield incorrect results, as some
--        // analysis may have converted it to an Add implicitly (e.g. SCEV used
--        // for dependence analysis). Instead, replace it with an equivalent Add.
--        // This is possible as all users of the disjoint OR only access lanes
--        // where the operands are disjoint or poison otherwise.
--        if (match(RecWithFlags, m_Or(m_VPValue(A), m_VPValue(B))) &&
--            RecWithFlags->isDisjoint()) {
--          VPBuilder Builder(RecWithFlags);
--          VPInstruction *New = Builder.createOverflowingOp(
--              Instruction::Add, {A, B}, {false, false},
--              RecWithFlags->getDebugLoc());
--          RecWithFlags->replaceAllUsesWith(New);
--          RecWithFlags->eraseFromParent();
--          CurRec = New;
--        }
-         RecWithFlags->dropPoisonGeneratingFlags();
-       } else {
-         Instruction *Instr = dyn_cast_or_null<Instruction>(
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
---- a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
-+++ b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
-@@ -164,13 +164,14 @@
- ; SSE-LABEL: demandedbits_sitofp_blendvps:
- ; SSE:       # %bb.0:
- ; SSE-NEXT:    movaps %xmm0, %xmm3
--; SSE-NEXT:    movaps %xmm2, %xmm0
-+; SSE-NEXT:    cvtdq2ps %xmm2, %xmm0
- ; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
- ; SSE-NEXT:    movaps %xmm3, %xmm0
- ; SSE-NEXT:    retq
- ;
- ; AVX-LABEL: demandedbits_sitofp_blendvps:
- ; AVX:       # %bb.0:
-+; AVX-NEXT:    vcvtdq2ps %xmm2, %xmm2
- ; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
- ; AVX-NEXT:    retq
-   %cvt = sitofp <4 x i32> %a2 to <4 x float>
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/X86/int-to-fp-demanded.ll b/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
---- a/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
-+++ b/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
-@@ -7,13 +7,19 @@
- define i32 @sitofp_signbit_only(i32 %i_in) nounwind {
- ; X86-LABEL: sitofp_signbit_only:
- ; X86:       # %bb.0:
-+; X86-NEXT:    subl $8, %esp
-+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-+; X86-NEXT:    movl %eax, (%esp)
-+; X86-NEXT:    fildl (%esp)
-+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
- ; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
- ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-+; X86-NEXT:    addl $8, %esp
- ; X86-NEXT:    retl
- ;
- ; X64-LABEL: sitofp_signbit_only:
- ; X64:       # %bb.0:
--; X64-NEXT:    movd %edi, %xmm0
-+; X64-NEXT:    cvtsi2ss %edi, %xmm0
- ; X64-NEXT:    movmskps %xmm0, %eax
- ; X64-NEXT:    shll $31, %eax
- ; X64-NEXT:    retq
-@@ -38,8 +44,8 @@
- ;
- ; X64-LABEL: sitofp_signbit_only_okay_width:
- ; X64:       # %bb.0:
--; X64-NEXT:    shll $16, %edi
--; X64-NEXT:    movd %edi, %xmm0
-+; X64-NEXT:    movswl %di, %eax
-+; X64-NEXT:    cvtsi2ss %eax, %xmm0
- ; X64-NEXT:    movmskps %xmm0, %eax
- ; X64-NEXT:    shll $31, %eax
- ; X64-NEXT:    retq
-@@ -76,14 +82,15 @@
- ; X86-LABEL: sitofp_signbit_only_fail_bad_width2:
- ; X86:       # %bb.0:
- ; X86-NEXT:    subl $8, %esp
--; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
--; X86-NEXT:    movl %edx, (%esp)
-+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-+; X86-NEXT:    movl %eax, (%esp)
- ; X86-NEXT:    fildl (%esp)
- ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
-+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-+; X86-NEXT:    movl %eax, %edx
- ; X86-NEXT:    shrl $16, %edx
-+; X86-NEXT:    andl $32768, %eax # imm = 0x8000
- ; X86-NEXT:    andl $32768, %edx # imm = 0x8000
--; X86-NEXT:    movl $32768, %eax # imm = 0x8000
--; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
- ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
- ; X86-NEXT:    # kill: def $dx killed $dx killed $edx
- ; X86-NEXT:    addl $8, %esp
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
---- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
-+++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
-@@ -29,7 +29,7 @@
- ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
- ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
- ; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
--; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP0]], 1
-+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP0]], 1
- ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[ARR]], i64 [[TMP5]]
- ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
- ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 -3
+@@ -513,6 +513,7 @@
+     name = "__support_uint128",
+     hdrs = ["src/__support/UInt128.h"],
+     deps = [
++        ":__support_macros_attributes",
+         ":__support_macros_properties_types",
+         ":__support_uint",
+     ],
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 54da9c8c4fae3b..44e870eccddcda 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "d93cfd8dab577b09a8d01ef10a279b02572e4814"
-    LLVM_SHA256 = "8f904d21ea90c0ac73c7080c13c7bd1ec31d642ac30d112e009a1e8953d8bcee"
+    LLVM_COMMIT = "407937036fa7640f61f225474b1ea6623a40dbdd"
+    LLVM_SHA256 = "56d1ef16706a5952a357f9e0e1ab6a6063a68e7c08dc23529e9c748f2c3b73de"
 
     tf_http_archive(
         name = name,

From d718d4da775debb91b589d5430164454801b5991 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Thu, 21 Mar 2024 05:22:29 -0700
Subject: [PATCH 226/670] [IndexAnalysis] Use mlir::AffineMap instead of
 IndexingMap for RTVars.

If we want to compose the indexing map within RTVar with the indexing map of `hlo` that it is associated with, we can just construct an indexing map using the domain of the IndexingMap to which the RTVar belongs to.

PiperOrigin-RevId: 617809869
---
 third_party/xla/xla/service/gpu/model/BUILD   |  5 +-
 .../service/gpu/model/indexing_analysis.cc    | 69 +++++++++----------
 .../gpu/model/indexing_analysis_test.cc       | 49 +++----------
 .../xla/service/gpu/model/indexing_context.cc | 39 -----------
 .../xla/service/gpu/model/indexing_context.h  | 15 ----
 .../xla/xla/service/gpu/model/indexing_map.cc | 50 +++++---------
 .../xla/xla/service/gpu/model/indexing_map.h  | 18 ++---
 .../service/gpu/model/indexing_map_test.cc    | 37 +++++-----
 8 files changed, 82 insertions(+), 200 deletions(-)
 delete mode 100644 third_party/xla/xla/service/gpu/model/indexing_context.cc

diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 1460a8ce81247c..97179fb7726f46 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -418,10 +418,7 @@ xla_cc_test(
 
 cc_library(
     name = "indexing_map",
-    srcs = [
-        "indexing_context.cc",
-        "indexing_map.cc",
-    ],
+    srcs = ["indexing_map.cc"],
     hdrs = [
         "indexing_context.h",
         "indexing_map.h",
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index 5de7463655921c..81592731a68883 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -319,33 +319,33 @@ HloInstructionIndexing ComputeOutputToInputDynamicSliceOpIndexing(
       << "b/118437727: Old form, not supported.";
   // A map from tensor iteration space to (), because index operands are 0d
   // tensors.
-  IndexingMap zero_dim_map = IndexingMap::FromTensorSizes(
-      indexing_context,
-      AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{},
-                     mlir_context),
-      output_shape.dimensions(), {});
+  AffineMap empty_results_affine_map = AffineMap::get(
+      /*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{}, mlir_context);
+  IndexingMap start_indices_map =
+      IndexingMap::FromTensorSizes(indexing_context, empty_results_affine_map,
+                                   output_shape.dimensions(), {});
 
   std::vector<RTVar> offsets_rt_vars;
   offsets_rt_vars.reserve(rank);
   std::vector<AffineExpr> exprs;
   exprs.reserve(rank);
-  for (int64_t dim = 0; dim < rank; ++dim) {
+  for (auto [dim, slice_size] :
+       llvm::enumerate(dynamic_slice->dynamic_slice_sizes())) {
     exprs.push_back(getAffineDimExpr(dim, mlir_context) +
                     getAffineSymbolExpr(dim, mlir_context));
-    Interval feasible_values{
-        0, input_shape.dimensions(dim) - dynamic_slice->slice_sizes(dim)};
-    RTVarData rt_var_data{feasible_values,
-                          dynamic_slice->operand(dim + first_index_num),
-                          zero_dim_map};
-    offsets_rt_vars.push_back(indexing_context->RegisterRTVar(rt_var_data));
+    offsets_rt_vars.push_back(
+        RTVar{Interval{0, input_shape.dimensions(dim) - slice_size},
+              dynamic_slice->operand(dim + first_index_num),
+              empty_results_affine_map});
   }
   std::vector<IndexingMap> indexing_maps(dynamic_slice->operand_count(),
-                                         zero_dim_map);
-  indexing_maps.front() = IndexingMap{
-      indexing_context,
-      AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank, exprs,
-                     mlir_context),
-      zero_dim_map.GetDimVars(), /*range_vars=*/{}, std::move(offsets_rt_vars)};
+                                         start_indices_map);
+  indexing_maps.front() =
+      IndexingMap{indexing_context,
+                  AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank, exprs,
+                                 mlir_context),
+                  start_indices_map.GetDimVars(), /*range_vars=*/{},
+                  std::move(offsets_rt_vars)};
   return HloInstructionIndexing::FromIndexingMaps(indexing_maps);
 }
 
@@ -370,24 +370,23 @@ HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
       output_shape.dimensions(), {});
 
   // start_indices: (d0, ... d_{N-1}) -> ()
-  IndexingMap start_indices_map = IndexingMap::FromTensorSizes(
-      indexing_context,
-      AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{},
-                     mlir_context),
-      output_shape.dimensions(), {});
+  AffineMap empty_results_affine_map = AffineMap::get(
+      /*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{}, mlir_context);
+  IndexingMap start_indices_map =
+      IndexingMap::FromTensorSizes(indexing_context, empty_results_affine_map,
+                                   output_shape.dimensions(), {});
 
   // update: (d_0 - s_0, ..., d_{N-1} - s_{N-1})
   std::vector<AffineExpr> exprs;
   exprs.reserve(rank);
   std::vector<RTVar> rt_vars;
   rt_vars.reserve(rank);
-  for (int64_t dim = 0; dim < rank; ++dim) {
+  for (auto [dim, slice_size] : llvm::enumerate(update_shape.dimensions())) {
     exprs.push_back(getAffineDimExpr(dim, mlir_context) -
                     getAffineSymbolExpr(dim, mlir_context));
-    Interval feasible_values{
-        0, output_shape.dimensions(dim) - update_shape.dimensions(dim)};
-    rt_vars.push_back(indexing_context->RegisterRTVar(
-        {feasible_values, dus->operand(2 + dim), start_indices_map}));
+    Interval feasible_values{0, output_shape.dimensions(dim) - slice_size};
+    rt_vars.push_back(RTVar{feasible_values, dus->operand(2 + dim),
+                            empty_results_affine_map});
   }
   IndexingMap update_map{indexing_context,
                          AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank,
@@ -447,17 +446,13 @@ HloInstructionIndexing ComputeOutputToInputGatherOpIndexing(
 
     if (operand_dim_id >= index_vector_length) continue;
 
-    RTVarData rt_var_data{
+    rt_vars.push_back(RTVar{
         Interval{0, operand_shape.dimensions(operand_dim_id) - slice_size},
         gather->operand(1),
-        IndexingMap{
-            indexing_context,
-            AffineMap::get(output_rank, /*symbolCount=*/0,
-                           {indices_id_dim, getAffineConstantExpr(
-                                                operand_dim_id, mlir_context)},
-                           mlir_context),
-            dim_vars, /*range_vars=*/{}, /*rt_vars=*/{}}};
-    rt_vars.push_back(indexing_context->RegisterRTVar(rt_var_data));
+        AffineMap::get(output_rank, /*symbolCount=*/0,
+                       {indices_id_dim,
+                        getAffineConstantExpr(operand_dim_id, mlir_context)},
+                       mlir_context)});
     exprs.back() =
         exprs.back() + getAffineSymbolExpr(operand_dim_id, mlir_context);
   }
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 12fa7df22b8981..78270678427055 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -44,12 +44,7 @@ MATCHER_P2(MatchInstrIndexing, operand_id, indexing_map_matchers, "") {
                             result_listener);
 }
 
-class IndexingAnalysisTest : public IndexingTestBase {
-  void SetUp() override {
-    IndexingTestBase::SetUp();
-    IndexingContext::ResetRTVarStateForTests();
-  }
-};
+using IndexingAnalysisTest = IndexingTestBase;
 
 TEST_F(IndexingAnalysisTest, FuseProducerConsumerOutputToInputIndexing) {
   auto root = ParseAndGetRoot(R"(
@@ -652,27 +647,15 @@ TEST_F(IndexingAnalysisTest, DynamicSliceOp) {
                 d0 in [0, 0]
                 d1 in [0, 1]
                 d2 in [0, 31]
-                s0 id: 0 in [0, 1]
+                s0 in [0, 1]
                   hlo: %of1 = s32[] parameter(1)
                   (d0, d1, d2)  -> ()
-                  domain:
-                  d0 in [0, 0]
-                  d1 in [0, 1]
-                  d2 in [0, 31]
-                s1 id: 1 in [0, 0]
+                s1 in [0, 0]
                   hlo: %of2 = s32[] parameter(2)
                   (d0, d1, d2)  -> ()
-                  domain:
-                  d0 in [0, 0]
-                  d1 in [0, 1]
-                  d2 in [0, 31]
-                s2 id: 2 in [0, 226]
+                s2 in [0, 226]
                   hlo: %of3 = s32[] parameter(3)
                   (d0, d1, d2) -> ()
-                  domain:
-                  d0 in [0, 0]
-                  d1 in [0, 1]
-                  d2 in [0, 31]
               )")),
                           ElementsAre(MatchIndexingMap(R"(
                 (d0, d1, d2)  -> ()
@@ -721,18 +704,12 @@ TEST_F(IndexingAnalysisTest, DynamicUpdateSliceOp) {
                 domain:
                 d0 in [0, 19]
                 d1 in [0, 29]
-                s0 id: 0 in [0, 15]
+                s0 in [0, 15]
                   hlo: %of1 = s32[] parameter(2)
                   (d0, d1)  -> ()
-                  domain:
-                  d0 in [0, 19]
-                  d1 in [0, 29]
-                s1 id: 1 in [0, 20]
+                s1 in [0, 20]
                   hlo: %of2 = s32[] parameter(3)
                   (d0, d1)  -> ()
-                  domain:
-                  d0 in [0, 19]
-                  d1 in [0, 29]
               )")),
                           ElementsAre(MatchIndexingMap(R"(
                 (d0, d1)  -> ()
@@ -1068,22 +1045,12 @@ TEST_F(IndexingAnalysisTest, GatherOp) {
                 d1 in [0, 6]
                 d2 in [0, 7]
                 d3 in [0, 3]
-                s0 id: 0 in [0, 26]
+                s0 in [0, 26]
                   hlo: %indices = s32[1806,2]{1,0} parameter(1)
                   (d0, d1, d2, d3) -> (d0, 0)
-                  domain:
-                  d0 in [0, 1805]
-                  d1 in [0, 6]
-                  d2 in [0, 7]
-                  d3 in [0, 3]
-                s1 id: 1 in [0, 68]
+                s1 in [0, 68]
                   hlo: %indices = s32[1806,2]{1,0} parameter(1)
                   (d0, d1, d2, d3) -> (d0, 1)
-                  domain:
-                  d0 in [0, 1805]
-                  d1 in [0, 6]
-                  d2 in [0, 7]
-                  d3 in [0, 3]
               )")),
                           ElementsAre(MatchIndexingMap(R"(
                 (d0, d1, d2, d3)[s0] -> (d0, s0)
diff --git a/third_party/xla/xla/service/gpu/model/indexing_context.cc b/third_party/xla/xla/service/gpu/model/indexing_context.cc
deleted file mode 100644
index b1ee50a4f1bb86..00000000000000
--- a/third_party/xla/xla/service/gpu/model/indexing_context.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/model/indexing_context.h"
-
-#include <utility>
-
-#include "xla/service/gpu/model/indexing_map.h"
-
-namespace xla {
-namespace gpu {
-
-static RTVarID rt_var_count = 0;
-
-RTVar IndexingContext::RegisterRTVar(RTVarData rt_var_data) {
-  rt_vars_registry_.insert(std::make_pair(rt_var_count, rt_var_data));
-  return RTVar{rt_var_count++};
-}
-
-RTVarData& IndexingContext::GetRTVarData(RTVarID id) {
-  return rt_vars_registry_.at(id);
-}
-
-/*static*/ void IndexingContext::ResetRTVarStateForTests() { rt_var_count = 0; }
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_context.h b/third_party/xla/xla/service/gpu/model/indexing_context.h
index 1947919bc849a4..e5dfc6adb7d3c4 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_context.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_context.h
@@ -16,12 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_MODEL_INDEXING_CONTEXT_H_
 #define XLA_SERVICE_GPU_MODEL_INDEXING_CONTEXT_H_
 
-#include <cstdint>
-#include <utility>
-
-#include "absl/container/flat_hash_map.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
@@ -34,18 +29,8 @@ class IndexingContext {
 
   mlir::MLIRContext* GetMLIRContext() const { return mlir_context_; }
 
-  // TBD: This method should behave like a thread-safe counter. It will register
-  // a new RTSymbol by adding it to `rt_vals_registry_` with the newly generated
-  // ID.
-  RTVar RegisterRTVar(RTVarData rt_var_data);
-
-  RTVarData& GetRTVarData(RTVarID id);
-
-  static void ResetRTVarStateForTests();
-
  private:
   mlir::MLIRContext* mlir_context_;
-  absl::flat_hash_map<RTVarID, RTVarData> rt_vars_registry_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index cbec67aa90f249..b24b490868fbf3 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -594,7 +594,10 @@ bool operator==(const RangeVar& lhs, const RangeVar& rhs) {
   return lhs.range == rhs.range;
 }
 
-bool operator==(const RTVar& lhs, const RTVar& rhs) { return lhs.id == rhs.id; }
+bool operator==(const RTVar& lhs, const RTVar& rhs) {
+  return lhs.feasible_values == rhs.feasible_values && lhs.hlo == rhs.hlo &&
+         lhs.map == rhs.map;
+}
 
 std::vector<DimVar> DimVarsFromTensorSizes(
     absl::Span<const int64_t> tensor_sizes) {
@@ -656,9 +659,7 @@ const Interval& IndexingMap::GetSymbolBound(int64_t symbol_id) const {
   int64_t range_var_count = GetRangeVarsCount();
   return symbol_id < range_var_count
              ? range_vars_[symbol_id].range
-             : indexing_context_
-                   ->GetRTVarData(rt_vars_[symbol_id - range_var_count].id)
-                   .feasible_values;
+             : rt_vars_[symbol_id - range_var_count].feasible_values;
 }
 
 Interval& IndexingMap::GetMutableSymbolBound(int64_t symbol_id) {
@@ -667,9 +668,7 @@ Interval& IndexingMap::GetMutableSymbolBound(int64_t symbol_id) {
   int64_t range_var_count = GetRangeVarsCount();
   return symbol_id < range_var_count
              ? range_vars_[symbol_id].range
-             : indexing_context_
-                   ->GetRTVarData(rt_vars_[symbol_id - range_var_count].id)
-                   .feasible_values;
+             : rt_vars_[symbol_id - range_var_count].feasible_values;
 }
 
 std::vector<Interval> IndexingMap::GetSymbolBounds() const {
@@ -679,8 +678,7 @@ std::vector<Interval> IndexingMap::GetSymbolBounds() const {
     bounds.push_back(range_var.range);
   }
   for (const auto& rt_var : rt_vars_) {
-    bounds.push_back(
-        indexing_context_->GetRTVarData(rt_var.id).feasible_values);
+    bounds.push_back(rt_var.feasible_values);
   }
   return bounds;
 }
@@ -835,23 +833,25 @@ void IndexingMap::Print(std::ostream& out,
                         const AffineMapPrinter& printer) const {
   printer.Print(out, affine_map_);
   out << "\ndomain:\n";
-  for (const auto& [index, range] : llvm::enumerate(dim_vars_)) {
+  for (const auto& [index, dim_var] : llvm::enumerate(dim_vars_)) {
     out << printer.GetDimensionName(static_cast<int64_t>(index)) << " in ";
-    dim_vars_.at(index).bounds.Print(out);
+    dim_var.bounds.Print(out);
     out << '\n';
   }
-  int64_t range_vars_count = GetRangeVarsCount();
-  for (const auto& [index, range] : llvm::enumerate(range_vars_)) {
+  for (const auto& [index, range_var] : llvm::enumerate(range_vars_)) {
     out << printer.GetSymbolName(static_cast<int64_t>(index)) << " in ";
-    range_vars_.at(index).range.Print(out);
+    range_var.range.Print(out);
     out << '\n';
   }
-  for (const auto& [index, range] : llvm::enumerate(rt_vars_)) {
-    auto id = rt_vars_.at(index).id;
-    const RTVarData& rt_var_data = indexing_context_->GetRTVarData(id);
+  int64_t range_vars_count = GetRangeVarsCount();
+  for (const auto& [index, rt_var] : llvm::enumerate(rt_vars_)) {
     out << printer.GetSymbolName(static_cast<int64_t>(range_vars_count + index))
-        << " id: " << id;
-    rt_var_data.Print(out);
+        << " in ";
+    rt_var.feasible_values.Print(out);
+    out << "\n  hlo: "
+        << (rt_var.hlo == nullptr ? "NULL" : rt_var.hlo->ToString()) << "\n  ";
+    printer.Print(out, rt_var.map);
+    out << '\n';
   }
   std::vector<std::string> expr_range_strings;
   expr_range_strings.reserve(constraints_.size());
@@ -1158,17 +1158,5 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
   return composed_indexing_map;
 }
 
-std::string RTVarData::ToString() const {
-  std::stringstream ss;
-  Print(ss);
-  return ss.str();
-}
-
-void RTVarData::Print(std::ostream& out) const {
-  out << " in " << feasible_values
-      << "\nhlo: " << (hlo == nullptr ? "NULL" : hlo->ToString()) << '\n';
-  indexing_map.Print(out, AffineMapPrinter());
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index 1d7c1051d48af7..433a810984daab 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -167,15 +167,18 @@ H AbslHashValue(H h, const RangeVar& range_var) {
 // RTSymbol variable represents a runtime symbol, e.g. a dynamic offset in
 // HLO dynamic-update-slice op. RTSymbol variables correspond to the back
 // portion of the symbols in `affine_map_`.
-using RTVarID = int64_t;
 struct RTVar {
-  RTVarID id;
+  Interval feasible_values;
+  const HloInstruction* hlo;
+  mlir::AffineMap map;
 };
 bool operator==(const RTVar& lhs, const RTVar& rhs);
 
 template <typename H>
 H AbslHashValue(H h, const RTVar& rt_var) {
-  return H::combine(std::move(h), rt_var.id);
+  llvm::hash_code map_hash = llvm::hash_combine(rt_var.map);
+  return H::combine(std::move(h), rt_var.feasible_values, rt_var.hlo,
+                    static_cast<size_t>(map_hash));
 }
 
 std::vector<DimVar> DimVarsFromTensorSizes(
@@ -361,15 +364,6 @@ H AbslHashValue(H h, const IndexingMap& indexing_map) {
                     indexing_map.GetConstraintsCount());
 }
 
-struct RTVarData {
-  std::string ToString() const;
-  void Print(std::ostream& out) const;
-
-  Interval feasible_values;
-  const HloInstruction* hlo;
-  IndexingMap indexing_map;
-};
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index ee9046c945337d..95fe85c71dfd54 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -33,6 +33,7 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using ::mlir::AffineMap;
 using ::testing::ElementsAre;
 
 class IndexingMapTest : public HloTestBase {
@@ -44,14 +45,11 @@ class IndexingMapTest : public HloTestBase {
 };
 
 TEST_F(IndexingMapTest, RTVar) {
-  IndexingMap zero_dim_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap("() -> ()", &mlir_context_), {100, 44},
-      {});
-  std::vector<RTVar> rt_vars{
-      indexing_context_.RegisterRTVar({Interval{0, 2},
-                                       /*instr=*/nullptr, zero_dim_map}),
-      indexing_context_.RegisterRTVar({Interval{0, 7},
-                                       /*instr=*/nullptr, zero_dim_map})};
+  auto zero_dim_map = AffineMap::get(&mlir_context_);
+  std::vector<RTVar> rt_vars{RTVar{Interval{0, 2},
+                                   /*instr=*/nullptr, zero_dim_map},
+                             RTVar({Interval{0, 7},
+                                    /*instr=*/nullptr, zero_dim_map})};
 
   IndexingMap indexing_map(
       &indexing_context_,
@@ -59,24 +57,21 @@ TEST_F(IndexingMapTest, RTVar) {
                      &mlir_context_),
       {DimVar{{0, 99}}, DimVar{{0, 43}}}, {RangeVar{{-99, 99}}},
       std::move(rt_vars));
-  EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
-              (d0, d1)[s0, s1, s2] -> (d1, d0, s0 + s1, s1)
+  printer_.SetSymbolName(0, "range");
+  printer_.SetSymbolName(1, "rt_0");
+  printer_.SetSymbolName(2, "rt_1");
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0, d1)[range, rt_0, rt_1] -> (d1, d0, range + rt_0, rt_0)
               domain:
               d0 in [0, 99]
               d1 in [0, 43]
-              s0 in [-99, 99]
-              s1 id: 0 in [0, 2]
+              range in [-99, 99]
+              rt_0 in [0, 2]
+                hlo: NULL
+                () -> ()
+              rt_1 in [0, 7]
                 hlo: NULL
                 () -> ()
-                domain: 
-                d0 in [0, 99]
-                d1 in [0, 43]
-              s2 id: 1 in [0, 7]
-                  hlo: NULL
-                  () -> ()
-                  domain:
-                  d0 in [0, 99]
-                  d1 in [0, 43]
               )"));
 }
 

From 5a50fb082fae9c808e24ae8f928e03b359ac86f4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Mar 2024 05:49:51 -0700
Subject: [PATCH 227/670] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/3192a484ad8453b5f6f1227b35b91235045569c4.

PiperOrigin-RevId: 617815258
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index df3e6bb4f924ec..9bd6d5ed51237a 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "1741a4616ea64a43b4d58a43e5fbc32e382afeaf"
-    TFRT_SHA256 = "d77c2839945ef6636829b782308c63c6e9a675e0c2b45c92d3bf8553944d0020"
+    TFRT_COMMIT = "3192a484ad8453b5f6f1227b35b91235045569c4"
+    TFRT_SHA256 = "e1396fc46a54b819869e269f7a4af5f67b564b253212dc7a434af9f4d9ae095a"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index df3e6bb4f924ec..9bd6d5ed51237a 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "1741a4616ea64a43b4d58a43e5fbc32e382afeaf"
-    TFRT_SHA256 = "d77c2839945ef6636829b782308c63c6e9a675e0c2b45c92d3bf8553944d0020"
+    TFRT_COMMIT = "3192a484ad8453b5f6f1227b35b91235045569c4"
+    TFRT_SHA256 = "e1396fc46a54b819869e269f7a4af5f67b564b253212dc7a434af9f4d9ae095a"
 
     tf_http_archive(
         name = "tf_runtime",

From c294fbf66b15ea88d01dd143581628b36c2bae20 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Thu, 21 Mar 2024 06:00:01 -0700
Subject: [PATCH 228/670] [xla:gpu][NFC] Simplify HloPredicateIsOp

PiperOrigin-RevId: 617817125
---
 third_party/xla/xla/hlo/ir/hlo_instruction.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index c1229e46491ab6..50c12af36a452d 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -2858,14 +2858,8 @@ using ConstHloInstructionSet =
 
 template <HloOpcode op, HloOpcode... rest>
 bool HloPredicateIsOp(const HloInstruction* instruction) {
-  if (instruction->opcode() == op) {
-    return true;
-  }
-  if constexpr (sizeof...(rest) == 0) {
-    return false;
-  } else {
-    return HloPredicateIsOp<rest...>(instruction);
-  }
+  return (instruction->opcode() == op) ||
+         ((instruction->opcode() == rest) || ...);
 }
 
 /* static */ inline bool HloInstruction::MightHaveCalledComputations(

From 79b0e39a1927cc9e7751ff28f42f18c05dbe2128 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Mar 2024 06:49:26 -0700
Subject: [PATCH 229/670] The value of 'num_results_' in
 XlaCompiledCpuFunction::StaticData was previously unset, so
 XlaCompiledCpuFunction::num_results() would always return 0. This is an issue
 with code that relies on the XlaCompiledCpuFunction interface rather than the
 ProgramShape proto.

PiperOrigin-RevId: 617827439
---
 tensorflow/compiler/tf2xla/BUILD                 |  1 +
 .../tf2xla/xla_jit_compiled_cpu_function.cc      | 16 ++++++++++++++++
 .../tf2xla/xla_jit_compiled_cpu_function_test.cc |  4 ++++
 3 files changed, 21 insertions(+)

diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 7e1d80e9e8676d..cee78e817fe7e7 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -418,6 +418,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
+        "@com_google_absl//absl/types:span",
         "@local_xla//xla:cpu_function_runtime",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:statusor",
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 784c012a0274bc..754d018cc5781c 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
@@ -50,6 +51,18 @@ absl::StatusOr<size_t> ComputeResultIndex(
   return result_slice.index();
 }
 
+// Returns the number of results.
+int CountResults(
+    absl::Span<const xla::cpu_function_runtime::BufferInfo> buffer_infos) {
+  int num_results = 0;
+  for (const auto& info : buffer_infos) {
+    if (info.is_result_parameter()) {
+      ++num_results;
+    }
+  }
+  return num_results;
+}
+
 // Collect names from `entries`, where T is one of
 // tf2xla::{Feed,Fetch,Variable}. We hold the actual strings in nonempty_names,
 // and hold arrays of pointers in name_ptrs, terminated by a nullptr entry.
@@ -146,6 +159,7 @@ XlaJitCompiledCpuFunction::Compile(
       xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos);
   TF_ASSIGN_OR_RETURN(size_t result_index,
                       ComputeResultIndex(buffer_assignment));
+  const int num_results = CountResults(buffer_infos);
 
   std::unique_ptr<XlaJitCompiledCpuFunction> jit_unique_ptr(
       new XlaJitCompiledCpuFunction);
@@ -173,6 +187,8 @@ XlaJitCompiledCpuFunction::Compile(
       &jit->static_data_, jit->arg_index_table_.size());
   XlaCompiledCpuFunction::set_static_data_num_variables(&jit->static_data_,
                                                         config.variable_size());
+  XlaCompiledCpuFunction::set_static_data_num_results(&jit->static_data_,
+                                                      num_results);
   XlaCompiledCpuFunction::set_static_data_result_index(&jit->static_data_,
                                                        result_index);
   // Optional metadata is collected and set below.
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index 399826ac12ed55..787d67674a2c8e 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -176,6 +176,8 @@ TEST(XlaJitCompiledCpuFunction, Sum) {
       XlaJitCompiledCpuFunction::Compile(graph_def, config,
                                          xla::ExecutableBuildOptions()));
   XlaCompiledCpuFunction function(jit->StaticData());
+  ASSERT_EQ(function.num_args(), 2);
+  ASSERT_EQ(function.num_results(), 1);
 
   // Run the function and check results.
   *static_cast<int32*>(function.arg_data(0)) = 10;
@@ -258,6 +260,8 @@ TEST(XlaJitCompiledCpuFunction, SumVariable) {
       XlaJitCompiledCpuFunction::Compile(graph_def, config,
                                          xla::ExecutableBuildOptions()));
   XlaCompiledCpuFunction function(jit->StaticData());
+  ASSERT_EQ(function.num_args(), 2);
+  ASSERT_EQ(function.num_results(), 2);
 
   // Run the function and check results.
   *static_cast<int32*>(function.arg_data(0)) = 10;

From 7638308ed91727d43fbd442a2049a95653e62aad Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Thu, 21 Mar 2024 06:56:18 -0700
Subject: [PATCH 230/670] Make EmitReduce use indexing maps

All index calculation are now performed by the new indexing maps
which makes the implementation of `EmitReduce` quite a bit simpler.

PiperOrigin-RevId: 617828935
---
 .../gpu/fusions/mlir/elemental_hlo_to_mlir.cc | 117 +++++++++++-------
 .../gpu/fusions/mlir/elemental_hlo_to_mlir.h  |  16 ++-
 2 files changed, 86 insertions(+), 47 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index 7aeb2bcc0b2d64..4d5933ac54d3a3 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
@@ -64,6 +65,7 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
 #include "xla/mlir_hlo/mhlo/utils/type_conversion.h"
 #include "xla/primitive_util.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
@@ -249,57 +251,53 @@ absl::StatusOr<SmallVector<Value>> EmitReduce(
     const HloInstruction* instr, ValueRange indices,
     const OperandProvider& operand_provider,
     const CallTargetProvider& call_target_provider, ImplicitLocOpBuilder& b) {
-  SmallVector<Value> reduction_indices(indices);
-  SmallVector<Value> accumulators;
+  IndexingContext indexing_context{b.getContext()};
+  HloInstructionIndexing indexing =
+      ComputeOutputToInputIndexing(instr, 0, &indexing_context);
+  const auto& indexing_map = *indexing.indexing_maps[0].begin();
+
+  SmallVector<Value> init_values;
   for (int i = instr->operand_count() / 2; i < instr->operand_count(); ++i) {
-    TF_ASSIGN_OR_RETURN(accumulators.emplace_back(),
+    TF_ASSIGN_OR_RETURN(init_values.emplace_back(),
                         GetSingleOperandValue(operand_provider, instr, i, {}));
     // Convert back to signed type.
     TF_ASSIGN_OR_RETURN(auto element_mlir_type,
                         ConvertPrimitiveTypeToMlirType(
                             instr->operand(i)->shape().element_type(), b));
-    accumulators.back() = b.create<mlir::UnrealizedConversionCastOp>(
-                               element_mlir_type, accumulators.back())
-                              .getResult(0);
+    init_values.back() = b.create<mlir::UnrealizedConversionCastOp>(
+                              element_mlir_type, init_values.back())
+                             .getResult(0);
   }
-  auto dims = llvm::to_vector(instr->dimensions());
-  absl::c_sort(dims);
-  ForOp outermost_loop = nullptr;
-  for (int dim : dims) {
-    auto bound = instr->operands()[0]->shape().dimensions(dim);
-    auto loop = b.create<ForOp>(b.create<ConstantIndexOp>(0),
-                                b.create<ConstantIndexOp>(bound),
-                                b.create<ConstantIndexOp>(1), accumulators);
-    if (outermost_loop == nullptr) {
-      outermost_loop = loop;
-    } else {
-      b.create<YieldOp>(loop.getResults());
+
+  auto body =
+      [&](ValueRange iter_args, ValueRange dim_values,
+          ValueRange symbol_values) -> absl::StatusOr<SmallVector<Value>> {
+    auto indices = ApplyAffineMap(indexing_map.GetAffineMap(), dim_values,
+                                  symbol_values, b);
+
+    SmallVector<Value> args{iter_args};
+    for (int i = 0; i < instr->operand_count() / 2; ++i) {
+      TF_ASSIGN_OR_RETURN(
+          args.emplace_back(),
+          GetSingleOperandValue(operand_provider, instr, i, indices));
+      // Convert back to signed type.
+      TF_ASSIGN_OR_RETURN(auto element_mlir_type,
+                          ConvertPrimitiveTypeToMlirType(
+                              instr->operand(i)->shape().element_type(), b));
+      args.back() = b.create<mlir::UnrealizedConversionCastOp>(
+                         element_mlir_type, args.back())
+                        .getResult(0);
     }
-    b.setInsertionPointToStart(loop.getBody());
-    reduction_indices.insert(reduction_indices.begin() + dim,
-                             loop.getInductionVar());
-    accumulators = {loop.getRegionIterArgs().begin(),
-                    loop.getRegionIterArgs().end()};
-  }
-  SmallVector<Value> args = accumulators;
-  for (int i = 0; i < instr->operand_count() / 2; ++i) {
-    TF_ASSIGN_OR_RETURN(
-        args.emplace_back(),
-        GetSingleOperandValue(operand_provider, instr, i, reduction_indices));
-    // Convert back to signed type.
-    TF_ASSIGN_OR_RETURN(auto element_mlir_type,
-                        ConvertPrimitiveTypeToMlirType(
-                            instr->operand(i)->shape().element_type(), b));
-    args.back() = b.create<mlir::UnrealizedConversionCastOp>(element_mlir_type,
-                                                             args.back())
-                      .getResult(0);
-  }
-  auto reducer = call_target_provider(
-      instr->called_computations().front()->root_instruction());
-  b.create<YieldOp>(b.create<mlir::func::CallOp>(reducer, args).getResults());
+    auto reducer = call_target_provider(
+        instr->called_computations().front()->root_instruction());
+    return b.create<mlir::func::CallOp>(reducer, args).getResults();
+  };
 
-  b.setInsertionPointAfter(outermost_loop);
-  return ConvertToSignless(outermost_loop.getResults(), b);
+  TF_ASSIGN_OR_RETURN(
+      auto result,
+      EmitLoopNestWithStatus(b, indices, init_values, indexing_map, body));
+
+  return ConvertToSignless(result, b);
 }
 
 absl::StatusOr<SmallVector<Value>> EmitConcat(
@@ -1110,9 +1108,10 @@ absl::Status SubgraphToMlirFunction(
 SmallVector<Value> EmitLoopNest(
     ImplicitLocOpBuilder& b, ValueRange dim_values, ValueRange iter_args_inits,
     const IndexingMap& indexing_map,
-    const std::function<
-        SmallVector<Value>(ValueRange /*iter_args*/, ValueRange /*dim_values*/,
-                           ValueRange /*symbol_values*/)>& create_body) {
+    mlir::function_ref<SmallVector<Value>(ValueRange /*iter_args*/,
+                                          ValueRange /*dim_values*/,
+                                          ValueRange /*symbol_values*/)>
+        create_body) {
   SmallVector<Value, 4> lbs, ubs, steps;
   GetLoopBoundsFromIndexingMap(b, indexing_map, &lbs, &ubs, &steps);
 
@@ -1142,6 +1141,34 @@ SmallVector<Value> EmitLoopNest(
   return loop_nest.results;
 }
 
+absl::StatusOr<SmallVector<Value>> EmitLoopNestWithStatus(
+    ImplicitLocOpBuilder& b, ValueRange dim_values, ValueRange iter_args_inits,
+    const IndexingMap& indexing_map,
+    mlir::function_ref<absl::StatusOr<SmallVector<Value>>(
+        ValueRange /*iter_args*/, ValueRange /*dim_values*/,
+        ValueRange /*symbol_values*/)>
+        create_body) {
+  absl::Status status = absl::OkStatus();
+
+  auto result = EmitLoopNest(
+      b, dim_values, iter_args_inits, indexing_map,
+      [&](ValueRange iter_args, ValueRange dim_values,
+          ValueRange symbol_values) -> SmallVector<Value> {
+        auto body_result = create_body(iter_args, dim_values, symbol_values);
+        if (!body_result.ok()) {
+          status = std::move(body_result.status());
+          return SmallVector<Value>{};
+        }
+
+        return std::move(body_result.value());
+      });
+
+  if (!status.ok()) {
+    return status;
+  }
+  return result;
+}
+
 mlir::Value ClampIndex(mlir::Value index, bool is_unsigned, int64_t high,
                        ImplicitLocOpBuilder& b) {
   auto zero = b.create<ConstantOp>(b.getIndexAttr(0));
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
index 186c83765249ee..85d48087a0e983 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
@@ -97,9 +98,20 @@ mlir::Value CheckConstraints(const IndexingMap& map, mlir::ValueRange dims,
 llvm::SmallVector<mlir::Value> EmitLoopNest(
     mlir::ImplicitLocOpBuilder& b, mlir::ValueRange dim_values,
     mlir::ValueRange iter_args_inits, const IndexingMap& indexing_map,
-    const std::function<llvm::SmallVector<mlir::Value>(
+    mlir::function_ref<llvm::SmallVector<mlir::Value>(
         mlir::ValueRange iter_args, mlir::ValueRange dim_values,
-        mlir::ValueRange symbol_values)>& create_body);
+        mlir::ValueRange symbol_values)>
+        create_body);
+
+// Same as EmitLoopNest, but the body building function can return an error
+// which gets returned from EmitLoopNestWithStatus.
+absl::StatusOr<llvm::SmallVector<mlir::Value>> EmitLoopNestWithStatus(
+    mlir::ImplicitLocOpBuilder& b, mlir::ValueRange dim_values,
+    mlir::ValueRange iter_args_inits, const IndexingMap& indexing_map,
+    mlir::function_ref<absl::StatusOr<llvm::SmallVector<mlir::Value>>(
+        mlir::ValueRange iter_args, mlir::ValueRange dim_values,
+        mlir::ValueRange symbol_values)>
+        create_body);
 
 // Clamps `index` to [0, high] boundaries.
 mlir::Value ClampIndex(mlir::Value index, bool is_unsigned, int64_t high,

From eb2c53e3de99aba66bf31de0239e40e4140d56a4 Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Thu, 21 Mar 2024 07:13:32 -0700
Subject: [PATCH 231/670] Address some ClangTidy warnings.

PiperOrigin-RevId: 617832727
---
 tensorflow/lite/c/c_api_opaque_internal.cc        | 1 +
 tensorflow/lite/core/c/BUILD                      | 3 +++
 tensorflow/lite/core/c/c_api_test.cc              | 1 +
 tensorflow/lite/core/c/registration_external.cc   | 6 ++++++
 tensorflow/lite/delegates/BUILD                   | 6 +++++-
 tensorflow/lite/delegates/opaque_delegate_test.cc | 6 +++++-
 6 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/c/c_api_opaque_internal.cc b/tensorflow/lite/c/c_api_opaque_internal.cc
index 2f7f1113cb5bb2..5b6fcf48a1dce3 100644
--- a/tensorflow/lite/c/c_api_opaque_internal.cc
+++ b/tensorflow/lite/c/c_api_opaque_internal.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/c/registration_external.h"
 #include "tensorflow/lite/core/subgraph.h"
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index 7f10784c10ddad..86c83aec78b779 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -181,6 +181,7 @@ cc_test(
         ":c_api_experimental",
         ":c_api_types",
         ":common",
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core:subgraph",
@@ -208,6 +209,7 @@ cc_test(
         ":c_api_experimental",
         ":c_api_types",
         ":common",
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core:subgraph",
@@ -234,6 +236,7 @@ cc_test(
         ":c_api_types",
         ":c_api_without_op_resolver_without_alwayslink",
         ":common",
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/c:selectively_built_c_api_test_lib",
diff --git a/tensorflow/lite/core/c/c_api_test.cc b/tensorflow/lite/core/c/c_api_test.cc
index 13caecd88a5c19..576a34b2545254 100644
--- a/tensorflow/lite/core/c/c_api_test.cc
+++ b/tensorflow/lite/core/c/c_api_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/c_api_opaque.h"
diff --git a/tensorflow/lite/core/c/registration_external.cc b/tensorflow/lite/core/c/registration_external.cc
index 4f5a0daddf81d7..d655ce8de58922 100644
--- a/tensorflow/lite/core/c/registration_external.cc
+++ b/tensorflow/lite/core/c/registration_external.cc
@@ -14,7 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/core/c/registration_external.h"
 
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common_internal.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 
 TfLiteRegistrationExternal* TfLiteRegistrationExternalCreate(
     TfLiteBuiltinOperator builtin_code, const char* custom_name, int version) {
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index 8660884edb2b51..d0a207e1ff6879 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -13,8 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
@@ -168,12 +168,14 @@ cc_test(
         "TFLITE_USE_OPAQUE_DELEGATE",
     ],
     deps = [
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_experimental",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -192,12 +194,14 @@ cc_test(
         "TF_LITE_STRIP_ERROR_STRINGS",
     ],
     deps = [
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_experimental",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/delegates/opaque_delegate_test.cc b/tensorflow/lite/delegates/opaque_delegate_test.cc
index 8025da59a52bfd..af8e55d79ba751 100644
--- a/tensorflow/lite/delegates/opaque_delegate_test.cc
+++ b/tensorflow/lite/delegates/opaque_delegate_test.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
+#include <stddef.h>
+#include <stdint.h>
+
 #include <fstream>
 #include <memory>
 #include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_opaque.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
@@ -27,6 +30,7 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/interpreter_builder.h"
 #include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/testing/util.h"
 
 using ::testing::ContainsRegex;

From 5ccc5ec41f7f4da1563f3e122f63c6e78a545ab6 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 21 Mar 2024 07:36:55 -0700
Subject: [PATCH 232/670] Fix for tuples inside loops in LayoutAssignment

PiperOrigin-RevId: 617838261
---
 .../xla/xla/service/layout_assignment.cc      | 11 ++++--
 .../xla/xla/service/layout_assignment_test.cc | 37 +++++++++++++++++++
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index c79261e6bb8d6c..82624c414b8168 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -2004,9 +2004,14 @@ Status LayoutAssignment::PropagateBufferConstraintToUses(
       VLOG(3) << "Propagating layout through backedge"
               << buffer_constraint.layout().ToString();
       int64_t index = user->operand_index(buffer.instruction());
-      TF_ASSIGN_OR_RETURN(
-          auto buffer, points_to_analysis_->GetBufferDefinedAt(
-                           user->parent()->parameter_instruction(0), {index}));
+
+      const HloInstruction* inputs = user->parent()->parameter_instruction(0);
+
+      ShapeIndex used_index = buffer.index();
+      used_index.push_front(index);
+
+      TF_ASSIGN_OR_RETURN(auto buffer, points_to_analysis_->GetBufferDefinedAt(
+                                           inputs, used_index));
 
       TF_RETURN_IF_ERROR(SetBufferLayout(buffer_constraint.layout(), *buffer,
                                          /*mandatory=*/false));
diff --git a/third_party/xla/xla/service/layout_assignment_test.cc b/third_party/xla/xla/service/layout_assignment_test.cc
index 9332f9390f43e3..110a76aa1266e5 100644
--- a/third_party/xla/xla/service/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/layout_assignment_test.cc
@@ -1797,5 +1797,42 @@ TEST_F(LayoutAssignmentTest, AliasConstrainedParamterWithUnconstrainedOutput) {
             m->entry_computation_layout().parameter_layout(0).shape());
 }
 
+TEST_F(LayoutAssignmentTest, NestedTupleInLoop) {
+  const char* module_str = R"(
+HloModule Module
+
+condition  {
+    p = (f32[100,100], (f32[100,100], u32[], token[])) parameter(0)
+    ROOT lt = pred[] constant(1)
+}
+
+body {
+    p = (f32[100,100], (f32[100,100], u32[], token[])) parameter(0)
+
+    t1 = f32[100,100] get-tuple-element(p), index=0
+    t = (f32[100,100], u32[], token[]) get-tuple-element(p), index=1
+    sdone = token[] send-done(t), channel_id=0
+    tk = token[] after-all()
+    snd = (f32[100,100], u32[], token[]) send(t1, tk), channel_id=0
+    a = add(t1, t1)
+    ROOT tup =  tuple(a, snd)
+}
+
+ENTRY %main {
+    p0 = f32[100,100] parameter(0)
+    tk = token[] after-all()
+    snd = (f32[100,100], u32[], token[]) send(p0, tk), channel_id=1
+    t = tuple(p0, snd)
+    ROOT loop = while(t), condition=condition, body=body
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
+
+  LayoutAssignment layout_assignment(m->mutable_entry_computation_layout(),
+                                     nullptr);
+  EXPECT_IS_OK(layout_assignment.Run(m.get()).status());
+}
+
 }  // namespace
 }  // namespace xla

From 1f77b17784b5092267160d2e1919809eb44f47a2 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Thu, 21 Mar 2024 08:09:53 -0700
Subject: [PATCH 233/670] [xla][gpu] Preserve a frontend attribute for runtime
 optimization when decomposing a CollectivePermute with a cycle.

The frontend attribute annotates a range of runtime execution instances where
each source-target pair is communicating data that affects the outcome of the
application. As such, when we split the source-target pairs for the decomposed
CollectivePermute, we also need to split the corresponding bounds represented
by the frontend attribute.

Modify two existing tests.

PiperOrigin-RevId: 617846437
---
 third_party/xla/xla/service/gpu/BUILD         |  4 ++
 .../collective_permute_cycle_decomposer.cc    | 61 +++++++++++++++++++
 ...ollective_permute_cycle_decomposer_test.cc | 10 +++
 3 files changed, 75 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 30fc46fd7a1616..ac00a6a0c53179 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -6060,13 +6060,17 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:hlo_parser",
         "//xla/service:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
     ],
diff --git a/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc b/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc
index 57168b9c8e897d..8edf682481baf9 100644
--- a/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc
@@ -16,11 +16,14 @@ limitations under the License.
 #include "xla/service/gpu/collective_permute_cycle_decomposer.h"
 
 #include <cstdint>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -30,10 +33,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/literal_util.h"
+#include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/hlo_parser.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 
@@ -118,6 +124,57 @@ CycleType ShouldDecomposeWithCycleType(
                                   : CycleType::kUnknown;
 }
 
+// Constructs the frontend attributes for the two decomposed CollectivePermute
+// instructions.
+Status GetFrontendAttributes(HloCollectivePermuteInstruction* cp,
+                             CycleType cycle_type,
+                             xla::FrontendAttributes& cp1_attr,
+                             xla::FrontendAttributes& cp2_attr) {
+  cp1_attr = cp->frontend_attributes();
+  cp2_attr = cp->frontend_attributes();
+  auto validation_it =
+      cp->frontend_attributes().map().find(kSendRecvValidationAttr);
+  if (validation_it == cp->frontend_attributes().map().end() ||
+      validation_it->second == "invalid") {
+    return OkStatus();
+  }
+
+  auto statusor_bounds = ParseReplicaGroupsOnly(validation_it->second);
+  if (!statusor_bounds.ok()) {
+    return statusor_bounds.status();
+  }
+  const std::vector<ReplicaGroup>& bounds = statusor_bounds.value();
+  if (bounds.size() < 2) {
+    return Internal("Invalid number of replica groups");
+  }
+
+  int64_t num_pairs = bounds.size();
+  // A forward cycle has its backedge at the end while a backward cycle has its
+  // backedge at the beginning.
+  auto backedge_start = cycle_type == CycleType::kBackward
+                            ? bounds.begin()
+                            : bounds.begin() + num_pairs - 1;
+  auto other_edges_start =
+      cycle_type == CycleType::kBackward ? bounds.begin() + 1 : bounds.begin();
+  std::vector<ReplicaGroup> cp1_bounds(backedge_start, backedge_start + 1);
+  std::vector<ReplicaGroup> cp2_bounds(other_edges_start,
+                                       other_edges_start + num_pairs - 1);
+  auto bounds_to_string = [](const std::vector<ReplicaGroup> groups) {
+    return "{" +
+           absl::StrJoin(groups, ",",
+                         [](std::string* out, const ReplicaGroup& value) {
+                           absl::StrAppend(out, "{", value.replica_ids(0), ",",
+                                           value.replica_ids(1), "}");
+                         }) +
+           "}";
+  };
+  std::string cp1_validation_str = bounds_to_string(cp1_bounds);
+  std::string cp2_validation_str = bounds_to_string(cp2_bounds);
+  (*cp1_attr.mutable_map())[kSendRecvValidationAttr] = cp1_validation_str;
+  (*cp2_attr.mutable_map())[kSendRecvValidationAttr] = cp2_validation_str;
+  return OkStatus();
+}
+
 // Decomposes a CollectivePermute instruction with a cycle in its source-target
 // pairs into two CollectivePermute instructions.
 Status DecomposeCollectivePermuteCycle(HloCollectivePermuteInstruction* cp,
@@ -139,6 +196,8 @@ Status DecomposeCollectivePermuteCycle(HloCollectivePermuteInstruction* cp,
   SourceTargetPairs other_edges(other_edges_start,
                                 other_edges_start + num_pairs - 1);
   const OpMetadata& metadata = cp->metadata();
+  xla::FrontendAttributes cp1_attr, cp2_attr;
+  TF_RETURN_IF_ERROR(GetFrontendAttributes(cp, cycle_type, cp1_attr, cp2_attr));
 
   // Create the CollectivePermute instruction for the communication represented
   // by the backedge.
@@ -147,6 +206,7 @@ Status DecomposeCollectivePermuteCycle(HloCollectivePermuteInstruction* cp,
           cp->shape(), cp->mutable_operand(0), backedge,
           cp->channel_id().value()));
   cp1->set_metadata(metadata);
+  cp1->set_frontend_attributes(cp1_attr);
   int64_t cp1_receiver = backedge.back().second;
 
   // Create the CollectivePermute instruction for the communication represented
@@ -155,6 +215,7 @@ Status DecomposeCollectivePermuteCycle(HloCollectivePermuteInstruction* cp,
       computation->AddInstruction(HloInstruction::CreateCollectivePermute(
           cp->shape(), cp->mutable_operand(0), other_edges, next_channel_id));
   cp2->set_metadata(metadata);
+  cp2->set_frontend_attributes(cp2_attr);
 
   // Calculate the received data as follows:
   //   partition = u32[] partition-id()
diff --git a/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer_test.cc b/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer_test.cc
index c12e50216e9e7d..da687711f8ff70 100644
--- a/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer_test.cc
+++ b/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer_test.cc
@@ -96,6 +96,7 @@ TEST_F(CollectivePermuteCycleDecomposerTest, ForwardCycle) {
         p = u32[] partition-id()
         ROOT start = u32[3,2] collective-permute(p), channel_id=1,
           source_target_pairs={{0,1},{1,2},{2,3},{3,0}},
+          frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10}}"},
           metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
       }
     )";
@@ -123,8 +124,12 @@ TEST_F(CollectivePermuteCycleDecomposerTest, ForwardCycle) {
   EXPECT_EQ(cp1->operand(0), cp2->operand(0));
   EXPECT_GT(cp2->channel_id().value(), cp1->channel_id().value());
   EXPECT_THAT(cp1->ToString(), HasSubstr("source_target_pairs={{3,0}}"));
+  EXPECT_THAT(cp1->ToString(),
+              HasSubstr("_xla_send_recv_validation=\"{{3,10}}\""));
   EXPECT_THAT(cp2->ToString(),
               HasSubstr("source_target_pairs={{0,1},{1,2},{2,3}}"));
+  EXPECT_THAT(cp2->ToString(),
+              HasSubstr("_xla_send_recv_validation=\"{{0,7},{1,8},{2,9}}\""));
   check_metadata(cp1);
   check_metadata(cp2);
 }
@@ -136,6 +141,7 @@ TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycle) {
         p = u32[] partition-id()
         ROOT start = u32[] collective-permute(p), channel_id=1,
           source_target_pairs={{0,3},{1,0},{2,1},{3,2}},
+          frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10}}"},
           metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
       }
     )";
@@ -162,8 +168,12 @@ TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycle) {
   EXPECT_EQ(cp1->operand(0), cp2->operand(0));
   EXPECT_GT(cp2->channel_id().value(), cp1->channel_id().value());
   EXPECT_THAT(cp1->ToString(), HasSubstr("source_target_pairs={{0,3}}"));
+  EXPECT_THAT(cp1->ToString(),
+              HasSubstr("_xla_send_recv_validation=\"{{0,7}}\""));
   EXPECT_THAT(cp2->ToString(),
               HasSubstr("source_target_pairs={{1,0},{2,1},{3,2}}"));
+  EXPECT_THAT(cp2->ToString(),
+              HasSubstr("_xla_send_recv_validation=\"{{1,8},{2,9},{3,10}}\""));
   check_metadata(cp1);
   check_metadata(cp2);
 }

From 7ab00210a4692422b650b657ae7dec04a2deead4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Mar 2024 08:19:19 -0700
Subject: [PATCH 234/670] Bugfix, nit: do not use designated initializers for
 aggregate initialization.

PiperOrigin-RevId: 617848873
---
 third_party/xla/xla/python/ifrt_proxy/client/compiler.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
index 9d8c386fd39e74..55132de4cec640 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
@@ -101,10 +101,9 @@ absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> Compiler::Compile(
       response->addressable_device_logical_ids_size());
   for (const auto& logical_device_id :
        response->addressable_device_logical_ids()) {
-    addressable_device_logical_device_ids.push_back({
-        .replica = logical_device_id.replica(),
-        .partition = logical_device_id.partition(),
-    });
+    xla::ifrt::LoadedExecutable::LogicalDeviceIds id{
+        logical_device_id.replica(), logical_device_id.partition()};
+    addressable_device_logical_device_ids.push_back(id);
   }
 
   std::vector<xla::ifrt::Device*> addressable_devices;

From f491148a6a7755347ec4224dc9ac85e374e789f7 Mon Sep 17 00:00:00 2001
From: Sergey Kozub <sergeykozub@google.com>
Date: Thu, 21 Mar 2024 08:35:28 -0700
Subject: [PATCH 235/670] Simplify indexing maps with mod constraints for
 symbols/dims

PiperOrigin-RevId: 617853049
---
 .../xla/xla/service/gpu/model/indexing_map.cc | 63 +++++++++++++++++++
 .../xla/xla/service/gpu/model/indexing_map.h  |  3 +
 .../service/gpu/model/indexing_map_test.cc    | 34 +++++++++-
 3 files changed, 97 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index b24b490868fbf3..33a5c8e7e82dd4 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -908,6 +908,8 @@ bool IndexingMap::Simplify() {
     constraints_were_simplified = true;
     if (!SimplifyConstraintRanges()) break;
   }
+  // Simplify dependent constraints.
+  MergeModConstraints();
   // Simplify affine_map using the optimized ranges.
   // Potentially, we can be smarter about recreating the range_evaluator.
   RangeEvaluator range_evaluator(GetDimensionBounds(), GetSymbolBounds(),
@@ -1091,6 +1093,67 @@ void IndexingMap::RemoveUnusedSymbols() {
   }
 }
 
+void IndexingMap::MergeModConstraints() {
+  RangeEvaluator range_evaluator(GetDimensionBounds(), GetSymbolBounds(),
+                                 GetMLIRContext());
+
+  // Group constraints by LHS.
+  llvm::DenseMap<AffineExpr, llvm::SmallVector<AffineBinaryOpExpr, 2>>
+      grouped_constraints;
+  for (const auto& [expr, _] : constraints_) {
+    if (expr.getKind() != AffineExprKind::Mod) continue;
+    auto binop = mlir::cast<AffineBinaryOpExpr>(expr);
+    grouped_constraints[binop.getLHS()].push_back(binop);
+  }
+
+  // Merge constraints of type MOD.
+  // (X mod 3 == 0) & (X mod 2 == 0) => (X mod 6 == 0)
+  for (const auto& [lhs, binops] : grouped_constraints) {
+    llvm::DenseMap<int64_t, llvm::SmallVector<AffineBinaryOpExpr, 2>>
+        mod_groups;
+    for (const auto& binop : binops) {
+      Interval mod_result = constraints_[binop];
+      if (mod_result.IsPoint()) {
+        mod_groups[mod_result.lower].push_back(binop);
+      }
+    }
+    if (mod_groups.empty()) continue;
+
+    // Update domain for dimensions and symbols only.
+    Interval* update = nullptr;
+    if (lhs.getKind() == AffineExprKind::DimId) {
+      update = &GetMutableDimensionBound(
+          mlir::cast<AffineDimExpr>(lhs).getPosition());
+    } else if (lhs.getKind() == AffineExprKind::SymbolId) {
+      update = &GetMutableSymbolBound(
+          mlir::cast<AffineSymbolExpr>(lhs).getPosition());
+    }
+    for (const auto& [res, ops] : mod_groups) {
+      // Calculate least common multiple for the divisors.
+      int64_t div = 1;
+      for (const auto& op : ops) {
+        int64_t rhs_value =
+            range_evaluator.ComputeExpressionRange(op.getRHS()).lower;
+        div = std::lcm(div, rhs_value);
+      }
+      // Replace multiple constraints with a merged one.
+      if (ops.size() > 1) {
+        for (const auto& op : ops) {
+          constraints_.erase(op);
+        }
+        constraints_[lhs % div] = Interval{res, res};
+      }
+      // Update dimension and symbol bounds.
+      if (update != nullptr) {
+        int64_t l = (update->lower / div) * div + res;
+        update->lower = l >= update->lower ? l : l + div;
+        int64_t h = (update->upper / div) * div + res;
+        update->upper = h <= update->upper ? h : h - div;
+      }
+    }
+  }
+}
+
 IndexingMap ComposeIndexingMaps(const IndexingMap& first,
                                 const IndexingMap& second) {
   if (second.IsUndefined() || first.IsUndefined()) {
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index 433a810984daab..5b5452bf846595 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -336,6 +336,9 @@ class IndexingMap {
   // Returns true if simplification was performed.
   bool SimplifyConstraintRanges();
 
+  // Merges "mod" constraints for the same AffineExpr.
+  void MergeModConstraints();
+
   IndexingContext* indexing_context_ = nullptr;
   mlir::AffineMap affine_map_;
   std::vector<DimVar> dim_vars_;
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index 95fe85c71dfd54..ddfceefe4843be 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -177,10 +177,10 @@ TEST_F(IndexingMapTest, Composition_ProducerAndConsumerHaveConstraints) {
   EXPECT_THAT(composed, MatchIndexingMap(R"(
                           (d0)[s0, s1, s2] -> (s2, d0, s1, s0)
                           domain:
-                          d0 in [0, 9]
-                          s0 in [0, 69]
+                          d0 in [0, 8]
+                          s0 in [1, 67]
                           s1 in [0, 19]
-                          s2 in [0, 7]
+                          s2 in [0, 4]
                           d0 mod 8 in [0, 0]
                           s0 mod 3 in [1, 1]
                           s2 mod 4 in [0, 0]
@@ -358,6 +358,34 @@ TEST_F(IndexingMapTest,
                         )"));
 }
 
+TEST_F(IndexingMapTest, ConstraintMerge_Mod) {
+  IndexingMap indexing_map(
+      &indexing_context_,
+      ParseAffineMap("(d0)[s0, s1] -> (d0, s1, s0)", &mlir_context_),
+      {DimVar{{0, 4}}}, {RangeVar{{-21, -1}}, RangeVar{{0, 10}}},
+      /*rt_vars=*/{});
+  indexing_map.AddConstraint(ParseAffineExpr("d0 mod 3", &mlir_context_),
+                             Interval{0, 0});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 2", &mlir_context_),
+                             Interval{0, 0});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 3", &mlir_context_),
+                             Interval{0, 0});
+  indexing_map.AddConstraint(ParseAffineExpr("s1 mod 5", &mlir_context_),
+                             Interval{1, 1});
+  indexing_map.Simplify();
+
+  EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
+                          (d0)[s0, s1] -> (d0, s1, s0)
+                          domain:
+                          d0 in [0, 3]
+                          s0 in [-18, -6]
+                          s1 in [1, 6]
+                          d0 mod 3 in [0, 0]
+                          s0 mod 6 in [0, 0]
+                          s1 mod 5 in [1, 1]
+                        )"));
+}
+
 TEST_F(IndexingMapTest, AffineMapSimplification_ConstantDims) {
   IndexingMap indexing_map = IndexingMap(
       &indexing_context_, ParseAffineMap("(d0) -> (d0)", &mlir_context_),

From 38b5c07d35aae19c118ca1bb760a06c9d27af710 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Thu, 21 Mar 2024 08:38:28 -0700
Subject: [PATCH 236/670] graphcycles_test: use batch mode in benchmarks

So that results are easier to interpret and compare.

Before:
----------------------------------------------------------------
Benchmark                      Time             CPU   Iterations
----------------------------------------------------------------
BM_StressTest/2048        989074 ns       988955 ns          707
BM_StressTest/4096       2043829 ns      2043653 ns          346
BM_StressTest/32768     17073324 ns     17071023 ns           40
BM_StressTest/262144   152278792 ns    152251000 ns            4
BM_StressTest/1048576 1403662723 ns   1397561773 ns            1
BM_ContractEdge/1000      122874 ns       122842 ns         5702
BM_ContractEdge/10000    1471461 ns      1471153 ns          477

After:
----------------------------------------------------------------
Benchmark                      Time             CPU   Iterations
----------------------------------------------------------------
BM_StressTest/2048           482 ns          482 ns      1394688
BM_StressTest/4096           494 ns          494 ns      1409024
BM_StressTest/32768          546 ns          546 ns      1409024
BM_StressTest/262144         568 ns          568 ns      1310720
BM_StressTest/1048576       1176 ns         1176 ns      1048576
BM_ContractEdge/1000         130 ns          130 ns      5271000
BM_ContractEdge/10000        150 ns          150 ns      4680000
PiperOrigin-RevId: 617853860
---
 third_party/xla/xla/service/graphcycles/graphcycles_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/graphcycles/graphcycles_test.cc b/third_party/xla/xla/service/graphcycles/graphcycles_test.cc
index 841b383736dcd7..a3a7a0aa45eb38 100644
--- a/third_party/xla/xla/service/graphcycles/graphcycles_test.cc
+++ b/third_party/xla/xla/service/graphcycles/graphcycles_test.cc
@@ -510,7 +510,7 @@ TEST_F(GraphCyclesTest, CanContractEdge) {
 static void BM_StressTest(::testing::benchmark::State &state) {
   const int num_nodes = state.range(0);
 
-  for (auto s : state) {
+  while (state.KeepRunningBatch(num_nodes)) {
     tensorflow::GraphCycles g;
     int32_t *nodes = new int32_t[num_nodes];
     for (int i = 0; i < num_nodes; i++) {
@@ -532,7 +532,7 @@ BENCHMARK(BM_StressTest)->Range(2048, 1048576);
 static void BM_ContractEdge(::testing::benchmark::State &state) {
   const int num_nodes = state.range(0);
 
-  for (auto s : state) {
+  while (state.KeepRunningBatch(num_nodes)) {
     state.PauseTiming();
     tensorflow::GraphCycles g;
     std::vector<int32_t> nodes;

From bb4bdbd871fe6ffb3c4c25bc106d2208ef2279c2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Mar 2024 09:23:28 -0700
Subject: [PATCH 237/670] Ignore nested tuples, if present, during post
 processing.

This is a hack for some cases (specifically observed for the llama and gemma models) where nested tuples as used as inputs/outputs of the kOptimizationBarrier instruction.

PiperOrigin-RevId: 617866604
---
 .../xla/hlo/experimental/auto_sharding/auto_sharding.cc  | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index d466a7bb8eedb3..596883dc97d5c8 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -2295,6 +2295,15 @@ Status SetHloShardingPostProcessing(
       continue;
     } else {
       if (inst->shape().IsTuple()) {
+        // While we do not support nested tuples fully, this is a hack to get
+        // things to work in some cases (specifically observed for the llama and
+        // gemma models) where nested tuples as used as inputs/outputs of the
+        // kOptimizationBarrier instruction.
+        if (absl::c_any_of(
+                inst->shape().tuple_shapes(),
+                [](const Shape& shape) { return shape.IsTuple(); })) {
+          continue;
+        }
         switch (inst->opcode()) {
           case HloOpcode::kReduce:
           case HloOpcode::kCustomCall:

From be04c260482259cdd3b101778a0ecfe4f9fb0fd6 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Thu, 21 Mar 2024 09:59:36 -0700
Subject: [PATCH 238/670] Rename `DequantizeOpQuantizePerChannel` to
 `DequantizeOpQuantizePerAxis`.

PiperOrigin-RevId: 617878310
---
 .../lite/experimental/shlo/ops/unary_elementwise.h   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h b/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h
index 89217c1306083e..554c1096f845e0 100644
--- a/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h
+++ b/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h
@@ -34,7 +34,7 @@ namespace shlo_ref {
 namespace detail {
 
 template <typename StorageT, typename ExpressedT, typename F>
-void DequantizeOpQuantizePerChannelImpl(
+void DequantizeOpQuantizePerAxisImpl(
     F& op, const Shape& shape, const Axis quantization_dimension,
     const StorageT quantization_min, const StorageT quantization_max,
     const absl::Span<const StorageT> input_zero_points,
@@ -65,7 +65,7 @@ void DequantizeOpQuantizePerChannelImpl(
       if (depth == quantization_dimension) {
         quantization_index = i;
       }
-      DequantizeOpQuantizePerChannelImpl(
+      DequantizeOpQuantizePerAxisImpl(
           op, shape, quantization_dimension, quantization_min, quantization_max,
           input_zero_points, input_scales, output_zero_points, output_scales,
           strides, input_data, output_data, depth + 1, quantization_index);
@@ -76,8 +76,8 @@ void DequantizeOpQuantizePerChannelImpl(
 }
 
 template <DataType storage_type, DataType expressed_type, typename F>
-void DequantizeOpQuantizePerChannel(F&& func, const Tensor& input,
-                                    Tensor& output) {
+void DequantizeOpQuantizePerAxis(F&& func, const Tensor& input,
+                                 Tensor& output) {
   using StorageT = StorageType<storage_type>;
   using ExpressedT = StorageType<expressed_type>;
   const Shape& shape = input.shape();
@@ -94,7 +94,7 @@ void DequantizeOpQuantizePerChannel(F&& func, const Tensor& input,
   const Strides& strides = ComputeStrides(shape);
   const StorageT* input_data = input.GetDataAs<storage_type>();
   StorageT* output_data = output.GetDataAs<storage_type>();
-  DequantizeOpQuantizePerChannelImpl(
+  DequantizeOpQuantizePerAxisImpl(
       func, shape, quantization_dimension, Storage<storage_type>::kMinValue,
       Storage<storage_type>::kMaxValue, input_zero_points, input_scales,
       output_zero_points, output_scales, strides, input_data, output_data,
@@ -169,7 +169,7 @@ template <class F>
 absl::Status Evaluate(UnaryElementwiseOp<F>& op, const Tensor& input,
                       Tensor& output) {
   if (input.IsPerAxisQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerChannel,
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerAxis,
                        input.quantized_tensor_element_type().StorageType(),
                        input.quantized_tensor_element_type().ExpressedType(),
                        op.func, input, output);

From 6738c28cf4eed334d75d02b6e97fe16c34069ef1 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Thu, 21 Mar 2024 10:19:55 -0700
Subject: [PATCH 239/670] graphcycles_test: add BM_IsReachableNonConst
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

name                          time/op
BM_StressTest/2048             489ns ± 5%
BM_StressTest/4096             501ns ± 6%
BM_StressTest/32768            521ns ± 4%
BM_StressTest/262144           575ns ±11%
BM_StressTest/1048576          614ns ± 2%
BM_ContractEdge/1000           129ns ± 2%
BM_ContractEdge/10000          148ns ± 1%
BM_IsReachableNonConst/10     12.2ns ± 7%
BM_IsReachableNonConst/50     19.9ns ±14%
BM_IsReachableNonConst/100    24.7ns ± 7%
BM_IsReachableNonConst/200    26.3ns ± 3%
BM_IsReachableNonConst/1000   37.3ns ± 5%
BM_IsReachableNonConst/30000  66.4ns ± 3%

PiperOrigin-RevId: 617886294
---
 third_party/xla/xla/service/graphcycles/BUILD |  1 +
 .../service/graphcycles/graphcycles_test.cc   | 49 +++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/third_party/xla/xla/service/graphcycles/BUILD b/third_party/xla/xla/service/graphcycles/BUILD
index 47b8e312df5cd6..22d7067c07bc78 100644
--- a/third_party/xla/xla/service/graphcycles/BUILD
+++ b/third_party/xla/xla/service/graphcycles/BUILD
@@ -41,6 +41,7 @@ xla_cc_test(
     deps = [
         ":graphcycles",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/random",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
diff --git a/third_party/xla/xla/service/graphcycles/graphcycles_test.cc b/third_party/xla/xla/service/graphcycles/graphcycles_test.cc
index a3a7a0aa45eb38..0c6fa481b16c37 100644
--- a/third_party/xla/xla/service/graphcycles/graphcycles_test.cc
+++ b/third_party/xla/xla/service/graphcycles/graphcycles_test.cc
@@ -17,11 +17,13 @@ limitations under the License.
 
 #include "xla/service/graphcycles/graphcycles.h"
 
+#include <cstdint>
 #include <optional>
 #include <random>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/random/random.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/test_benchmark.h"
@@ -553,3 +555,50 @@ static void BM_ContractEdge(::testing::benchmark::State &state) {
   }
 }
 BENCHMARK(BM_ContractEdge)->Arg(1000)->Arg(10000);
+
+static void BM_IsReachableNonConst(testing::benchmark::State &state) {
+  const int num_nodes = state.range(0);
+
+  tensorflow::GraphCycles g;
+  std::vector<uint32_t> nodes;
+  nodes.reserve(num_nodes);
+  for (int i = 0; i < num_nodes; i++) {
+    nodes.push_back(g.NewNode());
+  }
+
+  // Add forward edges.
+  absl::BitGen bitgen;
+  for (int i = 0; i < num_nodes; i++) {
+    int max = num_nodes - 1 - i;
+    if (max == 0) break;
+    constexpr int branch_factor = 2;
+    for (int b = 0; b < branch_factor; b++) {
+      int j = i + 1 + absl::Uniform(bitgen, 0, max);
+      CHECK_LT(j, num_nodes);
+      CHECK(g.InsertEdge(nodes[i], nodes[j]));
+    }
+  }
+
+  auto get_random_node = [&]() {
+    return nodes[absl::Uniform(bitgen, 0, num_nodes)];
+  };
+
+  uint32_t src, dst;
+  int i = 0;
+  for (auto s : state) {
+    if (i % 256 == 0) {
+      src = get_random_node();
+      dst = get_random_node();
+    }
+    bool reachable = g.IsReachableNonConst(src, dst);
+    benchmark::DoNotOptimize(reachable);
+    i++;
+  }
+}
+BENCHMARK(BM_IsReachableNonConst)
+    ->Arg(10)
+    ->Arg(50)
+    ->Arg(100)
+    ->Arg(200)
+    ->Arg(1000)
+    ->Arg(30000);

From 76e23e8e0c5dc8971c0510688d09938ff2d06afa Mon Sep 17 00:00:00 2001
From: Kevin Gleason <gleasonk@google.com>
Date: Thu, 21 Mar 2024 10:21:18 -0700
Subject: [PATCH 240/670] Integrate CHLO->StableHLO lowerings in XLA

PiperOrigin-RevId: 617886759
---
 tensorflow/compiler/mlir/lite/stablehlo/BUILD |    1 +
 .../stablehlo/transforms/tf_stablehlo_pass.cc |    5 +-
 .../compiler/mlir/lite/tf_tfl_passes.cc       |   10 +-
 .../mlir/quantization/stablehlo/BUILD         |    1 -
 .../stablehlo/passes/bridge/passes.cc         |    3 +-
 .../compiler/mlir/tf2xla/transforms/BUILD     |    5 +-
 .../tf2xla/transforms/xla_legalize_targets.cc |    2 +
 .../mlir/tf2xla/transforms/xla_legalize_tf.cc |    6 +-
 .../transforms/xla_legalize_tf_passes.td      |   16 +-
 .../tf2xla/kernels/xla_call_module_loader.cc  |    4 +-
 third_party/stablehlo/temporary.patch         |  408 +++-
 .../xla/third_party/stablehlo/temporary.patch |  408 +++-
 third_party/xla/xla/mlir_hlo/BUILD            |   26 +-
 .../mlir_hlo/mhlo/transforms/CMakeLists.txt   |    1 -
 .../chlo_legalize_to_hlo.cc                   | 1976 -----------------
 .../chlo_legalize_to_hlo_pass.cc              |  111 +-
 .../chlo_legalize_to_hlo_patterns.td          |  337 +--
 .../mlir_hlo/mhlo/transforms/mhlo_passes.td   |   37 +-
 .../xla/xla/mlir_hlo/mhlo/transforms/passes.h |    8 -
 .../xla/mlir_hlo/mhlo/transforms/rewriters.h  |   25 +-
 .../chlo/chlo_legalize_to_hlo_broadcasts.mlir |  345 ---
 .../chlo_legalize_to_hlo_no_broadcasts.mlir   |   11 -
 .../Dialect/chlo/chlo_legalize_to_mhlo.mlir   |   23 +-
 .../chlo/chlo_legalize_to_mhlo_basis_ops.mlir |  276 ---
 .../tests/Dialect/mhlo/lower-complex.mlir     |    2 +-
 third_party/xla/xla/pjrt/mlir_to_hlo.cc       |   14 +-
 26 files changed, 952 insertions(+), 3109 deletions(-)
 delete mode 100644 third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
 delete mode 100644 third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_no_broadcasts.mlir
 delete mode 100644 third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo_basis_ops.mlir

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 14cdc985c8be8c..858c09d323683a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -142,6 +142,7 @@ cc_library(
         "@local_xla//xla/mlir_hlo",
         "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
         "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@local_xla//xla/mlir_hlo:type_conversion",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:register",
     ],
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
index e0a1dbc3bf9445..fcacfcf4984db1 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/rewriters.h"
+#include "xla/mlir_hlo/mhlo/utils/type_conversion.h"
 
 namespace mlir {
 namespace odml {
@@ -104,8 +105,8 @@ void TFToMhloPass::runOnOperation() {
   mhlo::Tf2XlaTypeConverter converter;
   mhlo::PopulateLegalizeTfWithTf2XlaPatterns(
       "XLA_CPU_JIT", patterns, context, converter, /*prefer_tf2xla=*/false);
-  chlo::populateDecomposeChloPatterns(context, &patterns);
-  chlo::populateChloBroadcastingPatterns(context, &patterns);
+  stablehlo::StablehloToHloTypeConverter hlo_converter;
+  chlo::populateChloToHloPatterns(context, &hlo_converter, &patterns);
   chlo::ConstantLikeOp::getCanonicalizationPatterns(patterns, context);
 
   ConversionTarget target(*context);
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 3a1c84fbfebd99..59bb17ba3bc613 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -150,14 +150,8 @@ void AddPreQuantizationStableHloToTfPasses(
   // specific features like mhlo::ErfOp which aren't supported
   // in StableHLO, but we have CHLO->StableHLO decompositions to legalize.
   pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
-  pass_manager.addPass(
-      mlir::stablehlo::experimental::createChloRecomposeOpsPass());
-  pass_manager.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createChloLegalizeToHloBasisOpsPass());
-  pass_manager.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createChloLegalizeToHloPass());
-  pass_manager.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createShapeLegalizeToHloPass());
+  mlir::stablehlo::experimental::createChloLegalizeToStablehloPipeline(
+      pass_manager);
   pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
 
   // The following two passes find specific uniform quantization patterns in
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 03d00a8b174fe5..5d9c795083ed7d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -331,7 +331,6 @@ cc_library(
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/mlir_hlo",
-        "@local_xla//xla/mlir_hlo:chlo_legalize_to_hlo",
         "@local_xla//xla/mlir_hlo:mhlo_passes",
         "@local_xla//xla/translate/hlo_to_mhlo:attribute_importer",
         "@stablehlo//:chlo_ops",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.cc
index b07b833429f8b6..33d66316870798 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.cc
@@ -26,8 +26,7 @@ namespace mlir::quant::stablehlo {
 void AddQuantizationLoweringPasses(mlir::OpPassManager& pm) {
   // These passes are grouped together and must run in this specific order.
   pm.addNestedPass<mlir::func::FuncOp>(CreateConvertTFQuantOpsToMHLOPass());
-  pm.addNestedPass<mlir::func::FuncOp>(mhlo::createChloLegalizeToHloPass(
-      /*legalizeBroadcasts=*/true, /*expandCompositions=*/false));
+  pm.addNestedPass<mlir::func::FuncOp>(mhlo::createChloLegalizeToHloPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mhlo::createMhloQuantLegalizeToIntPass());
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
index 28a459ccff2eac..3226474fbf8ff6 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
@@ -196,6 +196,7 @@ cc_library(
         "@llvm-project//mlir:TransformUtils",
         "@local_xla//xla/mlir_hlo",
         "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -293,13 +294,15 @@ cc_library(
         "@local_xla//xla/client:padding",
         "@local_xla//xla/client:sharding_builder",
         "@local_xla//xla/mlir_hlo",
-        "@local_xla//xla/mlir_hlo:chlo_legalize_to_hlo",
         "@local_xla//xla/mlir_hlo:convert_op_folder",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@local_xla//xla/mlir_hlo:type_conversion",
         "@local_xla//xla/stream_executor/tpu:c_api_conversions",
         "@local_xla//xla/stream_executor/tpu:tpu_api",
         "@local_xla//xla/translate/hlo_to_mhlo:attribute_importer",
         "@local_xla//xla/translate/mhlo_to_hlo:type_to_shape",
         "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.cc
index 3d46b98d9bac90..816b9a5e8b7706 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
@@ -34,6 +35,7 @@ ConversionTarget GetDefaultLegalConversionTargets(MLIRContext& mlir_context,
 
   if (legalize_chlo) {
     target.addIllegalDialect<chlo::ChloDialect>();
+    target.addIllegalDialect<stablehlo::StablehloDialect>();
   } else {
     target.addLegalDialect<chlo::ChloDialect>();
   }
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
index 7336c8fe625447..d3c9ff7e8bd157 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h"
@@ -45,6 +46,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/rewriters.h"
+#include "xla/mlir_hlo/mhlo/utils/type_conversion.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 
 namespace mlir {
@@ -203,9 +205,9 @@ LogicalResult legalizeTF(Operation *op, bool legalize_chlo,
 
   // Populate with CHLO->HLO lowerings to account for TF ops legalized to
   // CHLO first.
+  stablehlo::StablehloToHloTypeConverter hlo_converter;
   if (legalize_chlo) {
-    chlo::populateDecomposeChloPatterns(context, &patterns);
-    chlo::populateChloBroadcastingPatterns(context, &patterns);
+    chlo::populateChloToHloPatterns(context, &hlo_converter, &patterns);
   }
   // ConstantLike op is convenient to create splat constants, but is
   // canonicalized to plain HLO constant if statically shaped. Add the
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
index 4c3f664af9cb83..19c31018185c82 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
@@ -45,12 +45,16 @@ def LegalizeTF : Pass<"xla-legalize-tf", "ModuleOp"> {
   ];
 
   let constructor = "mlir::mhlo::createLegalizeTFPass()";
-  let dependentDialects = ["arith::ArithDialect, chlo::ChloDialect",
-                           "mhlo::MhloDialect",
-                           "quant::QuantizationDialect",
-                           "shape::ShapeDialect",
-                           "func::FuncDialect",
-                           "sparse_tensor::SparseTensorDialect"];
+  let dependentDialects = [
+    "arith::ArithDialect",
+    "chlo::ChloDialect",
+    "func::FuncDialect",
+    "mhlo::MhloDialect",
+    "quant::QuantizationDialect",
+    "shape::ShapeDialect",
+    "sparse_tensor::SparseTensorDialect",
+    "stablehlo::StablehloDialect"
+  ];
 }
 
 def LegalizeTFCollective : Pass<"xla-legalize-tf-collective", "ModuleOp"> {
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
index 6ab1793d493eaf..630e948b95f33d 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
@@ -563,8 +563,8 @@ absl::Status XlaCallModuleLoader::LowerModuleToMhlo() {
   mlir::PassManager pm(module_->getContext());
   applyTensorflowAndCLOptions(pm);
   pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createChloLegalizeToHloPass(
-      /*legalizeBroadcasts=*/true, /*expandCompositions=*/true));
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createChloLegalizeToHloPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   // In order to export to XLA, we must sink constants to control flow
   // regions, since XLA uses functional control flow.
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index ed999bbe7db342..d31b57fbd68ad9 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -2038,7 +2038,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
 --- stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
 +++ stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
-@@ -0,0 +1,168 @@
+@@ -0,0 +1,178 @@
 +/* Copyright 2024 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
 +you may not use this file except in compliance with the License.
@@ -2061,12 +2061,14 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +#include "mlir/IR/Attributes.h"
 +#include "mlir/IR/BuiltinAttributes.h"
 +#include "mlir/IR/PatternMatch.h"
++#include "mlir/Pass/PassManager.h"
 +#include "mlir/Support/LogicalResult.h"
 +#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 +#include "stablehlo/dialect/ChloOps.h"
 +#include "stablehlo/dialect/StablehloOps.h"
 +#include "stablehlo/experimental/dialect/StablehloOps.h"
 +#include "stablehlo/experimental/transforms/Passes.h"
++#include "stablehlo/transforms/Passes.h"
 +
 +namespace mlir {
 +namespace stablehlo {
@@ -2204,13 +2206,21 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +  }
 +};
 +
++void createChloLegalizeToStablehloPipeline(OpPassManager& pm) {
++  pm.addPass(mlir::stablehlo::experimental::createChloRecomposeOpsPass());
++  pm.addNestedPass<mlir::func::FuncOp>(
++      mlir::stablehlo::createChloLegalizeToStablehloPass());
++  pm.addNestedPass<mlir::func::FuncOp>(
++      mlir::stablehlo::createShapeLegalizeToStablehloPass());
++}
++
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/stablehlo/experimental/transforms/Passes.h
 --- stablehlo/stablehlo/experimental/transforms/Passes.h
 +++ stablehlo/stablehlo/experimental/transforms/Passes.h
-@@ -0,0 +1,36 @@
+@@ -0,0 +1,38 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2242,6 +2252,8 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 +#define GEN_PASS_REGISTRATION
 +#include "stablehlo/experimental/transforms/Passes.h.inc"
 +
++void createChloLegalizeToStablehloPipeline(OpPassManager &pm);
++
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
@@ -2639,6 +2651,375 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+--- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
++++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+@@ -1324,99 +1324,99 @@
+   // CHECK: %[[TMP_40:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
+   // CHECK: %[[TMP_41:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_40]]
+   // CHECK: %[[TMP_42:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_43:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_42]]
++  // CHECK: %[[TMP_43:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_42]]
+   // CHECK: %[[TMP_44:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_45:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_44]]
++  // CHECK: %[[TMP_45:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_44]]
+   // CHECK: %[[TMP_46:.*]] = stablehlo.multiply %[[TMP_43]], %[[TMP_45]]
+   // CHECK: %[[TMP_47:.*]] = stablehlo.constant dense<-1.39544646E-19>
+   // CHECK: %[[TMP_48:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_47]]
+   // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_48]]
+   // CHECK: %[[TMP_50:.*]] = stablehlo.multiply %[[TMP_46]], %[[TMP_49]]
+   // CHECK: %[[TMP_51:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_52:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_51]]
++  // CHECK: %[[TMP_52:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_51]]
+   // CHECK: %[[TMP_53:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_54:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_53]]
++  // CHECK: %[[TMP_54:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_53]]
+   // CHECK: %[[TMP_55:.*]] = stablehlo.multiply %[[TMP_52]], %[[TMP_54]]
+   // CHECK: %[[TMP_56:.*]] = stablehlo.constant dense<5.50900303E-18>
+   // CHECK: %[[TMP_57:.*]] = stablehlo.add %[[TMP_50]], %[[TMP_56]]
+   // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_57]]
+   // CHECK: %[[TMP_59:.*]] = stablehlo.multiply %[[TMP_55]], %[[TMP_58]]
+   // CHECK: %[[TMP_60:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_61:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_60]]
++  // CHECK: %[[TMP_61:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_60]]
+   // CHECK: %[[TMP_62:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_63:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_62]]
++  // CHECK: %[[TMP_63:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_62]]
+   // CHECK: %[[TMP_64:.*]] = stablehlo.multiply %[[TMP_61]], %[[TMP_63]]
+   // CHECK: %[[TMP_65:.*]] = stablehlo.constant dense<-2.17486866E-16>
+   // CHECK: %[[TMP_66:.*]] = stablehlo.add %[[TMP_59]], %[[TMP_65]]
+   // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_66]]
+   // CHECK: %[[TMP_68:.*]] = stablehlo.multiply %[[TMP_64]], %[[TMP_67]]
+   // CHECK: %[[TMP_69:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_70:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_69]]
++  // CHECK: %[[TMP_70:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_69]]
+   // CHECK: %[[TMP_71:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_72:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_71]]
++  // CHECK: %[[TMP_72:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_71]]
+   // CHECK: %[[TMP_73:.*]] = stablehlo.multiply %[[TMP_70]], %[[TMP_72]]
+   // CHECK: %[[TMP_74:.*]] = stablehlo.constant dense<8.58606213E-15>
+   // CHECK: %[[TMP_75:.*]] = stablehlo.add %[[TMP_68]], %[[TMP_74]]
+   // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_75]]
+   // CHECK: %[[TMP_77:.*]] = stablehlo.multiply %[[TMP_73]], %[[TMP_76]]
+   // CHECK: %[[TMP_78:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_79:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_78]]
++  // CHECK: %[[TMP_79:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_78]]
+   // CHECK: %[[TMP_80:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_81:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_80]]
++  // CHECK: %[[TMP_81:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_80]]
+   // CHECK: %[[TMP_82:.*]] = stablehlo.multiply %[[TMP_79]], %[[TMP_81]]
+   // CHECK: %[[TMP_83:.*]] = stablehlo.constant dense<-3.3896803E-13>
+   // CHECK: %[[TMP_84:.*]] = stablehlo.add %[[TMP_77]], %[[TMP_83]]
+   // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_84]]
+   // CHECK: %[[TMP_86:.*]] = stablehlo.multiply %[[TMP_82]], %[[TMP_85]]
+   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_88:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_87]]
++  // CHECK: %[[TMP_88:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_87]]
+   // CHECK: %[[TMP_89:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_89]]
++  // CHECK: %[[TMP_90:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_89]]
+   // CHECK: %[[TMP_91:.*]] = stablehlo.multiply %[[TMP_88]], %[[TMP_90]]
+   // CHECK: %[[TMP_92:.*]] = stablehlo.constant dense<1.33825364E-11>
+   // CHECK: %[[TMP_93:.*]] = stablehlo.add %[[TMP_86]], %[[TMP_92]]
+   // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_93]]
+   // CHECK: %[[TMP_95:.*]] = stablehlo.multiply %[[TMP_91]], %[[TMP_94]]
+   // CHECK: %[[TMP_96:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_96]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_96]]
+   // CHECK: %[[TMP_98:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_99:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_98]]
++  // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_98]]
+   // CHECK: %[[TMP_100:.*]] = stablehlo.multiply %[[TMP_97]], %[[TMP_99]]
+   // CHECK: %[[TMP_101:.*]] = stablehlo.constant dense<-5.28419031E-10>
+   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_95]], %[[TMP_101]]
+   // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_102]]
+   // CHECK: %[[TMP_104:.*]] = stablehlo.multiply %[[TMP_100]], %[[TMP_103]]
+   // CHECK: %[[TMP_105:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_105]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_105]]
+   // CHECK: %[[TMP_107:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_108:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_107]]
++  // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_107]]
+   // CHECK: %[[TMP_109:.*]] = stablehlo.multiply %[[TMP_106]], %[[TMP_108]]
+   // CHECK: %[[TMP_110:.*]] = stablehlo.constant dense<2.08767563E-8>
+   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_104]], %[[TMP_110]]
+   // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_111]]
+   // CHECK: %[[TMP_113:.*]] = stablehlo.multiply %[[TMP_109]], %[[TMP_112]]
+   // CHECK: %[[TMP_114:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_114]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_114]]
+   // CHECK: %[[TMP_116:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_117:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_116]]
++  // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_116]]
+   // CHECK: %[[TMP_118:.*]] = stablehlo.multiply %[[TMP_115]], %[[TMP_117]]
+   // CHECK: %[[TMP_119:.*]] = stablehlo.constant dense<-8.26719599E-7>
+   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_113]], %[[TMP_119]]
+   // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_120]]
+   // CHECK: %[[TMP_122:.*]] = stablehlo.multiply %[[TMP_118]], %[[TMP_121]]
+   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_123]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_123]]
+   // CHECK: %[[TMP_125:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_125]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_125]]
+   // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_124]], %[[TMP_126]]
+   // CHECK: %[[TMP_128:.*]] = stablehlo.constant dense<3.30687835E-5>
+   // CHECK: %[[TMP_129:.*]] = stablehlo.add %[[TMP_122]], %[[TMP_128]]
+   // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_129]]
+   // CHECK: %[[TMP_131:.*]] = stablehlo.multiply %[[TMP_127]], %[[TMP_130]]
+   // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_132]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_132]]
+   // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_134]]
++  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_134]]
+   // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_135]]
+   // CHECK: %[[TMP_137:.*]] = stablehlo.constant dense<-0.00138888892>
+   // CHECK: %[[TMP_138:.*]] = stablehlo.add %[[TMP_131]], %[[TMP_137]]
+@@ -1600,99 +1600,99 @@
+   // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
+   // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
+   // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_130]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
+   // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_132]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
+   // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
+   // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.39544646E-19>
+   // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
+   // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
+   // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
+   // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_139]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
+   // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_141]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
+   // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
+   // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.50900303E-18>
+   // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
+   // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
+   // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
+   // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_149:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_148]]
++  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
+   // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_151:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_150]]
++  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
+   // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
+   // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.17486866E-16>
+   // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
+   // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
+   // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
+   // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_158:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_157]]
++  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
+   // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_160:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_159]]
++  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
+   // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
+   // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.58606213E-15>
+   // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
+   // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
+   // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
+   // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_167:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_166]]
++  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
+   // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_169:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_168]]
++  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
+   // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
+   // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896803E-13>
+   // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
+   // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
+   // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
+   // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_176:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_175]]
++  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
+   // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_178:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_177]]
++  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
+   // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
+   // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.33825364E-11>
+   // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
+   // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
+   // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
+   // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_185:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_184]]
++  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
+   // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_187:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_186]]
++  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
+   // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
+   // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.28419031E-10>
+   // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
+   // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
+   // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
+   // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_194:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_193]]
++  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
+   // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_196:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_195]]
++  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
+   // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
+   // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767563E-8>
+   // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
+   // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
+   // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
+   // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_203:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_202]]
++  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
+   // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_205:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_204]]
++  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
+   // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
+   // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.26719599E-7>
+   // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
+   // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
+   // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
+   // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_212:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_211]]
++  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
+   // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_214:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_213]]
++  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
+   // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
+   // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.30687835E-5>
+   // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
+   // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
+   // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
+   // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_221:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_220]]
++  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
+   // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_223:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_222]]
++  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
+   // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
+   // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.00138888892>
+   // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
+@@ -1988,99 +1988,99 @@
+   // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
+   // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
+   // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_130]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
+   // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_132]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
+   // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
+   // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
+   // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
+   // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
+   // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
+   // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_139]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
+   // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_141]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
+   // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
+   // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
+   // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
+   // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
+   // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
+   // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_149:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_148]]
++  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
+   // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_151:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_150]]
++  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
+   // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
+   // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
+   // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
+   // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
+   // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
+   // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_158:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_157]]
++  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
+   // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_160:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_159]]
++  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
+   // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
+   // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
+   // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
+   // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
+   // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
+   // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_167:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_166]]
++  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
+   // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_169:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_168]]
++  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
+   // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
+   // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
+   // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
+   // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
+   // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
+   // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_176:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_175]]
++  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
+   // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_178:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_177]]
++  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
+   // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
+   // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
+   // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
+   // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
+   // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
+   // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_185:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_184]]
++  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
+   // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_187:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_186]]
++  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
+   // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
+   // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
+   // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
+   // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
+   // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
+   // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_194:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_193]]
++  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
+   // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_196:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_195]]
++  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
+   // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
+   // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767569878681E-8>
+   // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
+   // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
+   // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
+   // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_203:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_202]]
++  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
+   // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_205:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_204]]
++  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
+   // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
+   // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
+   // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
+   // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
+   // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
+   // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_212:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_211]]
++  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
+   // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_214:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_213]]
++  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
+   // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
+   // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
+   // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
+   // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
+   // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
+   // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_221:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_220]]
++  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
+   // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_223:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_222]]
++  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
+   // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
+   // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.0013888888888888889>
+   // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
 diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
 --- stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
 +++ stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
@@ -2718,4 +3099,27 @@ diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/
 -    (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:3, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
 - func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
 -}
+diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+--- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
++++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+@@ -1575,6 +1575,7 @@
+ 
+ static Value materializeZeta(ConversionPatternRewriter &rewriter, Location loc,
+                              ValueRange args) {
++  // Code should match XLA's materializeZeta from chlo_legalize_to_hlo.cc
+   assert(args.size() == 2);
+   Value x = args[0];
+   Value q = args[1];
+@@ -1629,9 +1630,9 @@
+   // Using Horner's rule allows to avoid some NaN's and Infs from happening,
+   // resulting in more numerically stable code.
+   for (int i = 0; i < 11; ++i) {
+-    Value factorLhs = rewriter.create<mlir::stablehlo::SubtractOp>(
++    Value factorLhs = rewriter.create<mlir::stablehlo::AddOp>(
+         loc, x, getConstantLike(rewriter, loc, 22 - 2 * i, x));
+-    Value factorRhs = rewriter.create<mlir::stablehlo::SubtractOp>(
++    Value factorRhs = rewriter.create<mlir::stablehlo::AddOp>(
+         loc, x, getConstantLike(rewriter, loc, 21 - 2 * i, x));
+     factor = rewriter.create<mlir::stablehlo::MulOp>(loc, factorLhs, factorRhs);
+     hornerSum = rewriter.create<mlir::stablehlo::MulOp>(
 
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index ed999bbe7db342..d31b57fbd68ad9 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -2038,7 +2038,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
 --- stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
 +++ stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
-@@ -0,0 +1,168 @@
+@@ -0,0 +1,178 @@
 +/* Copyright 2024 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
 +you may not use this file except in compliance with the License.
@@ -2061,12 +2061,14 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +#include "mlir/IR/Attributes.h"
 +#include "mlir/IR/BuiltinAttributes.h"
 +#include "mlir/IR/PatternMatch.h"
++#include "mlir/Pass/PassManager.h"
 +#include "mlir/Support/LogicalResult.h"
 +#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 +#include "stablehlo/dialect/ChloOps.h"
 +#include "stablehlo/dialect/StablehloOps.h"
 +#include "stablehlo/experimental/dialect/StablehloOps.h"
 +#include "stablehlo/experimental/transforms/Passes.h"
++#include "stablehlo/transforms/Passes.h"
 +
 +namespace mlir {
 +namespace stablehlo {
@@ -2204,13 +2206,21 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +  }
 +};
 +
++void createChloLegalizeToStablehloPipeline(OpPassManager& pm) {
++  pm.addPass(mlir::stablehlo::experimental::createChloRecomposeOpsPass());
++  pm.addNestedPass<mlir::func::FuncOp>(
++      mlir::stablehlo::createChloLegalizeToStablehloPass());
++  pm.addNestedPass<mlir::func::FuncOp>(
++      mlir::stablehlo::createShapeLegalizeToStablehloPass());
++}
++
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/stablehlo/experimental/transforms/Passes.h
 --- stablehlo/stablehlo/experimental/transforms/Passes.h
 +++ stablehlo/stablehlo/experimental/transforms/Passes.h
-@@ -0,0 +1,36 @@
+@@ -0,0 +1,38 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2242,6 +2252,8 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 +#define GEN_PASS_REGISTRATION
 +#include "stablehlo/experimental/transforms/Passes.h.inc"
 +
++void createChloLegalizeToStablehloPipeline(OpPassManager &pm);
++
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
@@ -2639,6 +2651,375 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+--- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
++++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+@@ -1324,99 +1324,99 @@
+   // CHECK: %[[TMP_40:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
+   // CHECK: %[[TMP_41:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_40]]
+   // CHECK: %[[TMP_42:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_43:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_42]]
++  // CHECK: %[[TMP_43:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_42]]
+   // CHECK: %[[TMP_44:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_45:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_44]]
++  // CHECK: %[[TMP_45:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_44]]
+   // CHECK: %[[TMP_46:.*]] = stablehlo.multiply %[[TMP_43]], %[[TMP_45]]
+   // CHECK: %[[TMP_47:.*]] = stablehlo.constant dense<-1.39544646E-19>
+   // CHECK: %[[TMP_48:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_47]]
+   // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_48]]
+   // CHECK: %[[TMP_50:.*]] = stablehlo.multiply %[[TMP_46]], %[[TMP_49]]
+   // CHECK: %[[TMP_51:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_52:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_51]]
++  // CHECK: %[[TMP_52:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_51]]
+   // CHECK: %[[TMP_53:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_54:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_53]]
++  // CHECK: %[[TMP_54:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_53]]
+   // CHECK: %[[TMP_55:.*]] = stablehlo.multiply %[[TMP_52]], %[[TMP_54]]
+   // CHECK: %[[TMP_56:.*]] = stablehlo.constant dense<5.50900303E-18>
+   // CHECK: %[[TMP_57:.*]] = stablehlo.add %[[TMP_50]], %[[TMP_56]]
+   // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_57]]
+   // CHECK: %[[TMP_59:.*]] = stablehlo.multiply %[[TMP_55]], %[[TMP_58]]
+   // CHECK: %[[TMP_60:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_61:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_60]]
++  // CHECK: %[[TMP_61:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_60]]
+   // CHECK: %[[TMP_62:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_63:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_62]]
++  // CHECK: %[[TMP_63:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_62]]
+   // CHECK: %[[TMP_64:.*]] = stablehlo.multiply %[[TMP_61]], %[[TMP_63]]
+   // CHECK: %[[TMP_65:.*]] = stablehlo.constant dense<-2.17486866E-16>
+   // CHECK: %[[TMP_66:.*]] = stablehlo.add %[[TMP_59]], %[[TMP_65]]
+   // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_66]]
+   // CHECK: %[[TMP_68:.*]] = stablehlo.multiply %[[TMP_64]], %[[TMP_67]]
+   // CHECK: %[[TMP_69:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_70:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_69]]
++  // CHECK: %[[TMP_70:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_69]]
+   // CHECK: %[[TMP_71:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_72:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_71]]
++  // CHECK: %[[TMP_72:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_71]]
+   // CHECK: %[[TMP_73:.*]] = stablehlo.multiply %[[TMP_70]], %[[TMP_72]]
+   // CHECK: %[[TMP_74:.*]] = stablehlo.constant dense<8.58606213E-15>
+   // CHECK: %[[TMP_75:.*]] = stablehlo.add %[[TMP_68]], %[[TMP_74]]
+   // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_75]]
+   // CHECK: %[[TMP_77:.*]] = stablehlo.multiply %[[TMP_73]], %[[TMP_76]]
+   // CHECK: %[[TMP_78:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_79:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_78]]
++  // CHECK: %[[TMP_79:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_78]]
+   // CHECK: %[[TMP_80:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_81:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_80]]
++  // CHECK: %[[TMP_81:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_80]]
+   // CHECK: %[[TMP_82:.*]] = stablehlo.multiply %[[TMP_79]], %[[TMP_81]]
+   // CHECK: %[[TMP_83:.*]] = stablehlo.constant dense<-3.3896803E-13>
+   // CHECK: %[[TMP_84:.*]] = stablehlo.add %[[TMP_77]], %[[TMP_83]]
+   // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_84]]
+   // CHECK: %[[TMP_86:.*]] = stablehlo.multiply %[[TMP_82]], %[[TMP_85]]
+   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_88:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_87]]
++  // CHECK: %[[TMP_88:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_87]]
+   // CHECK: %[[TMP_89:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_89]]
++  // CHECK: %[[TMP_90:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_89]]
+   // CHECK: %[[TMP_91:.*]] = stablehlo.multiply %[[TMP_88]], %[[TMP_90]]
+   // CHECK: %[[TMP_92:.*]] = stablehlo.constant dense<1.33825364E-11>
+   // CHECK: %[[TMP_93:.*]] = stablehlo.add %[[TMP_86]], %[[TMP_92]]
+   // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_93]]
+   // CHECK: %[[TMP_95:.*]] = stablehlo.multiply %[[TMP_91]], %[[TMP_94]]
+   // CHECK: %[[TMP_96:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_96]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_96]]
+   // CHECK: %[[TMP_98:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_99:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_98]]
++  // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_98]]
+   // CHECK: %[[TMP_100:.*]] = stablehlo.multiply %[[TMP_97]], %[[TMP_99]]
+   // CHECK: %[[TMP_101:.*]] = stablehlo.constant dense<-5.28419031E-10>
+   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_95]], %[[TMP_101]]
+   // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_102]]
+   // CHECK: %[[TMP_104:.*]] = stablehlo.multiply %[[TMP_100]], %[[TMP_103]]
+   // CHECK: %[[TMP_105:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_105]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_105]]
+   // CHECK: %[[TMP_107:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_108:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_107]]
++  // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_107]]
+   // CHECK: %[[TMP_109:.*]] = stablehlo.multiply %[[TMP_106]], %[[TMP_108]]
+   // CHECK: %[[TMP_110:.*]] = stablehlo.constant dense<2.08767563E-8>
+   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_104]], %[[TMP_110]]
+   // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_111]]
+   // CHECK: %[[TMP_113:.*]] = stablehlo.multiply %[[TMP_109]], %[[TMP_112]]
+   // CHECK: %[[TMP_114:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_114]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_114]]
+   // CHECK: %[[TMP_116:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_117:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_116]]
++  // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_116]]
+   // CHECK: %[[TMP_118:.*]] = stablehlo.multiply %[[TMP_115]], %[[TMP_117]]
+   // CHECK: %[[TMP_119:.*]] = stablehlo.constant dense<-8.26719599E-7>
+   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_113]], %[[TMP_119]]
+   // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_120]]
+   // CHECK: %[[TMP_122:.*]] = stablehlo.multiply %[[TMP_118]], %[[TMP_121]]
+   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_123]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_123]]
+   // CHECK: %[[TMP_125:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_125]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_125]]
+   // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_124]], %[[TMP_126]]
+   // CHECK: %[[TMP_128:.*]] = stablehlo.constant dense<3.30687835E-5>
+   // CHECK: %[[TMP_129:.*]] = stablehlo.add %[[TMP_122]], %[[TMP_128]]
+   // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_129]]
+   // CHECK: %[[TMP_131:.*]] = stablehlo.multiply %[[TMP_127]], %[[TMP_130]]
+   // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_132]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_132]]
+   // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_134]]
++  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_134]]
+   // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_135]]
+   // CHECK: %[[TMP_137:.*]] = stablehlo.constant dense<-0.00138888892>
+   // CHECK: %[[TMP_138:.*]] = stablehlo.add %[[TMP_131]], %[[TMP_137]]
+@@ -1600,99 +1600,99 @@
+   // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
+   // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
+   // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_130]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
+   // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_132]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
+   // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
+   // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.39544646E-19>
+   // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
+   // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
+   // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
+   // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_139]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
+   // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_141]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
+   // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
+   // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.50900303E-18>
+   // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
+   // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
+   // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
+   // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_149:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_148]]
++  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
+   // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_151:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_150]]
++  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
+   // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
+   // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.17486866E-16>
+   // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
+   // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
+   // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
+   // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_158:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_157]]
++  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
+   // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_160:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_159]]
++  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
+   // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
+   // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.58606213E-15>
+   // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
+   // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
+   // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
+   // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_167:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_166]]
++  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
+   // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_169:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_168]]
++  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
+   // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
+   // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896803E-13>
+   // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
+   // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
+   // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
+   // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_176:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_175]]
++  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
+   // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_178:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_177]]
++  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
+   // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
+   // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.33825364E-11>
+   // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
+   // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
+   // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
+   // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_185:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_184]]
++  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
+   // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_187:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_186]]
++  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
+   // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
+   // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.28419031E-10>
+   // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
+   // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
+   // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
+   // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_194:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_193]]
++  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
+   // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_196:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_195]]
++  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
+   // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
+   // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767563E-8>
+   // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
+   // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
+   // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
+   // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_203:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_202]]
++  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
+   // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_205:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_204]]
++  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
+   // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
+   // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.26719599E-7>
+   // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
+   // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
+   // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
+   // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_212:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_211]]
++  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
+   // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_214:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_213]]
++  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
+   // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
+   // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.30687835E-5>
+   // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
+   // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
+   // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
+   // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_221:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_220]]
++  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
+   // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_223:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_222]]
++  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
+   // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
+   // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.00138888892>
+   // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
+@@ -1988,99 +1988,99 @@
+   // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
+   // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
+   // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_130]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
+   // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_132]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
+   // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
+   // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
+   // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
+   // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
+   // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
+   // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_139]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
+   // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_141]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
+   // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
+   // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
+   // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
+   // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
+   // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
+   // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_149:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_148]]
++  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
+   // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_151:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_150]]
++  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
+   // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
+   // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
+   // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
+   // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
+   // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
+   // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_158:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_157]]
++  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
+   // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_160:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_159]]
++  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
+   // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
+   // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
+   // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
+   // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
+   // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
+   // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_167:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_166]]
++  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
+   // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_169:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_168]]
++  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
+   // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
+   // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
+   // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
+   // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
+   // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
+   // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_176:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_175]]
++  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
+   // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_178:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_177]]
++  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
+   // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
+   // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
+   // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
+   // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
+   // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
+   // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_185:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_184]]
++  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
+   // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_187:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_186]]
++  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
+   // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
+   // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
+   // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
+   // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
+   // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
+   // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_194:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_193]]
++  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
+   // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_196:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_195]]
++  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
+   // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
+   // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767569878681E-8>
+   // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
+   // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
+   // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
+   // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_203:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_202]]
++  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
+   // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_205:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_204]]
++  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
+   // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
+   // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
+   // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
+   // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
+   // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
+   // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_212:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_211]]
++  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
+   // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_214:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_213]]
++  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
+   // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
+   // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
+   // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
+   // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
+   // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
+   // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_221:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_220]]
++  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
+   // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_223:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_222]]
++  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
+   // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
+   // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.0013888888888888889>
+   // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
 diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
 --- stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
 +++ stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
@@ -2718,4 +3099,27 @@ diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/
 -    (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:3, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
 - func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
 -}
+diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+--- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
++++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+@@ -1575,6 +1575,7 @@
+ 
+ static Value materializeZeta(ConversionPatternRewriter &rewriter, Location loc,
+                              ValueRange args) {
++  // Code should match XLA's materializeZeta from chlo_legalize_to_hlo.cc
+   assert(args.size() == 2);
+   Value x = args[0];
+   Value q = args[1];
+@@ -1629,9 +1630,9 @@
+   // Using Horner's rule allows to avoid some NaN's and Infs from happening,
+   // resulting in more numerically stable code.
+   for (int i = 0; i < 11; ++i) {
+-    Value factorLhs = rewriter.create<mlir::stablehlo::SubtractOp>(
++    Value factorLhs = rewriter.create<mlir::stablehlo::AddOp>(
+         loc, x, getConstantLike(rewriter, loc, 22 - 2 * i, x));
+-    Value factorRhs = rewriter.create<mlir::stablehlo::SubtractOp>(
++    Value factorRhs = rewriter.create<mlir::stablehlo::AddOp>(
+         loc, x, getConstantLike(rewriter, loc, 21 - 2 * i, x));
+     factor = rewriter.create<mlir::stablehlo::MulOp>(loc, factorLhs, factorRhs);
+     hornerSum = rewriter.create<mlir::stablehlo::MulOp>(
 
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index a3322c411f28f4..72fd5ae46c082b 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -573,7 +573,7 @@ cc_library(
     ],
     strip_include_prefix = ".",
     deps = [
-        ":chlo_legalize_to_hlo",
+        ":chlo_legalize_to_hlo_inc_gen",
         ":hlo_legalize_to_stablehlo",
         ":legalize_to_linalg_utils",
         ":legalize_to_standard_inc_gen",
@@ -623,6 +623,7 @@ cc_library(
         "@stablehlo//:base",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_passes",
     ],
 )
 
@@ -881,28 +882,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "chlo_legalize_to_hlo",
-    srcs = ["mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc"],
-    hdrs = ["mhlo/transforms/rewriters.h"],
-    strip_include_prefix = ".",
-    deps = [
-        ":chlo_legalize_to_hlo_inc_gen",
-        ":map_chlo_to_hlo_op",
-        ":mlir_hlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ComplexDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:Transforms",
-        "@stablehlo//:broadcast_utils",
-        "@stablehlo//:chlo_ops",
-    ],
-)
-
 gentbl_cc_library(
     name = "chlo_legalize_to_hlo_inc_gen",
     compatible_with = get_compatible_with_portable(),
@@ -972,7 +951,6 @@ cc_library(
     ],
     strip_include_prefix = ".",
     deps = [
-        ":chlo_legalize_to_hlo",
         ":deallocation_passes",
         ":deallocation_passes_inc_gen",
         ":lhlo",
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
index ba8ab2e65ad68e..b7d447620a22b7 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
@@ -192,7 +192,6 @@ add_mlir_library(MhloToStandard
 )
 
 add_mlir_library(ChloPasses
-  chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
   chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
 
   DEPENDS
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
deleted file mode 100644
index 958d137a41214d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
+++ /dev/null
@@ -1,1976 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Enable the use of M_* math constants.
-// NOTE: this must be first in the file to ensure that if cmath is transitively
-// included by any other header it has the define set on first processing.
-// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants
-#define _USE_MATH_DEFINES
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <limits>
-#include <numeric>
-#include <vector>
-
-#include "llvm/ADT/SmallVector.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mhlo/transforms/map_chlo_to_hlo_op.h"
-#include "mhlo/transforms/rewriters.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "stablehlo/dialect/BroadcastUtils.h"
-#include "stablehlo/dialect/ChloOps.h"
-#include "utils/hlo_utils.h"
-
-namespace mlir {
-namespace chlo {
-namespace {
-
-struct ConvertConstantLikeOp : public OpConversionPattern<ConstantLikeOp> {
-  using OpConversionPattern<ConstantLikeOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ConstantLikeOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    auto resultTy = op.getType().cast<ShapedType>();
-
-    // Unranked uses are not supported.
-    if (!resultTy.hasRank()) return failure();
-
-    // Lower to MHLO constant if statically shaped.
-    if (resultTy.hasStaticShape()) {
-      auto complexAttr = op.getValue().dyn_cast<complex::NumberAttr>();
-      auto attr = complexAttr
-                      ? DenseElementsAttr::get(resultTy, complexAttr.getValue())
-                      : DenseElementsAttr::get(resultTy, op.getValue());
-      rewriter.replaceOpWithNewOp<mhlo::ConstantOp>(op, attr);
-      return success();
-    }
-
-    // Lower to broadcasted constant.
-    auto loc = op.getLoc();
-    Value constant = rewriter.create<mhlo::ConstantOp>(loc, op.getValue());
-    Value shape = rewriter.create<shape::ShapeOfOp>(loc, adaptor.getOperand());
-    rewriter.replaceOpWithNewOp<mhlo::DynamicBroadcastInDimOp>(
-        op, resultTy, constant, shape, rewriter.getI64TensorAttr({}));
-    return success();
-  }
-};
-
-template <typename FTy>
-Value materializeChebyshevPolynomialApproximation(
-    ConversionPatternRewriter &rewriter, Location loc, Value x,
-    ArrayRef<FTy> coefficients) {
-  Value b0 = chlo::getConstantLike(rewriter, loc, 0.0, x);
-  Value b1 = chlo::getConstantLike(rewriter, loc, 0.0, x);
-  Value b2 = chlo::getConstantLike(rewriter, loc, 0.0, x);
-  for (FTy c : coefficients) {
-    b2 = b1;
-    b1 = b0;
-    b0 = rewriter.create<mhlo::MulOp>(loc, x.getType(), x, b1);
-    b0 = rewriter.create<mhlo::SubtractOp>(loc, x.getType(), b0, b2);
-    b0 = rewriter.create<mhlo::AddOp>(
-        loc, x.getType(), b0, chlo::getConstantLike(rewriter, loc, c, x));
-  }
-  Value result = rewriter.create<mhlo::SubtractOp>(loc, x.getType(), b0, b2);
-  result = rewriter.create<mhlo::MulOp>(
-      loc, x.getType(), result, chlo::getConstantLike(rewriter, loc, 0.5, x));
-  return result;
-}
-
-template <typename FTy>
-Value materializeBesselI1eApproximation(ConversionPatternRewriter &rewriter,
-                                        Location loc, Value x,
-                                        ArrayRef<FTy> kI1eCoeffsA,
-                                        ArrayRef<FTy> kI1eCoeffsB) {
-  Value z = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value half = chlo::getConstantLike(rewriter, loc, 0.5, x);
-  Value two = chlo::getConstantLike(rewriter, loc, 2.0, x);
-  Value thirtyTwo = chlo::getConstantLike(rewriter, loc, 32.0, x);
-  Value eight = chlo::getConstantLike(rewriter, loc, 8.0, x);
-
-  Value tmp = rewriter.create<mhlo::MulOp>(loc, half, z);
-  tmp = rewriter.create<mhlo::SubtractOp>(loc, tmp, two);
-
-  Value xLe8 = materializeChebyshevPolynomialApproximation(rewriter, loc, tmp,
-                                                           kI1eCoeffsA);
-  xLe8 = rewriter.create<mhlo::MulOp>(loc, z, xLe8);
-
-  tmp = rewriter.create<mhlo::DivOp>(loc, thirtyTwo, z);
-  tmp = rewriter.create<mhlo::SubtractOp>(loc, tmp, two);
-  Value xGt8 = materializeChebyshevPolynomialApproximation(rewriter, loc, tmp,
-                                                           kI1eCoeffsB);
-  xGt8 = rewriter.create<mhlo::DivOp>(loc, xGt8,
-                                      rewriter.create<mhlo::SqrtOp>(loc, z));
-
-  Value isLe8 = rewriter.create<mhlo::CompareOp>(loc, z, eight,
-                                                 mhlo::ComparisonDirection::LE);
-
-  Value select = rewriter.create<mhlo::SelectOp>(loc, isLe8, xLe8, xGt8);
-  return rewriter.create<mhlo::MulOp>(
-      loc, rewriter.create<mhlo::SignOp>(loc, x), select);
-}
-
-Value materializeBesselI1eApproximationF32(ConversionPatternRewriter &rewriter,
-                                           Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
-         "expect f32 element type");
-  const float kI1eCoeffsA[] = {
-      9.38153738649577178388E-9f, -4.44505912879632808065E-8f,
-      2.00329475355213526229E-7f, -8.56872026469545474066E-7f,
-      3.47025130813767847674E-6f, -1.32731636560394358279E-5f,
-      4.78156510755005422638E-5f, -1.61760815825896745588E-4f,
-      5.12285956168575772895E-4f, -1.51357245063125314899E-3f,
-      4.15642294431288815669E-3f, -1.05640848946261981558E-2f,
-      2.47264490306265168283E-2f, -5.29459812080949914269E-2f,
-      1.02643658689847095384E-1f, -1.76416518357834055153E-1f,
-      2.52587186443633654823E-1f};
-
-  const float kI1eCoeffsB[] = {
-      -3.83538038596423702205E-9f, -2.63146884688951950684E-8f,
-      -2.51223623787020892529E-7f, -3.88256480887769039346E-6f,
-      -1.10588938762623716291E-4f, -9.76109749136146840777E-3f,
-      7.78576235018280120474E-1f};
-
-  return materializeBesselI1eApproximation<float>(rewriter, loc, x, kI1eCoeffsA,
-                                                  kI1eCoeffsB);
-}
-
-Value materializeBesselI1eApproximationF64(ConversionPatternRewriter &rewriter,
-                                           Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
-         "expect f64 element type");
-
-  const double kI1eCoeffsA[] = {
-      2.77791411276104639959E-18, -2.11142121435816608115E-17,
-      1.55363195773620046921E-16, -1.10559694773538630805E-15,
-      7.60068429473540693410E-15, -5.04218550472791168711E-14,
-      3.22379336594557470981E-13, -1.98397439776494371520E-12,
-      1.17361862988909016308E-11, -6.66348972350202774223E-11,
-      3.62559028155211703701E-10, -1.88724975172282928790E-9,
-      9.38153738649577178388E-9,  -4.44505912879632808065E-8,
-      2.00329475355213526229E-7,  -8.56872026469545474066E-7,
-      3.47025130813767847674E-6,  -1.32731636560394358279E-5,
-      4.78156510755005422638E-5,  -1.61760815825896745588E-4,
-      5.12285956168575772895E-4,  -1.51357245063125314899E-3,
-      4.15642294431288815669E-3,  -1.05640848946261981558E-2,
-      2.47264490306265168283E-2,  -5.29459812080949914269E-2,
-      1.02643658689847095384E-1,  -1.76416518357834055153E-1,
-      2.52587186443633654823E-1};
-
-  const double kI1eCoeffsB[] = {
-      7.51729631084210481353E-18,  4.41434832307170791151E-18,
-      -4.65030536848935832153E-17, -3.20952592199342395980E-17,
-      2.96262899764595013876E-16,  3.30820231092092828324E-16,
-      -1.88035477551078244854E-15, -3.81440307243700780478E-15,
-      1.04202769841288027642E-14,  4.27244001671195135429E-14,
-      -2.10154184277266431302E-14, -4.08355111109219731823E-13,
-      -7.19855177624590851209E-13, 2.03562854414708950722E-12,
-      1.41258074366137813316E-11,  3.25260358301548823856E-11,
-      -1.89749581235054123450E-11, -5.58974346219658380687E-10,
-      -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
-      -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
-      -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
-      7.78576235018280120474E-1};
-
-  return materializeBesselI1eApproximation<double>(rewriter, loc, x,
-                                                   kI1eCoeffsA, kI1eCoeffsB);
-}
-
-Value materializeWithUpcast(ConversionPatternRewriter &rewriter, Location loc,
-                            ValueRange args, FloatType minPrecisionTy,
-                            Value callback(ConversionPatternRewriter &,
-                                           Location, ValueRange)) {
-  auto originalTy = getElementTypeOrSelf(args.front().getType());
-  auto floatOriginalTy = originalTy.dyn_cast<FloatType>();
-  bool needsUpcast =
-      floatOriginalTy && floatOriginalTy.getWidth() < minPrecisionTy.getWidth();
-
-  // Upcast arguments if necessary.
-  llvm::SmallVector<Value, 2> castedArgs;
-  if (needsUpcast) {
-    for (Value a : args) {
-      castedArgs.push_back(
-          rewriter.create<mhlo::ConvertOp>(loc, a, minPrecisionTy));
-    }
-    args = castedArgs;
-  }
-
-  Value result = callback(rewriter, loc, args);
-
-  // Cast back if necessary.
-  if (needsUpcast) {
-    result = rewriter.create<mhlo::ConvertOp>(loc, result, originalTy);
-  }
-
-  return result;
-}
-
-struct ConvertBesselI1eOp : public OpConversionPattern<BesselI1eOp> {
-  using OpConversionPattern<BesselI1eOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      BesselI1eOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    Value x = adaptor.getOperand();
-    Type ty = x.getType().cast<ShapedType>().getElementType();
-
-    // For now, we support only f64, f32, f16 and bf16.
-    // See https://www.tensorflow.org/api_docs/python/tf/math/bessel_i1e
-    if (!ty.isF64() && !ty.isF32() && !ty.isF16() && !ty.isBF16())
-      return failure();
-
-    if (ty.isF64()) {
-      rewriter.replaceOp(
-          op, materializeBesselI1eApproximationF64(rewriter, loc, x));
-      return success();
-    }
-
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, loc, adaptor.getOperands(),
-                                  rewriter.getF32Type(),
-                                  &materializeBesselI1eApproximationF32));
-    return success();
-  }
-};
-
-template <typename FTy>
-Value materializePolynomialApproximation(ConversionPatternRewriter &rewriter,
-                                         Location loc, Value x,
-                                         ArrayRef<FTy> coefficients) {
-  if (coefficients.empty()) return chlo::getConstantLike(rewriter, loc, 0.0, x);
-
-  Value poly = chlo::getConstantLike(rewriter, loc, coefficients[0], x);
-  for (size_t i = 1; i < coefficients.size(); ++i) {
-    poly = rewriter.create<mhlo::MulOp>(loc, x.getType(), poly, x);
-    poly = rewriter.create<mhlo::AddOp>(
-        loc, x.getType(), poly,
-        chlo::getConstantLike(rewriter, loc, coefficients[i], x));
-  }
-  return poly;
-}
-
-// Precondition is |x| >= 1. Use erf approximation, otherwise.
-//
-// We rely on multiple polynomial approximations for x >= 1. We pass |x| as an
-// argument and derive the final approximation for all |x| >= 1.
-// This implementation is based on Cephes.
-Value materializeErfcApproximationF64ForMagnituteGeOne(
-    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
-         "expect f64 element type");
-  const double kMaxlog = 7.09782712893383996843E2;
-  const double kErfcPCoefficients[] = {
-      2.46196981473530512524E-10, 5.64189564831068821977E-1,
-      7.46321056442269912687E0,   4.86371970985681366614E1,
-      1.96520832956077098242E2,   5.26445194995477358631E2,
-      9.34528527171957607540E2,   1.02755188689515710272E3,
-      5.57535335369399327526E2};
-  const double kErfcQCoefficients[] = {
-      1.00000000000000000000E0, 1.32281951154744992508E1,
-      8.67072140885989742329E1, 3.54937778887819891062E2,
-      9.75708501743205489753E2, 1.82390916687909736289E3,
-      2.24633760818710981792E3, 1.65666309194161350182E3,
-      5.57535340817727675546E2};
-  const double kErfcRCoefficients[] = {
-      5.64189583547755073984E-1, 1.27536670759978104416E0,
-      5.01905042251180477414E0,  6.16021097993053585195E0,
-      7.40974269950448939160E0,  2.97886665372100240670E0};
-  const double kErfcSCoefficients[] = {
-      1.00000000000000000000E0, 2.26052863220117276590E0,
-      9.39603524938001434673E0, 1.20489539808096656605E1,
-      1.70814450747565897222E1, 9.60896809063285878198E0,
-      3.36907645100081516050E0};
-
-  // Let z = -x^2.
-  Value xSq = rewriter.create<mhlo::MulOp>(loc, x, x);
-  Value z = rewriter.create<mhlo::NegOp>(loc, xSq);
-
-  // Materialize polynomial approximation for x in [1, 8) as
-  //   erfc(x) = exp(z) P(|x|) / Q(|x|).
-  Value expZ = rewriter.create<mhlo::ExpOp>(loc, z);
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value polP = materializePolynomialApproximation(
-      rewriter, loc, absX, llvm::ArrayRef(kErfcPCoefficients));
-  Value expZMulPolyP = rewriter.create<mhlo::MulOp>(loc, expZ, polP);
-  Value polQ = materializePolynomialApproximation(
-      rewriter, loc, absX, llvm::ArrayRef(kErfcQCoefficients));
-  Value erfcApprox18 = rewriter.create<mhlo::DivOp>(loc, expZMulPolyP, polQ);
-
-  // Materialize polynomial approximation for x in >= 8 as
-  //   erfc(x) exp(z) R(|x|) / S(|x|).
-  Value polR = materializePolynomialApproximation(
-      rewriter, loc, absX, llvm::ArrayRef(kErfcRCoefficients));
-  Value expZMulPolyR = rewriter.create<mhlo::MulOp>(loc, expZ, polR);
-  Value polS = materializePolynomialApproximation(
-      rewriter, loc, absX, llvm::ArrayRef(kErfcSCoefficients));
-  Value erfcApprox8Inf = rewriter.create<mhlo::DivOp>(loc, expZMulPolyR, polS);
-
-  // Combine polynomial approximations for x >= 1.
-  Value eight = chlo::getConstantLike(rewriter, loc, 8.0, x);
-  Value absXLt8 = rewriter.create<mhlo::CompareOp>(
-      loc, absX, eight, mhlo::ComparisonDirection::LT);
-  Value erfcApprox = rewriter.create<mhlo::SelectOp>(loc, absXLt8, erfcApprox18,
-                                                     erfcApprox8Inf);
-
-  // Clamp to prevent overflow and materialize approximation for large x as
-  //   erfc(x) = 0.
-  Value zLtNegMaxlog = rewriter.create<mhlo::CompareOp>(
-      loc, z, chlo::getConstantLike(rewriter, loc, -kMaxlog, x),
-      mhlo::ComparisonDirection::LT);
-  Value zero = chlo::getConstantLike(rewriter, loc, 0.0, x);
-  Value erfcApproxClamped =
-      rewriter.create<mhlo::SelectOp>(loc, zLtNegMaxlog, zero, erfcApprox);
-
-  // Derive approximation for x <= -1 as
-  //   erfc(x) = 2 - erfc(-x).
-  // Reuse previously materialized approximations all of which take |x| as their
-  // argument.
-  Value xLtZero = rewriter.create<mhlo::CompareOp>(
-      loc, x, zero, mhlo::ComparisonDirection::LT);
-  Value two = chlo::getConstantLike(rewriter, loc, 2.0, x);
-  Value twoSubErfcApproxClamped =
-      rewriter.create<mhlo::SubtractOp>(loc, two, erfcApproxClamped);
-  return rewriter.create<mhlo::SelectOp>(loc, xLtZero, twoSubErfcApproxClamped,
-                                         erfcApproxClamped);
-}
-
-// Precondition is |x| <= 1. Use erfc approximation, otherwise.
-// This implementation is based on Cephes.
-Value materializeErfApproximationF64ForMagnituteLeOne(
-    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
-         "expect f64 element type");
-  const double kErfTCoefficients[] = {
-      9.60497373987051638749E0, 9.00260197203842689217E1,
-      2.23200534594684319226E3, 7.00332514112805075473E3,
-      5.55923013010394962768E4};
-  const double kErfUCoefficients[] = {
-      1.00000000000000000000E0, 3.35617141647503099647E1,
-      5.21357949780152679795E2, 4.59432382970980127987E3,
-      2.26290000613890934246E4, 4.92673942608635921086E4};
-
-  // Materialize polynomial approximation for |x| <= 1 as
-  //   erf(x) = x T(x^2) / U(x^2).
-  Value xSq = rewriter.create<mhlo::MulOp>(loc, x, x);
-  Value polyT = materializePolynomialApproximation(
-      rewriter, loc, xSq, llvm::ArrayRef(kErfTCoefficients));
-  Value xMulPolyT = rewriter.create<mhlo::MulOp>(loc, x, polyT);
-  Value polyU = materializePolynomialApproximation(
-      rewriter, loc, xSq, llvm::ArrayRef(kErfUCoefficients));
-  return rewriter.create<mhlo::DivOp>(loc, xMulPolyT, polyU);
-}
-
-// This implementation is based on Cephes.
-Value materializeErfApproximationF64(ConversionPatternRewriter &rewriter,
-                                     Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
-         "expect f64 element type");
-
-  // Rely on erf approximation for |x| < 1
-  //   erf(x) = erf_approx(x)
-  Value erfApprox =
-      materializeErfApproximationF64ForMagnituteLeOne(rewriter, loc, x);
-
-  // Rely on erfc approximation for |x| >= 1 and materialize erf as
-  //   erf(x) = 1 - erfc_approx(x)
-  Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  Value erfcApprox =
-      materializeErfcApproximationF64ForMagnituteGeOne(rewriter, loc, x);
-  Value erfcBasedApprox =
-      rewriter.create<mhlo::SubtractOp>(loc, one, erfcApprox);
-
-  // Materialize approximation selection based on argument.
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value absXLtOne = rewriter.create<mhlo::CompareOp>(
-      loc, absX, one, mhlo::ComparisonDirection::LT);
-  return rewriter.create<mhlo::SelectOp>(loc, absXLtOne, erfApprox,
-                                         erfcBasedApprox);
-}
-
-Value materializeErfcApproximationF64(ConversionPatternRewriter &rewriter,
-                                      Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
-         "expect f64 element type");
-
-  // Rely on erfc approximation for |x| >= 1
-  //   erfc(x) = erfc_approx(x)
-  Value erfcApprox =
-      materializeErfcApproximationF64ForMagnituteGeOne(rewriter, loc, x);
-
-  // Rely on erf approximation for |x| < 1 and materialize erfc as
-  //   erfc(x) = 1 - erf_approx(x)
-  Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  Value erfApprox =
-      materializeErfApproximationF64ForMagnituteLeOne(rewriter, loc, x);
-  Value erfBasedApprox = rewriter.create<mhlo::SubtractOp>(loc, one, erfApprox);
-
-  // Materialize approximation selection based on argument.
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value absXLtOne = rewriter.create<mhlo::CompareOp>(
-      loc, absX, one, mhlo::ComparisonDirection::LT);
-  return rewriter.create<mhlo::SelectOp>(loc, absXLtOne, erfBasedApprox,
-                                         erfcApprox);
-}
-
-// Precondition is |x| >= 1. Use erf approximation, otherwise.
-//
-// We rely on multiple polynomial approximations for x >= 1. We pass |x| as an
-// argument and derive the final approximation for all |x| >= 1.
-// This implementation is based on Cephes.
-Value materializeErfcApproximationF32ForMagnitudeGeOne(
-    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
-         "expect f32 element type");
-  const double kMaxlog = 88.72283905206835;
-  const float kErfcPCoefficients[] = {
-      +2.326819970068386E-2, -1.387039388740657E-1, +3.687424674597105E-1,
-      -5.824733027278666E-1, +6.210004621745983E-1, -4.944515323274145E-1,
-      +3.404879937665872E-1, -2.741127028184656E-1, +5.638259427386472E-1,
-  };
-  const float kErfcRCoefficients[] = {
-      -1.047766399936249E+1, +1.297719955372516E+1, -7.495518717768503E+0,
-      +2.921019019210786E+0, -1.015265279202700E+0, +4.218463358204948E-1,
-      -2.820767439740514E-1, +5.641895067754075E-1,
-  };
-
-  // Let z = -x^2.
-  Value xSq = rewriter.create<mhlo::MulOp>(loc, x, x);
-  Value z = rewriter.create<mhlo::NegOp>(loc, xSq);
-
-  // Materialize polynomial approximation for x >= 1 as
-  //   erfc(x) = exp(z) 1/x P(1/x^2)   if x in [1, 2)
-  //   erfc(x) = exp(z) 1/x R(1/x^2)   if x >= 2
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  Value reciprocalXSq = rewriter.create<mhlo::DivOp>(loc, one, xSq);
-  Value expZ = rewriter.create<mhlo::ExpOp>(loc, z);
-  Value oneDivAbsX = rewriter.create<mhlo::DivOp>(loc, one, absX);
-  Value expZMulOneDivAbsX = rewriter.create<mhlo::MulOp>(loc, expZ, oneDivAbsX);
-  Value two = chlo::getConstantLike(rewriter, loc, 2.0, x);
-  Value absXLtTwo = rewriter.create<mhlo::CompareOp>(
-      loc, absX, two, mhlo::ComparisonDirection::LT);
-  Value polP = materializePolynomialApproximation(
-      rewriter, loc, reciprocalXSq, llvm::ArrayRef(kErfcPCoefficients));
-  Value polR = materializePolynomialApproximation(
-      rewriter, loc, reciprocalXSq, llvm::ArrayRef(kErfcRCoefficients));
-  Value poly = rewriter.create<mhlo::SelectOp>(loc, absXLtTwo, polP, polR);
-  Value erfcApprox = rewriter.create<mhlo::MulOp>(loc, expZMulOneDivAbsX, poly);
-
-  // Clamp to prevent overflow and materialize approximation for large x as
-  //   erfc(x) = 0.
-  Value zLtNeqMaxlog = rewriter.create<mhlo::CompareOp>(
-      loc, z, chlo::getConstantLike(rewriter, loc, -kMaxlog, x),
-      mhlo::ComparisonDirection::LT);
-  Value zero = chlo::getConstantLike(rewriter, loc, 0.0, x);
-  Value erfcApproxClamped =
-      rewriter.create<mhlo::SelectOp>(loc, zLtNeqMaxlog, zero, erfcApprox);
-
-  // Derive approximation for x <= -1 as
-  //   erfc(x) = 2 - erfc(-x).
-  // Reuse previously materialized approximations all of which take |x| as their
-  // argument.
-  Value xLtZero = rewriter.create<mhlo::CompareOp>(
-      loc, x, zero, mhlo::ComparisonDirection::LT);
-  Value twoSubErfcApprox =
-      rewriter.create<mhlo::SubtractOp>(loc, two, erfcApproxClamped);
-  return rewriter.create<mhlo::SelectOp>(loc, xLtZero, twoSubErfcApprox,
-                                         erfcApproxClamped);
-}
-
-// Precondition is |x| <= 1. Use erfc approximation, otherwise.
-// This implementation is based on Cephes.
-Value materializeErfApproximationF32ForMagnitudeLeOne(
-    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
-         "expect f32 element type");
-  const float kErfTCoefficients[] = {
-      +7.853861353153693E-5, -8.010193625184903E-4, +5.188327685732524E-3,
-      -2.685381193529856E-2, +1.128358514861418E-1, -3.761262582423300E-1,
-      +1.128379165726710E+0,
-  };
-
-  // Materialize polynomial approximation for |x| <= 1 as
-  //   erf(x) = x T(x^2).
-  Value xSq = rewriter.create<mhlo::MulOp>(loc, x, x);
-  Value polyT = materializePolynomialApproximation(
-      rewriter, loc, xSq, llvm::ArrayRef(kErfTCoefficients));
-  return rewriter.create<mhlo::MulOp>(loc, x, polyT);
-}
-
-// This is the same approximation as used in Eigen.
-Value materializeErfApproximationF32(ConversionPatternRewriter &rewriter,
-                                     Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
-         "expect f32 element type");
-  const float kAlpha[] = {
-      -2.72614225801306e-10f, 2.77068142495902e-08f,  -2.10102402082508e-06f,
-      -5.69250639462346e-05f, -7.34990630326855e-04f, -2.95459980854025e-03f,
-      -1.60960333262415e-02f,
-  };
-  const float kBeta[] = {
-      -1.45660718464996e-05f, -2.13374055278905e-04f, -1.68282697438203e-03f,
-      -7.37332916720468e-03f, -1.42647390514189e-02f,
-  };
-
-  // Clamp argument between -4 and 4.
-  Value lb = chlo::getConstantLike(rewriter, loc, -4.0, x);
-  Value ub = chlo::getConstantLike(rewriter, loc, 4.0, x);
-  x = rewriter.create<mhlo::ClampOp>(loc, x.getType(), lb, x, ub);
-  Value xSq = rewriter.create<mhlo::MulOp>(loc, x, x);
-
-  // Materialize polynomial approximation for x in [-4, 4] as
-  //   erf(x) = x * Alpha(x^2) / Beta(x^2).
-  Value alphaPoly = materializePolynomialApproximation(rewriter, loc, xSq,
-                                                       llvm::ArrayRef(kAlpha));
-  Value betaPoly = materializePolynomialApproximation(rewriter, loc, xSq,
-                                                      llvm::ArrayRef(kBeta));
-  Value xMulAlphaPoly = rewriter.create<mhlo::MulOp>(loc, x, alphaPoly);
-  Value erf = rewriter.create<mhlo::DivOp>(loc, xMulAlphaPoly, betaPoly);
-  Value lbErf = chlo::getConstantLike(rewriter, loc, -1.0, x);
-  Value ubErf = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  return rewriter.create<mhlo::ClampOp>(loc, erf.getType(), lbErf, erf, ubErf);
-}
-
-Value materializeErfcApproximationF32(ConversionPatternRewriter &rewriter,
-                                      Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
-         "expect f32 element type");
-
-  // Rely on erfc approximation for |x| >= 1
-  //   erfc(x) = erfc_approx(x)
-  Value erfcApprox =
-      materializeErfcApproximationF32ForMagnitudeGeOne(rewriter, loc, x);
-
-  // Rely on erf approximation for |x| < 1 and materialize erfc as
-  //   erfc(x) = 1 - erf_approx(x)
-  Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  Value erfApprox =
-      materializeErfApproximationF32ForMagnitudeLeOne(rewriter, loc, x);
-  Value erfBasedApprox = rewriter.create<mhlo::SubtractOp>(loc, one, erfApprox);
-
-  // Materialize approximation selection based on argument.
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value absXLtOne = rewriter.create<mhlo::CompareOp>(
-      loc, absX, one, mhlo::ComparisonDirection::LT);
-  return rewriter.create<mhlo::SelectOp>(loc, absXLtOne, erfBasedApprox,
-                                         erfcApprox);
-}
-
-struct BasisConvertErfOp : public OpConversionPattern<ErfOp> {
-  using OpConversionPattern<ErfOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ErfOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    Value x = adaptor.getOperand();
-    Type ty = x.getType().cast<ShapedType>().getElementType();
-
-    // For now, we support only f64, f32, f16 and bf16.
-    if (!ty.isF64() && !ty.isF32() && !ty.isF16() && !ty.isBF16())
-      return failure();
-
-    if (ty.isF64()) {
-      rewriter.replaceOp(op, materializeErfApproximationF64(rewriter, loc, x));
-      return success();
-    }
-
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, loc, adaptor.getOperands(),
-                                  rewriter.getF32Type(),
-                                  &materializeErfApproximationF32));
-    return success();
-  }
-};
-
-struct ConvertErfcOp : public OpConversionPattern<ErfcOp> {
-  using OpConversionPattern<ErfcOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ErfcOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    Value x = adaptor.getOperand();
-    Type ty = x.getType().cast<ShapedType>().getElementType();
-
-    // For now, we support only f64, f32, f16 and bf16.
-    if (!ty.isF64() && !ty.isF32() && !ty.isF16() && !ty.isBF16())
-      return failure();
-
-    if (ty.isF64()) {
-      rewriter.replaceOp(op, materializeErfcApproximationF64(rewriter, loc, x));
-      return success();
-    }
-
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, loc, adaptor.getOperands(),
-                                  rewriter.getF32Type(),
-                                  &materializeErfcApproximationF32));
-    return success();
-  }
-};
-
-Value erfInv32(ConversionPatternRewriter &b, Location loc, ValueRange args) {
-  constexpr int kDegree = 9;
-  constexpr std::array<float, 9> wLessThan5Constants = {
-      2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
-      -4.39150654e-06f, 0.00021858087f,  -0.00125372503f,
-      -0.00417768164f,  0.246640727f,    1.50140941f};
-  constexpr std::array<float, 9> wGreaterThan5Constants = {
-      -0.000200214257f, 0.000100950558f, 0.00134934322f,
-      -0.00367342844f,  0.00573950773f,  -0.0076224613f,
-      0.00943887047f,   1.00167406f,     2.83297682f};
-
-  Value x = args[0];
-  // Compute logarithm of (1+arg) using log1p(arg) which is more precise than
-  // log(1+arg) when arg is close to zero. For more details, see
-  // https://en.cppreference.com/w/cpp/numeric/math/log1p
-  Value minusXSquared =
-      b.create<mhlo::MulOp>(loc, x, b.create<mhlo::NegOp>(loc, x));
-  Value w =
-      b.create<mhlo::NegOp>(loc, b.create<mhlo::Log1pOp>(loc, minusXSquared));
-
-  Value lt = b.create<mhlo::CompareOp>(loc, w, getConstantLike(b, loc, 5.0, x),
-                                       mhlo::ComparisonDirection::LT);
-  auto coefficient = [&](int i) {
-    return b.create<mhlo::SelectOp>(
-        loc, lt, getConstantLike(b, loc, wLessThan5Constants[i], x),
-        getConstantLike(b, loc, wGreaterThan5Constants[i], x));
-  };
-  w = b.create<mhlo::SelectOp>(
-      loc, lt,
-      b.create<mhlo::SubtractOp>(loc, w, getConstantLike(b, loc, 2.5, x)),
-      b.create<mhlo::SubtractOp>(loc, b.create<mhlo::SqrtOp>(loc, w),
-                                 getConstantLike(b, loc, 3.0, x)));
-  Value p = coefficient(0);
-  for (int i = 1; i < kDegree; ++i) {
-    p = b.create<mhlo::AddOp>(loc, coefficient(i),
-                              b.create<mhlo::MulOp>(loc, p, w));
-  }
-
-  // Result modulo edge cases.
-  Value result = b.create<mhlo::MulOp>(loc, p, x);
-
-  // Handle edge cases, namely erfinv(+/-1) = +/-inf.  (The above computation is
-  // indeterminate, and can give nan or -/+inf.)
-  return b.create<mhlo::SelectOp>(
-      loc,
-      b.create<mhlo::CompareOp>(loc, b.create<mhlo::AbsOp>(loc, x),
-                                getConstantLike(b, loc, 1, x),
-                                mhlo::ComparisonDirection::EQ),
-      b.create<mhlo::MulOp>(loc, x, getConstantLikeInfValue(b, loc, x, false)),
-      result);
-}
-
-Value erfInv64(ConversionPatternRewriter &b, Location loc, ValueRange args) {
-  constexpr std::array<double, 23> wLessThan625Constants = {
-      -3.6444120640178196996e-21, -1.685059138182016589e-19,
-      1.2858480715256400167e-18,  1.115787767802518096e-17,
-      -1.333171662854620906e-16,  2.0972767875968561637e-17,
-      6.6376381343583238325e-15,  -4.0545662729752068639e-14,
-      -8.1519341976054721522e-14, 2.6335093153082322977e-12,
-      -1.2975133253453532498e-11, -5.4154120542946279317e-11,
-      1.051212273321532285e-09,   -4.1126339803469836976e-09,
-      -2.9070369957882005086e-08, 4.2347877827932403518e-07,
-      -1.3654692000834678645e-06, -1.3882523362786468719e-05,
-      0.0001867342080340571352,   -0.00074070253416626697512,
-      -0.0060336708714301490533,  0.24015818242558961693,
-      1.6536545626831027356};
-  constexpr std::array<double, 19> wLessThan16Constants = {
-      2.2137376921775787049e-09,  9.0756561938885390979e-08,
-      -2.7517406297064545428e-07, 1.8239629214389227755e-08,
-      1.5027403968909827627e-06,  -4.013867526981545969e-06,
-      2.9234449089955446044e-06,  1.2475304481671778723e-05,
-      -4.7318229009055733981e-05, 6.8284851459573175448e-05,
-      2.4031110387097893999e-05,  -0.0003550375203628474796,
-      0.00095328937973738049703,  -0.0016882755560235047313,
-      0.0024914420961078508066,   -0.0037512085075692412107,
-      0.005370914553590063617,    1.0052589676941592334,
-      3.0838856104922207635,
-  };
-  constexpr std::array<double, 17> wGreaterThan16Constants = {
-      -2.7109920616438573243e-11, -2.5556418169965252055e-10,
-      1.5076572693500548083e-09,  -3.7894654401267369937e-09,
-      7.6157012080783393804e-09,  -1.4960026627149240478e-08,
-      2.9147953450901080826e-08,  -6.7711997758452339498e-08,
-      2.2900482228026654717e-07,  -9.9298272942317002539e-07,
-      4.5260625972231537039e-06,  -1.9681778105531670567e-05,
-      7.5995277030017761139e-05,  -0.00021503011930044477347,
-      -0.00013871931833623122026, 1.0103004648645343977,
-      4.8499064014085844221,
-  };
-
-  Value x = args[0];
-  // Compute logarithm of (1+arg) using log1p(arg) which is more precise than
-  // log(1+arg) when arg is close to zero. For more details, see
-  // https://en.cppreference.com/w/cpp/numeric/math/log1p
-  Value minusXSquared =
-      b.create<mhlo::MulOp>(loc, x, b.create<mhlo::NegOp>(loc, x));
-  Value w =
-      b.create<mhlo::NegOp>(loc, b.create<mhlo::Log1pOp>(loc, minusXSquared));
-
-  Value lt625 = b.create<mhlo::CompareOp>(
-      loc, w, getConstantLike(b, loc, 6.25, x), mhlo::ComparisonDirection::LT);
-  Value lt16 = b.create<mhlo::CompareOp>(loc, w, getConstantLike(b, loc, 16, x),
-                                         mhlo::ComparisonDirection::LT);
-
-  auto coefficient = [&](int i) {
-    Value c = getConstantLike(b, loc, wLessThan625Constants[i], x);
-    if (i < 19) {
-      c = b.create<mhlo::SelectOp>(
-          loc, lt625, c, getConstantLike(b, loc, wLessThan16Constants[i], x));
-    }
-    if (i < 17) {
-      c = b.create<mhlo::SelectOp>(
-          loc, lt16, c, getConstantLike(b, loc, wGreaterThan16Constants[i], x));
-    }
-    return c;
-  };
-
-  Value sqrtW = b.create<mhlo::SqrtOp>(loc, w);
-  Value wMinus3125 =
-      b.create<mhlo::SubtractOp>(loc, w, getConstantLike(b, loc, 3.125, x));
-  Value select2 =
-      b.create<mhlo::SelectOp>(loc, lt16, getConstantLike(b, loc, 3.25, w),
-                               getConstantLike(b, loc, 5.0, w));
-  Value select2Result = b.create<mhlo::SubtractOp>(loc, sqrtW, select2);
-  w = b.create<mhlo::SelectOp>(loc, lt625, wMinus3125, select2Result);
-
-  Value p = coefficient(0);
-  for (int i = 1; i < 17; ++i) {
-    p = b.create<mhlo::AddOp>(loc, coefficient(i),
-                              b.create<mhlo::MulOp>(loc, p, w));
-  }
-  for (int i = 17; i < 19; ++i) {
-    p = b.create<mhlo::SelectOp>(
-        loc, lt16,
-        b.create<mhlo::AddOp>(loc, coefficient(i),
-                              b.create<mhlo::MulOp>(loc, p, w)),
-        p);
-  }
-  for (int i = 19; i < 23; ++i) {
-    p = b.create<mhlo::SelectOp>(
-        loc, lt625,
-        b.create<mhlo::AddOp>(loc, coefficient(i),
-                              b.create<mhlo::MulOp>(loc, p, w)),
-        p);
-  }
-
-  // Result modulo edge cases.
-  Value result = b.create<mhlo::MulOp>(loc, p, x);
-
-  // Handle edge cases, namely erfinv(+/-1) = +/-inf.  (The above computation is
-  // indeterminate, and can give nan or -/+inf.)
-  return b.create<mhlo::SelectOp>(
-      loc,
-      b.create<mhlo::CompareOp>(loc, b.create<mhlo::AbsOp>(loc, x),
-                                getConstantLike(b, loc, 1, x),
-                                mhlo::ComparisonDirection::EQ),
-      b.create<mhlo::MulOp>(loc, x, getConstantLikeInfValue(b, loc, x, false)),
-      result);
-}
-
-struct ConvertErfInvOp : public OpConversionPattern<ErfInvOp> {
-  using OpConversionPattern<ErfInvOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ErfInvOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    if (op.getResult().getType().getElementType().isF64()) {
-      rewriter.replaceOp(op, erfInv64(rewriter, loc, adaptor.getOperands()));
-      return success();
-    }
-    FloatType minPrecisionTy = rewriter.getF32Type();
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, loc, adaptor.getOperands(),
-                                  minPrecisionTy, &erfInv32));
-    return success();
-  }
-};
-
-// Coefficients for the Lanczos approximation of the gamma function. The
-// coefficients are uniquely determined by the choice of g and n (kLanczosGamma
-// and kLanczosCoefficients.size() + 1). The coefficients below correspond to
-// [7, 9]. [5, 7], [7, 9], [9, 10], and [607/128.0, 15] were evaluated and
-// [7, 9] seemed to be the least sensitive to the quality of the log function.
-// In particular, [5, 7] is the only choice where -1.5e-5 <= lgamma(2) <= 1.5e-5
-// for a particularly inaccurate log function.
-constexpr double kLanczosGamma = 7;  // aka g
-constexpr double kBaseLanczosCoeff = 0.99999999999980993227684700473478;
-constexpr std::array<double, 8> kLanczosCoefficients = {
-    676.520368121885098567009190444019, -1259.13921672240287047156078755283,
-    771.3234287776530788486528258894,   -176.61502916214059906584551354,
-    12.507343278686904814458936853,     -0.13857109526572011689554707,
-    9.984369578019570859563e-6,         1.50563273514931155834e-7};
-
-// Compute the Lgamma function using Lanczos' approximation from "A Precision
-// Approximation of the Gamma Function". SIAM Journal on Numerical Analysis
-// series B. Vol. 1:
-//   lgamma(z + 1) = (log(2) + log(pi)) / 2
-//                     + (z + 1/2) * log(t(z))
-//                     - t(z) + log(a(z))
-//   with   t(z) = z + kLanczosGamma + 1/2
-//          a(z) = kBaseLanczosCoeff
-//                   + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
-Value materializeLgamma(ConversionPatternRewriter &rewriter, Location loc,
-                        ValueRange args) {
-  // If the input is less than 0.5 use Euler's reflection formula.
-  //   gamma(x) = pi / (sin(pi * x) * gamma(1 - x))
-  // Let z be
-  //   z = -x      if x < 1/2
-  //   z = x - 1   otheriwse
-  Value x = args.front();
-  Value half = getConstantLike(rewriter, loc, 0.5, x);
-  Value needToReflect = rewriter.create<mhlo::CompareOp>(
-      loc, x, half, mhlo::ComparisonDirection::LT);
-  Value negX = rewriter.create<mhlo::NegOp>(loc, x);
-  Value one = getConstantLike(rewriter, loc, 1, x);
-  Value xSubOne = rewriter.create<mhlo::SubtractOp>(loc, x, one);
-  Value z = rewriter.create<mhlo::SelectOp>(loc, needToReflect, negX, xSubOne);
-
-  // Materialize
-  //   a(z) = kBaseLanczosCoeff
-  //            + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
-  Value a = getConstantLike(rewriter, loc, kBaseLanczosCoeff, x);
-  for (int i = 0, end = kLanczosCoefficients.size(); i < end; ++i) {
-    Value coeff = getConstantLike(rewriter, loc, kLanczosCoefficients[i], x);
-    Value oneBasedIndex = getConstantLike(rewriter, loc, i + 1, x);
-    Value quotient = rewriter.create<mhlo::DivOp>(
-        loc, coeff, rewriter.create<mhlo::AddOp>(loc, z, oneBasedIndex));
-    a = rewriter.create<mhlo::AddOp>(loc, a, quotient);
-  }
-
-  // To improve accuracy on platforms with less-precise log implementations,
-  // compute log(kLanczosGamma + 1/2) at compile time and use log1p on the
-  // device.
-  // Materialize as
-  //   log(t) = log(kLanczosGamma + 1/2 + z)
-  //          = log(kLanczosGamma + 1/2) + log1p(z / (kLanczosGamma + 1/2)).
-  Value lanczosPlusHalf =
-      getConstantLike(rewriter, loc, kLanczosGamma + 0.5, x);
-  Value t = rewriter.create<mhlo::AddOp>(loc, lanczosPlusHalf, z);
-  Value logTerm =
-      getConstantLike(rewriter, loc, std::log(kLanczosGamma + 0.5), x);
-  Value log1pTerm = rewriter.create<mhlo::Log1pOp>(
-      loc, rewriter.create<mhlo::DivOp>(loc, z, lanczosPlusHalf));
-  Value logT = rewriter.create<mhlo::AddOp>(loc, logTerm, log1pTerm);
-
-  // Note that t(z) may be large and we need to be careful not to overflow to
-  // infinity in the relevant term
-  //   r = (z + 1/2) * log(t(z)) - t(z).
-  // Therefore, we compute this as
-  //   r = (z + 1/2 - t(z) / log(t(z))) * log(t(z)).
-  Value tDivLogT = rewriter.create<mhlo::DivOp>(loc, t, logT);
-  Value sum = rewriter.create<mhlo::SubtractOp>(
-      loc, rewriter.create<mhlo::AddOp>(loc, z, half), tDivLogT);
-  Value r = rewriter.create<mhlo::MulOp>(loc, sum, logT);
-
-  // Compute the final result (modulo reflection) as
-  //   lgamma(z + 1) = (log(2) + log(pi)) / 2 + r + log(a(z)).
-  Value logA = rewriter.create<mhlo::LogOp>(loc, a);
-  Value lgamma = rewriter.create<mhlo::AddOp>(
-      loc,
-      rewriter.create<mhlo::AddOp>(
-          loc,
-          getConstantLike(rewriter, loc, (std::log(2) + std::log(M_PI)) / 2, x),
-          r),
-      logA);
-
-  // Compute the reflected value for x < 0.5 as
-  //   lgamma(x) = log(pi) - lgamma(1-x) - log(abs(sin(pi * x))).
-  //
-  // The abs is needed because lgamma is the log of the absolute value of the
-  // gamma function.
-  //
-  // We have to be careful when computing the final term above. gamma(x) goes
-  // to +/-inf at every integer x < 0, and this is controlled by the sin(pi * x)
-  // term. The slope is large, so precision is particularly important.
-  //
-  // Because abs(sin(pi * x)) has period of 1 we can equivalently use
-  // abs(sin(pi * frac(x))) where frac(x) is the fractional part of x. This is
-  // more numerically accurate: It doesn't overflow to inf like pi * x would and
-  // if x is an integer it evaluates to exactly 0 which is important because we
-  // then take the log of this value, and log(0) is inf.
-  //
-  // We don't have a frac(x) primitive in HLO and computing it is tricky, but
-  // because abs(sin(pi * x)) = abs(sin(pi * abs(x))), it's good enough for our
-  // purposes to use abs(frac(x)) = abs(x) - floor(abs(x)).
-  //
-  // Furthermore, pi * abs(frac(x)) loses precision when abs(frac(x)) is close
-  // to 1. To remedy this, we can use the fact that sin(pi * x) in the domain
-  // [0, 1] is symmetric across the line Y=0.5.
-  //
-
-  // Convert values of abs_frac > 0.5 to (1 - abs_frac) to improve precision of
-  // pi * abs_frac for values of abs_frac close to 1.
-  Value abs = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value absFrac = rewriter.create<mhlo::SubtractOp>(
-      loc, abs, rewriter.create<mhlo::FloorOp>(loc, abs));
-  Value reduceAbsFrac = rewriter.create<mhlo::CompareOp>(
-      loc, half, absFrac, mhlo::ComparisonDirection::LT);
-  absFrac = rewriter.create<mhlo::SelectOp>(
-      loc, reduceAbsFrac, rewriter.create<mhlo::SubtractOp>(loc, one, absFrac),
-      absFrac);
-
-  // Materialize reflection.
-  Value reflectionDenom = rewriter.create<mhlo::LogOp>(
-      loc,
-      rewriter.create<mhlo::SineOp>(
-          loc, rewriter.create<mhlo::MulOp>(
-                   loc, getConstantLike(rewriter, loc, M_PI, x), absFrac)));
-  Value lgammaReflection = rewriter.create<mhlo::SubtractOp>(
-      loc,
-      rewriter.create<mhlo::SubtractOp>(
-          loc, getConstantLike(rewriter, loc, std::log(M_PI), x),
-          reflectionDenom),
-      lgamma);
-
-  // Avoid computing -inf - inf, which is nan. If reflection_denom is +/-inf,
-  // then it "wins" and the result is +/-inf.
-  Value finiteReflectionDenom =
-      rewriter.create<mhlo::IsFiniteOp>(loc, reflectionDenom);
-  Value negReflectionDenom = rewriter.create<mhlo::NegOp>(loc, reflectionDenom);
-  lgammaReflection = rewriter.create<mhlo::SelectOp>(
-      loc, finiteReflectionDenom, lgammaReflection, negReflectionDenom);
-
-  // Select whether or not to rely on the reflection.
-  lgamma = rewriter.create<mhlo::SelectOp>(loc, needToReflect, lgammaReflection,
-                                           lgamma);
-
-  // Materialize +/-inf behavior as
-  //   lgamma(+/-inf) = +inf.
-  Value xIsInf = rewriter.create<chlo::IsInfOp>(loc, x);
-  return rewriter.create<mhlo::SelectOp>(
-      loc, xIsInf,
-      chlo::getConstantLikeInfValue(rewriter, loc, x, /*negative=*/false),
-      lgamma);
-}
-
-// Express `cosh` as
-//   cosh(x) = (e^x + e^-x) / 2
-//           = e^(x + log(1/2)) + e^(-x + log(1/2))
-//
-// The second formulation avoids overflowing when e^x = inf but (e^x)/2 is not.
-//
-// This incorrectly overflows to inf for two f32 input values, namely
-// +/-89.4159851, due to rounding error when computing x +/- log(1/2).  The
-// correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
-// we deem this acceptable.
-Value materializeCoshApproximation(ConversionPatternRewriter &rewriter,
-                                   Location loc, ValueRange operands) {
-  CoshOp::Adaptor transformed(operands);
-  Value x = transformed.getOperand();
-
-  Value logOneHalf =
-      rewriter.create<mhlo::LogOp>(loc, getConstantLike(rewriter, loc, 0.5, x));
-  Value expAdd = rewriter.create<mhlo::ExpOp>(
-      loc, rewriter.create<mhlo::AddOp>(loc, x, logOneHalf));
-  Value expSub = rewriter.create<mhlo::ExpOp>(
-      loc, rewriter.create<mhlo::SubtractOp>(loc, logOneHalf, x));
-  return rewriter.create<mhlo::AddOp>(loc, expAdd, expSub);
-}
-
-struct ConvertCoshOp : public OpConversionPattern<CoshOp> {
-  using OpConversionPattern<CoshOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      CoshOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-                                  rewriter.getF32Type(),
-                                  &materializeCoshApproximation));
-    return success();
-  }
-};
-
-// Compute the Digamma function using Lanczos' approximation from "A Precision
-// Approximation of the Gamma Function". SIAM Journal on Numerical Analysis
-// series B. Vol. 1:
-//   digamma(z + 1) = log(t(z)) + a'(z) / a(z) - kLanczosGamma / t(z)
-//   with   t(z) = z + kLanczosGamma + 1/2
-//          a(z) = kBaseLanczosCoeff
-//                   + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
-//          a'(z) = - sum(k = 1, n, kLanczosCoefficients[i] / (z + k) / (z + k))
-Value materializeDigamma(ConversionPatternRewriter &rewriter, Location loc,
-                         ValueRange args) {
-  // If the input is less than 0.5 use Euler's reflection formula.
-  //   digamma(x) = digamma(1 - x) - pi * cot(pi * x)
-  // Let z be
-  //   z = -x      if x < 1/2
-  //   z = x - 1   otheriwse
-  Value x = args.front();
-  Value half = getConstantLike(rewriter, loc, 0.5, x);
-  Value needToReflect = rewriter.create<mhlo::CompareOp>(
-      loc, x, half, mhlo::ComparisonDirection::LT);
-  Value negX = rewriter.create<mhlo::NegOp>(loc, x);
-  Value one = getConstantLike(rewriter, loc, 1, x);
-  Value xSubOne = rewriter.create<mhlo::SubtractOp>(loc, x, one);
-  Value z = rewriter.create<mhlo::SelectOp>(loc, needToReflect, negX, xSubOne);
-
-  // Materialize
-  //   a(z) = kBaseLanczosCoeff
-  //            + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
-  //   a'(z) = - sum(k = 1, n, kLanczosCoefficients[i] / (z + k) / (z + k))
-  Value zero = getConstantLike(rewriter, loc, 0.0, x);
-  Value a = getConstantLike(rewriter, loc, kBaseLanczosCoeff, x);
-  Value aPrime = zero;
-  for (int i = 0, end = kLanczosCoefficients.size(); i < end; ++i) {
-    Value coeff = getConstantLike(rewriter, loc, kLanczosCoefficients[i], x);
-    Value oneBasedIndex = getConstantLike(rewriter, loc, i + 1, x);
-    Value zTerm = rewriter.create<mhlo::AddOp>(loc, z, oneBasedIndex);
-    aPrime = rewriter.create<mhlo::SubtractOp>(
-        loc, aPrime,
-        rewriter.create<mhlo::DivOp>(
-            loc, coeff, rewriter.create<mhlo::MulOp>(loc, zTerm, zTerm)));
-    a = rewriter.create<mhlo::AddOp>(
-        loc, a, rewriter.create<mhlo::DivOp>(loc, coeff, zTerm));
-  }
-
-  // To improve accuracy on platforms with less-precise log implementations,
-  // compute log(kLanczosGamma + 1/2) at compile time and use log1p on the
-  // device.
-  // Materialize as
-  //   log(t) = log(kLanczosGamma + 1/2 + z)
-  //          = log(kLanczosGamma + 1/2) + log1p(z / (kLanczosGamma + 1/2)).
-  Value lanczosPlusHalf =
-      getConstantLike(rewriter, loc, kLanczosGamma + 0.5, x);
-  Value t = rewriter.create<mhlo::AddOp>(loc, lanczosPlusHalf, z);
-  Value logTerm =
-      getConstantLike(rewriter, loc, std::log(kLanczosGamma + 0.5), x);
-  Value log1pTerm = rewriter.create<mhlo::Log1pOp>(
-      loc, rewriter.create<mhlo::DivOp>(loc, z, lanczosPlusHalf));
-  Value logT = rewriter.create<mhlo::AddOp>(loc, logTerm, log1pTerm);
-
-  // Materialize the final result (modulo reflection) as
-  //   digamma(z + 1) = log(t(z)) + a'(z) / a(z) - kLanczosGamma / t(z).
-  Value aPrimeDivA = rewriter.create<mhlo::DivOp>(loc, aPrime, a);
-  Value lanczosGammaDivT = rewriter.create<mhlo::DivOp>(
-      loc, getConstantLike(rewriter, loc, kLanczosGamma, x), t);
-  Value digamma = rewriter.create<mhlo::SubtractOp>(
-      loc, rewriter.create<mhlo::AddOp>(loc, logT, aPrimeDivA),
-      lanczosGammaDivT);
-
-  // We need to be careful how we compute cot(pi * input) below: For
-  // near-integral arguments, pi * input can lose precision.
-  //
-  // Input is already known to be less than 0.5 (otherwise we don't have to
-  // reflect). We shift values smaller than -0.5 into the range [-0.5, 0.5] to
-  // increase precision of pi * x and the resulting cotangent.
-  Value reducedX = rewriter.create<mhlo::AddOp>(
-      loc, x,
-      rewriter.create<mhlo::AbsOp>(
-          loc, rewriter.create<mhlo::FloorOp>(
-                   loc, rewriter.create<mhlo::AddOp>(
-                            loc, x, getConstantLike(rewriter, loc, 0.5, x)))));
-
-  // Materialize reflection for inputs less than 0.5 as
-  //   digamma(x) = digamma(1 - x) - pi * cot(pi * x)
-  //              = digamma(1 - x) - pi * cos(pi * x) / sin(pi * x)
-  Value pi = getConstantLike(rewriter, loc, M_PI, x);
-  Value piMulReducedX = rewriter.create<mhlo::MulOp>(loc, pi, reducedX);
-  Value cos = rewriter.create<mhlo::CosineOp>(loc, piMulReducedX);
-  Value sin = rewriter.create<mhlo::SineOp>(loc, piMulReducedX);
-  Value reflection = rewriter.create<mhlo::SubtractOp>(
-      loc, digamma,
-      rewriter.create<mhlo::DivOp>(
-          loc, rewriter.create<mhlo::MulOp>(loc, pi, cos), sin));
-
-  // Select whether or not to rely on the reflection.
-  digamma =
-      rewriter.create<mhlo::SelectOp>(loc, needToReflect, reflection, digamma);
-
-  // Digamma has poles at negative integers and zero; return nan for those.
-  Value isLeZero = rewriter.create<mhlo::CompareOp>(
-      loc, x, zero, mhlo::ComparisonDirection::LE);
-  Value isInt = rewriter.create<mhlo::CompareOp>(
-      loc, x, rewriter.create<mhlo::FloorOp>(loc, x),
-      mhlo::ComparisonDirection::EQ);
-  Value isPole = rewriter.create<mhlo::AndOp>(loc, isLeZero, isInt);
-  return rewriter.create<mhlo::SelectOp>(
-      loc, isPole,
-      getConstantLike(rewriter, loc, std::numeric_limits<double>::quiet_NaN(),
-                      x),
-      digamma);
-}
-
-Value materializeZeta(ConversionPatternRewriter &rewriter, Location loc,
-                      ValueRange args) {
-  // Implementation ported from:
-  // https://github.com/openxla/xla/blob/7a067a7b88d2ffb15b1dc5e3c06f701a15f0391d/xla/client/lib/math.cc#L1912-L1917
-  // Reference: Johansson, Fredrik.
-  // "Rigorous high-precision computation of the Hurwitz zeta function and its
-  // derivatives." Numerical Algorithms 69.2 (2015): 253-270.
-  // https://arxiv.org/abs/1309.2877 - formula (5)
-  // Notation is more or less kept as a reference to the whitepaper.
-  assert(args.size() == 2);
-  Value x = args[0];
-  Value q = args[1];
-  static const std::array<double, 12> kZetaCoeffs{
-      -7.1661652561756670113e18,
-      1.8152105401943546773e17,
-      -4.5979787224074726105e15,
-      1.1646782814350067249e14,
-      -2.950130727918164224e12,
-      7.47242496e10,
-      -1.8924375803183791606e9,
-      47900160.0,
-      -1209600.0,
-      30240.0,
-      -720.0,
-      12.0,
-  };
-
-  // For speed we'll always use 9 iterations for the initial series estimate,
-  // and a 12 term expansion for the Euler-Maclaurin formula.
-  Value a = q;
-  Value zero = chlo::getConstantLike(rewriter, loc, 0.0, a);
-  Value negPower = zero;
-  Value negX = rewriter.create<mhlo::NegOp>(loc, x);
-  Value initialSum = rewriter.create<mhlo::PowOp>(loc, q, negX);
-  Value one = chlo::getConstantLike(rewriter, loc, 1.0, a);
-  for (int i = 0; i < 9; ++i) {
-    a = rewriter.create<mhlo::AddOp>(loc, a, one);
-    negPower = rewriter.create<mhlo::PowOp>(loc, a, negX);
-    initialSum = rewriter.create<mhlo::AddOp>(loc, initialSum, negPower);
-  }
-  a = rewriter.create<mhlo::AddOp>(loc, a, one);
-  negPower = rewriter.create<mhlo::PowOp>(loc, a, negX);
-  Value oneLikeX = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  Value xMinusOne = rewriter.create<mhlo::SubtractOp>(loc, x, oneLikeX);
-  Value negPowerMulA = rewriter.create<mhlo::MulOp>(loc, negPower, a);
-  Value negPowerMulADivXMinusOne =
-      rewriter.create<mhlo::DivOp>(loc, negPowerMulA, xMinusOne);
-  Value s =
-      rewriter.create<mhlo::AddOp>(loc, initialSum, negPowerMulADivXMinusOne);
-  Value aInverseSquare = rewriter.create<mhlo::DivOp>(
-      loc, one, rewriter.create<mhlo::MulOp>(loc, a, a));
-
-  Value hornerSum = zero;
-  Value factor = one;
-  // Use Horner's rule for this.
-  // Note this differs from Cephes which does a 'naive' polynomial evaluation.
-  // Using Horner's rule allows to avoid some NaN's and Infs from happening,
-  // resulting in more numerically stable code.
-  for (int i = 0; i < 11; ++i) {
-    Value factorLhs = rewriter.create<mhlo::AddOp>(
-        loc, x, chlo::getConstantLike(rewriter, loc, 22 - 2 * i, x));
-    Value factorRhs = rewriter.create<mhlo::AddOp>(
-        loc, x, chlo::getConstantLike(rewriter, loc, 21 - 2 * i, x));
-    factor = rewriter.create<mhlo::MulOp>(loc, factorLhs, factorRhs);
-    hornerSum = rewriter.create<mhlo::MulOp>(
-        loc, factor,
-        rewriter.create<mhlo::MulOp>(
-            loc, aInverseSquare,
-            rewriter.create<mhlo::AddOp>(
-                loc, hornerSum,
-                chlo::getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], a))));
-  }
-  Value zeroPointFiveLikeNegPower =
-      chlo::getConstantLike(rewriter, loc, .5, negPower);
-  Value xDivA = rewriter.create<mhlo::DivOp>(loc, x, a);
-  s = rewriter.create<mhlo::AddOp>(
-      loc, s,
-      rewriter.create<mhlo::MulOp>(
-          loc, negPower,
-          rewriter.create<mhlo::AddOp>(
-              loc, zeroPointFiveLikeNegPower,
-              rewriter.create<mhlo::MulOp>(
-                  loc, xDivA,
-                  rewriter.create<mhlo::AddOp>(
-                      loc,
-                      chlo::getConstantLike(rewriter, loc, 1. / kZetaCoeffs[11],
-                                            a),
-                      hornerSum)))));
-
-  // Use the initial zeta sum without the correction term coming
-  // from Euler-Maclaurin if it is accurate enough.
-  Value absNegPower = rewriter.create<mhlo::AbsOp>(loc, negPower);
-  Value absInitialSum = rewriter.create<mhlo::AbsOp>(loc, initialSum);
-  Value output = rewriter.create<mhlo::SelectOp>(
-      loc,
-      rewriter.create<mhlo::CompareOp>(
-          loc, absNegPower,
-          rewriter.create<mhlo::MulOp>(
-              loc, absInitialSum,
-              chlo::getConstantLikeSmallestFiniteValue(rewriter, loc, a)),
-          mhlo::ComparisonDirection::LT),
-      initialSum, s);
-
-  // Function is not defined for x < 1.
-  Value nan = chlo::getConstantLike(
-      rewriter, loc, std::numeric_limits<double>::quiet_NaN(), x);
-  output = rewriter.create<mhlo::SelectOp>(
-      loc,
-      rewriter.create<mhlo::CompareOp>(loc, x, oneLikeX,
-                                       mhlo::ComparisonDirection::LT),
-      nan, output);
-
-  // For q <= 0, x must be an integer.
-  Value qLeZero = rewriter.create<mhlo::CompareOp>(
-      loc, q, zero, mhlo::ComparisonDirection::LE);
-  Value xNotInt = rewriter.create<mhlo::CompareOp>(
-      loc, x, rewriter.create<mhlo::FloorOp>(loc, x),
-      mhlo::ComparisonDirection::NE);
-  Value xDomainError = rewriter.create<mhlo::AndOp>(loc, qLeZero, xNotInt);
-  output = rewriter.create<mhlo::SelectOp>(loc, xDomainError, nan, output);
-
-  // For all integer q <= 0, zeta has a pole. The limit is only defined as
-  // +inf if x is and even integer.
-  Value inf = chlo::getConstantLike(rewriter, loc,
-                                    std::numeric_limits<double>::infinity(), x);
-  Value qIsInt = rewriter.create<mhlo::CompareOp>(
-      loc, q, rewriter.create<mhlo::FloorOp>(loc, q),
-      mhlo::ComparisonDirection::EQ);
-  Value atPole = rewriter.create<mhlo::AndOp>(loc, qLeZero, qIsInt);
-  Value two = chlo::getConstantLike(rewriter, loc, 2.0, x);
-  Value xIsInt = rewriter.create<mhlo::CompareOp>(
-      loc, x, rewriter.create<mhlo::FloorOp>(loc, x),
-      mhlo::ComparisonDirection::EQ);
-  Value xIsEven = rewriter.create<mhlo::CompareOp>(
-      loc, rewriter.create<mhlo::RemOp>(loc, x, two), zero,
-      mhlo::ComparisonDirection::EQ);
-  Value xIsEvenInt = rewriter.create<mhlo::AndOp>(loc, xIsInt, xIsEven);
-  output = rewriter.create<mhlo::SelectOp>(
-      loc, atPole, rewriter.create<mhlo::SelectOp>(loc, xIsEvenInt, inf, nan),
-      output);
-
-  // For x = 1, this is the harmonic series and diverges.
-  output = rewriter.create<mhlo::SelectOp>(
-      loc,
-      rewriter.create<mhlo::CompareOp>(loc, x, one,
-                                       mhlo::ComparisonDirection::EQ),
-      inf, output);
-
-  return output;
-}
-
-Value materializePolygamma(ConversionPatternRewriter &rewriter, Location loc,
-                           ValueRange args) {
-  PolygammaOp::Adaptor transformed(args);
-  Value n = transformed.getN();
-  Value x = transformed.getX();
-
-  // Handle integer n > 0.
-  Value one = getConstantLike(rewriter, loc, 1.0, x);
-  Value two = getConstantLike(rewriter, loc, 2.0, x);
-  Value sign = rewriter.create<mhlo::SubtractOp>(
-      loc,
-      rewriter.create<mhlo::MulOp>(loc, two,
-                                   rewriter.create<mhlo::RemOp>(loc, n, two)),
-      one);
-  Value nPlusOne = rewriter.create<mhlo::AddOp>(loc, n, one);
-  Value expLgammaNp1 = rewriter.create<mhlo::ExpOp>(
-      loc, rewriter.create<chlo::LgammaOp>(loc, nPlusOne));
-  Value zeta = rewriter.create<chlo::ZetaOp>(loc, nPlusOne, x);
-  Value result = rewriter.create<mhlo::MulOp>(
-      loc, rewriter.create<mhlo::MulOp>(loc, sign, expLgammaNp1), zeta);
-
-  // Handle n = 0.
-  Value zero = getConstantLike(rewriter, loc, 0.0, x);
-  Value nEqZero = rewriter.create<mhlo::CompareOp>(
-      loc, n, zero, mhlo::ComparisonDirection::EQ);
-  result = rewriter.create<mhlo::SelectOp>(
-      loc, nEqZero, rewriter.create<chlo::DigammaOp>(loc, x), result);
-
-  // Check that n is a natural number. Return nan, otherwise.
-  Value nonInt = rewriter.create<mhlo::CompareOp>(
-      loc, n, rewriter.create<mhlo::FloorOp>(loc, n),
-      mhlo::ComparisonDirection::NE);
-  Value negative = rewriter.create<mhlo::CompareOp>(
-      loc, n, zero, mhlo::ComparisonDirection::LT);
-  Value nonNatural = rewriter.create<mhlo::OrOp>(loc, nonInt, negative);
-  return rewriter.create<mhlo::SelectOp>(
-      loc, nonNatural,
-      getConstantLike(rewriter, loc, std::numeric_limits<double>::quiet_NaN(),
-                      x),
-      result);
-}
-
-struct ConvertLgammaOp : public OpConversionPattern<LgammaOp> {
-  using OpConversionPattern<LgammaOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      LgammaOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    FloatType minPrecisionTy = rewriter.getF32Type();
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-                                  minPrecisionTy, &materializeLgamma));
-    return success();
-  }
-};
-
-struct ConvertDigammaOp : public OpConversionPattern<DigammaOp> {
-  using OpConversionPattern<DigammaOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      DigammaOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    FloatType minPrecisionTy = rewriter.getF32Type();
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-                                  minPrecisionTy, &materializeDigamma));
-    return success();
-  }
-};
-
-Value materializeNextAfter(ConversionPatternRewriter &rewriter, Location loc,
-                           ValueRange operands) {
-  NextAfterOp::Adaptor transformed(operands);
-  Value x = transformed.getX();
-  Value y = transformed.getY();
-  auto resultTy = x.getType().cast<ShapedType>();
-  auto bitwidth = resultTy.getElementType().getIntOrFloatBitWidth();
-  ImplicitLocOpBuilder b(loc, rewriter);
-  auto intTy = resultTy.clone(b.getIntegerType(bitwidth));
-  auto xAsInt = b.create<mhlo::BitcastConvertOp>(intTy, x);
-  auto yAsInt = b.create<mhlo::BitcastConvertOp>(intTy, y);
-
-  // The result is NaN if either "x" or "y" are NaN.
-  auto xIsNan = b.create<mhlo::CompareOp>(x, x, mhlo::ComparisonDirection::NE);
-  auto yIsNan = b.create<mhlo::CompareOp>(y, y, mhlo::ComparisonDirection::NE);
-  auto nanInput = b.create<mhlo::OrOp>(xIsNan, yIsNan);
-  auto resultForNan = getConstantLike(
-      rewriter, loc, std::numeric_limits<double>::quiet_NaN(), x);
-  auto resultForNanAsInt =
-      b.create<mhlo::BitcastConvertOp>(intTy, resultForNan);
-
-  // The sign bit is the MSB.
-  const int64_t signBit = int64_t{1} << (bitwidth - 1);
-  // Discard the sign bit to make the result non-negative.
-  auto signMask = getConstantLike(rewriter, loc, signBit, xAsInt);
-  auto negatedSignMask = getConstantLike(rewriter, loc, ~signBit, xAsInt);
-  auto xAbs = b.create<mhlo::AndOp>(xAsInt, negatedSignMask);
-  auto yAbs = b.create<mhlo::AndOp>(yAsInt, negatedSignMask);
-
-  // When both "x" and "y" are equal, the result is "y".
-  auto xAndYAreEqual =
-      b.create<mhlo::CompareOp>(x, y, mhlo::ComparisonDirection::EQ);
-  auto resultForEqual = yAsInt;
-
-  // When both "x" and "y" are 0, the result is "y". This is a separate case
-  // from above because "x" and "y" might have a different sign.
-  auto zero = getConstantLike(rewriter, loc, 0, xAsInt);
-  auto xIsZero =
-      b.create<mhlo::CompareOp>(xAbs, zero, mhlo::ComparisonDirection::EQ);
-  auto yIsZero =
-      b.create<mhlo::CompareOp>(yAbs, zero, mhlo::ComparisonDirection::EQ);
-  auto resultForBothZero = yAsInt;
-
-  auto xSign = b.create<mhlo::AndOp>(xAsInt, signMask);
-  auto ySign = b.create<mhlo::AndOp>(yAsInt, signMask);
-
-  // If from == 0 && to != 0, we need to return the smallest subnormal number
-  // signed like "to".
-  auto one = getConstantLike(rewriter, loc, 1, xAsInt);
-  auto resultForXZeroYNonZero = b.create<mhlo::OrOp>(ySign, one);
-
-  // If the sign of "x" and "y" disagree:
-  // - we need to make the magnitude of "from" smaller so that it is closer to
-  //   zero.
-  //
-  // Otherwise the signs agree:
-  // - "x" with a magnitude larger than "y" means we need to make the magnitude
-  //   smaller.
-  // - "x" with a magnitude smaller than "y" means we need to make the magnitude
-  //   larger.
-  auto signsDisagree =
-      b.create<mhlo::CompareOp>(xSign, ySign, mhlo::ComparisonDirection::NE);
-  auto xMagnitudeLargerThanY =
-      b.create<mhlo::CompareOp>(xAbs, yAbs, mhlo::ComparisonDirection::GT);
-  auto resultHasSmallerMagnitude =
-      b.create<mhlo::OrOp>(xMagnitudeLargerThanY, signsDisagree);
-  auto minusOne = getConstantLike(rewriter, loc, -1, xAsInt);
-  auto magnitudeAdjustment =
-      b.create<mhlo::SelectOp>(resultHasSmallerMagnitude, minusOne, one);
-  Value result = b.create<mhlo::AddOp>(xAsInt, magnitudeAdjustment);
-  // Handle from == +-0.
-  result = b.create<mhlo::SelectOp>(
-      xIsZero,
-      b.create<mhlo::SelectOp>(yIsZero, resultForBothZero,
-                               resultForXZeroYNonZero),
-      result);
-  // Handle from == to.
-  result = b.create<mhlo::SelectOp>(xAndYAreEqual, resultForEqual, result);
-  // Handle isnan(x) || isnan(y).
-  result = b.create<mhlo::SelectOp>(nanInput, resultForNanAsInt, result);
-
-  // Cast back to the original type.
-  return b.create<mhlo::BitcastConvertOp>(resultTy, result);
-}
-
-struct ConvertNextAfterOp : public OpConversionPattern<NextAfterOp> {
-  using OpConversionPattern<NextAfterOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      NextAfterOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOp(
-        op, materializeNextAfter(rewriter, op.getLoc(), adaptor.getOperands()));
-    return success();
-  }
-};
-
-struct ConvertPolygammaOp : public OpConversionPattern<PolygammaOp> {
-  using OpConversionPattern<PolygammaOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      PolygammaOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    FloatType minPrecisionTy = rewriter.getF32Type();
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, loc, adaptor.getOperands(),
-                                  minPrecisionTy, &materializePolygamma));
-    return success();
-  }
-};
-
-// Sinh(x) = (e^x - e^-x) / 2
-//         = e^(x + log(1/2)) - e^(-x + log(1/2)).
-//
-// The second formulation avoids overflowing when e^x = inf but (e^x)/2 is not
-// inf.
-//
-// This incorrectly overflows to +/-inf for two f32 input values, namely
-// +/-89.4159851, due to rounding error when computing x +/- log(1/2).  The
-// correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
-// we deem this acceptable.
-Value materializeSinhApproximationForLargeX(ConversionPatternRewriter &rewriter,
-                                            Location loc, ValueRange operands) {
-  SinhOp::Adaptor transformed(operands);
-  Value x = transformed.getOperand();
-
-  Value logOneHalf =
-      rewriter.create<mhlo::LogOp>(loc, getConstantLike(rewriter, loc, 0.5, x));
-  Value expAdd = rewriter.create<mhlo::ExpOp>(
-      loc, rewriter.create<mhlo::AddOp>(loc, x, logOneHalf));
-  Value expSub = rewriter.create<mhlo::ExpOp>(
-      loc, rewriter.create<mhlo::SubtractOp>(loc, logOneHalf, x));
-  return rewriter.create<mhlo::SubtractOp>(loc, expAdd, expSub);
-}
-
-// Express `sinh` as
-//   sinh(x) = (e^x - e^-x) / 2                     if |x| < 1
-//           = e^(x + log(1/2)) - e^(-x + log(1/2)) otherwise.
-Value materializeSinhApproximation(ConversionPatternRewriter &rewriter,
-                                   Location loc, ValueRange operands) {
-  Value largeSinhResult =
-      materializeSinhApproximationForLargeX(rewriter, loc, operands);
-
-  SinhOp::Adaptor transformed(operands);
-  Value x = transformed.getOperand();
-
-  // For smaller x, we get unwanted cancellations of e^x - e^-x, resulting in
-  // 0.
-  // Rewrite this to avoid that. We use expm1(x) because that preserves the
-  // first order term of the taylor series of e^x.
-  // (e^(x) - e^(-x)) / 2. =
-  // (e^(x) - 1 + 1 - e^(-x)) / 2.
-  // (expm1(x) + (e^(x) - 1) / e^x) / 2.
-  // (expm1(x) + expm1(x) / (expm1(x) + 1)) / 2.
-  Value expm1 = rewriter.create<mhlo::Expm1Op>(loc, x);
-  Value one = getConstantLike(rewriter, loc, 1.0, x);
-  Value oneHalf = getConstantLike(rewriter, loc, 0.5, x);
-  Value expm1PlusOne = rewriter.create<mhlo::AddOp>(loc, expm1, one);
-  Value ratio = rewriter.create<mhlo::DivOp>(loc, expm1, expm1PlusOne);
-  Value sum = rewriter.create<mhlo::AddOp>(loc, expm1, ratio);
-  Value smallSinhResult = rewriter.create<mhlo::MulOp>(loc, oneHalf, sum);
-
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value absXLtOne = rewriter.create<mhlo::CompareOp>(
-      loc, absX, one, mhlo::ComparisonDirection::LT);
-  return rewriter.create<mhlo::SelectOp>(loc, absXLtOne, smallSinhResult,
-                                         largeSinhResult);
-}
-
-struct ConvertSinhOp : public OpConversionPattern<SinhOp> {
-  using OpConversionPattern<SinhOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      SinhOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Value x = adaptor.getOperand();
-    if (x.getType().cast<ShapedType>().getElementType().isa<ComplexType>()) {
-      rewriter.replaceOp(op, materializeSinhApproximationForLargeX(
-                                 rewriter, op.getLoc(), adaptor.getOperands()));
-      return success();
-    }
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-                                  rewriter.getF32Type(),
-                                  &materializeSinhApproximation));
-    return success();
-  }
-};
-
-// Converts chlo.top_k to MHLO iota, sort, and slice ops.
-//
-// chlo.top_k sorts along last dimension of the input tensor and then returns
-// the top K components' values and indices. This is translated into a few
-// ops in MHLO: first generating an integer sequence for the indices,
-// then sort both the original input tensor and the indices togheter, and
-// at last slice out the top K components.
-//
-// For example, for the following IR:
-//
-// %0:2 = "chlo.top_k"(%input, k=8): tensor<16x16xf32> ->
-//                                   (tensor<16x8xf32>, tensor<16x8xi32>)
-//
-// We will get:
-//
-// %1 = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<16x16xi32>
-// %2 = "mhlo.sort"(%input, %1) ({
-// ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>,
-//      %arg3: tensor<i32>, %arg4: tensor<i32>):
-//   %7 = "mhlo.compare"(%arg1, %arg2) {comparison_direction = "GT"}: ...
-//   "mhlo.return"(%7) : (tensor<i1>) -> ()
-// }) {dimension = 1 : i64, is_stable = true} : ...
-// %3 = "mhlo.get_tuple_element"(%2) {index = 0 : i32} : ...
-// %4 = "mhlo.get_tuple_element"(%2) {index = 1 : i32} : ...
-// %5 = "mhlo.slice"(%3) {limit_indices = dense<[16, 8]> : tensor<2xi64>,
-//                           start_indices dense<0> : tensor<2xi64>,
-//                           strides = dense<1> : tensor<2xi64>} :
-//                              (tensor<16x16xf32>) -> tensor<16x8xf32>
-// %6 = "mhlo.slice"(%4) ...
-//
-struct BasisConvertTopKOp : public OpConversionPattern<TopKOp> {
-  using OpConversionPattern<TopKOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      TopKOp op, OpAdaptor /*adaptor*/,
-      ConversionPatternRewriter &rewriter) const override {
-    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
-    if (!operandType) return failure();
-    int64_t operandRank = operandType.getRank();
-    int64_t lastDimIndex = operandRank - 1;
-    int64_t lastDimSize = operandType.getDimSize(lastDimIndex);
-    int64_t lastDimResultSize =
-        hlo::isDynamicDimSize(lastDimSize)
-            ? static_cast<int64_t>(op.getK())
-            : std::min(static_cast<int64_t>(op.getK()), lastDimSize);
-    int64_t isDynamic = !operandType.hasStaticShape();
-    auto i32Type = rewriter.getIntegerType(32);
-    Value opShapeValue, resultShapeValue;
-    if (isDynamic) {
-      SmallVector<Value> sizesI32x1;
-      for (auto i = 0; i < operandType.getRank(); ++i) {
-        auto sizeI32 = rewriter.create<mhlo::GetDimensionSizeOp>(
-            op.getLoc(), op.getOperand(), i);
-        auto sizeI32x1 = rewriter.create<mhlo::ReshapeOp>(
-            op.getLoc(), RankedTensorType::get({1}, i32Type), sizeI32);
-        sizesI32x1.push_back(sizeI32x1);
-      }
-      opShapeValue =
-          rewriter.create<mhlo::ConcatenateOp>(op.getLoc(), sizesI32x1,
-                                               /*dimension=*/0);
-      auto lastDimI32 = rewriter.create<mhlo::ConstantOp>(
-          op.getLoc(),
-          rewriter.getI32IntegerAttr(static_cast<int32_t>(lastDimResultSize)));
-      auto lastDimI32x1 = rewriter.create<mhlo::ReshapeOp>(
-          op.getLoc(), RankedTensorType::get({1}, i32Type), lastDimI32);
-      sizesI32x1.back() = lastDimI32x1;
-      resultShapeValue =
-          rewriter.create<mhlo::ConcatenateOp>(op.getLoc(), sizesI32x1,
-                                               /*dimension=*/0);
-    }
-
-    // Create an Iota op for indices.
-    Type iotaType = RankedTensorType::get(operandType.getShape(), i32Type);
-    Value iotaOp;
-    if (isDynamic) {
-      iotaOp = rewriter.create<mhlo::DynamicIotaOp>(
-          op.getLoc(), iotaType, opShapeValue,
-          rewriter.getI64IntegerAttr(lastDimIndex));
-    } else {
-      iotaOp = rewriter.create<mhlo::IotaOp>(
-          op.getLoc(), iotaType, rewriter.getI64IntegerAttr(lastDimIndex));
-    }
-
-    // Create the sort op. It takes two inputs, one for the original input, the
-    // other for the indices. Use TOTALORDER comparison type instead of the
-    // default comparison if the element type is of type float.
-    Type elementType = operandType.getElementType();
-    auto sortOp =
-        createSortOp(&rewriter, op.getLoc(), {op.getOperand(), iotaOp},
-                     {elementType, i32Type}, lastDimIndex,
-                     /*isStable=*/true,
-                     /*direction=*/mhlo::ComparisonDirection::GT);
-
-    // Get the sorted input and index tuple element.
-    auto tupleFirstElement = sortOp.getResult(0);
-    auto tupleSecondElement = sortOp.getResult(1);
-
-    SmallVector<int64_t, 4> beginIndices(operandRank, 0);
-    auto endIndices = llvm::to_vector<4>(operandType.getShape());
-    endIndices.back() = lastDimResultSize;
-    SmallVector<int64_t, 4> strides(operandRank, 1);
-
-    // Get the slice for the top K elements.
-    auto indicesTy = RankedTensorType::get(operandRank, rewriter.getI64Type());
-    Value values, indices;
-    if (isDynamic) {
-      Value startIndices = rewriter.create<mhlo::ConstantOp>(
-          op.getLoc(), DenseIntElementsAttr::get(indicesTy, beginIndices));
-      Value lastIndices = rewriter.create<mhlo::ConvertOp>(
-          op.getLoc(), resultShapeValue, rewriter.getI64Type());
-      Value stridesOp = rewriter.create<mhlo::ConstantOp>(
-          op.getLoc(), DenseIntElementsAttr::get(indicesTy, strides));
-
-      SmallVector<int64_t, 4> resultShape =
-          llvm::to_vector<4>(operandType.getShape());
-      resultShape.back() = lastDimResultSize;
-      RankedTensorType resultType = RankedTensorType::get(
-          resultShape, elementType, operandType.getEncoding());
-      RankedTensorType indexResultType =
-          RankedTensorType::get(resultShape, i32Type);
-
-      values = rewriter.create<mhlo::RealDynamicSliceOp>(
-          op.getLoc(), resultType, tupleFirstElement, startIndices, lastIndices,
-          stridesOp);
-      indices = rewriter.create<mhlo::RealDynamicSliceOp>(
-          op.getLoc(), indexResultType, tupleSecondElement, startIndices,
-          lastIndices, stridesOp);
-    } else {
-      values = rewriter.create<mhlo::SliceOp>(
-          op.getLoc(), tupleFirstElement,
-          DenseIntElementsAttr::get(indicesTy, beginIndices),
-          DenseIntElementsAttr::get(indicesTy, endIndices),
-          DenseIntElementsAttr::get(indicesTy, strides));
-      indices = rewriter.create<mhlo::SliceOp>(
-          op.getLoc(), tupleSecondElement,
-          DenseIntElementsAttr::get(indicesTy, beginIndices),
-          DenseIntElementsAttr::get(indicesTy, endIndices),
-          DenseIntElementsAttr::get(indicesTy, strides));
-    }
-
-    rewriter.replaceOp(op, {values, indices});
-    return success();
-  }
-};
-
-struct ConvertZetaOp : public OpConversionPattern<ZetaOp> {
-  using OpConversionPattern<ZetaOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ZetaOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    FloatType minPrecisionTy = rewriter.getF32Type();
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, loc, adaptor.getOperands(),
-                                  minPrecisionTy, &materializeZeta));
-    return success();
-  }
-};
-
-struct ConvertSelectOp : public OpConversionPattern<BroadcastSelectOp> {
-  using OpConversionPattern<BroadcastSelectOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      BroadcastSelectOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    // Only support ranked operands.
-    Value pred = adaptor.getPred();
-    Value onTrue = adaptor.getOnTrue();
-    Value onFalse = adaptor.getOnFalse();
-    auto predType = pred.getType().dyn_cast<RankedTensorType>();
-    auto onTrueType = onTrue.getType().dyn_cast<RankedTensorType>();
-    auto onFalseType = onFalse.getType().dyn_cast<RankedTensorType>();
-    auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
-    if (!predType || !onTrueType || !onFalseType || !resultType) {
-      return failure();
-    }
-
-    auto loc = op.getLoc();
-
-    Value predShape = rewriter.createOrFold<shape::ShapeOfOp>(loc, pred);
-    Value onTrueShape = rewriter.createOrFold<shape::ShapeOfOp>(loc, onTrue);
-    Value onFalseShape = rewriter.createOrFold<shape::ShapeOfOp>(loc, onFalse);
-    int64_t resultRank = std::max(
-        {predType.getRank(), onTrueType.getRank(), onFalseType.getRank()});
-
-    Value broadcastableCstr = rewriter.createOrFold<shape::CstrBroadcastableOp>(
-        loc, ValueRange{predShape, onTrueShape, onFalseShape});
-    auto assumingOp = rewriter.create<shape::AssumingOp>(
-        loc, ArrayRef<Type>{resultType}, broadcastableCstr);
-
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.createBlock(&assumingOp.getDoRegion());
-
-    Value resultExtents = rewriter.createOrFold<shape::BroadcastOp>(
-        loc, shape::getExtentTensorType(op.getContext()),
-        ValueRange{predShape, onTrueShape, onFalseShape},
-        /*error=*/nullptr);
-    auto shapeType =
-        RankedTensorType::get({resultRank}, rewriter.getIndexType());
-    resultExtents =
-        rewriter.createOrFold<tensor::CastOp>(loc, shapeType, resultExtents);
-
-    Value broadcastedPred = pred;
-    // Pred has an implicit broadcast for scalars, so use that when convenient.
-    if (predType.getRank() > 0) {
-      auto predBroadcastDimensions = llvm::to_vector<4>(
-          llvm::seq<int64_t>(resultRank - predType.getRank(), resultRank));
-      broadcastedPred = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
-          loc,
-          RankedTensorType::get(resultType.getShape(),
-                                predType.getElementType()),
-          pred, resultExtents,
-          rewriter.getI64TensorAttr(predBroadcastDimensions));
-    }
-    auto onTrueBroadcastDimensions = llvm::to_vector<4>(
-        llvm::seq<int64_t>(resultRank - onTrueType.getRank(), resultRank));
-    Value broadcastedOnTrue = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
-        loc,
-        RankedTensorType::get(resultType.getShape(),
-                              onTrueType.getElementType()),
-        onTrue, resultExtents,
-        rewriter.getI64TensorAttr(onTrueBroadcastDimensions));
-    auto onFalseBroadcastDimensions = llvm::to_vector<4>(
-        llvm::seq<int64_t>(resultRank - onFalseType.getRank(), resultRank));
-    Value broadcastedOnFalse = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
-        loc,
-        RankedTensorType::get(resultType.getShape(),
-                              onFalseType.getElementType()),
-        onFalse, resultExtents,
-        rewriter.getI64TensorAttr(onFalseBroadcastDimensions));
-
-    // And generate the final non-broadcasted ternary op.
-    Value finalResult =
-        rewriter.create<mhlo::SelectOp>(loc, resultType, broadcastedPred,
-                                        broadcastedOnTrue, broadcastedOnFalse);
-    rewriter.create<shape::AssumingYieldOp>(loc, finalResult);
-    rewriter.replaceOp(op, {assumingOp.getResult(0)});
-    return success();
-  }
-};
-
-// Converts binary ops that statically are determined to not broadcast directly
-// to the corresponding mhlo non-broadcasting op.
-template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
-struct ConvertTrivialNonBroadcastBinaryOp
-    : public OpConversionPattern<ChloOpTy> {
-  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ChloOpTy op, typename ChloOpTy::Adaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    // Only rewrite for statically determinable non-broadcasting cases.
-    auto lhsType =
-        adaptor.getLhs().getType().template dyn_cast<RankedTensorType>();
-    auto rhsType =
-        adaptor.getRhs().getType().template dyn_cast<RankedTensorType>();
-    if (!lhsType || !rhsType) return failure();
-
-    // Requires rank broadcast.
-    if (lhsType.getRank() != rhsType.getRank()) return failure();
-    // Any dynamic dimension may require broadcasting and requires more
-    // analysis.
-    if (!lhsType.hasStaticShape() || !rhsType.hasStaticShape())
-      return failure();
-
-    for (auto extents : llvm::zip(lhsType.getShape(), rhsType.getShape())) {
-      auto lhsExtent = std::get<0>(extents);
-      auto rhsExtent = std::get<1>(extents);
-      if (lhsExtent != rhsExtent) {
-        return failure();
-      }
-    }
-
-    rewriter.replaceOp(op, Adaptor::createOp(op, op.getResult().getType(),
-                                             adaptor.getOperands(), rewriter));
-    return success();
-  }
-};
-
-// Converts a binary op with ranked broadcasting operands to explicitly
-// broadcast and invoke the corresponding mhlo non-broadcasting op.
-// Note that dynamic broadcasting supported by this pattern is only valid for
-// "numpy" broadcasting semantics as defined here:
-//   https://docs.scipy.org/doc/numpy/reference/ufuncs.html
-// Specifically, this includes the following cases:
-//   - Same rank broadcast (operands have the same static rank).
-//   - Different-rank broadcast, either without a broadcast_dims attribte or
-//     with the broadcast_dims attribute set to map to a prefix padding.
-//   - Legal combinations of degenerate (1-dim) implicit broadcasting.
-// The restriction on broadcast_dims derives from the definition of the
-// `shape.broadcast` op, which only supports prefix-padding.
-template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
-struct ConvertRankedDynamicBroadcastBinaryOp
-    : public OpConversionPattern<ChloOpTy> {
-  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ChloOpTy op, typename ChloOpTy::Adaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    // Only support ranked operands.
-    Value lhs = adaptor.getLhs();
-    Value rhs = adaptor.getRhs();
-    auto lhsType = lhs.getType().dyn_cast<RankedTensorType>();
-    auto rhsType = rhs.getType().dyn_cast<RankedTensorType>();
-    auto resultType =
-        op.getResult().getType().template dyn_cast<RankedTensorType>();
-    if (!lhsType || !rhsType || !resultType) return failure();
-
-    // Check for "numpy"-style rank broadcast.
-    auto broadcastDimensions = op.getBroadcastDimensions();
-    if (broadcastDimensions &&
-        !hlo::isLegalNumpyRankedBroadcast(lhs, rhs, *broadcastDimensions)) {
-      // Note: It is unclear whether the general specification of explicit
-      // broadcast_dimensions on binary ops is a feature we want to carry
-      // forward. While it can technically be implemented for ranked-dynamic,
-      // it is incompatible with unranked inputs. If this warning is emitted
-      // in real programs, it is an indication that the feature should be
-      // implemented versus just falling back on the more standard definition
-      // of numpy-like prefix-padding.
-      op.emitWarning() << "unsupported non prefix-padded dynamic rank "
-                       << "broadcast_dimensions = " << *broadcastDimensions;
-      return failure();
-    }
-
-    // Compute result shape.
-    auto loc = op.getLoc();
-
-    // Insert a constraint on the shapes being broadcastable and insert all
-    // future code into an assuming block reliant on the constraint.
-    Value lhsShape = rewriter.create<shape::ShapeOfOp>(loc, lhs);
-    Value rhsShape = rewriter.create<shape::ShapeOfOp>(loc, rhs);
-    auto broadcastableCstr =
-        rewriter.create<shape::CstrBroadcastableOp>(loc, lhsShape, rhsShape);
-    auto assumingOp = rewriter.create<shape::AssumingOp>(
-        loc, ArrayRef<Type>{resultType}, broadcastableCstr.getResult());
-
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.createBlock(&assumingOp.getDoRegion());
-
-    int64_t resultRank = std::max(lhsType.getRank(), rhsType.getRank());
-    Value resultExtents =
-        hlo::computeBinaryElementwiseBroadcastingResultExtents(loc, lhs, rhs,
-                                                               rewriter);
-
-    // Note that we unconditionally emit DynamicBroadcastInDim ops and let
-    // downstream canonicalizations fold them away if possible. This is
-    // because, in the dynamic case, there are many corner cases regarding
-    // when it is safe to omit, and some of them require analysis to prove
-    // properly.
-    auto lhsBroadcastDimensions = llvm::to_vector<4>(
-        llvm::seq<int64_t>(resultRank - lhsType.getRank(), resultRank));
-    Value broadcastedLhs = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
-        loc,
-        RankedTensorType::get(resultType.getShape(), lhsType.getElementType()),
-        lhs, resultExtents, rewriter.getI64TensorAttr(lhsBroadcastDimensions));
-    auto rhsBroadcastDimensions = llvm::to_vector<4>(
-        llvm::seq<int64_t>(resultRank - rhsType.getRank(), resultRank));
-    Value broadcastedRhs = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
-        loc,
-        RankedTensorType::get(resultType.getShape(), rhsType.getElementType()),
-        rhs, resultExtents, rewriter.getI64TensorAttr(rhsBroadcastDimensions));
-
-    // And generate the final non-broadcasted binary op.
-    Value finalResult = Adaptor::createOp(
-        op, resultType, {broadcastedLhs, broadcastedRhs}, rewriter);
-    rewriter.create<shape::AssumingYieldOp>(loc, finalResult);
-    rewriter.replaceOp(op, {assumingOp.getResult(0)});
-    return success();
-  }
-};
-
-class ConvertDynamicReshapeOp
-    : public OpRewritePattern<chlo::DynamicReshapeOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(chlo::DynamicReshapeOp op,
-                                PatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-    auto tensor = op.getOperand();
-    auto shape = op.getOutputShape();
-
-    auto shapeTy = shape.getType().cast<ShapedType>();
-    auto resultTy = op.getType().cast<ShapedType>();
-
-    Value inputShape = rewriter.create<shape::ShapeOfOp>(loc, tensor);
-    Value numEls = rewriter.create<shape::NumElementsOp>(loc, inputShape);
-    Value cstr = rewriter.create<mhlo::CstrReshapableOp>(loc, numEls, shape);
-    rewriter.replaceOpWithNewOp<shape::AssumingOp>(
-        op, cstr, [&](OpBuilder &b, Location l) {
-          Value computedShape =
-              b.create<mhlo::ComputeReshapeShapeOp>(l, shapeTy, numEls, shape);
-          SmallVector<Value> result;
-          result.push_back(b.create<mhlo::DynamicReshapeOp>(l, resultTy, tensor,
-                                                            computedShape));
-          return result;
-        });
-
-    return success();
-  }
-};
-
-#include "chlo_legalize_to_hlo/generated_chlo_legalize_to_hlo.inc"
-}  // namespace
-
-void populateChloBroadcastingPatterns(MLIRContext *context,
-                                      RewritePatternSet *patterns) {
-  // Instantiate conversion templates for conforming binary elementwise ops
-  // that do not have different dtypes between operands and results and do
-  // not have special attributes that need to be preserved.
-  populateForBroadcastingBinaryOp<ConvertTrivialNonBroadcastBinaryOp>(
-      context, patterns, 10);
-  populateForBroadcastingBinaryOp<ConvertRankedDynamicBroadcastBinaryOp>(
-      context, patterns, 5);
-  patterns
-      ->add<ConvertConstantLikeOp, ConvertDynamicReshapeOp, ConvertSelectOp>(
-          context);
-}
-
-void populateChloLegalizeToHloBasisOpsPatterns(MLIRContext *context,
-                                               RewritePatternSet *patterns) {
-  // Patterns that decompose to a basis set of HLOs
-  // These are guaranteed to be convertible to StableHLO, but discard some
-  // higher level information that is useful to XLA compilation.
-  patterns->add<BasisConvertErfOp, BasisConvertTopKOp>(context);
-}
-
-void populateDecomposeChloPatterns(MLIRContext *context,
-                                   RewritePatternSet *patterns) {
-  populateWithGenerated(*patterns);
-
-  // Other patterns.
-  // clang-format off
-  patterns->add<ConvertBesselI1eOp,
-                   ConvertCoshOp,
-                   ConvertDigammaOp,
-                   ConvertErfcOp,
-                   ConvertErfInvOp,
-                   ConvertLgammaOp,
-                   ConvertNextAfterOp,
-                   ConvertPolygammaOp,
-                   ConvertSinhOp,
-                   ConvertZetaOp>(context);
-  // clang-format on
-}
-
-}  // namespace chlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
index 058e6db42fc321..d03c05880865e4 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
@@ -19,59 +19,46 @@ limitations under the License.
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
 #include "mhlo/transforms/rewriters.h"
+#include "mhlo/utils/type_conversion.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Pass/Pass.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/PassManager.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "stablehlo/dialect/ChloOps.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "stablehlo/transforms/Passes.h"
 
 namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_CHLOLEGALIZETOHLOPASS
-#define GEN_PASS_DEF_CHLOLEGALIZETOHLOBASISOPSPASS
+#define GEN_PASS_DEF_CHLOLEGALIZETOHIGHLEVELMHLOPASS
 #include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
-struct ChloLegalizeToHloPass
-    : public impl::ChloLegalizeToHloPassBase<ChloLegalizeToHloPass> {
-  explicit ChloLegalizeToHloPass(bool legalizeBroadcasts,
-                                 bool expandCompositions)
-      : ChloLegalizeToHloPassBase<
-            ChloLegalizeToHloPass>::ChloLegalizeToHloPassBase() {
-    this->legalize_broadcasts_ = legalizeBroadcasts;
-    this->expand_compositions_ = expandCompositions;
-  }
+struct ChloLegalizeToHighLevelMhloPass
+    : public impl::ChloLegalizeToHighLevelMhloPassBase<
+          ChloLegalizeToHighLevelMhloPass> {
+  using ChloLegalizeToHighLevelMhloPassBase::
+      ChloLegalizeToHighLevelMhloPassBase;
 
   void runOnOperation() override {
-    ConversionTarget conversionTarget(getContext());
-    RewritePatternSet conversionPatterns(&getContext());
-    conversionTarget.addIllegalDialect<chlo::ChloDialect>();
+    MLIRContext &context = getContext();
+    ConversionTarget conversionTarget(context);
+    RewritePatternSet conversionPatterns(&context);
+
+    chlo::populateChloToHighLevelMhloOpPatterns(&context, &conversionPatterns);
 
     // Consider the mhlo dialect legal for tests. Also add helper dialects
     // that are needed by the patterns.
-    conversionTarget
-        .addLegalDialect<MhloDialect, mlir::arith::ArithDialect,
-                         mlir::func::FuncDialect, mlir::tensor::TensorDialect,
-                         mlir::shape::ShapeDialect, mlir::scf::SCFDialect>();
-    conversionTarget.addLegalOp<chlo::MinimumBroadcastShapesOp>();
-
-    if (legalize_broadcasts_) {
-      chlo::populateChloBroadcastingPatterns(&getContext(),
-                                             &conversionPatterns);
-    }
-
-    if (expand_compositions_) {
-      chlo::populateDecomposeChloPatterns(&getContext(), &conversionPatterns);
-    } else {
-      conversionTarget
-          .addLegalOp<chlo::NextAfterOp, chlo::PolygammaOp, chlo::ZetaOp>();
-    }
+    conversionTarget.addLegalDialect<chlo::ChloDialect, mhlo::MhloDialect>();
+    conversionTarget.addIllegalOp<chlo::TopKOp, chlo::ErfOp, chlo::TanOp>();
 
     if (failed(applyPartialConversion(getOperation(), conversionTarget,
                                       std::move(conversionPatterns)))) {
@@ -80,29 +67,27 @@ struct ChloLegalizeToHloPass
   }
 };
 
-struct ChloLegalizeToHloBasisOpsPass
-    : public impl::ChloLegalizeToHloBasisOpsPassBase<
-          ChloLegalizeToHloBasisOpsPass> {
-  using ChloLegalizeToHloBasisOpsPassBase::ChloLegalizeToHloBasisOpsPassBase;
+struct ChloLegalizeToHloPass
+    : public impl::ChloLegalizeToHloPassBase<ChloLegalizeToHloPass> {
+  using ChloLegalizeToHloPassBase::ChloLegalizeToHloPassBase;
 
   void runOnOperation() override {
-    ConversionTarget conversionTarget(getContext());
-    RewritePatternSet conversionPatterns(&getContext());
+    MLIRContext &context = getContext();
+    ConversionTarget conversionTarget(context);
+    RewritePatternSet conversionPatterns(&context);
 
-    // Patterns will only be applied to these ops
-    conversionTarget.addIllegalOp<chlo::ErfOp, chlo::TopKOp>();
+    stablehlo::StablehloToHloTypeConverter typeConverter;
+    chlo::populateChloToHloPatterns(&context, &typeConverter,
+                                    &conversionPatterns);
 
-    // Programs with MHLO equivalents to the StableHLO ops are likely bugs
-    // for users of this expander pass, so best to disallow.
-    conversionTarget.addIllegalOp<mhlo::TopKOp>();  // TODO: Add ErfOp
-
-    // Given that the resulting patterns should be convertible to StableHLO
-    // Only MHLO should be legal.
+    // Consider the mhlo dialect legal for tests. Also add helper dialects
+    // that are needed by the patterns.
     conversionTarget
-        .addLegalDialect<MhloDialect, chlo::ChloDialect, func::FuncDialect>();
-
-    chlo::populateChloLegalizeToHloBasisOpsPatterns(&getContext(),
-                                                    &conversionPatterns);
+        .addIllegalDialect<chlo::ChloDialect, stablehlo::StablehloDialect>();
+    conversionTarget.addLegalDialect<
+        MhloDialect, mlir::arith::ArithDialect, mlir::func::FuncDialect,
+        mlir::tensor::TensorDialect, mlir::shape::ShapeDialect>();
+    conversionTarget.addLegalOp<chlo::MinimumBroadcastShapesOp>();
 
     if (failed(applyPartialConversion(getOperation(), conversionTarget,
                                       std::move(conversionPatterns)))) {
@@ -113,16 +98,26 @@ struct ChloLegalizeToHloBasisOpsPass
 
 }  // namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>> createChloLegalizeToHloPass(
-    bool legalizeBroadcasts, bool expandCompositions) {
-  return std::make_unique<ChloLegalizeToHloPass>(legalizeBroadcasts,
-                                                 expandCompositions);
+}  // namespace mhlo
+
+namespace chlo {
+namespace {
+#include "chlo_legalize_to_hlo/generated_chlo_legalize_to_hlo.inc"
+
+}  // namespace
+
+void populateChloToHighLevelMhloOpPatterns(MLIRContext *,
+                                           RewritePatternSet *patterns) {
+  populateWithGenerated(*patterns);
 }
 
-std::unique_ptr<OperationPass<func::FuncOp>>
-createChloLegalizeToHloBasisOpsPass() {
-  return std::make_unique<ChloLegalizeToHloBasisOpsPass>();
+void populateChloToHloPatterns(MLIRContext *context,
+                               TypeConverter *typeConverter,
+                               RewritePatternSet *patterns) {
+  chlo::populateChloToHighLevelMhloOpPatterns(context, patterns);
+  stablehlo::populateChloToStablehloPatterns(context, patterns);
+  stablehlo::populateStablehloToHloPatterns(patterns, typeConverter, context);
 }
 
-}  // namespace mhlo
+}  // namespace chlo
 }  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td
index 937739f2851454..497686bf2e2ab8 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td
@@ -19,345 +19,22 @@ limitations under the License.
 // ambiguous/different for various backends. Avoid patterns that are actually
 // lowering to non-canonical forms.
 
-include "mlir/Dialect/Shape/IR/ShapeOps.td"
 include "mlir/IR/OpBase.td"
 include "mhlo/IR/hlo_ops.td"
 include "stablehlo/dialect/ChloOps.td"
 
-class MHLO_ComparisonDirectionValue<string enumStr> :
-  ConstantAttr<MHLO_ComparisonDirectionAttr, "::mlir::mhlo::ComparisonDirection::" # enumStr>;
-
 //===----------------------------------------------------------------------===//
-// Unary op patterns.
+// Direct CHLO->MHLO conversions
 //===----------------------------------------------------------------------===//
 
-// Expand acos for non-complex arguments to MHLO dialect as follows:
-//   acos(x) = 2 * atan2(sqrt(1 - x^2), (1 + x))  if x != -1
-//           = pi                                 if x == -1
-//
-// TODO(b/237376133): Support operands with complex element types separately
-// using the following formula.
-//   acos(x) = -(i * log(x + i * sqrt((1 + x) * (1 - x))))
-def : Pat<(CHLO_AcosOp NonComplexElementType:$input),
-  (MHLO_SelectOp
-    (MHLO_CompareOp
-      $input,
-      (MHLO_ConstantLike<"-1"> $input),
-      MHLO_ComparisonDirectionValue<"NE">,
-      (MHLO_DEFAULT_COMPARISON_TYPE)
-    ),
-    (MHLO_MulOp
-      (MHLO_ConstantLike<"2"> $input),
-      (MHLO_Atan2Op
-        (MHLO_SqrtOp
-          (MHLO_SubtractOp
-            (MHLO_ConstantLike<"1"> $input),
-            (MHLO_MulOp $input, $input)
-          )
-        ),
-        (MHLO_AddOp
-          (MHLO_ConstantLike<"1"> $input),
-          $input
-        )
-      )
-    ),
-    (MHLO_ConstantLike<"M_PI"> $input)
-  )>;
-
-// Expand acosh to MHLO dialect as follows:
-//   acosh(x) = log(x + sqrt(x^2 - 1))      if x >= -1
-//            = log(x + sqrt((x+1)*(x-1)))
-//   acosh(x) = nan                         if x < -1
-//
-// If x^2 will overflow, we approximate sqrt(x^2 - 1) == x and compute as
-// log(2*x) = log(2) + log(x).  (Note this works because negative x never
-// overflows; x < -1 simply yields nan.
-def : Pat<(CHLO_AcoshOp NonComplexElementType:$input),
-  (MHLO_SelectOp
-    (MHLO_CompareOp
-      $input,
-      (MHLO_ConstantLike<"-1"> $input),
-      MHLO_ComparisonDirectionValue<"LT">,
-      (MHLO_DEFAULT_COMPARISON_TYPE)
-    ),
-    (MHLO_ConstantLike<"NAN"> $input),
-    (MHLO_SelectOp
-      (MHLO_CompareOp
-        $input,
-        (MHLO_SqrtOp
-          (MHLO_ConstantLikeMaxFiniteValue $input)
-        ),
-        MHLO_ComparisonDirectionValue<"GE">,
-        (MHLO_DEFAULT_COMPARISON_TYPE)
-      ),
-      (MHLO_AddOp
-        (MHLO_LogOp $input),
-        (MHLO_LogOp
-          (MHLO_ConstantLike<"2"> $input)
-        )
-      ),
-      (MHLO_LogOp
-        (MHLO_AddOp
-          $input,
-          (MHLO_SqrtOp
-            (MHLO_MulOp
-              (MHLO_AddOp
-                (MHLO_ConstantLike<"1"> $input),
-                $input
-              ),
-              (MHLO_AddOp
-                (MHLO_ConstantLike<"-1"> $input),
-                $input
-              )
-            )
-          )
-        )
-      )
-    )
-  )>;
-
-// Expand acosh for complex arguments to MHLO dialect as
-//   acosh(x) = log(x + sqrt((x+1)*(x-1)))
-//
-// Per tensorflow/compiler/xla/client/lib/math.cc at the time of writing:
-// "For now, we ignore the question of overflow if x is a
-// complex type, because we don't yet have exhaustive tests for complex trig
-// functions".
-def : Pat<(CHLO_AcoshOp ComplexElementType:$input),
-  (MHLO_LogOp
-    (MHLO_AddOp
-      $input,
-      (MHLO_SqrtOp
-        (MHLO_MulOp
-          (MHLO_AddOp
-            $input,
-            (MHLO_ConstantLike<"1"> $input)
-          ),
-          (MHLO_SubtractOp
-            $input,
-            (MHLO_ConstantLike<"1"> $input)
-          )
-        )
-      )
-    )
-  )>;
-
-
-// Expand asin to MHLO dialect as follows:
-//   asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
-def : Pat<(CHLO_AsinOp $input),
-  (MHLO_MulOp
-    (MHLO_ConstantLike<"2"> $input),
-    (MHLO_Atan2Op
-      $input,
-      (MHLO_AddOp
-        (MHLO_ConstantLike<"1"> $input),
-        (MHLO_SqrtOp
-          (MHLO_SubtractOp
-            (MHLO_ConstantLike<"1"> $input),
-            (MHLO_MulOp $input, $input)
-          )
-        )
-      )
-    )
-  )>;
-
-// Expand asinh for non-complex arguments to MHLO dialect as
-//   asinh(x) = log(x + sqrt(x^2 + 1))
-//
-// If x^2 will overflow and x is positive, we can approximate x + sqrt(x^2 + 1)
-// as 2*x and return log(2) + log(x).
-//
-// For small x, sqrt(x^2 + 1) will evaluate to 1 due to floating point
-// arithmetic. However, we would like to retain the low order term of this,
-// which is around 0.5 * x^2 using a binomial expansion.
-// Let z = sqrt(a^2 + 1)
-// The following rewrite retains the lower order term.
-// log(a + sqrt(a^2 + 1))
-//   = log((a + sqrt(a^2 + 1)) * (1 + sqrt(a^2 + 1)) / (1 + sqrt(a^2 + 1)))
-//   = log((a + a^2 + 1 + a * z + z) / (1 + z))
-//   = log(1 + a + a^2 / (1 + z))
-//   = log(1 + a + a^2 / (1 + sqrt(a^2 + 1)))
-//
-// If x is negative, the above would give us some trouble; we can't approximate
-// the result as x + abs(x) = 0 but we are saved by the fact that asinh(-x) =
-// -asinh(x).
-def : Pat<(CHLO_AsinhOp NonComplexElementType:$input),
-  (MHLO_MulOp
-    (MHLO_SignOp $input),
-    (MHLO_SelectOp
-      (MHLO_CompareOp
-        (MHLO_AbsOp $input),
-        (MHLO_SqrtOp
-          (MHLO_ConstantLikeMaxFiniteValue $input)
-        ),
-        MHLO_ComparisonDirectionValue<"GE">,
-        (MHLO_DEFAULT_COMPARISON_TYPE)
-      ),
-      (MHLO_AddOp
-        (MHLO_LogOp
-          (MHLO_AbsOp $input)
-        ),
-        (MHLO_LogOp
-          (MHLO_ConstantLike<"2"> $input)
-        )
-      ),
-      (MHLO_SelectOp
-        (MHLO_CompareOp
-          (MHLO_AbsOp $input),
-          (MHLO_ConstantLike<"1"> $input),
-          MHLO_ComparisonDirectionValue<"LE">,
-          (MHLO_DEFAULT_COMPARISON_TYPE)
-        ),
-        (MHLO_Log1pOp
-          (MHLO_AddOp
-            (MHLO_AbsOp $input),
-            (MHLO_MulOp
-              (MHLO_AbsOp $input),
-              (MHLO_DivOp
-                (MHLO_AbsOp $input),
-                (MHLO_AddOp
-                  (MHLO_ConstantLike<"1"> $input),
-                  (MHLO_SqrtOp
-                    (MHLO_AddOp
-                      (MHLO_MulOp
-                        (MHLO_AbsOp $input),
-                        (MHLO_AbsOp $input)
-                      ),
-                      (MHLO_ConstantLike<"1"> $input)
-                    )
-                  )
-                )
-              )
-            )
-          )
-        ),
-        (MHLO_LogOp
-          (MHLO_AddOp
-            (MHLO_AbsOp $input),
-            (MHLO_SqrtOp
-              (MHLO_AddOp
-                (MHLO_MulOp
-                  (MHLO_AbsOp $input),
-                  (MHLO_AbsOp $input)
-                ),
-                (MHLO_ConstantLike<"1"> $input)
-              )
-            )
-          )
-        )
-      )
-    )
-  )>;
-
-// Expand asinh for complex arguments to MHLO dialect as
-//   asinh(x) = log(x + sqrt(x^2 + 1))
-//
-// Per tensorflow/compiler/xla/client/lib/math.cc at the time of writing:
-// "For now, we ignore the question of overflow if x is a
-// complex type, because we don't yet have exhaustive tests for complex trig
-// functions".
-def : Pat<(CHLO_AsinhOp ComplexElementType:$input),
-  (MHLO_LogOp
-    (MHLO_AddOp
-      $input,
-      (MHLO_SqrtOp
-        (MHLO_AddOp
-          (MHLO_MulOp $input, $input),
-          (MHLO_ConstantLike<"1"> $input)
-        )
-      )
-    )
-  )>;
-
-// Express `atan` as
-//   atan(x) = atan2(x, 1)
-def : Pat<(CHLO_AtanOp $input),
-  (MHLO_Atan2Op
-    $input,
-    (MHLO_ConstantLike<"1"> $input)
-  )>;
-
-// Express `atanh` for non-complex arguments as follows:
-//   atanh(x) = 0.5 * log((1 + x) / (1 - x)) if abs(x) <= 1
-//   atanh(x) = nan                          otherwise
-def : Pat<(CHLO_AtanhOp NonComplexElementType:$input),
-  (MHLO_SelectOp
-    (MHLO_CompareOp
-      (MHLO_AbsOp $input),
-      (MHLO_ConstantLike<"1"> $input),
-      MHLO_ComparisonDirectionValue<"GT">,
-      (MHLO_DEFAULT_COMPARISON_TYPE)
-    ),
-    (MHLO_ConstantLike<"NAN"> $input),
-    (MHLO_MulOp
-      (MHLO_SubtractOp
-        (MHLO_Log1pOp $input),
-        (MHLO_Log1pOp
-          (MHLO_NegOp $input)
-        )
-      ),
-      (MHLO_ConstantLike<"0.5"> $input)
-    )
-  )>;
-
-// Express `atanh` for complex arguments as follows:
-//   atanh(x) = (log(1 + x) - log(1 + (-x))) * 0.5
-//
-// Per tensorflow/compiler/xla/client/lib/math.cc at the time of writing:
-// "For now, we ignore the nan edge case for complex inputs,
-// because we don't yet have exhaustive tests for complex trig functions".
-def : Pat<(CHLO_AtanhOp ComplexElementType:$input),
-  (MHLO_MulOp
-    (MHLO_SubtractOp
-      (MHLO_Log1pOp $input),
-      (MHLO_Log1pOp
-        (MHLO_NegOp $input)
-      )
-    ),
-    (MHLO_ConstantLike<"0.5"> $input)
-  )>;
-
-// Express `conj` as
-//   conj(x) = (re(x), -im(x)).
-def : Pat<(CHLO_ConjOp $v),
-          (MHLO_ComplexOp (MHLO_RealOp $v), (MHLO_NegOp (MHLO_ImagOp $v)))>;
-
-// Express `is_inf` as
-//   is_inf(x) = is_pos_inf(|x|)
-def : Pat<(CHLO_IsInfOp NonComplexElementType:$input),
-  (CHLO_IsPosInfOp
-    (MHLO_AbsOp $input)
-  )>;
-
-// Express `is_pos_inf` as
-//   is_pos_inf(x) = (x == +inf)
-def : Pat<(CHLO_IsPosInfOp NonComplexElementType:$input),
-  (MHLO_CompareOp
-    $input,
-    (MHLO_ConstantLikePosInfValue $input),
-    MHLO_ComparisonDirectionValue<"EQ">,
-    (MHLO_DEFAULT_COMPARISON_TYPE)
-  )>;
-
-// Express `is_neg_inf` as
-//   is_neg_inf(x) = (x == -inf)
-def : Pat<(CHLO_IsNegInfOp NonComplexElementType:$input),
-  (MHLO_CompareOp
-    $input,
-    (MHLO_ConstantLikeNegInfValue $input),
-    MHLO_ComparisonDirectionValue<"EQ">,
-    (MHLO_DEFAULT_COMPARISON_TYPE)
-  )>;
-
-def : Pat<(CHLO_ConstantOp $v),
-          (MHLO_ConstantOp $v)>;
-
 def : Pat<(CHLO_TanOp $v),
-          (MHLO_TanOp $v)>;
+          (MHLO_TanOp $v),
+          [], [], (addBenefit 10)>;
 
 def : Pat<(CHLO_ErfOp $v),
-          (MHLO_ErfOp $v)>;
+          (MHLO_ErfOp $v),
+          [], [], (addBenefit 10)>;
 
 def : Pat<(CHLO_TopKOp AnyRankedTensor:$v, $k),
-          (MHLO_TopKOp $v, $k, ConstBoolAttrTrue)>;
+          (MHLO_TopKOp $v, $k, ConstBoolAttrTrue),
+          [], [], (addBenefit 10)>;
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
index 5531d568a8d179..62868358ffc0e0 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
@@ -15,29 +15,28 @@ limitations under the License.
 
 include "mlir/Pass/PassBase.td"
 
-def ChloLegalizeToHloPass : Pass<"chlo-legalize-to-hlo", "func::FuncOp"> {
-  let summary = "Legalize CHLO to HLO.";
-  let constructor = "createChloLegalizeToHloPass()";
-  let dependentDialects = ["mhlo::MhloDialect", "chlo::ChloDialect",
-                           "shape::ShapeDialect", "scf::SCFDialect"];
-  let options = [
-    Option<"legalize_broadcasts_", "legalize-broadcasts", "bool",
-           /*default=*/"true", "Legalize implicit broadcasts to explicit HLO broadcasting forms">,
-    Option<"expand_compositions_", "expand-compositions", "bool",
-           /*default=*/"true", "Expands client-centric compositions to HLO primitives">,
-  ];
+def ChloLegalizeToHighLevelMhloPass : Pass<"chlo-legalize-to-high-level-mhlo", "func::FuncOp"> {
+  let summary = "Legalize CHLO's with XLA counterparts, like TopK and Erf.";
+  let description = [{
+    Performs direct legalization of CHLO->MHLO only for high-level (non-basis)
+    ops with XLA support. These are MHLO ops that directly model the CHLO op,
+    such as TopK and Erf.
+  }];
+  let dependentDialects = ["mhlo::MhloDialect"];
 }
 
-def ChloLegalizeToHloBasisOpsPass : Pass<"chlo-legalize-to-hlo-basis-ops", "func::FuncOp"> {
-  let summary = "Legalize specific CHLO ops (e.g. ErfOf and TopKOp) to basis MHLO ops.";
+def ChloLegalizeToHloPass : Pass<"chlo-legalize-to-hlo", "func::FuncOp"> {
+  let summary = "Legalize CHLO to MHLO with XLA-supported ops.";
   let description = [{
-    XLA has specialization for certain CHLO ops (ErfOp, TopKOp), and other
-    backends still require decomposition of these ops into the basis set which
-    can be converted safely to StableHLO. This pass is needed until we have
-    direct CHLO to StableHLO lowerings.
+    Performs legalization of CHLO->StableHLO->MHLO, while also preserving MHLO
+    high level operations when possible (see ChloLegalizeToHighLevelMhloPass).
   }];
-  let constructor = "createChloLegalizeToHloBasisOpsPass()";
-  let dependentDialects = ["mhlo::MhloDialect", "chlo::ChloDialect"];
+  let dependentDialects = [
+    "mhlo::MhloDialect",
+    "mlir::shape::ShapeDialect",
+    "mlir::stablehlo::StablehloDialect",
+    "mlir::tensor::TensorDialect"
+  ];
 }
 
 def HloCanonicalizeScatterPass : Pass<"hlo-canonicalize-scatter", "func::FuncOp"> {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
index a52c5b4ee7b2a4..b9a025cdb55861 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
@@ -49,14 +49,6 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeSortPass();
 /// Lowers from HLO dialect to Standard dialect.
 std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeToStdPass();
 
-/// Lowers from the CHLO dialect to the HLO dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createChloLegalizeToHloPass(
-    bool legalizeBroadcasts = true, bool expandCompositions = true);
-
-/// Lowers specific ops from the CHLO dialect to an HLO basis opset
-std::unique_ptr<OperationPass<func::FuncOp>>
-createChloLegalizeToHloBasisOpsPass();
-
 // Lowers from sparse ops in CHLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeSparseOperationsPass(
     bool legalizeToCustomCalls = true);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
index 4de69f27d268e0..14e3add6f814fe 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
@@ -191,23 +191,16 @@ void populateLegalizeSparseOpsToCustomCallPatterns(MLIRContext *context,
 
 namespace chlo {
 
-// Populates a collection of conversion patterns for legalizing broadcasting
-// client-HLO to their non-broadcasting counterparts.
-void populateChloBroadcastingPatterns(MLIRContext *context,
-                                      RewritePatternSet *patterns);
+// Populates direct translations between CHLO and MHLO ops for higher level
+// MHLO ops like TopK and Erf.
+void populateChloToHighLevelMhloOpPatterns(MLIRContext *context,
+                                           RewritePatternSet *patterns);
 
-// Populates a collection of conversion patterns for legalizing client-HLO to
-// HLO by decomposing client-operations to corresponding sequences of more
-// primitive operations. This does not include the
-// PopulateChloBroadcastingPatterns above.
-void populateDecomposeChloPatterns(MLIRContext *context,
-                                   RewritePatternSet *patterns);
-
-// Adds pattern to decompose specific CHLO ops like ErfOp and TopKOp to their
-// basis set of operations. These ops have 1:1 corresponding MHLO ops, but for
-// certain backends, they need to be expanded.
-void populateChloLegalizeToHloBasisOpsPatterns(MLIRContext *context,
-                                               RewritePatternSet *patterns);
+// Populates direct translations between CHLO->MHLO high level ops
+// and CHLO->StableHLO->MHLO patterns.
+void populateChloToHloPatterns(MLIRContext *context,
+                               TypeConverter *typeConverter,
+                               RewritePatternSet *patterns);
 
 }  // namespace chlo
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir
deleted file mode 100644
index 512e1cea6ff752..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir
+++ /dev/null
@@ -1,345 +0,0 @@
-// RUN: mlir-hlo-opt -chlo-legalize-to-hlo="legalize-broadcasts=true expand-compositions=false" -cse -canonicalize -split-input-file -verify-diagnostics %s -o - | FileCheck %s
-
-// Check the non-broadcast case for each registered op, then just check a
-// representative op for detailed broadcast semantics.
-// CHECK-LABEL: @addWithoutBroadcast
-func.func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.add %arg0, %arg1
-  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @dynamicBroadcast
-// CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
-// CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
-func.func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
-  // CHECK-DAG:    %[[RESULT_EXTENTS:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-DAG:    %[[ARG0_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG:    %[[ARG1_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK-NEXT:   %[[RESULT:.+]] = mhlo.add %[[ARG0_B]], %[[ARG1_B]]
-  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
-  // CHECK-NEXT: }
-  // CHECK-NEXT:      return %[[FINAL_RESULT]] : tensor<?x?xf32>
-  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %0 : tensor<?x?xf32>
-}
-
-// -----
-// CHECK-LABEL: @dynamicBroadcastComplex
-// CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
-// CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
-func.func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-  // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
-  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-DAG:    %[[ARG0_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-DAG:    %[[ARG1_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-NEXT:   %[[RESULT:.+]] = mhlo.complex %[[ARG0_B]], %[[ARG1_B]] : tensor<?x?xcomplex<f32>>
-  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
-  // CHECK-NEXT: }
-  // CHECK-NEXT: return %[[FINAL_RESULT]] : tensor<?x?xcomplex<f32>>
-  %0 = chlo.broadcast_complex %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
-  func.return %0 : tensor<?x?xcomplex<f32>>
-}
-
-// -----
-// CHECK-LABEL: @dynamicBroadcastCompare
-// CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
-// CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
-func.func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
-  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-DAG: %[[ARG0_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-DAG: %[[ARG1_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK: %[[RESULT:.+]] = mhlo.compare EQ, %[[ARG0_B]], %[[ARG1_B]] : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
-  // CHECK: shape.assuming_yield %[[RESULT]]
-  // CHECK-NEXT: }
-  // CHECK: return %[[FINAL_RESULT]] : tensor<?x?xi1>
-  %0 = chlo.broadcast_compare %arg0, %arg1 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
-  func.return %0 : tensor<?x?xi1>
-}
-
-// -----
-
-// CHECK-LABEL: func @selectv2
-func.func @selectv2(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT: mhlo.select %arg0, %arg1, %arg2
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @selectv2_pred_scalar
-func.func @selectv2_pred_scalar(%arg0: tensor<i1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT: mhlo.select %arg0, %arg1, %arg2
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @selectv2_broadcast_then
-func.func @selectv2_broadcast_then(%arg0: tensor<i1>, %arg1: tensor<8x1xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
-  // CHECK-NEXT: %[[BROADCAST:.*]] = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<8x1xi32>) -> tensor<2x8x8xi32>
-  // CHECK-NEXT: mhlo.select %arg0, %[[BROADCAST]], %arg2
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<8x1xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
-  func.return %0: tensor<2x8x8xi32>
-}
-
-// CHECK-LABEL: func @selectv2_broadcast_else
-func.func @selectv2_broadcast_else(%arg0: tensor<i1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<8x1xi32>) -> tensor<2x8x8xi32> {
-  // CHECK-NEXT: %[[BROADCAST:.*]] = "mhlo.broadcast_in_dim"(%arg2) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<8x1xi32>) -> tensor<2x8x8xi32>
-  // CHECK-NEXT: mhlo.select %arg0, %arg1, %[[BROADCAST]]
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x8x8xi32>, tensor<8x1xi32>) -> tensor<2x8x8xi32>
-  func.return %0: tensor<2x8x8xi32>
-}
-
-// CHECK-LABEL: func @selectv2_broadcast_pred
-func.func @selectv2_broadcast_pred(%arg0: tensor<1xi1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
-  // CHECK-NEXT: %[[BROADCAST:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1xi1>) -> tensor<2x8x8xi1>
-  // CHECK-NEXT: mhlo.select %[[BROADCAST]], %arg1, %arg2
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x8x8xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
-  func.return %0: tensor<2x8x8xi32>
-}
-
-// CHECK-LABEL: func @selectv2_broadcast_tensor_pred
-func.func @selectv2_broadcast_tensor_pred(%arg0: tensor<3xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
-  // CHECK-NEXT: %[[BROADCAST:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi1>) -> tensor<2x3xi1>
-  // CHECK-NEXT: mhlo.select %[[BROADCAST]], %arg1, %arg2
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
-  func.return %0: tensor<2x3xf16>
-}
-
-// CHECK-LABEL: func @selectv2_broadcast_all
-func.func @selectv2_broadcast_all(%arg0: tensor<8x1x1xi1>, %arg1: tensor<1x8x1xi32>, %arg2: tensor<1x1x8xi32>) -> tensor<8x8x8xi32> {
-  // CHECK-DAG: %[[BROADCAST_0:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x1x1xi1>) -> tensor<8x8x8xi1>
-  // CHECK-DAG: %[[BROADCAST_1:.*]] = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x8x1xi32>) -> tensor<8x8x8xi32>
-  // CHECK-DAG: %[[BROADCAST_2:.*]] = "mhlo.broadcast_in_dim"(%arg2) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x1x8xi32>) -> tensor<8x8x8xi32>
-  // CHECK: mhlo.select %[[BROADCAST_0]], %[[BROADCAST_1]], %[[BROADCAST_2]]
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<8x1x1xi1>, tensor<1x8x1xi32>, tensor<1x1x8xi32>) -> tensor<8x8x8xi32>
-  func.return %0: tensor<8x8x8xi32>
-}
-
-// CHECK-LABEL: func @selectv2_dynamic_ranked
-func.func @selectv2_dynamic_ranked(%arg0: tensor<1xi1>, %arg1: tensor<2x?x8xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x?x8xi32> {
-  // CHECK-DAG: %[[SHAPE0:.*]] = shape.const_shape [1] : tensor<1xindex>
-  // CHECK-DAG: %[[SHAPE2:.*]] = shape.const_shape [2, 8, 8] : tensor<3xindex>
-  // CHECK-NEXT: %[[SHAPE1:.*]] = shape.shape_of %arg1 : tensor<2x?x8xi32> -> tensor<3xindex>
-  // CHECK-NEXT: %[[CSTR:.*]] = shape.cstr_broadcastable %[[SHAPE1]], %[[SHAPE0]], %[[SHAPE2]] : tensor<3xindex>, tensor<1xindex>, tensor<3xindex>
-  // CHECK-NEXT: %[[ASSUME:.*]] = shape.assuming %[[CSTR]] -> (tensor<2x?x8xi32>) {
-  // CHECK-NEXT:   %[[BCST:.*]] = shape.broadcast %[[SHAPE1]], %[[SHAPE2]] : tensor<3xindex>, tensor<3xindex> -> tensor<3xindex>
-  // CHECK-NEXT:   %[[BCST0:.*]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[BCST]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1xi1>, tensor<3xindex>) -> tensor<2x?x8xi1>
-  // CHECK-NEXT:   %[[BCST1:.*]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[BCST]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<2x?x8xi32>, tensor<3xindex>) -> tensor<2x?x8xi32>
-  // CHECK-NEXT:   %[[BCST2:.*]] = "mhlo.dynamic_broadcast_in_dim"(%arg2, %[[BCST]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<2x8x8xi32>, tensor<3xindex>) -> tensor<2x?x8xi32>
-  // CHECK-NEXT:   %[[SELECT:.*]] = mhlo.select %[[BCST0]], %[[BCST1]], %[[BCST2]] : tensor<2x?x8xi1>, tensor<2x?x8xi32>
-  // CHECK-NEXT:   shape.assuming_yield %[[SELECT]] : tensor<2x?x8xi32>
-  // CHECK-NEXT: }
-  // CHECK-NEXT: return %[[ASSUME]] : tensor<2x?x8xi32>
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x?x8xi32>, tensor<2x8x8xi32>) -> tensor<2x?x8xi32>
-  func.return %0: tensor<2x?x8xi32>
-}
-
-// -----
-// Verifies that broadcast_dimensions validity checks are valid.
-// CHECK-LABEL: @dynamicNonScalarBroadcastDimensions
-func.func @dynamicNonScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK: mhlo.add
-  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 1>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  func.return %0 : tensor<1x4xf32>
-}
-
-// -----
-// Verifies that broadcast_dimensions validity checks are valid.
-// CHECK-LABEL: @dynamicNonScalarByScalarBroadcastDimensions
-func.func @dynamicNonScalarByScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<f32>) -> tensor<1x4xf32> {
-  // CHECK: mhlo.add
-  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64>} : (tensor<1x4xf32>, tensor<f32>) -> tensor<1x4xf32>
-  func.return %0 : tensor<1x4xf32>
-}
-
-// -----
-// Verifies that invalid broadcast dimensions are rejected.
-func.func @dynamicNonScalarBroadcastDimensionsSizeMismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // expected-warning @+2 {{unsupported non prefix-padded dynamic rank broadcast_dimensions}}
-  // expected-error @+1 {{failed to legalize operation}}
-  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 1, 2>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  func.return %0 : tensor<1x4xf32>
-}
-
-// -----
-// Verifies that invalid broadcast dimensions are rejected.
-func.func @dynamicNonScalarBroadcastDimensionsMismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // expected-warning @+2 {{unsupported non prefix-padded dynamic rank broadcast_dimensions}}
-  // expected-error @+1 {{failed to legalize operation}}
-  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 2>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  func.return %0 : tensor<1x4xf32>
-}
-
-// -----
-// Note that broadcast_add is used as a proxy for all of the template
-// expansions. Tests below merely verify that the op has an expansion.
-// CHECK-LABEL: @andWithoutBroadcast
-func.func @andWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-  // CHECK: mhlo.and %arg0, %arg1
-  %0 = chlo.broadcast_and %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
-  func.return %0 : tensor<4xi1>
-}
-
-// -----
-// CHECK-LABEL: @atan2WithoutBroadcast
-func.func @atan2WithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.atan2 %arg0, %arg1
-  %0 = chlo.broadcast_atan2 %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @compareWithoutBroadcast
-func.func @compareWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xi1> {
-  // CHECK: mhlo.compare EQ, %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-  %0 = chlo.broadcast_compare %arg0, %arg1 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-  func.return %0 : tensor<4xi1>
-}
-
-// -----
-// CHECK-LABEL: @complexWithoutBroadcast
-func.func @complexWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xcomplex<f32>> {
-  // CHECK: mhlo.complex %arg0, %arg1 : tensor<4xcomplex<f32>>
-  %0 = chlo.broadcast_complex %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xcomplex<f32>>
-  func.return %0 : tensor<4xcomplex<f32>>
-}
-
-// -----
-// CHECK-LABEL: @divideWithoutBroadcast
-func.func @divideWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.divide %arg0, %arg1
-  %0 = chlo.broadcast_divide %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @maximumWithoutBroadcast
-func.func @maximumWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.maximum %arg0, %arg1
-  %0 = chlo.broadcast_maximum %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @minimumWithoutBroadcast
-func.func @minimumWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.minimum %arg0, %arg1
-  %0 = chlo.broadcast_minimum %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @multiplyWithoutBroadcast
-func.func @multiplyWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.multiply %arg0, %arg1
-  %0 = chlo.broadcast_multiply %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @orWithoutBroadcast
-func.func @orWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-  // CHECK: mhlo.or %arg0, %arg1
-  %0 = chlo.broadcast_or %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
-  func.return %0 : tensor<4xi1>
-}
-
-// -----
-// CHECK-LABEL: @powerWithoutBroadcast
-func.func @powerWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.power %arg0, %arg1
-  %0 = chlo.broadcast_power %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @remainderWithoutBroadcast
-func.func @remainderWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.remainder %arg0, %arg1
-  %0 = chlo.broadcast_remainder %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @shift_leftWithoutBroadcast
-func.func @shift_leftWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK: mhlo.shift_left %arg0, %arg1
-  %0 = chlo.broadcast_shift_left %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  func.return %0 : tensor<4xi32>
-}
-
-// -----
-// CHECK-LABEL: @shift_right_arithmeticWithoutBroadcast
-func.func @shift_right_arithmeticWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK: mhlo.shift_right_arithmetic %arg0, %arg1
-  %0 = chlo.broadcast_shift_right_arithmetic %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  func.return %0 : tensor<4xi32>
-}
-
-// -----
-// CHECK-LABEL: @shift_right_logicalWithoutBroadcast
-func.func @shift_right_logicalWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK: mhlo.shift_right_logical %arg0, %arg1
-  %0 = chlo.broadcast_shift_right_logical %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  func.return %0 : tensor<4xi32>
-}
-
-// -----
-// CHECK-LABEL: @subWithoutBroadcast
-func.func @subWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.subtract %arg0, %arg1
-  %0 = chlo.broadcast_subtract %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @xorWithoutBroadcast
-func.func @xorWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-  // CHECK: mhlo.xor %arg0, %arg1
-  %0 = chlo.broadcast_xor %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
-  func.return %0 : tensor<4xi1>
-}
-
-// -----
-// CHECK-LABEL: @NextAfterWithoutBroadcast
-// CHECK-SAME: (%[[LHS:.*]]: tensor<4xf32>, %[[RHS:.*]]: tensor<4xf32>)
-func.func @NextAfterWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-    -> tensor<4xf32> {
-  // CHECK: chlo.next_after %[[LHS]], %[[RHS]]
-  %0 = chlo.broadcast_next_after %arg0, %arg1
-      : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @PolygammaWithoutBroadcast
-// CHECK-SAME: (%[[LHS:.*]]: tensor<4xf32>, %[[RHS:.*]]: tensor<4xf32>)
-func.func @PolygammaWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-    -> tensor<4xf32> {
-  // CHECK: chlo.polygamma %[[LHS]], %[[RHS]]
-  %0 = chlo.broadcast_polygamma %arg0, %arg1
-      : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @ZetaWithoutBroadcast
-func.func @ZetaWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-    -> tensor<4xf32> {
-  // CHECK: chlo.zeta %arg0, %arg1
-  %0 = chlo.broadcast_zeta %arg0, %arg1
-      : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_no_broadcasts.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_no_broadcasts.mlir
deleted file mode 100644
index 2a22006f2c9834..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_no_broadcasts.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: mlir-hlo-opt -chlo-legalize-to-hlo="legalize-broadcasts=false" %s | FileCheck %s
-
-// CHECK-LABEL: atan_static
-// CHECK-SAME: %[[ARG:.*]]: tensor<2x3x4xf32>
-func.func @atan_static(%arg0: tensor<2x3x4xf32>) -> tuple<tensor<2x3x4xf32>> {
-  // CHECK: %[[CST:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x3x4xf32>
-  // CHECK: mhlo.atan2 %[[ARG]], %[[CST]] : tensor<2x3x4xf32>
-  %0 = chlo.atan %arg0 : tensor<2x3x4xf32> -> tensor<2x3x4xf32>
-  %1 = "mhlo.tuple"(%0) : (tensor<2x3x4xf32>) -> tuple<tensor<2x3x4xf32>>
-  func.return %1 : tuple<tensor<2x3x4xf32>>
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
index 44b10a5ebddaa9..4d9835dc8b790c 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-hlo-opt --chlo-legalize-to-hlo --split-input-file -verify-diagnostics %s | FileCheck %s --dump-input-context=20
+// RUN: mlir-hlo-opt --chlo-legalize-to-high-level-mhlo --split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-HIGH-LEVEL
 
 // CHECK-LABEL: func.func @asin_bf16(
 // CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<bf16>
@@ -262,6 +263,7 @@ func.func @conj(%arg0: tensor<3xcomplex<f32>>) -> tensor<3xcomplex<f32>> {
 // CHECK-LABEL: @erf_f64
 // CHECK-SAME: %[[ARG:.*]]: tensor<f64>
 func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
+  // CHECK-HIGH-LEVEL: mhlo.erf
   // CHECK: %[[RESULT:.*]] = mhlo.erf %[[ARG]]
   // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<f64>) -> tensor<f64>
@@ -273,6 +275,7 @@ func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
 // CHECK-LABEL: @erf_f32
 // CHECK-SAME: %[[ARG:.*]]: tensor<f32>
 func.func @erf_f32(%arg : tensor<f32>) -> tensor<f32> {
+  // CHECK-HIGH-LEVEL: mhlo.erf
   // CHECK: %[[RESULT:.*]] = mhlo.erf %[[ARG]]
   // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<f32>) -> tensor<f32>
@@ -284,6 +287,7 @@ func.func @erf_f32(%arg : tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: @erf_f16
 // CHECK-SAME: %[[ARG:.*]]: tensor<f16>
 func.func @erf_f16(%arg : tensor<f16>) -> tensor<f16> {
+  // CHECK-HIGH-LEVEL: mhlo.erf
   // CHECK: %[[RESULT:.*]] = mhlo.erf %[[ARG]]
   // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<f16>) -> tensor<f16>
@@ -295,6 +299,7 @@ func.func @erf_f16(%arg : tensor<f16>) -> tensor<f16> {
 // CHECK-LABEL: @erf_bf16
 // CHECK-SAME: %[[ARG:.*]]: tensor<bf16>
 func.func @erf_bf16(%arg : tensor<bf16>) -> tensor<bf16> {
+  // CHECK-HIGH-LEVEL: mhlo.erf
   // CHECK: %[[RESULT:.*]] = mhlo.erf %[[ARG]]
   // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<bf16>) -> tensor<bf16>
@@ -2256,12 +2261,9 @@ func.func @next_after_f32(%x: tensor<2xf32>, %y: tensor<2xf32>) -> tensor<2xf32>
 // CHECK-LABEL: @tan_f16
 // CHECK-SAME: (%[[ARG:.*]]: tensor<f16>)
 func.func @tan_f16(%arg : tensor<f16>) -> tensor<f16> {
-  // %[[TMP_0:.*]] = mhlo.convert [[ARG]] : (tensor<f16>) -> tensor<f32>
-  // %[[TMP_1:.*]] = mhlo.sine %[[TMP_0]]
-  // %[[TMP_2:.*]] = mhlo.cosine %[[TMP_0]]
-  // %[[TMP_3:.*]] = mhlo.divide %[[TMP_1]], %[[TMP_2]]
-  // %[[TMP_4:.*]] = mhlo.convert %[[TMP_3]] : (tensor<f32>) -> tensor<f16>
-  // return %[[TMP_4]] : tensor<f16>
+  // CHECK-HIGH-LEVEL: mhlo.tan
+  // CHECK: %[[RESULT:.*]] = mhlo.tan %[[ARG]] : tensor<f16>
+  // CHECK: return %[[RESULT]]
   %1 = chlo.tan %arg : tensor<f16> -> tensor<f16>
   func.return %1 : tensor<f16>
 }
@@ -2271,10 +2273,9 @@ func.func @tan_f16(%arg : tensor<f16>) -> tensor<f16> {
 // CHECK-LABEL: @tan_f32
 // CHECK-SAME: (%[[ARG:.*]]: tensor<f32>)
 func.func @tan_f32(%arg : tensor<f32>) -> tensor<f32> {
-  // %[[TMP_0:.*]] = mhlo.sine %[[ARG]]
-  // %[[TMP_1:.*]] = mhlo.cosine %[[ARG]]
-  // %[[TMP_2:.*]] = mhlo.divide %[[TMP_0]], %[[TMP_1]]
-  // return %[[TMP_2]] : tensor<f32>
+  // CHECK-HIGH-LEVEL: mhlo.tan
+  // CHECK: %[[RESULT:.*]] = mhlo.tan %[[ARG]] : tensor<f32>
+  // CHECK: return %[[RESULT]]
   %1 = chlo.tan %arg : tensor<f32> -> tensor<f32>
   func.return %1 : tensor<f32>
 }
@@ -2284,6 +2285,7 @@ func.func @tan_f32(%arg : tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: @top_k
 // CHECK-SAME: (%[[ARG:.*]]: tensor<16x16xf32>)
 func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32>) {
+  // CHECK-HIGH-LEVEL: mhlo.topk
   // CHECK: %values, %indices = mhlo.topk(%arg0, k = 8, largest = true) : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
   %1:2 = chlo.top_k(%arg, k=8) : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
   func.return %1#0, %1#1 : tensor<16x8xf32>, tensor<16x8xi32>
@@ -2295,6 +2297,7 @@ func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32
 // CHECK-SAME: ([[ARG:%.*]]: tensor<?x5x?xi1>
 // CHECK-SAME: -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
 func.func @dyn_top_k(%arg0: tensor<?x5x?xi1>) -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>) {
+  // CHECK-HIGH-LEVEL: mhlo.topk
   // CHECK: %values, %indices = mhlo.topk(%arg0, k = 2, largest = true) : tensor<?x5x?xi1> -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
   %values, %indices = chlo.top_k(%arg0, k = 2) : tensor<?x5x?xi1> -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
   return %values, %indices : tensor<?x5x2xi1>, tensor<?x5x2xi32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo_basis_ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo_basis_ops.mlir
deleted file mode 100644
index 0a9b7eef41657e..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo_basis_ops.mlir
+++ /dev/null
@@ -1,276 +0,0 @@
-// RUN: mlir-hlo-opt --chlo-legalize-to-hlo-basis-ops --chlo-legalize-to-hlo --split-input-file -verify-diagnostics %s | FileCheck %s 
-
-// -----
-
-// CHECK-LABEL: @erf_f64
-// CHECK-SAME: %[[ARG:.*]]: tensor<f64>
-func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
-  // CHECK: %[[TMP_0:.*]] = mhlo.multiply %[[ARG]], %[[ARG]]
-  // CHECK: %[[TMP_3:.*]] = mhlo.constant dense<9.6049737398705161>
-  // CHECK: %[[TMP_5:.*]] = mhlo.multiply %[[TMP_3]], %[[TMP_0]]
-  // CHECK: %[[TMP_6:.*]] = mhlo.constant dense<90.026019720384269>
-  // CHECK: %[[TMP_7:.*]] = mhlo.add %[[TMP_5]], %[[TMP_6]]
-  // CHECK: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_7]], %[[TMP_0]]
-  // CHECK: %[[TMP_9:.*]] = mhlo.constant dense<2232.0053459468431>
-  // CHECK: %[[TMP_10:.*]] = mhlo.add %[[TMP_8]], %[[TMP_9]]
-  // CHECK: %[[TMP_11:.*]] = mhlo.multiply %[[TMP_10]], %[[TMP_0]]
-  // CHECK: %[[TMP_12:.*]] = mhlo.constant dense<7003.3251411280507>
-  // CHECK: %[[TMP_13:.*]] = mhlo.add %[[TMP_11]], %[[TMP_12]]
-  // CHECK: %[[TMP_14:.*]] = mhlo.multiply %[[TMP_13]], %[[TMP_0]]
-  // CHECK: %[[TMP_15:.*]] = mhlo.constant dense<55592.301301039493>
-  // CHECK: %[[TMP_16:.*]] = mhlo.add %[[TMP_14]], %[[TMP_15]]
-  // CHECK: %[[TMP_17:.*]] = mhlo.multiply %[[ARG]], %[[TMP_16]]
-  // CHECK: %[[TMP_20:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_22:.*]] = mhlo.multiply %[[TMP_20]], %[[TMP_0]]
-  // CHECK: %[[TMP_23:.*]] = mhlo.constant dense<33.561714164750313>
-  // CHECK: %[[TMP_24:.*]] = mhlo.add %[[TMP_22]], %[[TMP_23]]
-  // CHECK: %[[TMP_25:.*]] = mhlo.multiply %[[TMP_24]], %[[TMP_0]]
-  // CHECK: %[[TMP_26:.*]] = mhlo.constant dense<521.35794978015269>
-  // CHECK: %[[TMP_27:.*]] = mhlo.add %[[TMP_25]], %[[TMP_26]]
-  // CHECK: %[[TMP_28:.*]] = mhlo.multiply %[[TMP_27]], %[[TMP_0]]
-  // CHECK: %[[TMP_29:.*]] = mhlo.constant dense<4594.3238297098014>
-  // CHECK: %[[TMP_30:.*]] = mhlo.add %[[TMP_28]], %[[TMP_29]]
-  // CHECK: %[[TMP_31:.*]] = mhlo.multiply %[[TMP_30]], %[[TMP_0]]
-  // CHECK: %[[TMP_32:.*]] = mhlo.constant dense<22629.000061389095>
-  // CHECK: %[[TMP_33:.*]] = mhlo.add %[[TMP_31]], %[[TMP_32]]
-  // CHECK: %[[TMP_34:.*]] = mhlo.multiply %[[TMP_33]], %[[TMP_0]]
-  // CHECK: %[[TMP_35:.*]] = mhlo.constant dense<49267.394260863592>
-  // CHECK: %[[TMP_36:.*]] = mhlo.add %[[TMP_34]], %[[TMP_35]]
-  // CHECK: %[[TMP_37:.*]] = mhlo.divide %[[TMP_17]], %[[TMP_36]]
-  // CHECK: %[[TMP_38:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_39:.*]] = mhlo.multiply %[[ARG]], %[[ARG]]
-  // CHECK: %[[TMP_40:.*]] = mhlo.negate %[[TMP_39]]
-  // CHECK: %[[TMP_41:.*]] = mhlo.exponential %[[TMP_40]]
-  // CHECK: %[[TMP_42:.*]] = mhlo.abs %[[ARG]]
-  // CHECK: %[[TMP_45:.*]] = mhlo.constant dense<2.4619698147353052E-10>
-  // CHECK: %[[TMP_47:.*]] = mhlo.multiply %[[TMP_45]], %[[TMP_42]]
-  // CHECK: %[[TMP_48:.*]] = mhlo.constant dense<0.56418956483106886>
-  // CHECK: %[[TMP_49:.*]] = mhlo.add %[[TMP_47]], %[[TMP_48]]
-  // CHECK: %[[TMP_50:.*]] = mhlo.multiply %[[TMP_49]], %[[TMP_42]]
-  // CHECK: %[[TMP_51:.*]] = mhlo.constant dense<7.4632105644226989>
-  // CHECK: %[[TMP_52:.*]] = mhlo.add %[[TMP_50]], %[[TMP_51]]
-  // CHECK: %[[TMP_53:.*]] = mhlo.multiply %[[TMP_52]], %[[TMP_42]]
-  // CHECK: %[[TMP_54:.*]] = mhlo.constant dense<48.637197098568137>
-  // CHECK: %[[TMP_55:.*]] = mhlo.add %[[TMP_53]], %[[TMP_54]]
-  // CHECK: %[[TMP_56:.*]] = mhlo.multiply %[[TMP_55]], %[[TMP_42]]
-  // CHECK: %[[TMP_57:.*]] = mhlo.constant dense<196.5208329560771>
-  // CHECK: %[[TMP_58:.*]] = mhlo.add %[[TMP_56]], %[[TMP_57]]
-  // CHECK: %[[TMP_59:.*]] = mhlo.multiply %[[TMP_58]], %[[TMP_42]]
-  // CHECK: %[[TMP_60:.*]] = mhlo.constant dense<526.44519499547732>
-  // CHECK: %[[TMP_61:.*]] = mhlo.add %[[TMP_59]], %[[TMP_60]]
-  // CHECK: %[[TMP_62:.*]] = mhlo.multiply %[[TMP_61]], %[[TMP_42]]
-  // CHECK: %[[TMP_63:.*]] = mhlo.constant dense<934.52852717195765>
-  // CHECK: %[[TMP_64:.*]] = mhlo.add %[[TMP_62]], %[[TMP_63]]
-  // CHECK: %[[TMP_65:.*]] = mhlo.multiply %[[TMP_64]], %[[TMP_42]]
-  // CHECK: %[[TMP_66:.*]] = mhlo.constant dense<1027.5518868951572>
-  // CHECK: %[[TMP_67:.*]] = mhlo.add %[[TMP_65]], %[[TMP_66]]
-  // CHECK: %[[TMP_68:.*]] = mhlo.multiply %[[TMP_67]], %[[TMP_42]]
-  // CHECK: %[[TMP_69:.*]] = mhlo.constant dense<557.53533536939938>
-  // CHECK: %[[TMP_70:.*]] = mhlo.add %[[TMP_68]], %[[TMP_69]]
-  // CHECK: %[[TMP_71:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_70]]
-  // CHECK: %[[TMP_74:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_76:.*]] = mhlo.multiply %[[TMP_74]], %[[TMP_42]]
-  // CHECK: %[[TMP_77:.*]] = mhlo.constant dense<13.228195115474499>
-  // CHECK: %[[TMP_78:.*]] = mhlo.add %[[TMP_76]], %[[TMP_77]]
-  // CHECK: %[[TMP_79:.*]] = mhlo.multiply %[[TMP_78]], %[[TMP_42]]
-  // CHECK: %[[TMP_80:.*]] = mhlo.constant dense<86.707214088598973>
-  // CHECK: %[[TMP_81:.*]] = mhlo.add %[[TMP_79]], %[[TMP_80]]
-  // CHECK: %[[TMP_82:.*]] = mhlo.multiply %[[TMP_81]], %[[TMP_42]]
-  // CHECK: %[[TMP_83:.*]] = mhlo.constant dense<354.93777888781989>
-  // CHECK: %[[TMP_84:.*]] = mhlo.add %[[TMP_82]], %[[TMP_83]]
-  // CHECK: %[[TMP_85:.*]] = mhlo.multiply %[[TMP_84]], %[[TMP_42]]
-  // CHECK: %[[TMP_86:.*]] = mhlo.constant dense<975.70850174320549>
-  // CHECK: %[[TMP_87:.*]] = mhlo.add %[[TMP_85]], %[[TMP_86]]
-  // CHECK: %[[TMP_88:.*]] = mhlo.multiply %[[TMP_87]], %[[TMP_42]]
-  // CHECK: %[[TMP_89:.*]] = mhlo.constant dense<1823.9091668790973>
-  // CHECK: %[[TMP_90:.*]] = mhlo.add %[[TMP_88]], %[[TMP_89]]
-  // CHECK: %[[TMP_91:.*]] = mhlo.multiply %[[TMP_90]], %[[TMP_42]]
-  // CHECK: %[[TMP_92:.*]] = mhlo.constant dense<2246.3376081871097>
-  // CHECK: %[[TMP_93:.*]] = mhlo.add %[[TMP_91]], %[[TMP_92]]
-  // CHECK: %[[TMP_94:.*]] = mhlo.multiply %[[TMP_93]], %[[TMP_42]]
-  // CHECK: %[[TMP_95:.*]] = mhlo.constant dense<1656.6630919416134>
-  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_94]], %[[TMP_95]]
-  // CHECK: %[[TMP_97:.*]] = mhlo.multiply %[[TMP_96]], %[[TMP_42]]
-  // CHECK: %[[TMP_98:.*]] = mhlo.constant dense<557.53534081772773>
-  // CHECK: %[[TMP_99:.*]] = mhlo.add %[[TMP_97]], %[[TMP_98]]
-  // CHECK: %[[TMP_100:.*]] = mhlo.divide %[[TMP_71]], %[[TMP_99]]
-  // CHECK: %[[TMP_103:.*]] = mhlo.constant dense<0.56418958354775506>
-  // CHECK: %[[TMP_105:.*]] = mhlo.multiply %[[TMP_103]], %[[TMP_42]]
-  // CHECK: %[[TMP_106:.*]] = mhlo.constant dense<1.275366707599781>
-  // CHECK: %[[TMP_107:.*]] = mhlo.add %[[TMP_105]], %[[TMP_106]]
-  // CHECK: %[[TMP_108:.*]] = mhlo.multiply %[[TMP_107]], %[[TMP_42]]
-  // CHECK: %[[TMP_109:.*]] = mhlo.constant dense<5.0190504225118051>
-  // CHECK: %[[TMP_110:.*]] = mhlo.add %[[TMP_108]], %[[TMP_109]]
-  // CHECK: %[[TMP_111:.*]] = mhlo.multiply %[[TMP_110]], %[[TMP_42]]
-  // CHECK: %[[TMP_112:.*]] = mhlo.constant dense<6.160210979930536>
-  // CHECK: %[[TMP_113:.*]] = mhlo.add %[[TMP_111]], %[[TMP_112]]
-  // CHECK: %[[TMP_114:.*]] = mhlo.multiply %[[TMP_113]], %[[TMP_42]]
-  // CHECK: %[[TMP_115:.*]] = mhlo.constant dense<7.4097426995044895>
-  // CHECK: %[[TMP_116:.*]] = mhlo.add %[[TMP_114]], %[[TMP_115]]
-  // CHECK: %[[TMP_117:.*]] = mhlo.multiply %[[TMP_116]], %[[TMP_42]]
-  // CHECK: %[[TMP_118:.*]] = mhlo.constant dense<2.9788666537210022>
-  // CHECK: %[[TMP_119:.*]] = mhlo.add %[[TMP_117]], %[[TMP_118]]
-  // CHECK: %[[TMP_120:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_119]]
-  // CHECK: %[[TMP_123:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_125:.*]] = mhlo.multiply %[[TMP_123]], %[[TMP_42]]
-  // CHECK: %[[TMP_126:.*]] = mhlo.constant dense<2.2605286322011726>
-  // CHECK: %[[TMP_127:.*]] = mhlo.add %[[TMP_125]], %[[TMP_126]]
-  // CHECK: %[[TMP_128:.*]] = mhlo.multiply %[[TMP_127]], %[[TMP_42]]
-  // CHECK: %[[TMP_129:.*]] = mhlo.constant dense<9.3960352493800147>
-  // CHECK: %[[TMP_130:.*]] = mhlo.add %[[TMP_128]], %[[TMP_129]]
-  // CHECK: %[[TMP_131:.*]] = mhlo.multiply %[[TMP_130]], %[[TMP_42]]
-  // CHECK: %[[TMP_132:.*]] = mhlo.constant dense<12.048953980809666>
-  // CHECK: %[[TMP_133:.*]] = mhlo.add %[[TMP_131]], %[[TMP_132]]
-  // CHECK: %[[TMP_134:.*]] = mhlo.multiply %[[TMP_133]], %[[TMP_42]]
-  // CHECK: %[[TMP_135:.*]] = mhlo.constant dense<17.081445074756591>
-  // CHECK: %[[TMP_136:.*]] = mhlo.add %[[TMP_134]], %[[TMP_135]]
-  // CHECK: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_136]], %[[TMP_42]]
-  // CHECK: %[[TMP_138:.*]] = mhlo.constant dense<9.6089680906328585>
-  // CHECK: %[[TMP_139:.*]] = mhlo.add %[[TMP_137]], %[[TMP_138]]
-  // CHECK: %[[TMP_140:.*]] = mhlo.multiply %[[TMP_139]], %[[TMP_42]]
-  // CHECK: %[[TMP_141:.*]] = mhlo.constant dense<3.3690764510008151>
-  // CHECK: %[[TMP_142:.*]] = mhlo.add %[[TMP_140]], %[[TMP_141]]
-  // CHECK: %[[TMP_143:.*]] = mhlo.divide %[[TMP_120]], %[[TMP_142]]
-  // CHECK: %[[TMP_144:.*]] = mhlo.constant dense<8.000000e+00>
-  // CHECK: %[[TMP_145:.*]] = mhlo.compare LT, %[[TMP_42]], %[[TMP_144]], NOTYPE
-  // CHECK: %[[TMP_146:.*]] = mhlo.select %[[TMP_145]], %[[TMP_100]], %[[TMP_143]]
-  // CHECK: %[[TMP_147:.*]] = mhlo.constant dense<-709.78271289338397>
-  // CHECK: %[[TMP_148:.*]] = mhlo.compare LT, %[[TMP_40]], %[[TMP_147]], NOTYPE
-  // CHECK: %[[TMP_149:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_150:.*]] = mhlo.select %[[TMP_148]], %[[TMP_149]], %[[TMP_146]]
-  // CHECK: %[[TMP_152:.*]] = mhlo.compare LT, %[[ARG]], %[[TMP_149]], NOTYPE
-  // CHECK: %[[TMP_153:.*]] = mhlo.constant dense<2.000000e+00>
-  // CHECK: %[[TMP_154:.*]] = mhlo.subtract %[[TMP_153]], %[[TMP_150]]
-  // CHECK: %[[TMP_155:.*]] = mhlo.select %[[TMP_152]], %[[TMP_154]], %[[TMP_150]]
-  // CHECK: %[[TMP_156:.*]] = mhlo.subtract %[[TMP_38]], %[[TMP_155]]
-  // CHECK: %[[TMP_157:.*]] = mhlo.abs %[[ARG]]
-  // CHECK: %[[TMP_159:.*]] = mhlo.compare LT, %[[TMP_157]], %[[TMP_38]], NOTYPE
-  // CHECK: %[[RESULT:.*]] = mhlo.select %[[TMP_159]], %[[TMP_37]], %[[TMP_156]]
-  // CHECK: return %[[RESULT]]
-  %1 = "chlo.erf"(%arg) : (tensor<f64>) -> tensor<f64>
-  func.return %1 : tensor<f64>
-}
-
-// -----
-
-// CHECK-LABEL: @erf_f32
-// CHECK-SAME: %[[ARG:.*]]: tensor<f32>
-func.func @erf_f32(%arg : tensor<f32>) -> tensor<f32> {
-  // CHECK-DAG: %[[TMP_0:.*]] = mhlo.constant dense<-4.000000e+00>
-  // CHECK-DAG: %[[TMP_1:.*]] = mhlo.constant dense<4.000000e+00>
-  // CHECK: %[[TMP_2:.*]] = mhlo.clamp %[[TMP_0]], %[[ARG]], %[[TMP_1]]
-  // CHECK: %[[TMP_3:.*]] = mhlo.multiply %[[TMP_2]], %[[TMP_2]]
-  // CHECK: %[[TMP_6:.*]] = mhlo.constant dense<-2.72614237E-10>
-  // CHECK: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_6]], %[[TMP_3]]
-  // CHECK: %[[TMP_9:.*]] = mhlo.constant dense<2.77068146E-8>
-  // CHECK: %[[TMP_10:.*]] = mhlo.add %[[TMP_8]], %[[TMP_9]]
-  // CHECK: %[[TMP_11:.*]] = mhlo.multiply %[[TMP_10]], %[[TMP_3]]
-  // CHECK: %[[TMP_12:.*]] = mhlo.constant dense<-2.10102394E-6>
-  // CHECK: %[[TMP_13:.*]] = mhlo.add %[[TMP_11]], %[[TMP_12]]
-  // CHECK: %[[TMP_14:.*]] = mhlo.multiply %[[TMP_13]], %[[TMP_3]]
-  // CHECK: %[[TMP_15:.*]] = mhlo.constant dense<-5.69250624E-5>
-  // CHECK: %[[TMP_16:.*]] = mhlo.add %[[TMP_14]], %[[TMP_15]]
-  // CHECK: %[[TMP_17:.*]] = mhlo.multiply %[[TMP_16]], %[[TMP_3]]
-  // CHECK: %[[TMP_18:.*]] = mhlo.constant dense<-7.34990637E-4>
-  // CHECK: %[[TMP_19:.*]] = mhlo.add %[[TMP_17]], %[[TMP_18]]
-  // CHECK: %[[TMP_20:.*]] = mhlo.multiply %[[TMP_19]], %[[TMP_3]]
-  // CHECK: %[[TMP_21:.*]] = mhlo.constant dense<-2.954600e-03>
-  // CHECK: %[[TMP_22:.*]] = mhlo.add %[[TMP_20]], %[[TMP_21]]
-  // CHECK: %[[TMP_23:.*]] = mhlo.multiply %[[TMP_22]], %[[TMP_3]]
-  // CHECK: %[[TMP_24:.*]] = mhlo.constant dense<-0.0160960332>
-  // CHECK: %[[TMP_25:.*]] = mhlo.add %[[TMP_23]], %[[TMP_24]]
-  // CHECK: %[[TMP_28:.*]] = mhlo.constant dense<-1.45660715E-5>
-  // CHECK: %[[TMP_30:.*]] = mhlo.multiply %[[TMP_28]], %[[TMP_3]]
-  // CHECK: %[[TMP_31:.*]] = mhlo.constant dense<-2.13374049E-4>
-  // CHECK: %[[TMP_32:.*]] = mhlo.add %[[TMP_30]], %[[TMP_31]]
-  // CHECK: %[[TMP_33:.*]] = mhlo.multiply %[[TMP_32]], %[[TMP_3]]
-  // CHECK: %[[TMP_34:.*]] = mhlo.constant dense<-0.00168282702>
-  // CHECK: %[[TMP_35:.*]] = mhlo.add %[[TMP_33]], %[[TMP_34]]
-  // CHECK: %[[TMP_36:.*]] = mhlo.multiply %[[TMP_35]], %[[TMP_3]]
-  // CHECK: %[[TMP_37:.*]] = mhlo.constant dense<-0.00737332925>
-  // CHECK: %[[TMP_38:.*]] = mhlo.add %[[TMP_36]], %[[TMP_37]]
-  // CHECK: %[[TMP_39:.*]] = mhlo.multiply %[[TMP_38]], %[[TMP_3]]
-  // CHECK: %[[TMP_40:.*]] = mhlo.constant dense<-0.0142647391>
-  // CHECK: %[[TMP_41:.*]] = mhlo.add %[[TMP_39]], %[[TMP_40]]
-  // CHECK: %[[TMP_42:.*]] = mhlo.multiply %[[TMP_2]], %[[TMP_25]]
-  // CHECK: %[[TMP_43:.*]] = mhlo.divide %[[TMP_42]], %[[TMP_41]]
-  // CHECK-DAG: %[[TMP_44:.*]] = mhlo.constant dense<-1.000000e+00>
-  // CHECK-DAG: %[[TMP_45:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[RESULT:.*]] = mhlo.clamp %[[TMP_44]], %[[TMP_43]], %[[TMP_45]]
-  // CHECK: return %[[RESULT]]
-  %1 = "chlo.erf"(%arg) : (tensor<f32>) -> tensor<f32>
-  func.return %1 : tensor<f32>
-}
-
-// -----
-
-// CHECK-LABEL: @erf_f16
-// CHECK-SAME: %[[ARG:.*]]: tensor<f16>
-func.func @erf_f16(%arg : tensor<f16>) -> tensor<f16> {
-  // CHECK: mhlo.convert %[[ARG]] : (tensor<f16>) -> tensor<f32>
-  // CHECK: %[[RESULT:.*]] = mhlo.convert %{{.*}} : (tensor<f32>) -> tensor<f16>
-  // CHECK: return %[[RESULT]]
-  %1 = "chlo.erf"(%arg) : (tensor<f16>) -> tensor<f16>
-  func.return %1 : tensor<f16>
-}
-
-// -----
-
-// CHECK-LABEL: @erf_bf16
-// CHECK-SAME: %[[ARG:.*]]: tensor<bf16>
-func.func @erf_bf16(%arg : tensor<bf16>) -> tensor<bf16> {
-  // CHECK: mhlo.convert %[[ARG]] : (tensor<bf16>) -> tensor<f32>
-  // CHECK: %[[RESULT:.*]] = mhlo.convert %{{.*}} : (tensor<f32>) -> tensor<bf16>
-  // CHECK: return %[[RESULT]]
-  %1 = "chlo.erf"(%arg) : (tensor<bf16>) -> tensor<bf16>
-  func.return %1 : tensor<bf16>
-}
-
-
-// CHECK-LABEL: @top_k
-// CHECK-SAME: (%[[ARG:.*]]: tensor<16x16xf32>)
-func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32>) {
-  // CHECK:      %[[IOTA:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64}
-  // CHECK-NEXT: %[[SORT:.*]]:2 = "mhlo.sort"(%[[ARG]], %[[IOTA]]) ({
-  // CHECK-NEXT: ^{{.*}}(%[[LHS:.*]]: tensor<f32>, %[[RHS:.*]]: tensor<f32>, %{{.*}}: tensor<i32>, %{{.*}}: tensor<i32>):
-  // CHECK-NEXT:   %[[CMP:.*]] = mhlo.compare GT, %[[LHS]], %[[RHS]], TOTALORDER
-  // CHECK-NEXT:   mhlo.return %[[CMP]]
-  // CHECK-NEXT: }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
-  // CHECK-NEXT: %[[VAL:.*]] = "mhlo.slice"(%[[SORT]]#0) {limit_indices = dense<[16, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-  // CHECK-NEXT: %[[IDX:.*]] = "mhlo.slice"(%[[SORT]]#1) {limit_indices = dense<[16, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-  // CHECK-NEXT: return %[[VAL]], %[[IDX]]
-  %1:2 = chlo.top_k(%arg, k=8) : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
-  func.return %1#0, %1#1 : tensor<16x8xf32>, tensor<16x8xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @dyn_top_k
-// CHECK-SAME: ([[ARG:%.*]]: tensor<?x5x?xi1>
-// CHECK-SAME: -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
-func.func @dyn_top_k(%arg0: tensor<?x5x?xi1>) -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>) {
-  // CHECK-NEXT: [[DIM_0_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 0 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
-  // CHECK-NEXT: [[DIM_0_I32x1:%.*]] = mhlo.reshape [[DIM_0_I32]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: [[DIM_1_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 1 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
-  // CHECK-NEXT: [[DIM_1_I32x1:%.*]] = mhlo.reshape [[DIM_1_I32]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: [[DIM_2_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 2 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
-  // CHECK-NEXT: [[DIM_2_I32x1:%.*]] = mhlo.reshape [[DIM_2_I32]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: [[IOTA_SHAPE:%.*]] = "mhlo.concatenate"([[DIM_0_I32x1]], [[DIM_1_I32x1]], [[DIM_2_I32x1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
-  // CHECK-NEXT: [[K_I32:%.*]] = mhlo.constant dense<2> : tensor<i32>
-  // CHECK-NEXT: [[K_I32x1:%.*]] = mhlo.reshape [[K_I32]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: [[RESULT_SHAPE:%.*]] = "mhlo.concatenate"([[DIM_0_I32x1]], [[DIM_1_I32x1]], [[K_I32x1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
-  // CHECK-NEXT: [[IOTA:%.*]] = "mhlo.dynamic_iota"([[IOTA_SHAPE]]) {iota_dimension = 2 : i64} : (tensor<3xi32>) -> tensor<?x5x?xi32>
-  // CHECK-NEXT: [[SORT:%.*]]:2 = "mhlo.sort"([[ARG]], [[IOTA]]) ({
-  // CHECK-NEXT: ^bb0([[ARG_1:%.*]]: tensor<i1>, [[ARG_2:%.*]]: tensor<i1>, [[ARG_3:%.*]]: tensor<i32>, [[ARG_4:%.*]]: tensor<i32>):
-  // CHECK-NEXT:   [[CMP:%.*]] = mhlo.compare  GT, [[ARG_1]], [[ARG_2]],  NOTYPE : (tensor<i1>, tensor<i1>) -> tensor<i1>
-  // CHECK-NEXT:   mhlo.return [[CMP]] : tensor<i1>
-  // CHECK-NEXT: }) {dimension = 2 : i64, is_stable = true} : (tensor<?x5x?xi1>, tensor<?x5x?xi32>) -> (tensor<?x5x?xi1>, tensor<?x5x?xi32>)
-  // CHECK-NEXT: [[STARTS:%.*]] = mhlo.constant dense<0> : tensor<3xi64>
-  // CHECK-NEXT: [[LIMITS:%.*]] = mhlo.convert [[RESULT_SHAPE]] : (tensor<3xi32>) -> tensor<3xi64>
-  // CHECK-NEXT: [[STRIDES:%.*]] = mhlo.constant dense<1> : tensor<3xi64>
-  // CHECK-NEXT: [[VAL:%.*]] = mhlo.real_dynamic_slice [[SORT]]#0, [[STARTS]], [[LIMITS]], [[STRIDES]] : (tensor<?x5x?xi1>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi1>
-  // CHECK-NEXT: [[IDX:%.*]] = mhlo.real_dynamic_slice [[SORT]]#1, [[STARTS]], [[LIMITS]], [[STRIDES]] : (tensor<?x5x?xi32>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi32>
-  // CHECK-NEXT: return [[VAL]], [[IDX]] : tensor<?x5x2xi1>, tensor<?x5x2xi32>
-  %values, %indices = chlo.top_k(%arg0, k = 2) : tensor<?x5x?xi1> -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
-  return %values, %indices : tensor<?x5x2xi1>, tensor<?x5x2xi32>
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/lower-complex.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/lower-complex.mlir
index 8f59051c95a244..8c2e615baed7fe 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/lower-complex.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/lower-complex.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -chlo-legalize-to-hlo -mhlo-test-lower-complex | FileCheck %s
+// RUN: mlir-hlo-opt %s --mhlo-test-lower-complex | FileCheck %s
 
 // CHECK-LABEL: @add
 func.func @add(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.cc b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
index c00045e0f27ce0..664012f46b9780 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.cc
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
@@ -200,8 +200,7 @@ Status MlirToXlaComputation(mlir::ModuleOp module,
     mlir::PassManager pm(module->getContext());
     pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
     pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::mhlo::createChloLegalizeToHloPass(
-            /*legalizeBroadcasts=*/true, /*expandCompositions=*/true));
+        mlir::mhlo::createChloLegalizeToHloPass());
     pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
     // In order to export to XLA, we must sink constants to control flow
     // regions, since XLA uses functional control flow.
@@ -303,12 +302,15 @@ absl::StatusOr<std::string> SerializeUsingNativeBytecode(
 
 absl::StatusOr<std::string> SerializeUsingVersionedStablehlo(
     mlir::ModuleOp mlir_module, absl::string_view target, bool inplace) {
-  // Legalize CHLO -> [MHLO+Shape] -> StableHLO
+  // Legalize CHLO -> [StableHLO+Shape] -> StableHLO
+  // Preserve higher-level ops with XLA support. To be replaced by composites.
   mlir::PassManager pm(mlir_module->getContext());
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createChloLegalizeToHloPass());
+      mlir::mhlo::createChloLegalizeToHighLevelMhloPass());
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createShapeLegalizeToHloPass());
+      mlir::stablehlo::createChloLegalizeToStablehloPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::stablehlo::createShapeLegalizeToStablehloPass());
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
   pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
   if (!mlir::succeeded(pm.run(mlir_module))) {
@@ -316,7 +318,7 @@ absl::StatusOr<std::string> SerializeUsingVersionedStablehlo(
   }
 
   // Avoid mutating the original module if it will be reused elsewhere
-  mlir::OwningOpRef<mlir::ModuleOp> cloned = mlir_module.clone();
+  mlir::OwningOpRef<mlir::ModuleOp> cloned;
   if (!inplace) {
     cloned = mlir_module.clone();
     mlir_module = *cloned;

From 5e463fc4e74477943258ef660d7bf33d1fb52f72 Mon Sep 17 00:00:00 2001
From: Yue Sheng <yueshengys@google.com>
Date: Thu, 21 Mar 2024 10:30:09 -0700
Subject: [PATCH 241/670] [PJRT][IFRT] Update PJRT, IFRT, and Py executable
 getters to return PjRtLayouts

PiperOrigin-RevId: 617889924
---
 third_party/xla/xla/pjrt/BUILD                |  1 +
 third_party/xla/xla/pjrt/pjrt_executable.cc   | 25 +++++-
 third_party/xla/xla/pjrt/pjrt_executable.h    |  7 +-
 third_party/xla/xla/python/ifrt/array.h       |  2 +
 third_party/xla/xla/python/ifrt/executable.h  | 20 +++--
 third_party/xla/xla/python/ifrt/mock.h        | 16 ++--
 .../xla/xla/python/ifrt_proxy/client/BUILD    |  3 +
 .../python/ifrt_proxy/client/executable.cc    | 26 ++++--
 .../xla/python/ifrt_proxy/client/executable.h |  6 +-
 .../ifrt_proxy/client/executable_test.cc      | 25 ++++--
 .../xla/xla/python/ifrt_proxy/server/BUILD    |  3 +
 .../python/ifrt_proxy/server/ifrt_backend.cc  | 24 +++++-
 .../ifrt_proxy/server/ifrt_backend_test.cc    | 19 +++--
 .../xla/python/pjrt_ifrt/pjrt_executable.h    | 12 ++-
 third_party/xla/xla/python/py_executable.cc   |  9 +-
 third_party/xla/xla/python/py_executable.h    |  6 +-
 third_party/xla/xla/python/xla.cc             |  5 +-
 third_party/xla/xla/python/xla_client.py      |  1 +
 third_party/xla/xla/python/xla_client_test.py | 82 +++++++++++--------
 19 files changed, 198 insertions(+), 94 deletions(-)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index e3f86a3287e59f..b6f7c6f8b9d079 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -236,6 +236,7 @@ cc_library(
         ":executable_metadata_proto_cc",
         ":execute_options_proto_cc",
         ":pjrt_common",
+        ":pjrt_layout",
         "//xla:shape_layout",
         "//xla:shape_util",
         "//xla:status",
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.cc b/third_party/xla/xla/pjrt/pjrt_executable.cc
index 9eeb5a8f39842d..4c4eae38f0cbc9 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.cc
+++ b/third_party/xla/xla/pjrt/pjrt_executable.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/execute_options.pb.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
@@ -321,7 +322,8 @@ PjRtExecutable::GetOutputDimensions() const {
   return output_dimensions;
 }
 
-StatusOr<std::vector<Layout>> PjRtExecutable::GetParameterLayouts() const {
+absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>>
+PjRtExecutable::GetParameterLayouts() const {
   TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
                       GetHloModules());
   if (hlo_modules.size() > 1) {
@@ -335,10 +337,18 @@ StatusOr<std::vector<Layout>> PjRtExecutable::GetParameterLayouts() const {
         "from executable.");
   }
   ComputationLayout comp_layout = hlo_modules[0]->entry_computation_layout();
-  return comp_layout.FlattenedParameterLayouts();
+  TF_ASSIGN_OR_RETURN(std::vector<Layout> layouts,
+                      comp_layout.FlattenedParameterLayouts());
+  std::vector<std::unique_ptr<PjRtLayout>> result;
+  result.reserve(layouts.size());
+  for (const Layout& layout : layouts) {
+    result.push_back(std::make_unique<PjRtXlaLayout>(layout));
+  }
+  return result;
 }
 
-StatusOr<std::vector<Layout>> PjRtExecutable::GetOutputLayouts() const {
+absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>>
+PjRtExecutable::GetOutputLayouts() const {
   TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
                       GetHloModules());
   if (hlo_modules.size() > 1) {
@@ -352,7 +362,14 @@ StatusOr<std::vector<Layout>> PjRtExecutable::GetOutputLayouts() const {
         "from executable.");
   }
   ComputationLayout comp_layout = hlo_modules[0]->entry_computation_layout();
-  return comp_layout.FlattenedResultLayouts();
+  TF_ASSIGN_OR_RETURN(std::vector<Layout> layouts,
+                      comp_layout.FlattenedResultLayouts());
+  std::vector<std::unique_ptr<PjRtLayout>> result;
+  result.reserve(layouts.size());
+  for (const Layout& layout : layouts) {
+    result.push_back(std::make_unique<PjRtXlaLayout>(layout));
+  }
+  return result;
 }
 
 StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.h b/third_party/xla/xla/pjrt/pjrt_executable.h
index 1631f595e315d5..e1a6c6f22af96a 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.h
+++ b/third_party/xla/xla/pjrt/pjrt_executable.h
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/pjrt/executable_metadata.pb.h"
 #include "xla/pjrt/execute_options.pb.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/service/compiler.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -352,10 +353,12 @@ class PjRtExecutable {
   GetOutputDimensions() const;
 
   // Returns the layout of each input parameter.
-  virtual StatusOr<std::vector<Layout>> GetParameterLayouts() const;
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>>
+  GetParameterLayouts() const;
 
   // Returns the layout of each output.
-  virtual StatusOr<std::vector<Layout>> GetOutputLayouts() const;
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>>
+  GetOutputLayouts() const;
 
   // Returns a list of lists of memory kind strings for output. The returned
   // value is `[num_programs, num_output]`. The size of the outer list should be
diff --git a/third_party/xla/xla/python/ifrt/array.h b/third_party/xla/xla/python/ifrt/array.h
index f1406877a6c110..b63d9f4c90096f 100644
--- a/third_party/xla/xla/python/ifrt/array.h
+++ b/third_party/xla/xla/python/ifrt/array.h
@@ -37,6 +37,8 @@ namespace ifrt {
 
 class Client;
 
+using Layout = ::xla::PjRtLayout;
+
 // Semantics for operations that may copy or move sharded buffers in an array.
 enum class ArrayCopySemantics : int {
   // Always creates new buffers to construct an output array. Mutation of the
diff --git a/third_party/xla/xla/python/ifrt/executable.h b/third_party/xla/xla/python/ifrt/executable.h
index 94b2784d325994..612827023d4da0 100644
--- a/third_party/xla/xla/python/ifrt/executable.h
+++ b/third_party/xla/xla/python/ifrt/executable.h
@@ -63,10 +63,12 @@ class Executable : public llvm::RTTIExtends<Executable, llvm::RTTIRoot> {
       const = 0;
   // Returns a list of output `OpSharding`.
   virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const = 0;
-  // Returns a list of parameter `xla::Layout`s.
-  virtual absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const = 0;
-  // Returns a list of output/result `xla::Layout`s.
-  virtual absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const = 0;
+  // Returns a list of parameter layouts.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Layout>>>
+  GetParameterLayouts() const = 0;
+  // Returns a list of output/result layouts.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Layout>>>
+  GetOutputLayouts() const = 0;
   // Returns an `HloModule` (optimized) per partition.
   virtual absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
   GetHloModules() const = 0;
@@ -131,10 +133,12 @@ class LoadedExecutable
       const = 0;
   // Returns a list of output OpSharding.
   virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const = 0;
-  // Returns a list of parameter `xla::Layout`s.
-  virtual absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const = 0;
-  // Returns a list of output/result `xla::Layout`s.
-  virtual absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const = 0;
+  // Returns a list of parameter layouts.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Layout>>>
+  GetParameterLayouts() const = 0;
+  // Returns a list of output/result layouts.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Layout>>>
+  GetOutputLayouts() const = 0;
   // Return an HloModule (optimized) per partition.
   virtual absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
   GetHloModules() const = 0;
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index afb810fe4e7262..b33209c1a3c921 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -236,10 +236,10 @@ class MockExecutable final
               (const, final));
   MOCK_METHOD(std::optional<std::vector<OpSharding>>, GetOutputShardings, (),
               (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<Layout>>, GetParameterLayouts, (),
-              (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<Layout>>, GetOutputLayouts, (),
-              (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::unique_ptr<Layout>>>,
+              GetParameterLayouts, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::unique_ptr<Layout>>>,
+              GetOutputLayouts, (), (const, final));
   MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>,
               GetHloModules, (), (const, final));
   MOCK_METHOD(
@@ -266,10 +266,10 @@ class MockLoadedExecutable final
               (const, final));
   MOCK_METHOD(std::optional<std::vector<OpSharding>>, GetOutputShardings, (),
               (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<Layout>>, GetParameterLayouts, (),
-              (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<Layout>>, GetOutputLayouts, (),
-              (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::unique_ptr<Layout>>>,
+              GetParameterLayouts, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::unique_ptr<Layout>>>,
+              GetOutputLayouts, (), (const, final));
   MOCK_METHOD(absl::StatusOr<std::vector<std::vector<absl::string_view>>>,
               GetOutputMemoryKinds, (), (const, final));
   MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>,
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index 27f920e9b82eae..69656e7d0eb5ec 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -326,6 +326,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:host_callback",
         "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
@@ -477,6 +478,7 @@ ifrt_proxy_cc_test(
         ":version",
         "//xla:shape_util",
         "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:mock",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
@@ -486,6 +488,7 @@ ifrt_proxy_cc_test(
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
index af7a1374eb530a..ea110d6ed824c9 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
@@ -38,6 +38,7 @@
 #include "xla/layout.h"
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
@@ -356,15 +357,30 @@ std::optional<std::vector<OpSharding>> LoadedExecutable::GetOutputShardings()
   return (*info)->output_shardings;
 }
 
-absl::StatusOr<std::vector<Layout>> LoadedExecutable::GetParameterLayouts()
-    const {
+absl::StatusOr<std::vector<std::unique_ptr<Layout>>>
+LoadedExecutable::GetParameterLayouts() const {
   TF_ASSIGN_OR_RETURN(auto info, metadata_future_.Await());
-  return info->parameter_layouts;
+  TF_RETURN_IF_ERROR(info->parameter_layouts.status());
+
+  std::vector<std::unique_ptr<Layout>> result;
+  result.reserve(info->parameter_layouts->size());
+  for (const xla::Layout& layout : *info->parameter_layouts) {
+    result.push_back(std::make_unique<xla::PjRtXlaLayout>(layout));
+  }
+  return result;
 }
 
-absl::StatusOr<std::vector<Layout>> LoadedExecutable::GetOutputLayouts() const {
+absl::StatusOr<std::vector<std::unique_ptr<Layout>>>
+LoadedExecutable::GetOutputLayouts() const {
   TF_ASSIGN_OR_RETURN(auto info, metadata_future_.Await());
-  return info->output_layouts;
+  TF_RETURN_IF_ERROR(info->output_layouts.status());
+
+  std::vector<std::unique_ptr<Layout>> result;
+  result.reserve(info->output_layouts->size());
+  for (const xla::Layout& layout : *info->output_layouts) {
+    result.push_back(std::make_unique<xla::PjRtXlaLayout>(layout));
+  }
+  return result;
 }
 
 absl::StatusOr<std::vector<std::vector<absl::string_view>>>
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.h b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
index 3727e87ba18967..d6b12d97211911 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
@@ -76,8 +76,10 @@ class LoadedExecutable final
 
   std::optional<std::vector<OpSharding>> GetParameterShardings() const override;
   std::optional<std::vector<OpSharding>> GetOutputShardings() const override;
-  absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const override;
-  absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const override;
+  absl::StatusOr<std::vector<std::unique_ptr<Layout>>> GetParameterLayouts()
+      const override;
+  absl::StatusOr<std::vector<std::unique_ptr<Layout>>> GetOutputLayouts()
+      const override;
   absl::StatusOr<std::vector<std::vector<absl::string_view>>>
   GetOutputMemoryKinds() const override;
   absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
index 4decc3c789efa5..04b466d7529062 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
@@ -25,6 +25,7 @@
 #include "llvm/Support/Casting.h"
 #include "xla/layout_util.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
@@ -44,6 +45,7 @@
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/common/types.h"
 #include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -151,13 +153,22 @@ TEST_F(LoadedExecutableTest, Metadata) {
                             tile_assignment_dimensions: [ 0, 1 ])pb"))));
   EXPECT_THAT(executable.GetOutputShardings(),
               Optional(ElementsAre(EquivToProto(R"pb(type: REPLICATED)pb"))));
-  EXPECT_THAT(executable.GetParameterLayouts(),
-              IsOkAndHolds(ElementsAre(
-                  xla::LayoutUtil::MakeDescendingLayout(/*rank=*/1),
-                  xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2))));
-  EXPECT_THAT(executable.GetOutputLayouts(),
-              IsOkAndHolds(ElementsAre(
-                  xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2))));
+  ASSERT_OK_AND_ASSIGN(auto parameter_layouts,
+                       executable.GetParameterLayouts());
+  EXPECT_EQ(parameter_layouts.size(), 2);
+  EXPECT_EQ(
+      tensorflow::down_cast<xla::PjRtXlaLayout*>(parameter_layouts[0].get())
+          ->xla_layout(),
+      xla::LayoutUtil::MakeDescendingLayout(/*rank=*/1));
+  EXPECT_EQ(
+      tensorflow::down_cast<xla::PjRtXlaLayout*>(parameter_layouts[1].get())
+          ->xla_layout(),
+      xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2));
+  ASSERT_OK_AND_ASSIGN(auto output_layouts, executable.GetOutputLayouts());
+  EXPECT_EQ(output_layouts.size(), 1);
+  EXPECT_EQ(tensorflow::down_cast<xla::PjRtXlaLayout*>(output_layouts[0].get())
+                ->xla_layout(),
+            xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2));
   EXPECT_THAT(executable.GetOutputMemoryKinds(),
               IsOkAndHolds(ElementsAre(ElementsAre("foo"))));
 }
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/BUILD b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
index 70dfde6ecc0210..3b072a9744b976 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
@@ -152,8 +152,10 @@ cc_library(
         ":host_callback",
         ":version",
         "//xla:shape_util",
+        "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:serdes",
         "//xla/python/ifrt_proxy/common:array_util",
@@ -201,6 +203,7 @@ ifrt_proxy_cc_test(
         "//xla/pjrt:host_callback",
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:mock",
         "//xla/python/ifrt:serdes",
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index 592bac91e718f5..91ec171deae7ff 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -40,6 +40,7 @@
 #include "llvm/Support/Casting.h"
 #include "xla/layout.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
@@ -58,6 +59,7 @@
 #include "xla/python/ifrt_proxy/server/host_callback.h"
 #include "xla/python/ifrt_proxy/server/version.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
+#include "xla/status_macros.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/env.h"
@@ -821,8 +823,15 @@ IfrtBackend::HandleLoadedExecutableMetadataRequest(
         parameter_layouts.ok()) {
       auto* const layouts =
           metadata_resp->mutable_parameter_layouts_list()->mutable_layouts();
-      for (const xla::Layout& layout : *parameter_layouts) {
-        layouts->Add(layout.ToProto());
+      for (const std::unique_ptr<xla::PjRtLayout>& parameter_layout :
+           *parameter_layouts) {
+        // TODO(b/329165105): use PjRtLayout::Serialize instead
+        const xla::PjRtXlaLayout* layout =
+            dynamic_cast<const xla::PjRtXlaLayout*>(parameter_layout.get());
+        TF_RET_CHECK(layout != nullptr)
+            << "IFRT proxy only supports PjRtXlaLayout, got a different "
+               "subclass";
+        layouts->Add(layout->xla_layout().ToProto());
       }
     } else {
       *metadata_resp->mutable_parameter_layouts_error() =
@@ -832,8 +841,15 @@ IfrtBackend::HandleLoadedExecutableMetadataRequest(
         output_layouts.ok()) {
       auto* const layouts =
           metadata_resp->mutable_output_layouts_list()->mutable_layouts();
-      for (const xla::Layout& layout : *output_layouts) {
-        layouts->Add(layout.ToProto());
+      for (const std::unique_ptr<xla::PjRtLayout>& output_layout :
+           *output_layouts) {
+        // TODO(b/329165105): use PjRtLayout::Serialize instead
+        const xla::PjRtXlaLayout* layout =
+            dynamic_cast<const xla::PjRtXlaLayout*>(output_layout.get());
+        TF_RET_CHECK(layout != nullptr)
+            << "IFRT proxy only supports PjRtXlaLayout, got a different "
+               "subclass";
+        layouts->Add(layout->xla_layout().ToProto());
       }
     } else {
       *metadata_resp->mutable_output_layouts_error() =
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
index cf90e2805e09ae..1da5d337863233 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
@@ -44,6 +44,7 @@
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
@@ -962,15 +963,19 @@ TEST_F(IfrtBackendHandlerTest, LoadedExecutableMetadata) {
     EXPECT_CALL(*executable, GetOutputShardings())
         .WillOnce(Return(std::vector<OpSharding>{op_sharding1}));
 
+    std::vector<std::unique_ptr<Layout>> parameter_layouts;
+    parameter_layouts.push_back(std::make_unique<xla::PjRtXlaLayout>(
+        xla::LayoutUtil::MakeDescendingLayout(/*rank=*/1)));
+    parameter_layouts.push_back(std::make_unique<xla::PjRtXlaLayout>(
+        xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2)));
     EXPECT_CALL(*executable, GetParameterLayouts())
-        .WillOnce(Return(std::vector<xla::Layout>{
-            xla::LayoutUtil::MakeDescendingLayout(/*rank=*/1),
-            xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2),
-        }));
+        .WillOnce(Return(std::move(parameter_layouts)));
+
+    std::vector<std::unique_ptr<Layout>> output_layouts;
+    output_layouts.push_back(std::make_unique<xla::PjRtXlaLayout>(
+        xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2)));
     EXPECT_CALL(*executable, GetOutputLayouts())
-        .WillOnce(Return(std::vector<xla::Layout>{
-            xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2),
-        }));
+        .WillOnce(Return(std::move(output_layouts)));
     EXPECT_CALL(*executable, GetOutputMemoryKinds())
         .WillOnce(Return(std::vector<std::vector<absl::string_view>>{{"foo"}}));
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index 4195244eda139c..f667508a287be8 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -114,12 +114,14 @@ class PjRtExecutable final
     return pjrt_executable_->GetOutputShardings();
   }
 
-  absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const override {
+  absl::StatusOr<std::vector<std::unique_ptr<Layout>>> GetParameterLayouts()
+      const override {
     DCHECK(this);
     return pjrt_executable_->GetParameterLayouts();
   }
 
-  absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const override {
+  absl::StatusOr<std::vector<std::unique_ptr<Layout>>> GetOutputLayouts()
+      const override {
     DCHECK(this);
     return pjrt_executable_->GetOutputLayouts();
   }
@@ -231,12 +233,14 @@ class PjRtLoadedExecutable final
     return pjrt_loaded_executable_->GetOutputShardings();
   }
 
-  absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const override {
+  absl::StatusOr<std::vector<std::unique_ptr<Layout>>> GetParameterLayouts()
+      const override {
     DCHECK(this);
     return pjrt_loaded_executable_->GetParameterLayouts();
   }
 
-  absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const override {
+  absl::StatusOr<std::vector<std::unique_ptr<Layout>>> GetOutputLayouts()
+      const override {
     DCHECK(this);
     return pjrt_loaded_executable_->GetOutputLayouts();
   }
diff --git a/third_party/xla/xla/python/py_executable.cc b/third_party/xla/xla/python/py_executable.cc
index e425bd1fa4d0ef..837a8ed5222c40 100644
--- a/third_party/xla/xla/python/py_executable.cc
+++ b/third_party/xla/xla/python/py_executable.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/executable.h"
@@ -406,14 +407,14 @@ PyLoadedExecutable::GetOutputMemoryKinds() const {
   return ifrt_loaded_executable_->GetOutputMemoryKinds();
 }
 
-absl::StatusOr<std::vector<Layout>> PyLoadedExecutable::GetParameterLayouts()
-    const {
+absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>>
+PyLoadedExecutable::GetParameterLayouts() const {
   nb::gil_scoped_release gil_release;
   return ifrt_loaded_executable_->GetParameterLayouts();
 }
 
-absl::StatusOr<std::vector<Layout>> PyLoadedExecutable::GetOutputLayouts()
-    const {
+absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>>
+PyLoadedExecutable::GetOutputLayouts() const {
   nb::gil_scoped_release gil_release;
   return ifrt_loaded_executable_->GetOutputLayouts();
 }
diff --git a/third_party/xla/xla/python/py_executable.h b/third_party/xla/xla/python/py_executable.h
index cfd716d7cc6e32..3139f65e4e1ce0 100644
--- a/third_party/xla/xla/python/py_executable.h
+++ b/third_party/xla/xla/python/py_executable.h
@@ -191,9 +191,11 @@ class PyLoadedExecutable {
   absl::StatusOr<std::vector<std::vector<std::string_view>>>
   GetOutputMemoryKinds() const;
 
-  absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>> GetParameterLayouts()
+      const;
 
-  absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>> GetOutputLayouts()
+      const;
 
   std::optional<std::vector<OpSharding>> GetParameterShardings() const;
 
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index c9194a2cdc694c..8f9c3e12a943dd 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -225,8 +225,9 @@ NB_MODULE(xla_extension, m_nb) {
         // generic method on PjRtCompiler instead, although we'll have
         // somehow have to attach a compiler to this PjRtLayout (something
         // like ClientAndPtr).
-        absl::StatusOr<PjRtXlaLayout> layout =
-            PjRtXlaLayout::Deserialize(nb::cast<std::string_view>(t[0]));
+        nb::bytes serialized = nb::cast<nb::bytes>(t[0]);
+        absl::StatusOr<PjRtXlaLayout> layout = PjRtXlaLayout::Deserialize(
+            std::string_view(serialized.c_str(), serialized.size()));
         ThrowIfError(layout.status());
         new (self) PjRtXlaLayout(std::move(*layout));
       });
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index ca419694f95d3a..48ba271cbf4f1f 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -543,6 +543,7 @@ def window_padding_type_to_pad_values(
 SingleDeviceSharding = _xla.SingleDeviceSharding
 PmapSharding = _xla.PmapSharding
 GSPMDSharding = _xla.GSPMDSharding
+PjRtLayout = _xla.PjRtLayout
 
 
 def LoadedExecutable_execute(self, arguments, device=None):
diff --git a/third_party/xla/xla/python/xla_client_test.py b/third_party/xla/xla/python/xla_client_test.py
index 9814c487802303..e0f131334c2f3a 100644
--- a/third_party/xla/xla/python/xla_client_test.py
+++ b/third_party/xla/xla/python/xla_client_test.py
@@ -533,6 +533,12 @@ def testScalarMinusVectorExplicitNumbering(self, dtype):
   class LayoutsTest(ComputationTest):
     """Tests related to getting and setting on-device memory layouts."""
 
+    def _minor_to_major(self, layout: xla_client.PjRtLayout):  # pylint: disable=invalid-name
+      m2m_str = re.search("{([0-9,]*)", str(layout)).group(1)
+      if not m2m_str:
+        return ()
+      return tuple(int(x) for x in m2m_str.split(","))
+
     @unittest.skipIf(pathways, "not implemented")
     def testGetArgumentLayouts(self):
       # Create computation with a few parameters.
@@ -557,9 +563,9 @@ def MakeArg(shape, dtype):
       # Test that compiled executable returns plausible layouts.
       layouts: Sequence[xla_client.Layout] = executable.get_parameter_layouts()
       self.assertLen(layouts, 3)
-      self.assertLen(layouts[0].minor_to_major(), 3)
-      self.assertLen(layouts[1].minor_to_major(), 2)
-      self.assertEmpty(layouts[2].minor_to_major())
+      self.assertLen(self._minor_to_major(layouts[0]), 3)
+      self.assertLen(self._minor_to_major(layouts[1]), 2)
+      self.assertEmpty(self._minor_to_major(layouts[2]))
 
     @unittest.skipIf(pathways, "not implemented")
     def testGetArgumentLayoutsTupled(self):
@@ -590,9 +596,9 @@ def testGetArgumentLayoutsTupled(self):
       # Test that compiled executable returns plausible layouts.
       layouts: Sequence[xla_client.Layout] = executable.get_parameter_layouts()
       self.assertLen(layouts, 3)
-      self.assertLen(layouts[0].minor_to_major(), 3)
-      self.assertEmpty(layouts[1].minor_to_major())
-      self.assertLen(layouts[2].minor_to_major(), 1)
+      self.assertLen(self._minor_to_major(layouts[0]), 3)
+      self.assertEmpty(self._minor_to_major(layouts[1]))
+      self.assertLen(self._minor_to_major(layouts[2]), 1)
 
     @unittest.skipIf(pathways, "not implemented")
     def testGetOutputLayouts(self):
@@ -616,9 +622,9 @@ def testGetOutputLayouts(self):
       # Test that compiled executable returns plausible layouts.
       layouts: Sequence[xla_client.Layout] = executable.get_output_layouts()
       self.assertLen(layouts, 3)
-      self.assertLen(layouts[0].minor_to_major(), 2)
-      self.assertEmpty(layouts[1].minor_to_major())
-      self.assertLen(layouts[2].minor_to_major(), 1)
+      self.assertLen(self._minor_to_major(layouts[0]), 2)
+      self.assertEmpty(self._minor_to_major(layouts[1]))
+      self.assertLen(self._minor_to_major(layouts[2]), 1)
 
     @unittest.skipIf(pathways, "not implemented")
     def testSetArgumentLayouts(self):
@@ -652,9 +658,9 @@ def testSetArgumentLayouts(self):
       # Check input layouts.
       input_layouts = executable.get_parameter_layouts()
       self.assertLen(input_layouts, 3)
-      self.assertEqual(input_layouts[0].minor_to_major(), (0, 1, 2))
-      self.assertEqual(input_layouts[1].minor_to_major(), ())
-      self.assertEqual(input_layouts[2].minor_to_major(), (0,))
+      self.assertEqual(self._minor_to_major(input_layouts[0]), (0, 1, 2))
+      self.assertEqual(self._minor_to_major(input_layouts[1]), ())
+      self.assertEqual(self._minor_to_major(input_layouts[2]), (0,))
 
       # Compile a version with default arg0 layout so we can make sure we
       # actually set it above.
@@ -662,8 +668,9 @@ def testSetArgumentLayouts(self):
           module_str.replace('"{0,1,2}"', '"default"')
       )
       self.assertNotEqual(
-          input_layouts[0].minor_to_major(),
-          default_executable.get_parameter_layouts()[0].minor_to_major())
+          self._minor_to_major(input_layouts[0]),
+          self._minor_to_major(default_executable.get_parameter_layouts()[0]),
+      )
 
     @unittest.skipIf(pathways or pathways_ifrt, "not implemented")
     def testSetArgumentLayoutsLegacy(self):
@@ -706,8 +713,10 @@ def MakeArg(shape, dtype, layout):
           executable.get_parameter_layouts())
       self.assertEqual(len(actual_layouts), len(expected_layouts))
       for actual, expected in zip(actual_layouts, expected_layouts):
-        self.assertEqual(actual.minor_to_major(),
-                         expected.layout().minor_to_major())
+        self.assertEqual(
+            self._minor_to_major(actual),
+            expected.layout().minor_to_major(),
+        )
 
     @unittest.skipIf(pathways, "not implemented")
     def testSetOutputLayouts(self):
@@ -741,9 +750,9 @@ def testSetOutputLayouts(self):
       # Check output layouts.
       output_layouts = executable.get_output_layouts()
       self.assertLen(output_layouts, 3)
-      self.assertEqual(output_layouts[0].minor_to_major(), (0, 1, 2))
-      self.assertEqual(output_layouts[1].minor_to_major(), ())
-      self.assertEqual(output_layouts[2].minor_to_major(), (0,))
+      self.assertEqual(self._minor_to_major(output_layouts[0]), (0, 1, 2))
+      self.assertEqual(self._minor_to_major(output_layouts[1]), ())
+      self.assertEqual(self._minor_to_major(output_layouts[2]), (0,))
 
       # Compile a version with default first output layout so we can make sure
       # we actually set it above.
@@ -751,8 +760,9 @@ def testSetOutputLayouts(self):
           module_str.replace('"{0,1,2}"', '"default"')
       )
       self.assertNotEqual(
-          output_layouts[0].minor_to_major(),
-          default_executable.get_output_layouts()[0].minor_to_major())
+          self._minor_to_major(output_layouts[0]),
+          self._minor_to_major(default_executable.get_output_layouts()[0]),
+      )
 
     @unittest.skipIf(pathways, "not implemented")
     def SetLayoutsSharded(self):
@@ -788,13 +798,13 @@ def SetLayoutsSharded(self):
       # Check input layouts.
       input_layouts = executable.get_parameter_layouts()
       self.assertLen(input_layouts, 2)
-      self.assertEqual(input_layouts[0].minor_to_major(), (0, 1))
-      self.assertEqual(input_layouts[1].minor_to_major(), ())
+      self.assertEqual(self._minor_to_major(input_layouts[0]), (0, 1))
+      self.assertEqual(self._minor_to_major(input_layouts[1]), ())
 
       # Check output layout.
       output_layouts = executable.get_output_layouts()
       self.assertLen(output_layouts, 1)
-      self.assertEqual(input_layouts[0].minor_to_major(), (0, 1))
+      self.assertEqual(self._minor_to_major(input_layouts[0]), (0, 1))
 
       # Compile a version with default layouts so we can make sure we actually
       # set it above.
@@ -802,11 +812,13 @@ def SetLayoutsSharded(self):
           module_str.replace('"{0,1}"', '"default"')
       )
       self.assertNotEqual(
-          input_layouts[0].minor_to_major(),
-          default_executable.get_parameter_layouts()[0].minor_to_major())
+          self._minor_to_major(input_layouts[0]),
+          self._minor_to_major(default_executable.get_parameter_layouts()[0]),
+      )
       self.assertNotEqual(
-          output_layouts[0].minor_to_major(),
-          default_executable.get_output_layouts()[0].minor_to_major())
+          self._minor_to_major(output_layouts[0]),
+          self._minor_to_major(default_executable.get_output_layouts()[0]),
+      )
 
     @unittest.skipIf(pathways, "not implemented")
     def testAutoArgumentLayouts(self):
@@ -838,8 +850,8 @@ def testAutoArgumentLayouts(self):
 
       # Check input layouts.
       input_layouts = executable.get_parameter_layouts()
-      self.assertEqual(input_layouts[0].minor_to_major(), (1, 0))
-      self.assertEqual(input_layouts[1].minor_to_major(), (2, 0, 1))
+      self.assertEqual(self._minor_to_major(input_layouts[0]), (1, 0))
+      self.assertEqual(self._minor_to_major(input_layouts[1]), (2, 0, 1))
 
       # Compile a version with default layouts so we can make sure the compiler
       # is actually choosing above.
@@ -849,8 +861,8 @@ def testAutoArgumentLayouts(self):
       # We expect the compiler to choose a non-default layout for the second
       # (1024,8,128) argument.
       self.assertNotEqual(
-          input_layouts[1].minor_to_major(),
-          default_executable.get_parameter_layouts()[1].minor_to_major(),
+          self._minor_to_major(input_layouts[1]),
+          self._minor_to_major(default_executable.get_parameter_layouts()[1]),
       )
 
     @unittest.skipIf(pathways, "not implemented")
@@ -881,7 +893,7 @@ def testAutoOutputLayouts(self):
 
       # Check output layout
       output_layout, = executable.get_output_layouts()
-      self.assertEqual(output_layout.minor_to_major(), (2, 0, 1))
+      self.assertEqual(self._minor_to_major(output_layout), (2, 0, 1))
 
       # Compile a version with default layouts so we can make sure the compiler
       # is actually choosing above.
@@ -890,8 +902,8 @@ def testAutoOutputLayouts(self):
       )
       # We expect the compiler to choose a non-default output layout.
       self.assertNotEqual(
-          output_layout.minor_to_major(),
-          default_executable.get_output_layouts()[0].minor_to_major(),
+          self._minor_to_major(output_layout),
+          self._minor_to_major(default_executable.get_output_layouts()[0]),
       )
 
   tests.append(LayoutsTest)

From 6480a6f33396de69040af2e066b8dd7f7cf4e432 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Bana=C5=9B?= <adambanas@google.com>
Date: Thu, 21 Mar 2024 10:55:23 -0700
Subject: [PATCH 242/670] [xla:ffi] Improved fail messages for custom call
 tests

In case a custom call is not found, the tests now fail gracefully instead of crashing.

PiperOrigin-RevId: 617898636
---
 third_party/xla/xla/tests/BUILD               |  1 +
 third_party/xla/xla/tests/custom_call_test.cc | 16 +++++++++-------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 27fb77cf0bfc9a..9443ebf882c54f 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -1751,6 +1751,7 @@ xla_test(
         "//xla/service:custom_call_target_registry",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/tests/custom_call_test.cc b/third_party/xla/xla/tests/custom_call_test.cc
index 071a3ee85558d8..4e0481af578569 100644
--- a/third_party/xla/xla/tests/custom_call_test.cc
+++ b/third_party/xla/xla/tests/custom_call_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/tests/test_macros.h"
 #include "xla/tests/test_utils.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace {
@@ -117,7 +118,7 @@ XLA_TEST_F(CustomCallTest, CustomCallR0F32Add2) {
 
   module->AddEntryComputation(builder.Build());
 
-  Literal result = ExecuteAndTransfer(std::move(module), {});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
   LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
 }
 
@@ -138,7 +139,7 @@ XLA_TEST_F(CustomCallTest, CustomCallR2F32Reduce) {
 
   module->AddEntryComputation(builder.Build());
 
-  Literal result = ExecuteAndTransfer(std::move(module), {});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
   LiteralTestUtil::ExpectR0Near<float>(10.0f, result, error_spec_);
 }
 
@@ -161,7 +162,7 @@ XLA_TEST_F(CustomCallTest, UsedInOtherComputations) {
 
   module->AddEntryComputation(b.Build());
 
-  Literal result = ExecuteAndTransfer(std::move(module), {});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
   LiteralTestUtil::ExpectR3EqualArray3D<float>(
       Array3D<float>{{{2, 3}, {4, 5}}, {{3, 4}, {5, 6}}}, result);
 }
@@ -190,7 +191,7 @@ XLA_TEST_F(CustomCallTest, InputAndOutputLayoutDiffer) {
   // Note, the expected result is transposed! This is because the input and
   // output layouts of the custom call differ and the called function just
   // blindly adds one to each element.
-  Literal result = ExecuteAndTransfer(std::move(module), {&argument});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {&argument}));
   LiteralTestUtil::ExpectR2Equal<float>({{2.f, 4.f}, {3.f, 5.f}}, result);
 }
 
@@ -217,7 +218,7 @@ XLA_TEST_F(CustomCallTest, LayoutConstrained) {
 
   Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
 
-  Literal result = ExecuteAndTransfer(std::move(module), {&argument});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {&argument}));
   LiteralTestUtil::ExpectR2Equal<float>({{3.f, 4.f}, {5.f, 6.f}}, result);
 }
 
@@ -237,7 +238,8 @@ XLA_TEST_F(CustomCallTest, TupleOutput) {
   Literal arg1 = LiteralUtil::CreateR0<float>(42.f);
 
   Literal expected = LiteralUtil::MakeTuple({&arg1, &arg0});
-  Literal result = ExecuteAndTransfer(std::move(module), {&arg0, &arg1});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {&arg0, &arg1}));
   EXPECT_EQ(result, expected);
 }
 
@@ -253,7 +255,7 @@ XLA_TEST_F(CustomCallTest, ReportsSuccess) {
 
   module->AddEntryComputation(builder.Build());
 
-  Literal result = ExecuteAndTransfer(std::move(module), {});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
   LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
 }
 

From 12c0b50e9fbb79f08d70b83c52b2c992e8b3dd42 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Thu, 21 Mar 2024 11:21:07 -0700
Subject: [PATCH 243/670] Fix a bug where
 `PjRtStreamExecutorBuffer::CopyRawToHostFuture` causes a crash in GPUs

The current implementation of `PjRtStreamExecutorBuffer::CopyRawToHostFuture` calls `CopyRawToHost()` inside the `dst` future's OnReady callback. Since `PjRtFuture` invokes the callbacks inline in the context where the corresponding promise is set, this becomes a problem when the `dst` is fulfilled in the context of device buffer ready events, e.g., `stream_executor::gpu::GpuExecutor::InternalHostCallback()`.

This CL fixes the aforementioned issue by adding a trampoline thread before calling `CopyRawToHost()`, thus ensuring that `CopyRawToHost()` is called outside any callback context. The added test in `se_gpu_pjrt_client_test.cc` fails without this trampoline.

PiperOrigin-RevId: 617907276
---
 third_party/xla/xla/pjrt/gpu/BUILD            |  1 +
 .../xla/pjrt/gpu/se_gpu_pjrt_client_test.cc   | 27 +++++++++++++++++++
 .../xla/pjrt/pjrt_stream_executor_client.cc   | 23 ++++++++++++++++
 .../xla/pjrt/pjrt_stream_executor_client.h    |  4 +++
 4 files changed, 55 insertions(+)

diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index a9883fb72cc372..90c6fb4a8aff04 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -139,6 +139,7 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla:statusor",
         "//xla:test",
+        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:utils",
         "//xla/pjrt/distributed:in_memory_key_value_store",
         "//xla/service:gpu_plugin",
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index ecf04945364deb..e7aac5c63b3549 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/utils.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/statusor.h"
@@ -427,6 +428,32 @@ TEST(StreamExecutorGpuClientTest, CopyRawToHostOutOfRange) {
   free(dst);
 }
 
+TEST(StreamExecutorGpuClientTest, CopyRawToHostFuture) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
+  auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostLiteral(literal, client->addressable_devices()[0]));
+
+  auto dst_promise = xla::PjRtFuture<absl::StatusOr<void*>>::CreatePromise();
+  xla::PjRtFuture<absl::StatusOr<void*>> dst_future(dst_promise);
+
+  TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
+  buffer->GetReadyFuture().OnReady([dst_promise = std::move(dst_promise),
+                                    size](absl::Status status) mutable {
+    dst_promise.Set(aligned_alloc(size, 0));
+  });
+
+  auto result = buffer->CopyRawToHostFuture(dst_future, 0, size);
+  TF_EXPECT_OK(result.Await());
+  TF_ASSERT_OK_AND_ASSIGN(auto* dst, dst_future.Await());
+  EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
+  EXPECT_EQ(*(static_cast<float*>(dst) + 1), 42.0f);
+
+  free(dst);
+}
+
 TEST(StreamExecutorGpuClientTest, AsyncCopyToDevice) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index cfe5962915dce0..0aae40831a9e20 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -1625,6 +1625,29 @@ PjRtFuture<Status> PjRtStreamExecutorBuffer::CopyRawToHost(
   return client_->CopyRawSubBufferToHost(this, dst, offset, transfer_size);
 }
 
+PjRtFuture<Status> PjRtStreamExecutorBuffer::CopyRawToHostFuture(
+    PjRtFuture<StatusOr<void*>> dst, int64_t offset, int64_t transfer_size) {
+  auto promise = PjRtFuture<Status>::CreatePromise();
+  dst.OnReady([this, promise, offset,
+               transfer_size](absl::StatusOr<void*> dst) mutable {
+    if (dst.ok()) {
+      // Trampoline through a thread pool since some device types (e.g., GPUs)
+      // do not allow calling D2H inside the callback's context.
+      client_->thread_pool()->Schedule(
+          [this, dst = *dst, offset, transfer_size,
+           promise = std::move(promise)]() mutable {
+            CopyRawToHost(dst, offset, transfer_size)
+                .OnReady([promise = std::move(promise)](Status status) mutable {
+                  promise.Set(status);
+                });
+          });
+    } else {
+      promise.Set(dst.status());
+    }
+  });
+  return PjRtFuture<Status>(std::move(promise));
+}
+
 StatusOr<ShapedBuffer> PjRtStreamExecutorBuffer::AsShapedBuffer() const {
   absl::MutexLock lock(&mu_);
   if (device_buffer_ == nullptr) {
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index dbac8c4c2cb87b..a94d3e28bf4345 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -676,6 +676,10 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   PjRtFuture<Status> CopyRawToHost(void* dst, int64_t offset,
                                    int64_t transfer_size) override;
 
+  PjRtFuture<Status> CopyRawToHostFuture(PjRtFuture<StatusOr<void*>> dst,
+                                         int64_t offset,
+                                         int64_t transfer_size) override;
+
   // Drops the buffer's reference to its associated device memory, leaving the
   // buffer in an invalid state. The memory will be freed lazily when all async
   // operations using the buffer have completed, according to the allocation

From 75bb98bd9d3998431aca05afd4be03e233b14425 Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Thu, 21 Mar 2024 11:29:13 -0700
Subject: [PATCH 244/670] Rename `TfLiteRegistrationExternal` as
 `TfLiteOperator`. Likewise rename corresponding API functions (all
 experimental). Rename (experimental)
 `TfLiteInterpreterOptionsAddRegistrationExternal` as
 `TfLiteInterpreterOptionsAddOperator`. Rename (private) header
 registration_external.h as operator.h and rename registration_external.cc as
 operator.cc.

Also add temporary backwards compatibility aliases that forward the old
names to the new names. These backwards compatibility aliases are needed
even though these names are experimental, because the old names are currently
referenced in MediaPipe, which is in a separate repository, and thus can't be
updated atomically.

PiperOrigin-RevId: 617909917
---
 RELEASE.md                                    |   3 +
 tensorflow/lite/c/BUILD                       |   4 +-
 tensorflow/lite/c/CMakeLists.txt              |   2 +-
 tensorflow/lite/c/c_api_internal.h            |  18 +--
 tensorflow/lite/c/c_api_opaque_internal.cc    |  30 ++---
 tensorflow/lite/c/c_api_opaque_internal.h     |  23 ++--
 .../lite/c/c_api_opaque_internal_test.cc      |  15 +--
 tensorflow/lite/c/common_internal.h           |   6 +-
 tensorflow/lite/core/api/op_resolver.h        |  26 ++--
 .../lite/core/api/op_resolver_internal.h      |  11 +-
 tensorflow/lite/core/c/BUILD                  |  32 +++--
 tensorflow/lite/core/c/c_api.cc               |  34 +++---
 tensorflow/lite/core/c/c_api.h                |  11 +-
 tensorflow/lite/core/c/c_api_experimental.cc  |  18 +--
 tensorflow/lite/core/c/c_api_experimental.h   |  26 ++--
 .../lite/core/c/c_api_experimental_test.cc    |  59 +++++-----
 tensorflow/lite/core/c/c_api_opaque.cc        |  11 +-
 tensorflow/lite/core/c/c_api_opaque.h         |  10 +-
 tensorflow/lite/core/c/c_api_opaque_test.cc   |  15 ++-
 tensorflow/lite/core/c/c_api_test.cc          | 111 +++++++++---------
 tensorflow/lite/core/c/common.h               |  28 +++--
 tensorflow/lite/core/c/operator.cc            |  92 +++++++++++++++
 .../c/{registration_external.h => operator.h} |  71 ++++++-----
 .../lite/core/c/registration_external.cc      |  97 ---------------
 tensorflow/lite/core/subgraph.cc              |  44 +++----
 tensorflow/lite/core/subgraph.h               |  20 ++--
 tensorflow/lite/delegates/delegate_test.cc    |  12 +-
 .../lite/delegates/opaque_delegate_test.cc    |  34 +++---
 .../sample_stable_delegate.cc                 |  10 +-
 .../sample_stable_delegate.h                  |   7 +-
 ...ample_stable_delegate_with_control_flow.cc |  14 +--
 ...sample_stable_delegate_with_control_flow.h |   7 +-
 .../delegates/utils/simple_opaque_delegate.cc |  20 ++--
 .../delegates/utils/simple_opaque_delegate.h  |   4 +-
 .../utils/simple_opaque_delegate_test.cc      |  17 ++-
 tensorflow/lite/ios/BUILD.apple               |  12 +-
 tensorflow/lite/ios/TensorFlowLiteC.h         |   2 +-
 tensorflow/lite/mutable_op_resolver_utils.cc  |  25 ++--
 tensorflow/lite/mutable_op_resolver_utils.h   |   8 +-
 .../lite/mutable_op_resolver_utils_test.cc    |  28 ++---
 .../lite/objc/TensorFlowLiteObjC.podspec      |   2 +-
 41 files changed, 497 insertions(+), 522 deletions(-)
 create mode 100644 tensorflow/lite/core/c/operator.cc
 rename tensorflow/lite/core/c/{registration_external.h => operator.h} (73%)
 delete mode 100644 tensorflow/lite/core/c/registration_external.cc

diff --git a/RELEASE.md b/RELEASE.md
index cd4e3a2cc3bdb8..c64023a3f48dee 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -72,6 +72,9 @@
       schema globally in the converter and inference engine. The new behaviour
       can be disabled via experimental
       flag `converter._experimental_disable_per_channel_quantization_for_dense_layers = True`.
+    * C API:
+        * The experimental `TfLiteRegistrationExternal` type has been renamed as
+          `TfLiteOperator`, and likewise for the corresponding API functions.
 
 ## Thanks to our Contributors
 
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 22efc0feb27940..19cdd37ed4f549 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -356,7 +356,7 @@ cc_library(
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api:op_resolver",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/core/c:registration_external",
+        "//tensorflow/lite/core/c:operator",
     ],
 )
 
@@ -393,7 +393,7 @@ cc_library(
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api:op_resolver",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/core/c:registration_external_without_alwayslink",
+        "//tensorflow/lite/core/c:operator_without_alwayslink",
     ],
 )
 
diff --git a/tensorflow/lite/c/CMakeLists.txt b/tensorflow/lite/c/CMakeLists.txt
index 3fb086ec91a878..b589d1ab8dee91 100644
--- a/tensorflow/lite/c/CMakeLists.txt
+++ b/tensorflow/lite/c/CMakeLists.txt
@@ -66,7 +66,7 @@ add_library(tensorflowlite_c ${TFLITE_C_LIBTYPE}
   ${TFLITE_SOURCE_DIR}/core/c/c_api.cc
   ${TFLITE_SOURCE_DIR}/core/c/c_api_experimental.cc
   ${TFLITE_SOURCE_DIR}/core/c/common.cc
-  ${TFLITE_SOURCE_DIR}/core/c/registration_external.cc
+  ${TFLITE_SOURCE_DIR}/core/c/operator.cc
   builtin_op_data.h
   c_api.h
   c_api_experimental.h
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index 60e11798f78e5d..8249f22eaaa463 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -93,13 +93,13 @@ struct TfLiteOpResolverCallbacks {
                                              int version)>
       find_custom_op_v1;
 
-  // Variant of `find_builtin_op` which returns `TfLiteRegistrationExternal`.
-  std::function<const TfLiteRegistrationExternal*(
-      void* user_data, TfLiteBuiltinOperator op, int version)>
+  // Variant of `find_builtin_op` which returns `TfLiteOperator`.
+  std::function<const TfLiteOperator*(void* user_data, TfLiteBuiltinOperator op,
+                                      int version)>
       find_builtin_op_external;
-  // Variant of `find_custom_op` which returns `TfLiteRegistrationExternal`.
-  std::function<const TfLiteRegistrationExternal*(void* user_data,
-                                                  const char* op, int version)>
+  // Variant of `find_custom_op` which returns `TfLiteOperator`.
+  std::function<const TfLiteOperator*(void* user_data, const char* op,
+                                      int version)>
       find_custom_op_external;
 };
 
@@ -135,9 +135,9 @@ struct TfLiteInterpreterOptions {
   // automatically retried without delegates.
   bool enable_delegate_fallback = false;
 
-  // TfLiteRegistrationExternal objects owned by caller of
-  // `TfLiteInterpreterOptionsAddRegistrationExternal` API.
-  std::vector<TfLiteRegistrationExternal*> op_registrations;
+  // TfLiteOperator objects owned by caller of
+  // `TfLiteInterpreterOptionsAddOperator` API.
+  std::vector<TfLiteOperator*> op_registrations;
 
   // Determines whether to allow to cancel invocations with
   // `Interpreter::Cancel` or `SignatureRunner::Cancel`.
diff --git a/tensorflow/lite/c/c_api_opaque_internal.cc b/tensorflow/lite/c/c_api_opaque_internal.cc
index 5b6fcf48a1dce3..cead720c0c7dcc 100644
--- a/tensorflow/lite/c/c_api_opaque_internal.cc
+++ b/tensorflow/lite/c/c_api_opaque_internal.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/core/c/registration_external.h"
+#include "tensorflow/lite/core/c/operator.h"
 #include "tensorflow/lite/core/subgraph.h"
 
 namespace tflite {
@@ -29,13 +29,13 @@ namespace internal {
 namespace {
 
 // Returns a dynamically allocated object; the caller is responsible for
-// deallocating it using TfLiteRegistrationExternalDelete.
-TfLiteRegistrationExternal* MakeRegistrationExternal(
-    const TfLiteRegistration* registration, int node_index) {
-  // We need to allocate a new TfLiteRegistrationExternal object and then
+// deallocating it using TfLiteOperatorDelete.
+TfLiteOperator* MakeOperator(const TfLiteRegistration* registration,
+                             int node_index) {
+  // We need to allocate a new TfLiteOperator object and then
   // populate its state correctly, based on the contents in 'registration'.
 
-  auto* registration_external = TfLiteRegistrationExternalCreate(
+  auto* registration_external = TfLiteOperatorCreate(
       static_cast<TfLiteBuiltinOperator>(registration->builtin_code),
       registration->custom_name, registration->version);
 
@@ -46,9 +46,8 @@ TfLiteRegistrationExternal* MakeRegistrationExternal(
 
 }  // anonymous namespace
 
-TfLiteRegistrationExternal*
-CommonOpaqueConversionUtil::CachedObtainRegistrationExternal(
-    RegistrationExternalsCache* registration_externals_cache,
+TfLiteOperator* CommonOpaqueConversionUtil::CachedObtainOperator(
+    OperatorsCache* registration_externals_cache,
     const TfLiteRegistration* registration, int node_index) {
   OpResolver::OpId op_id{registration->builtin_code, registration->custom_name,
                          registration->version};
@@ -56,25 +55,22 @@ CommonOpaqueConversionUtil::CachedObtainRegistrationExternal(
   if (it != registration_externals_cache->end()) {
     return it->second.get();
   }
-  auto* registration_external =
-      MakeRegistrationExternal(registration, node_index);
+  auto* registration_external = MakeOperator(registration, node_index);
   registration_externals_cache->insert(
       it, std::make_pair(op_id, registration_external));
 
   return registration_external;
 }
 
-TfLiteRegistrationExternal*
-CommonOpaqueConversionUtil::ObtainRegistrationExternal(
+TfLiteOperator* CommonOpaqueConversionUtil::ObtainOperator(
     TfLiteContext* context, const TfLiteRegistration* registration,
     int node_index) {
   auto* subgraph = static_cast<tflite::Subgraph*>(context->impl_);
   if (!subgraph->registration_externals_) {
-    subgraph->registration_externals_ =
-        std::make_shared<RegistrationExternalsCache>();
+    subgraph->registration_externals_ = std::make_shared<OperatorsCache>();
   }
-  return CachedObtainRegistrationExternal(
-      subgraph->registration_externals_.get(), registration, node_index);
+  return CachedObtainOperator(subgraph->registration_externals_.get(),
+                              registration, node_index);
 }
 
 }  // namespace internal
diff --git a/tensorflow/lite/c/c_api_opaque_internal.h b/tensorflow/lite/c/c_api_opaque_internal.h
index 08989145b2fb95..54647b676b3736 100644
--- a/tensorflow/lite/c/c_api_opaque_internal.h
+++ b/tensorflow/lite/c/c_api_opaque_internal.h
@@ -31,32 +31,31 @@ class CommonOpaqueConversionUtil {
  public:
   CommonOpaqueConversionUtil() = delete;
 
-  // Obtain (or create) a 'TfLiteRegistrationExternal' object that corresponds
+  // Obtain (or create) a 'TfLiteOperator' object that corresponds
   // to the provided 'registration' argument, and return the address of the
   // external registration.  We loosely define that a
-  // 'TfLiteRegistrationExternal' object "corresponds" to a 'TfLiteRegistration'
+  // 'TfLiteOperator' object "corresponds" to a 'TfLiteRegistration'
   // object when calling any function pointer (like 'prepare') on the
-  // 'TfLiteRegistrationExternal' object calls into the corresponding function
+  // 'TfLiteOperator' object calls into the corresponding function
   // pointer of the 'TfLiteRegistration' object.
   //
   // The specified 'context' or 'op_resolver' object is used to store the
-  // 'TfLiteRegistrationExternal*' pointers. The 'TfLiteRegistrationExternal*'
+  // 'TfLiteOperator*' pointers. The 'TfLiteOperator*'
   // pointer will be deallocated when that object gets destroyed.  I.e., the
   // caller of this function should not deallocate the object pointed to by the
-  // return value of 'ObtainRegistrationExternal'.
+  // return value of 'ObtainOperator'.
   //
   // We also need to provide the 'node_index' that the 'registration'
-  // corresponds to, so that the 'TfLiteRegistrationExternal' can store that
+  // corresponds to, so that the 'TfLiteOperator' can store that
   // index within its fields.  If the registration does not yet correspond
   // to a specific node index, then 'node_index' should be -1.
-  static TfLiteRegistrationExternal* ObtainRegistrationExternal(
-      TfLiteContext* context, const TfLiteRegistration* registration,
-      int node_index);
+  static TfLiteOperator* ObtainOperator(TfLiteContext* context,
+                                        const TfLiteRegistration* registration,
+                                        int node_index);
 
  private:
-  static TfLiteRegistrationExternal* CachedObtainRegistrationExternal(
-      ::tflite::internal::RegistrationExternalsCache*
-          registration_externals_cache,
+  static TfLiteOperator* CachedObtainOperator(
+      ::tflite::internal::OperatorsCache* registration_externals_cache,
       const TfLiteRegistration* registration, int node_index);
 };
 
diff --git a/tensorflow/lite/c/c_api_opaque_internal_test.cc b/tensorflow/lite/c/c_api_opaque_internal_test.cc
index 0965b85f173c60..8e63b045de44bb 100644
--- a/tensorflow/lite/c/c_api_opaque_internal_test.cc
+++ b/tensorflow/lite/c/c_api_opaque_internal_test.cc
@@ -43,9 +43,8 @@ TEST(ObtainRegistrationFromContext, ProducesValidResult) {
   TfLiteContext* context = interpreter->primary_subgraph().context();
   const TfLiteRegistration* registration = tflite::ops::builtin::Register_ADD();
 
-  TfLiteRegistrationExternal* registration_external =
-      CommonOpaqueConversionUtil::ObtainRegistrationExternal(context,
-                                                             registration, 42);
+  TfLiteOperator* registration_external =
+      CommonOpaqueConversionUtil::ObtainOperator(context, registration, 42);
 
   ASSERT_EQ(registration_external->builtin_code, kTfLiteBuiltinAdd);
   ASSERT_EQ(registration_external->version, registration->version);
@@ -66,11 +65,9 @@ TEST(ObtainRegistrationFromContext, CachingWorks) {
   const TfLiteRegistration* registration = tflite::ops::builtin::Register_ADD();
 
   // Call it twice, and verify that we get the same result back.
-  TfLiteRegistrationExternal* registration_external1 =
-      CommonOpaqueConversionUtil::ObtainRegistrationExternal(context,
-                                                             registration, 0);
-  TfLiteRegistrationExternal* registration_external2 =
-      CommonOpaqueConversionUtil::ObtainRegistrationExternal(context,
-                                                             registration, 1);
+  TfLiteOperator* registration_external1 =
+      CommonOpaqueConversionUtil::ObtainOperator(context, registration, 0);
+  TfLiteOperator* registration_external2 =
+      CommonOpaqueConversionUtil::ObtainOperator(context, registration, 1);
   ASSERT_EQ(registration_external1, registration_external2);
 }
diff --git a/tensorflow/lite/c/common_internal.h b/tensorflow/lite/c/common_internal.h
index 3a933707d50a63..b47c38b6922e5a 100644
--- a/tensorflow/lite/c/common_internal.h
+++ b/tensorflow/lite/c/common_internal.h
@@ -24,11 +24,11 @@ limitations under the License.
 // NOTE: This header does not follow C conventions and does not define a C API.
 // It is effectively an (internal) implementation detail of the C API.
 
-// `TfLiteRegistrationExternal` is an external version of `TfLiteRegistration`
+// `TfLiteOperator` is an external version of `TfLiteRegistration`
 // for C API which doesn't use internal types (such as `TfLiteContext`) but only
 // uses stable API types (such as `TfLiteOpaqueContext`). The purpose of each
 // field is the exactly the same as with `TfLiteRegistration`.
-typedef struct TfLiteRegistrationExternal {
+typedef struct TfLiteOperator {
   // Custom op name.  This should be non-null iff the op is a custom op,
   // i.e. iff builtin_code is kTfLiteBuiltinCustom.
   const char* custom_name;
@@ -81,7 +81,7 @@ typedef struct TfLiteRegistrationExternal {
   // Indicates if an operator's output can safely overwrite its input.
   // See the comments in `TfLiteInPlaceOp`.
   uint64_t inplace_operator;
-} TfLiteRegistrationExternal;
+} TfLiteOperator;
 
 // Returns true iff it's safe to dereference
 // 'delegate->opaque_delegate_builder'.
diff --git a/tensorflow/lite/core/api/op_resolver.h b/tensorflow/lite/core/api/op_resolver.h
index b43f1adc664ada..7aff7cafea1783 100644
--- a/tensorflow/lite/core/api/op_resolver.h
+++ b/tensorflow/lite/core/api/op_resolver.h
@@ -34,7 +34,7 @@ class Subgraph;            // For friend declaration below.
 
 namespace internal {
 class CommonOpaqueConversionUtil;  // For friend declaration below.
-class RegistrationExternalsCache;  // Forward decl.
+class OperatorsCache;              // Forward decl.
 }  // namespace internal
 #endif
 
@@ -45,7 +45,7 @@ class RegistrationExternalsCache;  // Forward decl.
 /// The lifetime of the TfLiteRegistration object whose address is
 /// returned by FindOp must exceed the lifetime of any InterpreterBuilder or
 /// Interpreter created with this OpResolver.
-/// Likewise the lifetime of the TfLiteRegistrationExternal object referenced
+/// Likewise the lifetime of the TfLiteOperator object referenced
 /// from the TfLiteRegistration object, if any, must exceed the lifetime of
 /// any InterpreterBuilder or Interpreter created with this OpResolver.
 class OpResolver {
@@ -139,11 +139,11 @@ class OpResolver {
   friend class OpResolverInternal;
   friend class Subgraph;  // For OpId.
   friend class tflite::internal::CommonOpaqueConversionUtil;
-  friend class tflite::internal::RegistrationExternalsCache;
+  friend class tflite::internal::OperatorsCache;
 #endif
 
   // This holds the identity of an operator.
-  // Ths is used as the key for the RegistrationExternalsCache below.
+  // Ths is used as the key for the OperatorsCache below.
   struct OpId {
     int builtin_code;
     const char* custom_name;
@@ -176,33 +176,33 @@ class OpResolver {
     };
   };
 
-  // A set of 'TfLiteRegistrationExternal' objects whose lifetimes need to
+  // A set of 'TfLiteOperator' objects whose lifetimes need to
   // last at least as long as the lifetime of the OpResolver.
   // We use shared_ptr rather than unique_ptr here, to allow the
-  // RegistrationExternalsCache to be shared with other classes such as the
+  // OperatorsCache to be shared with other classes such as the
   // InterpreterBuilder and Interpreter. This is so that the
-  // TfLiteRegistrationExternal objects allocated by an OpResolver,
+  // TfLiteOperator objects allocated by an OpResolver,
   // which may be referenced by a Subgraph in an Interpreter, can remain live
   // even if the OpResolver is destroyed, while also allowing the same
   // OpResolver to be used with multiple InterpreterBuilders and multiple
   // Interpreters.
-  mutable std::shared_ptr<internal::RegistrationExternalsCache>
+  mutable std::shared_ptr<internal::OperatorsCache>
       registration_externals_cache_;
 };
 
 #ifndef DOXYGEN_SKIP
-// Type for a set of owned 'TfLiteRegistrationExternal' objects.
+// Type for a set of owned 'TfLiteOperator' objects.
 // This is needed when converting TfLiteRegistration to
-// TfLiteRegistrationExternal, to ensure that the number of
-// TfLiteRegistrationExternal objects that we allocate is bounded, and to
+// TfLiteOperator, to ensure that the number of
+// TfLiteOperator objects that we allocate is bounded, and to
 // ensure that those objects get deallocated at the appropriate time.
 // We use a public class rather than a typedef or using declaration here,
 // to ensure that the class can be forward-declared.
 // WARNING: Experimental interface, subject to change.
 namespace internal {
-class RegistrationExternalsCache
+class OperatorsCache
     : private std::unordered_map<OpResolver::OpId,
-                                 std::unique_ptr<TfLiteRegistrationExternal>,
+                                 std::unique_ptr<TfLiteOperator>,
                                  OpResolver::OpId::Hasher> {
   friend class ::tflite::Subgraph;
   friend class ::tflite::internal::CommonOpaqueConversionUtil;
diff --git a/tensorflow/lite/core/api/op_resolver_internal.h b/tensorflow/lite/core/api/op_resolver_internal.h
index 9f6b893e5a6940..b30debe944eea9 100644
--- a/tensorflow/lite/core/api/op_resolver_internal.h
+++ b/tensorflow/lite/core/api/op_resolver_internal.h
@@ -35,13 +35,12 @@ class OpResolverInternal {
     return op_resolver.MayContainUserDefinedOps();
   }
 
-  // Get a shared_ptr to the RegistrationExternalsCache from an OpResolver.
+  // Get a shared_ptr to the OperatorsCache from an OpResolver.
   // This is used to allow the InterpreterBuilder and OpResolver to share
-  // the same RegistrationExternalsCache, so that the RegistrationExternal
-  // objects in it can persist for the lifetimes of both the InterpreterBuilder
-  // and OpResolver.
-  static std::shared_ptr<::tflite::internal::RegistrationExternalsCache>
-  GetSharedCache(const ::tflite::OpResolver& op_resolver) {
+  // the same OperatorsCache, so that the Operator objects in it can persist
+  // for the lifetimes of both the InterpreterBuilder and OpResolver.
+  static std::shared_ptr<::tflite::internal::OperatorsCache> GetSharedCache(
+      const ::tflite::OpResolver& op_resolver) {
     return op_resolver.registration_externals_cache_;
   }
 };
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index 86c83aec78b779..a9a7af312cbc87 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -25,7 +25,7 @@ exports_files(
         "c_api_opaque.h",
         "c_api_types.h",
         "common.h",
-        "registration_external.h",
+        "operator.h",
     ],
     visibility = [
         "//tensorflow/lite:__subpackages__",
@@ -44,7 +44,7 @@ filegroup(
         "c_api.h",
         "c_api_types.h",
         "common.h",
-        "registration_external.h",
+        "operator.h",
     ],
 )
 
@@ -73,7 +73,7 @@ tflite_cc_library_with_c_headers_test(
     deps = [
         ":c_api_types",
         ":c_api_without_op_resolver",
-        ":registration_external",
+        ":operator",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:create_op_resolver_with_builtin_ops",
         "//tensorflow/lite/c:common",
@@ -93,7 +93,7 @@ tflite_cc_library_with_c_headers_test(
     ],
     deps = [
         ":c_api_types",
-        ":registration_external",
+        ":operator",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:version",
         "//tensorflow/lite/c:c_api_internal",
@@ -121,7 +121,7 @@ tflite_cc_library_with_c_headers_test(
     ],
     deps = [
         ":c_api_types",
-        ":registration_external",
+        ":operator",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite/core/async/c:types",
     ],
@@ -137,7 +137,7 @@ tflite_cc_library_with_c_headers_test(
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
         ":c_api_types",
-        ":registration_external_without_alwayslink",
+        ":operator_without_alwayslink",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:version",
@@ -309,7 +309,7 @@ cc_test(
         ":c_api",
         ":c_api_types",
         ":common",
-        ":registration_external",
+        ":operator",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
@@ -381,6 +381,7 @@ tflite_cc_library_with_c_headers_test(
         ":c_api_opaque",
         ":c_api_types",
         ":common",
+        ":operator",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:signature_runner",
         "//tensorflow/lite/c:c_api_internal",
@@ -416,6 +417,7 @@ tflite_cc_library_with_c_headers_test(
         ":c_api_types",
         ":c_api_without_op_resolver",
         ":common",
+        ":operator",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
@@ -469,6 +471,7 @@ tflite_cc_library_with_c_headers_test(
         ":c_api_types",
         ":c_api_without_op_resolver_without_alwayslink",
         ":common",
+        ":operator_without_alwayslink",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:string_util",
@@ -510,6 +513,7 @@ tflite_cc_library_with_c_headers_test(
         ":c_api_opaque_without_op_resolver",
         ":c_api_types",
         ":common",
+        ":operator",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:c_api_opaque_internal",
@@ -533,6 +537,7 @@ tflite_cc_library_with_c_headers_test(
         ":c_api_types",
         ":c_api_without_op_resolver",
         ":common",
+        ":operator",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:c_api_opaque_internal",
@@ -569,6 +574,7 @@ tflite_cc_library_with_c_headers_test(
         ":c_api_types",
         ":c_api_without_op_resolver_without_alwayslink",
         ":common",
+        ":operator_without_alwayslink",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:c_api_opaque_internal_without_alwayslink",
@@ -612,9 +618,9 @@ cc_test(
 )
 
 tflite_cc_library_with_c_headers_test(
-    name = "registration_external",
-    srcs = ["registration_external.cc"],
-    hdrs = ["registration_external.h"],
+    name = "operator",
+    srcs = ["operator.cc"],
+    hdrs = ["operator.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     visibility = [
@@ -633,9 +639,9 @@ tflite_cc_library_with_c_headers_test(
 )
 
 tflite_cc_library_with_c_headers_test(
-    name = "registration_external_without_alwayslink",
-    srcs = ["registration_external.cc"],
-    hdrs = ["registration_external.h"],
+    name = "operator_without_alwayslink",
+    srcs = ["operator.cc"],
+    hdrs = ["operator.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
diff --git a/tensorflow/lite/core/c/c_api.cc b/tensorflow/lite/core/c/c_api.cc
index e5150cc69c35bb..5b2011e5757e89 100644
--- a/tensorflow/lite/core/c/c_api.cc
+++ b/tensorflow/lite/core/c/c_api.cc
@@ -132,9 +132,8 @@ void TfLiteInterpreterOptionsSetErrorReporter(
   options->error_reporter_callback.user_data = user_data;
 }
 
-void TfLiteInterpreterOptionsAddRegistrationExternal(
-    TfLiteInterpreterOptions* options,
-    TfLiteRegistrationExternal* registration) {
+void TfLiteInterpreterOptionsAddOperator(TfLiteInterpreterOptions* options,
+                                         TfLiteOperator* registration) {
   options->op_registrations.push_back(registration);
 }
 
@@ -144,9 +143,8 @@ TfLiteStatus TfLiteInterpreterOptionsEnableCancellation(
   return kTfLiteOk;
 }
 
-static void InitTfLiteRegistration(
-    TfLiteRegistration* registration,
-    TfLiteRegistrationExternal* registration_external) {
+static void InitTfLiteRegistration(TfLiteRegistration* registration,
+                                   TfLiteOperator* registration_external) {
   registration->builtin_code = registration_external->builtin_code;
   registration->custom_name = registration_external->custom_name;
   registration->version = registration_external->version;
@@ -282,13 +280,13 @@ TfLiteStatus TfLiteTensorCopyToBuffer(const TfLiteTensor* tensor,
 namespace tflite {
 namespace internal {
 
-static TfLiteRegistration* RegistrationExternalToRegistration(
-    const TfLiteRegistrationExternal* registration_external) {
-  // All TfLiteRegistrationExternal objects are dynamically allocated via
-  // TfLiteRegistrationExternalCreate(), so they are guaranteed
+static TfLiteRegistration* OperatorToRegistration(
+    const TfLiteOperator* registration_external) {
+  // All TfLiteOperator objects are dynamically allocated via
+  // TfLiteOperatorCreate(), so they are guaranteed
   // to be mutable, hence the const_cast below should be safe.
   auto registration_external_non_const =
-      const_cast<TfLiteRegistrationExternal*>(registration_external);
+      const_cast<TfLiteOperator*>(registration_external);
   TfLiteRegistration* new_registration = new TfLiteRegistration{};
   InitTfLiteRegistration(new_registration, registration_external_non_const);
   return new_registration;
@@ -311,10 +309,10 @@ const TfLiteRegistration* CallbackOpResolver::FindOp(tflite::BuiltinOperator op,
     }
   }
 
-  // Try using newer RegistrationExternal API.
+  // Try using newer Operator API.
   if (op_resolver_callbacks_.find_builtin_op_external) {
-    // Get a RegistrationExternal object and create a Registration (V4) object.
-    const TfLiteRegistrationExternal* registration_external =
+    // Get a Operator object and create a Registration (V4) object.
+    const TfLiteOperator* registration_external =
         op_resolver_callbacks_.find_builtin_op_external(
             op_resolver_callbacks_.user_data,
             static_cast<TfLiteBuiltinOperator>(op), version);
@@ -325,7 +323,7 @@ const TfLiteRegistration* CallbackOpResolver::FindOp(tflite::BuiltinOperator op,
          registration_external->prepare != nullptr ||
          registration_external->async_kernel != nullptr)) {
       TfLiteRegistration* new_registration =
-          RegistrationExternalToRegistration(registration_external);
+          OperatorToRegistration(registration_external);
       temporary_builtin_registrations_.push_back(
           std::unique_ptr<TfLiteRegistration>(new_registration));
       return new_registration;
@@ -375,8 +373,8 @@ const TfLiteRegistration* CallbackOpResolver::FindOp(const char* op,
   }
 
   if (op_resolver_callbacks_.find_custom_op_external) {
-    // Get a RegistrationExternal object and create a Registration (V3) object.
-    const TfLiteRegistrationExternal* registration_external =
+    // Get a Operator object and create a Registration (V3) object.
+    const TfLiteOperator* registration_external =
         op_resolver_callbacks_.find_custom_op_external(
             op_resolver_callbacks_.user_data, op, version);
     if (registration_external && (registration_external->init != nullptr ||
@@ -384,7 +382,7 @@ const TfLiteRegistration* CallbackOpResolver::FindOp(const char* op,
                                   registration_external->invoke != nullptr ||
                                   registration_external->prepare != nullptr)) {
       TfLiteRegistration* new_registration =
-          RegistrationExternalToRegistration(registration_external);
+          OperatorToRegistration(registration_external);
       temporary_builtin_registrations_.push_back(
           std::unique_ptr<TfLiteRegistration>(new_registration));
       return new_registration;
diff --git a/tensorflow/lite/core/c/c_api.h b/tensorflow/lite/core/c/c_api.h
index f7504a315f1bff..41726d2f6dc284 100644
--- a/tensorflow/lite/core/c/c_api.h
+++ b/tensorflow/lite/core/c/c_api.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/core/async/c/types.h"
 #include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
-#include "tensorflow/lite/core/c/registration_external.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/c/operator.h"  // IWYU pragma: export
 
 /// C API for TensorFlow Lite.
 ///
@@ -266,19 +266,18 @@ TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetErrorReporter(
 
 /// Adds an op registration to be applied during `TfLiteInterpreter` creation.
 ///
-/// The `TfLiteRegistrationExternal` object is needed to implement custom op of
+/// The `TfLiteOperator` object is needed to implement custom op of
 /// TFLite Interpreter via C API. Calling this function ensures that any
 /// `TfLiteInterpreter` created with the specified `options` can execute models
 /// that use the custom operator specified in `registration`.
 /// Please refer https://www.tensorflow.org/lite/guide/ops_custom for custom op
 /// support.
-/// \note The caller retains ownership of the TfLiteRegistrationExternal object
+/// \note The caller retains ownership of the TfLiteOperator object
 /// and should ensure that it remains valid for the duration of any created
 /// interpreter's lifetime.
 /// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsAddRegistrationExternal(
-    TfLiteInterpreterOptions* options,
-    TfLiteRegistrationExternal* registration);
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsAddOperator(
+    TfLiteInterpreterOptions* options, TfLiteOperator* registration);
 
 /// Enables users to cancel in-flight invocations with
 /// `TfLiteInterpreterCancel`.
diff --git a/tensorflow/lite/core/c/c_api_experimental.cc b/tensorflow/lite/core/c/c_api_experimental.cc
index 86348c28938b2d..d31d30bfee5033 100644
--- a/tensorflow/lite/core/c/c_api_experimental.cc
+++ b/tensorflow/lite/core/c/c_api_experimental.cc
@@ -75,11 +75,10 @@ void TfLiteInterpreterOptionsAddCustomOp(TfLiteInterpreterOptions* options,
 
 void TfLiteInterpreterOptionsSetOpResolverExternal(
     TfLiteInterpreterOptions* options,
-    const TfLiteRegistrationExternal* (*find_builtin_op)(void* user_data,
-                                                         int op, int version),
-    const TfLiteRegistrationExternal* (*find_custom_op)(void* user_data,
-                                                        const char* custom_op,
-                                                        int version),
+    const TfLiteOperator* (*find_builtin_op)(void* user_data, int op,
+                                             int version),
+    const TfLiteOperator* (*find_custom_op)(void* user_data,
+                                            const char* custom_op, int version),
     void* op_resolver_user_data) {
   options->op_resolver_callbacks = {};  // Sets all fields to null.
   options->op_resolver_callbacks.find_builtin_op_external = find_builtin_op;
@@ -89,10 +88,11 @@ void TfLiteInterpreterOptionsSetOpResolverExternal(
 
 void TfLiteInterpreterOptionsSetOpResolverExternalWithFallback(
     TfLiteInterpreterOptions* options,
-    const TfLiteRegistrationExternal* (*find_builtin_op_external)(
-        void* user_data, int op, int version),
-    const TfLiteRegistrationExternal* (*find_custom_op_external)(
-        void* user_data, const char* custom_op, int version),
+    const TfLiteOperator* (*find_builtin_op_external)(void* user_data, int op,
+                                                      int version),
+    const TfLiteOperator* (*find_custom_op_external)(void* user_data,
+                                                     const char* custom_op,
+                                                     int version),
     const TfLiteRegistration* (*find_builtin_op)(void* user_data,
                                                  TfLiteBuiltinOperator op,
                                                  int version),
diff --git a/tensorflow/lite/core/c/c_api_experimental.h b/tensorflow/lite/core/c/c_api_experimental.h
index c981c0fd1c7c1d..8037e967675f48 100644
--- a/tensorflow/lite/core/c/c_api_experimental.h
+++ b/tensorflow/lite/core/c/c_api_experimental.h
@@ -96,7 +96,7 @@ TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
 /// The `TfLiteInterpreterOptionsSetOpResolverExternal` function provides an
 /// alternative method for registering builtin ops and/or custom ops, by
 /// providing operator resolver callbacks.  Unlike using
-/// `TfLiteInterpreterOptionsAddRegistrationExternal`,
+/// `TfLiteInterpreterOptionsAddOperator`,
 /// `TfLiteInterpreterOptionsAddBuiltinOp` and/or
 /// `TfLiteInterpreterOptionsAddAddCustomOp`, these let you register all the
 /// operators in a single call.
@@ -108,18 +108,17 @@ TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
 /// If `op_resolver_user_data` is non-null, its lifetime must be at least as
 /// long as the lifetime of the `TfLiteInterpreterOptions`.
 ///
-/// The TfLiteRegistrationExternal objects whose addresses are returned by
+/// The TfLiteOperator objects whose addresses are returned by
 /// `find_builtin_op` and `find_custom_op` must outlive both the
 /// InterpreterOptions object and any Interpreter object created from it.
 ///
 /// WARNING: This is an experimental API and subject to change.
 void TfLiteInterpreterOptionsSetOpResolverExternal(
     TfLiteInterpreterOptions* options,
-    const TfLiteRegistrationExternal* (*find_builtin_op)(void* user_data,
-                                                         int op, int version),
-    const TfLiteRegistrationExternal* (*find_custom_op)(void* user_data,
-                                                        const char* custom_op,
-                                                        int version),
+    const TfLiteOperator* (*find_builtin_op)(void* user_data, int op,
+                                             int version),
+    const TfLiteOperator* (*find_custom_op)(void* user_data,
+                                            const char* custom_op, int version),
     void* op_resolver_user_data);
 
 /// \private
@@ -127,8 +126,8 @@ void TfLiteInterpreterOptionsSetOpResolverExternal(
 ///
 /// This combines the effects of TfLiteInterpreterOptionsSetOpResolverExternal
 /// and TfLiteInterpreterOptionsSetOpResolver.  The callbacks that return
-/// TfLiteRegistrationExternal will be called first, but if they return a
-/// TfLiteRegistrationExternal object that has no methods set, then
+/// TfLiteOperator will be called first, but if they return a
+/// TfLiteOperator object that has no methods set, then
 /// the callbacks that return a TfLiteRegistration will be called to get
 /// the methods.
 ///
@@ -139,10 +138,11 @@ void TfLiteInterpreterOptionsSetOpResolverExternal(
 /// TF Lite itself.
 void TfLiteInterpreterOptionsSetOpResolverExternalWithFallback(
     TfLiteInterpreterOptions* options,
-    const TfLiteRegistrationExternal* (*find_builtin_op_external)(
-        void* user_data, int op, int version),
-    const TfLiteRegistrationExternal* (*find_custom_op_external)(
-        void* user_data, const char* custom_op, int version),
+    const TfLiteOperator* (*find_builtin_op_external)(void* user_data, int op,
+                                                      int version),
+    const TfLiteOperator* (*find_custom_op_external)(void* user_data,
+                                                     const char* custom_op,
+                                                     int version),
     const TfLiteRegistration* (*find_builtin_op)(void* user_data,
                                                  TfLiteBuiltinOperator op,
                                                  int version),
diff --git a/tensorflow/lite/core/c/c_api_experimental_test.cc b/tensorflow/lite/core/c/c_api_experimental_test.cc
index c52ad232b6a9e7..bb9fa71485a3d1 100644
--- a/tensorflow/lite/core/c/c_api_experimental_test.cc
+++ b/tensorflow/lite/core/c/c_api_experimental_test.cc
@@ -50,10 +50,10 @@ const TfLiteRegistration* GetNoOpRegistration() {
   return &registration;
 }
 
-const TfLiteRegistrationExternal* GetNoOpRegistrationExternal() {
-  static TfLiteRegistrationExternal* registration =
-      TfLiteRegistrationExternalCreate(kTfLiteBuiltinCustom, "NoOp", 1);
-  TfLiteRegistrationExternalSetInvoke(
+const TfLiteOperator* GetNoOpOperator() {
+  static TfLiteOperator* registration =
+      TfLiteOperatorCreate(kTfLiteBuiltinCustom, "NoOp", 1);
+  TfLiteOperatorSetInvoke(
       registration,
       /*invoke=*/[](TfLiteOpaqueContext*, TfLiteOpaqueNode*) {
         return kTfLiteOk;
@@ -188,21 +188,20 @@ TEST(CApiExperimentalTest, SetOpResolver) {
   TfLiteModelDelete(model);
 }
 
-const TfLiteRegistrationExternal* MyFindBuiltinOpExternal(void* user_data,
-                                                          int op, int version) {
+const TfLiteOperator* MyFindBuiltinOpExternal(void* user_data, int op,
+                                              int version) {
   OpResolverData* my_data = static_cast<OpResolverData*>(user_data);
   if (op == kTfLiteBuiltinAdd && version == 1) {
     my_data->called_for_add = true;
-    return GetNoOpRegistrationExternal();
+    return GetNoOpOperator();
   }
   return nullptr;
 }
 
-const TfLiteRegistrationExternal* MyFindCustomOpExternal(void*,
-                                                         const char* custom_op,
-                                                         int version) {
+const TfLiteOperator* MyFindCustomOpExternal(void*, const char* custom_op,
+                                             int version) {
   if (absl::string_view(custom_op) == "foo" && version == 1) {
-    return GetNoOpRegistrationExternal();
+    return GetNoOpOperator();
   }
   return nullptr;
 }
@@ -244,14 +243,14 @@ TfLiteStatus SinhEval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-const TfLiteRegistrationExternal* SinhFindCustomOpExternal(
-    void*, const char* custom_op, int version) {
+const TfLiteOperator* SinhFindCustomOpExternal(void*, const char* custom_op,
+                                               int version) {
   if (absl::string_view(custom_op) == "Sinh" && version == 1) {
-    static TfLiteRegistrationExternal* registration = []() {
-      TfLiteRegistrationExternal* reg =
-          TfLiteRegistrationExternalCreate(kTfLiteBuiltinCustom, "Sinh", 1);
-      TfLiteRegistrationExternalSetPrepare(reg, &SinhPrepareOpaque);
-      TfLiteRegistrationExternalSetInvoke(reg, &SinhEvalOpaque);
+    static TfLiteOperator* registration = []() {
+      TfLiteOperator* reg =
+          TfLiteOperatorCreate(kTfLiteBuiltinCustom, "Sinh", 1);
+      TfLiteOperatorSetPrepare(reg, &SinhPrepareOpaque);
+      TfLiteOperatorSetInvoke(reg, &SinhEvalOpaque);
       return reg;
     }();
     return registration;
@@ -301,7 +300,7 @@ TEST(CApiExperimentalTest, SetOpResolverExternal) {
 // Test using TfLiteInterpreterOptionsSetOpResolverExternalWithFallback and
 // TfLiteInterpreterCreateWithSelectedOps, for a builtin op, for the normal
 // case where the op is found with the primary op resolver callback that returns
-// a TfLiteRegistrationExternal pointer.
+// a TfLiteOperator pointer.
 TEST(CApiExperimentalTest,
      SetOpResolverExternalWithFallback_BuiltinOp_NormalCase) {
   TfLiteModel* model =
@@ -348,10 +347,11 @@ TEST(CApiExperimentalTest,
   OpResolverData my_data;
   TfLiteInterpreterOptionsSetOpResolverExternalWithFallback(
       options,
-      [](void* user_data, int op,
-         int version) -> const TfLiteRegistrationExternal* { return nullptr; },
+      [](void* user_data, int op, int version) -> const TfLiteOperator* {
+        return nullptr;
+      },
       [](void* user_data, const char* custom_op,
-         int version) -> const TfLiteRegistrationExternal* { return nullptr; },
+         int version) -> const TfLiteOperator* { return nullptr; },
       MyFindBuiltinOp, MyFindCustomOp, &my_data);
   EXPECT_FALSE(my_data.called_for_add);
 
@@ -371,7 +371,7 @@ TEST(CApiExperimentalTest,
 // Test using TfLiteInterpreterOptionsSetOpResolverExternalWithFallback and
 // TfLiteInterpreterCreateWithSelectedOps, for a custom op, for the normal
 // case where the op is found with the primary op resolver callback that returns
-// a TfLiteRegistrationExternal pointer.
+// a TfLiteOperator pointer.
 TEST(CApiExperimentalTest,
      SetOpResolverExternalWithFallback_CustomOp_NormalCase) {
   TfLiteModel* model = TfLiteModelCreateFromFile(
@@ -427,10 +427,11 @@ TEST(CApiExperimentalTest,
   OpResolverData my_data;
   TfLiteInterpreterOptionsSetOpResolverExternalWithFallback(
       options,
-      [](void* user_data, int op,
-         int version) -> const TfLiteRegistrationExternal* { return nullptr; },
+      [](void* user_data, int op, int version) -> const TfLiteOperator* {
+        return nullptr;
+      },
       [](void* user_data, const char* custom_op,
-         int version) -> const TfLiteRegistrationExternal* { return nullptr; },
+         int version) -> const TfLiteOperator* { return nullptr; },
       MyFindBuiltinOp, SinhFindCustomOp, &my_data);
   EXPECT_FALSE(my_data.called_for_add);
 
@@ -654,16 +655,16 @@ struct OpaqueTestDelegate {
     delegate_state->buffer_handle++;
 
     TfLiteRegistration registration{};
-    registration.registration_external = TfLiteRegistrationExternalCreate(
+    registration.registration_external = TfLiteOperatorCreate(
         kTfLiteBuiltinDelegate, "OpaqueTestDelegate delegate kernel",
         /* version = */ 1);
 
-    TfLiteRegistrationExternalSetPrepare(
+    TfLiteOperatorSetPrepare(
         registration.registration_external,
         [](TfLiteOpaqueContext* context,
            TfLiteOpaqueNode* node) -> TfLiteStatus { return kTfLiteOk; });
 
-    TfLiteRegistrationExternalSetInvoke(
+    TfLiteOperatorSetInvoke(
         registration.registration_external,
         [](TfLiteOpaqueContext*, TfLiteOpaqueNode*) -> TfLiteStatus {
           return kTfLiteOk;
diff --git a/tensorflow/lite/core/c/c_api_opaque.cc b/tensorflow/lite/core/c/c_api_opaque.cc
index 6cf4ce4c6b1656..ce05ee54fffa31 100644
--- a/tensorflow/lite/core/c/c_api_opaque.cc
+++ b/tensorflow/lite/core/c/c_api_opaque.cc
@@ -402,8 +402,7 @@ TfLiteStatus TfLiteOpaqueContextGetExecutionPlan(
 
 TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
     struct TfLiteOpaqueContext* opaque_context, int node_index,
-    TfLiteOpaqueNode** node,
-    TfLiteRegistrationExternal** registration_external) {
+    TfLiteOpaqueNode** node, TfLiteOperator** registration_external) {
   // The following casts are safe only because this code is part of the
   // TF Lite runtime implementation.  Apps using TF Lite should not rely on
   // TfLiteOpaqueContext and TfLiteContext being equivalent, or on
@@ -427,10 +426,10 @@ TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
 
   // When the 'registration' object obtained via 'GetNodeAndRegistration'
   // does *not* have its 'registration_external' field set then we need to
-  // create a TfLiteRegistrationExternal on the fly, and set its field according
+  // create a TfLiteOperator on the fly, and set its field according
   // to the 'TfLiteRegistration' object.
   auto derived_registration =
-      tflite::internal::CommonOpaqueConversionUtil::ObtainRegistrationExternal(
+      tflite::internal::CommonOpaqueConversionUtil::ObtainOperator(
           context, registration, node_index);
 
   if (derived_registration == nullptr) return kTfLiteError;
@@ -441,7 +440,7 @@ TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
 
 TfLiteStatus TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
     struct TfLiteOpaqueContext* opaque_context,
-    TfLiteRegistrationExternal* registration_external,
+    TfLiteOperator* registration_external,
     const TfLiteIntArray* nodes_to_replace,
     TfLiteOpaqueDelegate* opaque_delegate) {
   // The following casts are safe only because this code is part of the
@@ -454,7 +453,7 @@ TfLiteStatus TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
 
   // Wrap the provided 'registration_external' as a regular 'TfLiteRegistration'
   // object to reduce the places in the TF Lite runtime that need to be aware
-  // of 'TfLiteRegistrationExternal's.  Note that it is important to
+  // of 'TfLiteOperator's.  Note that it is important to
   // brace-initialize the 'TfLiteRegistration' so that we pass a registration to
   // 'ReplaceNodeSubsetsWithDelegateKernels' that has all of its fields set to
   // null, except the 'registration_external' one.
diff --git a/tensorflow/lite/core/c/c_api_opaque.h b/tensorflow/lite/core/c/c_api_opaque.h
index afeb281f0a2a5b..751e074ed7eac7 100644
--- a/tensorflow/lite/core/c/c_api_opaque.h
+++ b/tensorflow/lite/core/c/c_api_opaque.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/core/c/c_api.h"
 #include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/c/operator.h"  // IWYU pragma: export
 
 #ifdef __cplusplus
 extern "C" {
@@ -288,7 +289,7 @@ TFL_CAPI_EXPORT int TfLiteOpaqueNodeNumberOfOutputs(
 
 /// Returns opaque data provided by the node implementer. The value returned
 /// from this function is the value that was returned from the `init` callback
-/// that was passed to `TfLiteRegistrationExternalSetInit`.
+/// that was passed to `TfLiteOperatorSetInit`.
 TFL_CAPI_EXPORT extern void* TfLiteOpaqueNodeGetUserData(
     const TfLiteOpaqueNode* opaque_node);
 
@@ -416,7 +417,7 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueContextGetExecutionPlan(
 ///
 /// This function is expected to be called from within a delegate callback, like
 /// 'Prepare', or a delegate kernel callback (i.e., a callback registered with
-/// a 'TfLiteRegistrationExternal' object).
+/// a 'TfLiteOperator' object).
 ///
 /// The loaded '*node' and '*registration_external' pointers will generally
 /// remain valid for the lifetime of the associated 'opaque_context', but can be
@@ -428,8 +429,7 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueContextGetExecutionPlan(
 // are returned to the users and which actions invalidate them.
 TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
     struct TfLiteOpaqueContext* opaque_context, int node_index,
-    TfLiteOpaqueNode** node,
-    TfLiteRegistrationExternal** registration_external);
+    TfLiteOpaqueNode** node, TfLiteOperator** registration_external);
 
 /// Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
 ///
@@ -446,7 +446,7 @@ TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
 TFL_CAPI_EXPORT TfLiteStatus
 TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
     struct TfLiteOpaqueContext* opaque_context,
-    TfLiteRegistrationExternal* registration_external,
+    TfLiteOperator* registration_external,
     const TfLiteIntArray* nodes_to_replace,
     TfLiteOpaqueDelegate* opaque_delegate);
 
diff --git a/tensorflow/lite/core/c/c_api_opaque_test.cc b/tensorflow/lite/core/c/c_api_opaque_test.cc
index 5cdf40ac55faab..14c9fd27c50923 100644
--- a/tensorflow/lite/core/c/c_api_opaque_test.cc
+++ b/tensorflow/lite/core/c/c_api_opaque_test.cc
@@ -578,15 +578,14 @@ TEST(TestTfLiteOpaqueNode, CustomOpWithSetAndGetTemporaries) {
       "tensorflow/lite/testdata/custom_sinh.bin");
   ASSERT_NE(model, nullptr);
 
-  TfLiteRegistrationExternal* reg =
-      TfLiteRegistrationExternalCreate(kTfLiteBuiltinCustom, "Sinh", 1);
-  TfLiteRegistrationExternalSetPrepare(reg, my_custom_op::Prepare);
-  TfLiteRegistrationExternalSetInit(reg, my_custom_op::Init);
-  TfLiteRegistrationExternalSetFree(reg, my_custom_op::Free);
-  TfLiteRegistrationExternalSetInvoke(reg, my_custom_op::Invoke);
+  TfLiteOperator* reg = TfLiteOperatorCreate(kTfLiteBuiltinCustom, "Sinh", 1);
+  TfLiteOperatorSetPrepare(reg, my_custom_op::Prepare);
+  TfLiteOperatorSetInit(reg, my_custom_op::Init);
+  TfLiteOperatorSetFree(reg, my_custom_op::Free);
+  TfLiteOperatorSetInvoke(reg, my_custom_op::Invoke);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
-  TfLiteInterpreterOptionsAddRegistrationExternal(options, reg);
+  TfLiteInterpreterOptionsAddOperator(options, reg);
 
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
 
@@ -606,7 +605,7 @@ TEST(TestTfLiteOpaqueNode, CustomOpWithSetAndGetTemporaries) {
   EXPECT_EQ(output_value, input_value);
 
   TfLiteInterpreterDelete(interpreter);
-  TfLiteRegistrationExternalDelete(reg);
+  TfLiteOperatorDelete(reg);
   TfLiteModelDelete(model);
 }
 
diff --git a/tensorflow/lite/core/c/c_api_test.cc b/tensorflow/lite/core/c/c_api_test.cc
index 576a34b2545254..aaeecfbee21e0e 100644
--- a/tensorflow/lite/core/c/c_api_test.cc
+++ b/tensorflow/lite/core/c/c_api_test.cc
@@ -445,11 +445,10 @@ struct OpState {
 std::vector<int>* g_nodes_to_replace;
 TfLiteOpaqueDelegate* g_opaque_delegate_struct;
 
-TfLiteRegistrationExternal* CreateDelegateKernelExternalRegistration() {
-  TfLiteRegistrationExternal* delegate_kernel_registration_external =
-      TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate,
-                                       "TEST DELEGATE KERNEL", /*version=*/1);
-  TfLiteRegistrationExternalSetInit(
+TfLiteOperator* CreateDelegateKernelExternalRegistration() {
+  TfLiteOperator* delegate_kernel_registration_external = TfLiteOperatorCreate(
+      kTfLiteBuiltinDelegate, "TEST DELEGATE KERNEL", /*version=*/1);
+  TfLiteOperatorSetInit(
       delegate_kernel_registration_external,
       [](TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
@@ -471,11 +470,10 @@ TfLiteRegistrationExternal* CreateDelegateKernelExternalRegistration() {
         }
         return new OpState{true};
       });
-  TfLiteRegistrationExternalSetFree(
-      delegate_kernel_registration_external,
-      [](TfLiteOpaqueContext* context, void* buffer) {
-        delete (reinterpret_cast<OpState*>(buffer));
-      });
+  TfLiteOperatorSetFree(delegate_kernel_registration_external,
+                        [](TfLiteOpaqueContext* context, void* buffer) {
+                          delete (reinterpret_cast<OpState*>(buffer));
+                        });
   return delegate_kernel_registration_external;
 }
 
@@ -500,7 +498,7 @@ TEST(CApiSimple, OpaqueDelegate_ReplaceNodeSubsetsWithDelegateKernels) {
     EXPECT_EQ(execution_plan->size, 2);
 
     TfLiteOpaqueNode* node = nullptr;
-    TfLiteRegistrationExternal* registration_external = nullptr;
+    TfLiteOperator* registration_external = nullptr;
     TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, 0, &node,
                                               &registration_external);
     EXPECT_NE(node, nullptr);
@@ -539,8 +537,7 @@ TEST(CApiSimple, OpaqueDelegate_ReplaceNodeSubsetsWithDelegateKernels) {
   g_opaque_delegate_struct = nullptr;
 }
 
-TEST(CApiSimple,
-     OpaqueDelegate_TransferRegistrationExternalOwnershipWithoutNodeToReplace) {
+TEST(CApiSimple, OpaqueDelegate_TransferOperatorOwnershipWithoutNodeToReplace) {
   g_nodes_to_replace = new std::vector<int>();
 
   TfLiteModel* model =
@@ -557,7 +554,7 @@ TEST(CApiSimple,
     delegate_state->delegate_prepared = true;
 
     TfLiteOpaqueNode* node = nullptr;
-    TfLiteRegistrationExternal* registration_external = nullptr;
+    TfLiteOperator* registration_external = nullptr;
     TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, 0, &node,
                                               &registration_external);
     EXPECT_NE(node, nullptr);
@@ -616,13 +613,13 @@ TEST_F(TestFP16Delegation,
     std::vector<int> nodes_to_replace;
     for (int i = 0; i < execution_plan->size; i++) {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration_external = nullptr;
+      TfLiteOperator* registration_external = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(opaque_context,
                                                 execution_plan->data[i], &node,
                                                 &registration_external);
       EXPECT_NE(node, nullptr);
       EXPECT_NE(registration_external, nullptr);
-      if (TfLiteRegistrationExternalGetBuiltInCode(registration_external) ==
+      if (TfLiteOperatorGetBuiltInCode(registration_external) ==
           kTfLiteBuiltinAdd) {
         nodes_to_replace.push_back(execution_plan->data[i]);
       }
@@ -755,10 +752,10 @@ struct DelegateKernelState {
   TfLiteOpaqueTensor* output_tensor = nullptr;
 };
 
-TfLiteRegistrationExternal* CreateReg() {
-  auto reg_ex = TfLiteRegistrationExternalCreate(
-      kTfLiteBuiltinDelegate, "Test driver delegate", /*version=*/1);
-  TfLiteRegistrationExternalSetInit(
+TfLiteOperator* CreateReg() {
+  auto reg_ex = TfLiteOperatorCreate(kTfLiteBuiltinDelegate,
+                                     "Test driver delegate", /*version=*/1);
+  TfLiteOperatorSetInit(
       reg_ex,
       [](TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
@@ -767,7 +764,7 @@ TfLiteRegistrationExternal* CreateReg() {
 
         for (int i = 0; i < params->nodes_to_replace->size; ++i) {
           TfLiteOpaqueNode* node = nullptr;
-          TfLiteRegistrationExternal* registration_external = nullptr;
+          TfLiteOperator* registration_external = nullptr;
           TfLiteOpaqueContextGetNodeAndRegistration(
               context, params->nodes_to_replace->data[i], &node,
               &registration_external);
@@ -775,8 +772,8 @@ TfLiteRegistrationExternal* CreateReg() {
           EXPECT_NE(nullptr, registration_external);
           EXPECT_EQ(2, TfLiteOpaqueNodeNumberOfInputs(node));
           EXPECT_EQ(1, TfLiteOpaqueNodeNumberOfOutputs(node));
-          EXPECT_EQ(kTfLiteBuiltinAdd, TfLiteRegistrationExternalGetBuiltInCode(
-                                           registration_external));
+          EXPECT_EQ(kTfLiteBuiltinAdd,
+                    TfLiteOperatorGetBuiltInCode(registration_external));
         }
 
         TfLiteIntArray* input_tensors = params->input_tensors;
@@ -791,7 +788,7 @@ TfLiteRegistrationExternal* CreateReg() {
         return new DelegateKernelState{input_tensor, output_tensor};
       });
 
-  TfLiteRegistrationExternalSetInvoke(
+  TfLiteOperatorSetInvoke(
       reg_ex,
       [](TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
@@ -809,8 +806,7 @@ TfLiteRegistrationExternal* CreateReg() {
         return kTfLiteOk;
       });
 
-  TfLiteRegistrationExternalSetFree(reg_ex, [](TfLiteOpaqueContext* context,
-                                               void* data) {
+  TfLiteOperatorSetFree(reg_ex, [](TfLiteOpaqueContext* context, void* data) {
     DelegateKernelState* state = reinterpret_cast<DelegateKernelState*>(data);
     delete state;
   });
@@ -842,15 +838,15 @@ TEST(CApiSimple, OpaqueDelegate_TfLiteOpaqueTensorGet) {
     std::vector<int> node_ids_to_replace;
     for (int i = 0; i < execution_plan->size; ++i) {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration_external = nullptr;
+      TfLiteOperator* registration_external = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(
           context, execution_plan->data[i], &node, &registration_external);
       EXPECT_NE(nullptr, node);
       EXPECT_NE(nullptr, registration_external);
       EXPECT_EQ(2, TfLiteOpaqueNodeNumberOfInputs(node));
       EXPECT_EQ(1, TfLiteOpaqueNodeNumberOfOutputs(node));
-      EXPECT_EQ(kTfLiteBuiltinAdd, TfLiteRegistrationExternalGetBuiltInCode(
-                                       registration_external));
+      EXPECT_EQ(kTfLiteBuiltinAdd,
+                TfLiteOperatorGetBuiltInCode(registration_external));
       node_ids_to_replace.push_back(execution_plan->data[i]);
     }
 
@@ -944,21 +940,21 @@ TEST(CApiSimple, OpaqueContextGetNodeAndRegistration) {
 
     for (int i = 0; i < execution_plan->size; i++) {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration_external = nullptr;
+      TfLiteOperator* registration_external = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, 0, &node,
                                                 &registration_external);
       EXPECT_NE(node, nullptr);
       EXPECT_NE(registration_external, nullptr);
-      EXPECT_EQ(kTfLiteBuiltinAdd, TfLiteRegistrationExternalGetBuiltInCode(
-                                       registration_external));
-      EXPECT_EQ(1, TfLiteRegistrationExternalGetVersion(registration_external));
+      EXPECT_EQ(kTfLiteBuiltinAdd,
+                TfLiteOperatorGetBuiltInCode(registration_external));
+      EXPECT_EQ(1, TfLiteOperatorGetVersion(registration_external));
       EXPECT_EQ(2, TfLiteOpaqueNodeNumberOfInputs(node));
       EXPECT_EQ(1, TfLiteOpaqueNodeNumberOfOutputs(node));
     }
 
     {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration_external = nullptr;
+      TfLiteOperator* registration_external = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, 0, &node,
                                                 &registration_external);
       EXPECT_EQ(1, TfLiteOpaqueNodeGetInputTensorIndex(node, 0));
@@ -980,7 +976,7 @@ TEST(CApiSimple, OpaqueContextGetNodeAndRegistration) {
     }
     {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration_external = nullptr;
+      TfLiteOperator* registration_external = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, 1, &node,
                                                 &registration_external);
       EXPECT_EQ(0, TfLiteOpaqueNodeGetInputTensorIndex(node, 0));
@@ -1005,8 +1001,8 @@ TEST(CApiSimple, OpaqueContextGetNodeAndRegistration) {
   TfLiteOpaqueDelegateDelete(opaque_delegate);
 }
 
-TEST(CApiSimple, TfLiteRegistrationExternalGetVersionNullptr) {
-  EXPECT_EQ(TfLiteRegistrationExternalGetVersion(nullptr), -1);
+TEST(CApiSimple, TfLiteOperatorGetVersionNullptr) {
+  EXPECT_EQ(TfLiteOperatorGetVersion(nullptr), -1);
 }
 
 TEST(CApiSimple, TfLiteOpaqueContextResizeTensor) {
@@ -1149,18 +1145,17 @@ TEST(CApiSimple, CustomOpSupport) {
       "tensorflow/lite/testdata/custom_sinh.bin");
   ASSERT_NE(model, nullptr);
 
-  TfLiteRegistrationExternal* reg =
-      TfLiteRegistrationExternalCreate(kTfLiteBuiltinCustom, "Sinh", 1);
-  TfLiteRegistrationExternalSetPrepare(reg, &FlexSinhPrepare);
-  TfLiteRegistrationExternalSetInit(reg, &FlexSinhInit);
-  TfLiteRegistrationExternalSetFree(reg, &FlexSinhFree);
-  TfLiteRegistrationExternalSetInvoke(reg, &FlexSinhEval);
+  TfLiteOperator* reg = TfLiteOperatorCreate(kTfLiteBuiltinCustom, "Sinh", 1);
+  TfLiteOperatorSetPrepare(reg, &FlexSinhPrepare);
+  TfLiteOperatorSetInit(reg, &FlexSinhInit);
+  TfLiteOperatorSetFree(reg, &FlexSinhFree);
+  TfLiteOperatorSetInvoke(reg, &FlexSinhEval);
 
-  const char* kCustomName = TfLiteRegistrationExternalGetCustomName(reg);
+  const char* kCustomName = TfLiteOperatorGetCustomName(reg);
   EXPECT_EQ("Sinh", kCustomName);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
-  TfLiteInterpreterOptionsAddRegistrationExternal(options, reg);
+  TfLiteInterpreterOptionsAddOperator(options, reg);
 
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
 
@@ -1180,7 +1175,7 @@ TEST(CApiSimple, CustomOpSupport) {
 
   TfLiteInterpreterDelete(interpreter);
   TfLiteModelDelete(model);
-  TfLiteRegistrationExternalDelete(reg);
+  TfLiteOperatorDelete(reg);
 }
 
 const TfLiteRegistration* find_builtin_op_add(void* user_data,
@@ -1519,10 +1514,10 @@ TEST(CApiSimple, OpaqueApiAccessors) {
     // Define a delegate kernel that checks that the properties of the model
     // are accessible via the opaque API function.
     //
-    TfLiteRegistrationExternal* reg = TfLiteRegistrationExternalCreate(
-        kTfLiteBuiltinDelegate, "my delegate", 123);
-    EXPECT_EQ(123, TfLiteRegistrationExternalGetVersion(reg));
-    TfLiteRegistrationExternalSetInit(
+    TfLiteOperator* reg =
+        TfLiteOperatorCreate(kTfLiteBuiltinDelegate, "my delegate", 123);
+    EXPECT_EQ(123, TfLiteOperatorGetVersion(reg));
+    TfLiteOperatorSetInit(
         reg,
         [](TfLiteOpaqueContext* opaque_context, const char* buffer,
            size_t length) -> void* {
@@ -1720,13 +1715,13 @@ TEST(CApiSimple, OpaqueApiAccessors) {
             TfLiteOpaqueTensorBuilderDelete(builder);
           }
           TfLiteOpaqueNode* node = nullptr;
-          TfLiteRegistrationExternal* registration_external = nullptr;
+          TfLiteOperator* registration_external = nullptr;
           TfLiteOpaqueContextGetNodeAndRegistration(
               opaque_context, params->nodes_to_replace->data[0], &node,
               &registration_external);
           // ADD is a builtin OP, not a custom OP.
           const char* kCustomName =
-              TfLiteRegistrationExternalGetCustomName(registration_external);
+              TfLiteOperatorGetCustomName(registration_external);
           EXPECT_EQ(nullptr, kCustomName);
 
           const void* node_custom_init_data = nullptr;
@@ -1841,9 +1836,9 @@ TEST(CApiSimple, OpaqueApiAccessorsStrings) {
   opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* context,
                                        TfLiteOpaqueDelegate* delegate,
                                        void* data) -> TfLiteStatus {
-    TfLiteRegistrationExternal* registration = TfLiteRegistrationExternalCreate(
-        kTfLiteBuiltinDelegate, "my delegate", 123);
-    TfLiteRegistrationExternalSetInit(
+    TfLiteOperator* registration =
+        TfLiteOperatorCreate(kTfLiteBuiltinDelegate, "my delegate", 123);
+    TfLiteOperatorSetInit(
         registration,
         [](TfLiteOpaqueContext* opaque_context, const char* buffer,
            size_t length) -> void* {
@@ -1864,12 +1859,12 @@ TEST(CApiSimple, OpaqueApiAccessorsStrings) {
           return nullptr;
         });
 
-    TfLiteRegistrationExternalSetPrepare(
+    TfLiteOperatorSetPrepare(
         registration,
         [](TfLiteOpaqueContext* context,
            TfLiteOpaqueNode* node) -> TfLiteStatus { return kTfLiteOk; });
 
-    TfLiteRegistrationExternalSetInvoke(
+    TfLiteOperatorSetInvoke(
         registration,
         [](TfLiteOpaqueContext* context,
            TfLiteOpaqueNode* node) -> TfLiteStatus {
@@ -2060,7 +2055,7 @@ TEST(CApiSimple, AddNodesAfterApplyingDelegate) {
     EXPECT_EQ(execution_plan->size, 1);
 
     TfLiteOpaqueNode* node = nullptr;
-    TfLiteRegistrationExternal* registration_external = nullptr;
+    TfLiteOperator* registration_external = nullptr;
     TfLiteOpaqueContextGetNodeAndRegistration(context, execution_plan->data[0],
                                               &node, &registration_external);
     EXPECT_NE(node, nullptr);
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index 9801bde9ddc6ea..ea54be9490ef01 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -1014,11 +1014,17 @@ typedef struct TfLiteContext {
                                          int subgraph_index);
 } TfLiteContext;
 
-/// `TfLiteRegistrationExternal` is an external version of `TfLiteRegistration`
+/// `TfLiteOperator` is an external version of `TfLiteRegistration`
 /// for C API which doesn't use internal types (such as `TfLiteContext`) but
 /// only uses stable API types (such as `TfLiteOpaqueContext`). The purpose of
 /// each field is the exactly the same as with `TfLiteRegistration`.
-typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
+typedef struct TfLiteOperator TfLiteOperator;
+
+#ifndef DOXYGEN_SKIP
+// For backwards compatibility.
+// Deprecated. Use TfLiteOperator instead.
+typedef TfLiteOperator TfLiteRegistrationExternal;
+#endif
 
 /// The valid values of the `inplace_operator` field in `TfLiteRegistration`.
 /// This allow an op to signal to the runtime that the same data pointer
@@ -1085,7 +1091,7 @@ static const int kTfLiteMaxSharableOpInputs = 3;
 /// It is a struct containing "methods" (C function pointers) that will be
 /// invoked by the TF Lite runtime to evaluate instances of the operation.
 ///
-/// See also `TfLiteRegistrationExternal` which is a more ABI-stable equivalent.
+/// See also `TfLiteOperator` which is a more ABI-stable equivalent.
 typedef struct TfLiteRegistration {
   /// Initializes the op from serialized data.
   /// Called only *once* for the lifetime of the op, so any one-time allocations
@@ -1156,12 +1162,12 @@ typedef struct TfLiteRegistration {
   /// properly.
   int version;
 
-  /// The external version of `TfLiteRegistration`. Since we can't use internal
-  /// types (such as `TfLiteContext`) for C API to maintain ABI stability.
-  /// C API user will provide `TfLiteRegistrationExternal` to implement custom
-  /// ops. We keep it inside of `TfLiteRegistration` and use it to route
-  /// callbacks properly.
-  TfLiteRegistrationExternal* registration_external;
+  /// The external (i.e. ABI-stable) version of `TfLiteRegistration`.
+  /// Since we can't use internal types (such as `TfLiteContext`) for C API to
+  /// maintain ABI stability.  C API user will provide `TfLiteOperator` to
+  /// implement custom ops.  We keep it inside of `TfLiteRegistration` and use
+  /// it to route callbacks properly.
+  TfLiteOperator* registration_external;
 
   /// Retrieves asynchronous kernel.
   ///
@@ -1201,7 +1207,7 @@ typedef struct TfLiteRegistration_V3 {
   int32_t builtin_code;
   const char* custom_name;
   int version;
-  TfLiteRegistrationExternal* registration_external;
+  TfLiteOperator* registration_external;
   struct TfLiteAsyncKernel* (*async_kernel)(TfLiteContext* context,
                                             TfLiteNode* node);
 } TfLiteRegistration_V3;
@@ -1227,7 +1233,7 @@ typedef struct TfLiteRegistration_V2 {
   int32_t builtin_code;
   const char* custom_name;
   int version;
-  TfLiteRegistrationExternal* registration_external;
+  TfLiteOperator* registration_external;
 } TfLiteRegistration_V2;
 
 /// \private
diff --git a/tensorflow/lite/core/c/operator.cc b/tensorflow/lite/core/c/operator.cc
new file mode 100644
index 00000000000000..f4571fe2b2615d
--- /dev/null
+++ b/tensorflow/lite/core/c/operator.cc
@@ -0,0 +1,92 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/c/operator.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common_internal.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+
+TfLiteOperator* TfLiteOperatorCreate(TfLiteBuiltinOperator builtin_code,
+                                     const char* custom_name, int version) {
+  return new TfLiteOperator{/*.custom_name =*/custom_name,
+                            /*.version =*/version,
+                            /*.init =*/nullptr,
+                            /*.free =*/nullptr,
+                            /*.prepare =*/nullptr,
+                            /*.invoke =*/nullptr,
+                            /*.async_kernel =*/nullptr,
+                            /*.builtin_code =*/builtin_code,
+                            /*.node_index =*/-1};
+}
+
+void TfLiteOperatorDelete(TfLiteOperator* reg) { delete reg; }
+
+void TfLiteOperatorSetInit(TfLiteOperator* registration,
+                           void* (*init)(TfLiteOpaqueContext* context,
+                                         const char* buffer, size_t length)) {
+  registration->init = init;
+}
+
+void TfLiteOperatorSetFree(TfLiteOperator* registration,
+                           void (*free)(TfLiteOpaqueContext* context,
+                                        void* data)) {
+  registration->free = free;
+}
+
+void TfLiteOperatorSetPrepare(
+    TfLiteOperator* registration,
+    TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node)) {
+  registration->prepare = prepare;
+}
+
+void TfLiteOperatorSetInvoke(
+    TfLiteOperator* registration,
+    TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
+                           TfLiteOpaqueNode* node)) {
+  registration->invoke = invoke;
+}
+
+void TfLiteOperatorSetAsyncKernel(
+    TfLiteOperator* registration,
+    TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
+                                       TfLiteOpaqueNode* node)) {
+  registration->async_kernel = async_kernel;
+}
+
+void TfLiteOperatorSetInplaceOperator(TfLiteOperator* registration,
+                                      uint64_t inplace_operator) {
+  registration->inplace_operator = inplace_operator;
+}
+
+TfLiteBuiltinOperator TfLiteOperatorGetBuiltInCode(
+    const TfLiteOperator* registration) {
+  return static_cast<TfLiteBuiltinOperator>(registration->builtin_code);
+}
+
+const char* TfLiteOperatorGetCustomName(const TfLiteOperator* registration) {
+  return registration->custom_name;
+}
+
+int TfLiteOperatorGetVersion(const TfLiteOperator* registration) {
+  if (!registration) {
+    return -1;
+  }
+  return registration->version;
+}
diff --git a/tensorflow/lite/core/c/registration_external.h b/tensorflow/lite/core/c/operator.h
similarity index 73%
rename from tensorflow/lite/core/c/registration_external.h
rename to tensorflow/lite/core/c/operator.h
index d637e6f8cfcdaa..796b0a23d8cb98 100644
--- a/tensorflow/lite/core/c/registration_external.h
+++ b/tensorflow/lite/core/c/operator.h
@@ -17,14 +17,14 @@ limitations under the License.
 /// Only the TensorFlow Lite implementation itself should include this
 /// file directly.
 ///
-/// The types and functions declared in registration_external.h are
+/// The types and functions declared in operator.h are
 /// part of the TensorFlow Lite Extension APIs.
 /// We reserve the right to make changes to this API in future releases,
 /// potentially including non-backwards-compatible changes, on a different
 /// schedule than for the other TensorFlow Lite APIs. See
 /// https://www.tensorflow.org/guide/versions#separate_version_number_for_tensorflow_lite_extension_apis.
-#ifndef TENSORFLOW_LITE_CORE_C_REGISTRATION_EXTERNAL_H_
-#define TENSORFLOW_LITE_CORE_C_REGISTRATION_EXTERNAL_H_
+#ifndef TENSORFLOW_LITE_CORE_C_OPERATOR_H_
+#define TENSORFLOW_LITE_CORE_C_OPERATOR_H_
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -37,31 +37,31 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-// TfLiteRegistrationExternal is an opaque version of TfLiteRegistration,
+// TfLiteOperator is an opaque version of TfLiteRegistration,
 // and is used for registering custom ops.  It represents a definition of a
 // custom op or a builtin op.
 //
 // \warning This is an experimental type and subject to change.
-typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
+typedef struct TfLiteOperator TfLiteOperator;
 
-// Returns a new TfLiteRegistrationExternal instance.
+// Returns a new TfLiteOperator instance.
 //
-// The returned TfLiteRegistrationExternal instance represents a definition
+// The returned TfLiteOperator instance represents a definition
 // of an operator with the identity (builtin_code/custom_name and
 // version) specified by the parameters, but with all callbacks initially unset.
 //
 // Evaluation of any operation using this operator will be done using
 // the "prepare" and "invoke" callbacks, which can be set using
-// `TfLiteRegistrationExternalSetPrepare` and
-// `TfLiteRegistrationExternalSetInvoke`, or for async execution
+// `TfLiteOperatorSetPrepare` and
+// `TfLiteOperatorSetInvoke`, or for async execution
 // the "prepare", "eval", and "wait" callbacks of the `TfLiteAsyncKernel`,
-// which can be set using `TfLiteRegistrationExternalSetAsyncKernel`.
+// which can be set using `TfLiteOperatorSetAsyncKernel`.
 // If the relevant callbacks are not set, then such evaluation will result
 // in an error status.  So normally any use of this function should be followed
 // by appropriate calls to set those callbacks.
 //
 // \note The caller retains ownership and should ensure that
-// the lifetime of the `TfLiteRegistrationExternal` must be at least as long as
+// the lifetime of the `TfLiteOperator` must be at least as long as
 // the lifetime of any `TfLiteInterpreter` or `tflite::Interpreter` that it is
 // used in.
 //
@@ -74,36 +74,33 @@ typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
 //                     https://www.tensorflow.org/lite/guide/ops_version
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteRegistrationExternal*
-TfLiteRegistrationExternalCreate(TfLiteBuiltinOperator builtin_code,
-                                 const char* custom_name, int version);
+TFL_CAPI_EXPORT extern TfLiteOperator* TfLiteOperatorCreate(
+    TfLiteBuiltinOperator builtin_code, const char* custom_name, int version);
 
-// Destroys the TfLiteRegistrationExternal instance.
+// Destroys the TfLiteOperator instance.
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalDelete(
-    TfLiteRegistrationExternal* registration);
+TFL_CAPI_EXPORT extern void TfLiteOperatorDelete(TfLiteOperator* registration);
 
 // Return the builtin op code of the provided external 'registration'.
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteBuiltinOperator
-TfLiteRegistrationExternalGetBuiltInCode(
-    const TfLiteRegistrationExternal* registration);
+TFL_CAPI_EXPORT extern TfLiteBuiltinOperator TfLiteOperatorGetBuiltInCode(
+    const TfLiteOperator* registration);
 
 /// Returns the custom name of the provided 'registration'. The returned pointer
 /// will be non-null iff the op is a custom op.
 ///
 /// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern const char* TfLiteRegistrationExternalGetCustomName(
-    const TfLiteRegistrationExternal* registration);
+TFL_CAPI_EXPORT extern const char* TfLiteOperatorGetCustomName(
+    const TfLiteOperator* registration);
 
 /// Return the OP version of the provided external 'registration'.  Return -1
 /// in case of error, or if the provided address is null.
 ///
 /// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern int TfLiteRegistrationExternalGetVersion(
-    const TfLiteRegistrationExternal* registration);
+TFL_CAPI_EXPORT extern int TfLiteOperatorGetVersion(
+    const TfLiteOperator* registration);
 
 // Sets the initialization callback for the registration.
 //
@@ -111,8 +108,8 @@ TFL_CAPI_EXPORT extern int TfLiteRegistrationExternalGetVersion(
 // Please refer `init` of `TfLiteRegistration` for the detail.
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInit(
-    TfLiteRegistrationExternal* registration,
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetInit(
+    TfLiteOperator* registration,
     void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
                   size_t length));
 
@@ -124,8 +121,8 @@ TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInit(
 // Please refer `free` of `TfLiteRegistration` for the detail.
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetFree(
-    TfLiteRegistrationExternal* registration,
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetFree(
+    TfLiteOperator* registration,
     void (*free)(TfLiteOpaqueContext* context, void* data));
 
 // Sets the preparation callback for the registration.
@@ -134,8 +131,8 @@ TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetFree(
 // Please refer `prepare` of `TfLiteRegistration` for the detail.
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetPrepare(
-    TfLiteRegistrationExternal* registration,
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetPrepare(
+    TfLiteOperator* registration,
     TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
                             TfLiteOpaqueNode* node));
 
@@ -145,8 +142,8 @@ TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetPrepare(
 // Please refer `invoke` of `TfLiteRegistration` for the detail.
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInvoke(
-    TfLiteRegistrationExternal* registration,
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetInvoke(
+    TfLiteOperator* registration,
     TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
                            TfLiteOpaqueNode* node));
 
@@ -158,8 +155,8 @@ TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInvoke(
 /// `node` is the delegate TfLiteNode created by `ModifyGraphWithDelegate`.
 /// Please refer `async_kernel` of `TfLiteRegistration` for the detail.
 /// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetAsyncKernel(
-    TfLiteRegistrationExternal* registration,
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetAsyncKernel(
+    TfLiteOperator* registration,
     struct TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
                                               TfLiteOpaqueNode* node));
 
@@ -169,11 +166,11 @@ TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetAsyncKernel(
 /// `TfLiteRegistration` for details.
 ///
 /// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInplaceOperator(
-    TfLiteRegistrationExternal* registration, uint64_t inplace_operator);
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetInplaceOperator(
+    TfLiteOperator* registration, uint64_t inplace_operator);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // TENSORFLOW_LITE_CORE_C_REGISTRATION_EXTERNAL_H_
+#endif  // TENSORFLOW_LITE_CORE_C_OPERATOR_H_
diff --git a/tensorflow/lite/core/c/registration_external.cc b/tensorflow/lite/core/c/registration_external.cc
deleted file mode 100644
index d655ce8de58922..00000000000000
--- a/tensorflow/lite/core/c/registration_external.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/core/c/registration_external.h"
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/common_internal.h"
-#include "tensorflow/lite/core/async/c/types.h"
-#include "tensorflow/lite/core/c/c_api_types.h"
-
-TfLiteRegistrationExternal* TfLiteRegistrationExternalCreate(
-    TfLiteBuiltinOperator builtin_code, const char* custom_name, int version) {
-  return new TfLiteRegistrationExternal{/*.custom_name =*/custom_name,
-                                        /*.version =*/version,
-                                        /*.init =*/nullptr,
-                                        /*.free =*/nullptr,
-                                        /*.prepare =*/nullptr,
-                                        /*.invoke =*/nullptr,
-                                        /*.async_kernel =*/nullptr,
-                                        /*.builtin_code =*/builtin_code,
-                                        /*.node_index =*/-1};
-}
-
-void TfLiteRegistrationExternalDelete(TfLiteRegistrationExternal* reg) {
-  delete reg;
-}
-
-void TfLiteRegistrationExternalSetInit(
-    TfLiteRegistrationExternal* registration,
-    void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
-                  size_t length)) {
-  registration->init = init;
-}
-
-void TfLiteRegistrationExternalSetFree(
-    TfLiteRegistrationExternal* registration,
-    void (*free)(TfLiteOpaqueContext* context, void* data)) {
-  registration->free = free;
-}
-
-void TfLiteRegistrationExternalSetPrepare(
-    TfLiteRegistrationExternal* registration,
-    TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
-                            TfLiteOpaqueNode* node)) {
-  registration->prepare = prepare;
-}
-
-void TfLiteRegistrationExternalSetInvoke(
-    TfLiteRegistrationExternal* registration,
-    TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
-                           TfLiteOpaqueNode* node)) {
-  registration->invoke = invoke;
-}
-
-void TfLiteRegistrationExternalSetAsyncKernel(
-    TfLiteRegistrationExternal* registration,
-    TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
-                                       TfLiteOpaqueNode* node)) {
-  registration->async_kernel = async_kernel;
-}
-
-void TfLiteRegistrationExternalSetInplaceOperator(
-    TfLiteRegistrationExternal* registration, uint64_t inplace_operator) {
-  registration->inplace_operator = inplace_operator;
-}
-
-TfLiteBuiltinOperator TfLiteRegistrationExternalGetBuiltInCode(
-    const TfLiteRegistrationExternal* registration) {
-  return static_cast<TfLiteBuiltinOperator>(registration->builtin_code);
-}
-
-const char* TfLiteRegistrationExternalGetCustomName(
-    const TfLiteRegistrationExternal* registration) {
-  return registration->custom_name;
-}
-
-int TfLiteRegistrationExternalGetVersion(
-    const TfLiteRegistrationExternal* registration) {
-  if (!registration) {
-    return -1;
-  }
-  return registration->version;
-}
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 489b5d4e82aae3..a1a9f66721167f 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -253,7 +253,7 @@ Subgraph::Subgraph(ErrorReporter* error_reporter,
                    resource::InitializationStatusMap* initialization_status_map,
                    int subgraph_index)
     : external_contexts_(external_contexts),
-      registration_externals_(new internal::RegistrationExternalsCache),
+      registration_externals_(new internal::OperatorsCache),
       error_reporter_(error_reporter),
       next_execution_plan_index_to_prepare_(0),
       next_execution_plan_index_to_plan_allocation_(0),
@@ -469,10 +469,10 @@ void PopulatePreviewDelegateParams(const NodeSubset& node_subset,
 // a nested 'custom_name' field defined inside the optionally set
 // 'registration_external' structure.  The top-level field takes precedence over
 // the nested field.  'TfLiteRegistration'
-// objects can optionally carry a 'TfLiteRegistrationExternal' pointer in their
+// objects can optionally carry a 'TfLiteOperator' pointer in their
 // 'registration_external' field.  If that's the case then the
 // 'TfLiteRegistration' object is merely a wrapper over a
-// 'TfLiteRegistrationExternal', with all fields except 'registration_external'
+// 'TfLiteOperator', with all fields except 'registration_external'
 // being null, that contains the actual logic that the registration represents.
 // See also the comment inside
 // 'TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels'.
@@ -512,7 +512,7 @@ TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
   // user has supplied an opaque delegate.
   if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate)) {
     // If the user has supplied an opaque delegate, then they _must_ also use
-    // TfLiteRegistrationExternal.
+    // TfLiteOperator.
     if (!registration.registration_external) {
       TFLITE_LOG(
           tflite::TFLITE_LOG_WARNING,
@@ -527,8 +527,8 @@ TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
                            registration.registration_external->custom_name,
                            registration.registration_external->version};
     auto [it, inserted] = registration_externals_->emplace(
-        op_id, std::unique_ptr<TfLiteRegistrationExternal>(
-                   registration.registration_external));
+        op_id,
+        std::unique_ptr<TfLiteOperator>(registration.registration_external));
     // If there was already an entry for this op_id in the
     // registration_externals_ cache, the statement above will have
     // no effect on the registration_externals_ cache,
@@ -1271,17 +1271,17 @@ TfLiteStatus Subgraph::ReleaseMemory() {
 void* Subgraph::OpInit(const TfLiteRegistration& op_reg, const char* buffer,
                        size_t length) {
   // Delegates that use the stable delegate API to iterate over the nodes and
-  // registrations are presented with ABI stable 'TfLiteRegistrationExternal'
+  // registrations are presented with ABI stable 'TfLiteOperator'
   // pointers, as opposed to ABI unstable 'TfLiteRegistration' pointers, even
   // for builtin OPs like ADD or SUB.  A knock-on effect of this behavior is
   // that we need to differentiate two scenarios when interacting with a
-  // 'TfLiteRegistrationExternal'.
+  // 'TfLiteOperator'.
   // 1. In the 'wrapper' scenario described above we use the 'node_index' field
   //    that points us to the corresponding 'TfLiteRegistration' that holds the
   //    callbacks that need to be invoked.
-  // 2. Otherwise the 'TfLiteRegistrationExternal' is either a stable custom OP,
+  // 2. Otherwise the 'TfLiteOperator' is either a stable custom OP,
   //    or a stable delegate kernel, and in both of those cases we need to use
-  //    the callbacks stored within the 'TfLiteRegistrationExternal' itself.
+  //    the callbacks stored within the 'TfLiteOperator' itself.
   if (op_reg.registration_external &&
       op_reg.registration_external->node_index != -1) {
     TfLiteRegistration* referenced_registration =
@@ -1302,17 +1302,17 @@ void* Subgraph::OpInit(const TfLiteRegistration& op_reg, const char* buffer,
 TfLiteStatus Subgraph::OpPrepare(const TfLiteRegistration& op_reg,
                                  TfLiteNode* node) {
   // Delegates that use the stable delegate API to iterate over the nodes and
-  // registrations are presented with ABI stable 'TfLiteRegistrationExternal'
+  // registrations are presented with ABI stable 'TfLiteOperator'
   // pointers, as opposed to ABI unstable 'TfLiteRegistration' pointers, even
   // for builtin OPs like ADD or SUB.  A knock-on effect of this behavior is
   // that we need to differentiate two scenarios when interacting with a
-  // 'TfLiteRegistrationExternal'.
+  // 'TfLiteOperator'.
   // 1. In the 'wrapper' scenario described above we use the 'node_index' field
   //    that points us to the corresponding 'TfLiteRegistration' that holds the
   //    callbacks that need to be invoked.
-  // 2. Otherwise the 'TfLiteRegistrationExternal' is either a stable custom OP,
+  // 2. Otherwise the 'TfLiteOperator' is either a stable custom OP,
   //    or a stable delegate kernel, and in both of those cases we need to use
-  //    the callbacks stored within the 'TfLiteRegistrationExternal' itself.
+  //    the callbacks stored within the 'TfLiteOperator' itself.
   if (op_reg.registration_external &&
       op_reg.registration_external->node_index != -1) {
     TfLiteRegistration* referenced_registration =
@@ -1368,17 +1368,17 @@ TfLiteStatus Subgraph::OpPrepare(const TfLiteRegistration& op_reg,
 TfLiteStatus Subgraph::OpInvoke(const TfLiteRegistration& op_reg,
                                 TfLiteNode* node) {
   // Delegates that use the stable delegate API to iterate over the nodes and
-  // registrations are presented with ABI stable 'TfLiteRegistrationExternal'
+  // registrations are presented with ABI stable 'TfLiteOperator'
   // pointers, as opposed to ABI unstable 'TfLiteRegistration' pointers, even
   // for builtin OPs like ADD or SUB.  A knock-on effect of this behavior is
   // that we need to differentiate two scenarios when interacting with a
-  // 'TfLiteRegistrationExternal'.
+  // 'TfLiteOperator'.
   // 1. In the 'wrapper' scenario described above we use the 'node_index' field
   //    that points us to the corresponding 'TfLiteRegistration' that holds the
   //    callbacks that need to be invoked.
-  // 2. Otherwise the 'TfLiteRegistrationExternal' is either a stable custom OP,
+  // 2. Otherwise the 'TfLiteOperator' is either a stable custom OP,
   //    or a stable delegate kernel, and in both of those cases we need to use
-  //    the callbacks stored within the 'TfLiteRegistrationExternal' itself.
+  //    the callbacks stored within the 'TfLiteOperator' itself.
   if (op_reg.registration_external &&
       op_reg.registration_external->node_index != -1) {
     TfLiteRegistration* referenced_registration =
@@ -1401,17 +1401,17 @@ TfLiteStatus Subgraph::OpInvoke(const TfLiteRegistration& op_reg,
 // If registration_external is valid, use the 'free' callback from that.
 void Subgraph::OpFree(const TfLiteRegistration& op_reg, void* buffer) {
   // Delegates that use the stable delegate API to iterate over the nodes and
-  // registrations are presented with ABI stable 'TfLiteRegistrationExternal'
+  // registrations are presented with ABI stable 'TfLiteOperator'
   // pointers, as opposed to ABI unstable 'TfLiteRegistration' pointers, even
   // for builtin OPs like ADD or SUB.  A knock-on effect of this behavior is
   // that we need to differentiate two scenarios when interacting with a
-  // 'TfLiteRegistrationExternal'.
+  // 'TfLiteOperator'.
   // 1. In the 'wrapper' scenario described above we use the 'node_index' field
   //    that points us to the corresponding 'TfLiteRegistration' that holds the
   //    callbacks that need to be invoked.
-  // 2. Otherwise the 'TfLiteRegistrationExternal' is either a stable custom OP,
+  // 2. Otherwise the 'TfLiteOperator' is either a stable custom OP,
   //    or a stable delegate kernel, and in both of those cases we need to use
-  //    the callbacks stored within the 'TfLiteRegistrationExternal' itself.
+  //    the callbacks stored within the 'TfLiteOperator' itself.
   if (op_reg.registration_external &&
       op_reg.registration_external->node_index != -1 && buffer) {
     TfLiteRegistration* referenced_registration =
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index d01cd1b899c000..ae1b1ca5670fda 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -957,27 +957,25 @@ class Subgraph {
   // sits inside the associated TFLite interpreter instance.
   TfLiteExternalContext** external_contexts_;
 
-  // A set of 'TfLiteRegistrationExternal' pointers that are owned by the
-  // subgraph.  The objects pointed to by the 'TfLiteRegistrationExternal'
+  // A set of 'TfLiteOperator' pointers that are owned by the
+  // subgraph.  The objects pointed to by the 'TfLiteOperator'
   // pointers are deleted in the 'Subgraph' destructor.
   //
   // The intended usage of this container is to provide (friend) classes
-  // the option to dynamically allocate 'TfLiteRegistrationExternal' objects
+  // the option to dynamically allocate 'TfLiteOperator' objects
   // and then tie the lifetime of these objects to a subgraph.
   //
   // WARNING: This field needs to precede 'nodes_and_registration_', to ensure
   // that it outlives that field, since that field might contain references to
-  // the TfLiteRegistrationExternal objects contained in this fielld.
+  // the TfLiteOperator objects contained in this fielld.
   //
   // LINT.IfChange
-  // The definition of RegistrationExternalsCache implicitly assumes that
-  // TfLiteRegistrationExternalDelete is the same as the standard C++ delete
-  // operator.
-  // TODO(b/238435088): in op_resolver, include registration_external.h and use
-  // 'TfLiteRegistrationExternalDelete' as the deleter, then we can eliminate
+  // The definition of OperatorsCache implicitly assumes that
+  // TfLiteOperatorDelete is the same as the standard C++ delete operator.
+  // TODO(b/238435088): in op_resolver, include operator.h and use
+  // 'TfLiteOperatorDelete' as the deleter, then we can eliminate
   // the IfChange...ThenChange directive below.
-  std::shared_ptr<::tflite::internal::RegistrationExternalsCache>
-      registration_externals_;
+  std::shared_ptr<::tflite::internal::OperatorsCache> registration_externals_;
   // LINT.ThenChange(//tensorflow/lite/core/c/c_api.cc)
 
   // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index 560b2b4c65b940..b98c233912d1f8 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -447,7 +447,7 @@ struct OpaqueTestDelegate {
     delegate_state->delegate_prepared = true;
 
     TfLiteRegistration registration{};
-    registration.registration_external = TfLiteRegistrationExternalCreate(
+    registration.registration_external = TfLiteOperatorCreate(
         kTfLiteBuiltinDelegate, "OpaqueTestDelegate delegate kernel", 1);
 
     registration.prepare = [](TfLiteContext* context,
@@ -1130,10 +1130,10 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
   TfLiteDelegate delegate_;
 };
 
-TfLiteRegistrationExternal* CreateTfLiteRegistrationExternal() {
-  auto registration = TfLiteRegistrationExternalCreate(
-      kTfLiteBuiltinDelegate, "OpaqueDelegateKernel", 1);
-  TfLiteRegistrationExternalSetPrepare(
+TfLiteOperator* CreateTfLiteOperator() {
+  auto registration =
+      TfLiteOperatorCreate(kTfLiteBuiltinDelegate, "OpaqueDelegateKernel", 1);
+  TfLiteOperatorSetPrepare(
       registration,
       [](TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
@@ -1186,7 +1186,7 @@ class TestOpaqueDelegateBuilderWithDynamicTensors
       TfLiteIntArray* execution_plan;
       TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan);
       return TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
-          opaque_context, CreateTfLiteRegistrationExternal(), execution_plan,
+          opaque_context, CreateTfLiteOperator(), execution_plan,
           opaque_delegate);
     };
     delegate_external_.flags = kTfLiteDelegateFlagsNone;
diff --git a/tensorflow/lite/delegates/opaque_delegate_test.cc b/tensorflow/lite/delegates/opaque_delegate_test.cc
index af8e55d79ba751..be650d253d2721 100644
--- a/tensorflow/lite/delegates/opaque_delegate_test.cc
+++ b/tensorflow/lite/delegates/opaque_delegate_test.cc
@@ -70,20 +70,19 @@ TEST(TestOpaqueDelegate, AddDelegate) {
                                        void* data) -> TfLiteStatus {
     // Test that an unnamed delegate kernel can be passed to the TF Lite
     // runtime.
-    TfLiteRegistrationExternal* registration_external =
-        TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate,
-                                         /*name*/ nullptr,
-                                         /*version=*/1);
-    TfLiteRegistrationExternalSetInit(
-        registration_external,
-        [](TfLiteOpaqueContext* context, const char* buffer,
-           size_t length) -> void* { return nullptr; });
+    TfLiteOperator* registration_external =
+        TfLiteOperatorCreate(kTfLiteBuiltinDelegate,
+                             /*name*/ nullptr,
+                             /*version=*/1);
+    TfLiteOperatorSetInit(registration_external,
+                          [](TfLiteOpaqueContext* context, const char* buffer,
+                             size_t length) -> void* { return nullptr; });
     TfLiteIntArray* execution_plan;
     TF_LITE_ENSURE_STATUS(
         TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan));
     for (int i = 0; i < execution_plan->size; ++i) {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration = nullptr;
+      TfLiteOperator* registration = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, i, &node,
                                                 &registration);
       int fd = -1;
@@ -127,20 +126,19 @@ TEST(TestOpaqueDelegate, ModelWithCustomOpAndInitData) {
   opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* opaque_context,
                                        TfLiteOpaqueDelegate* opaque_delegate,
                                        void* data) -> TfLiteStatus {
-    TfLiteRegistrationExternal* registration_external =
-        TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate,
-                                         /*name*/ nullptr,
-                                         /*version=*/1);
-    TfLiteRegistrationExternalSetInit(
-        registration_external,
-        [](TfLiteOpaqueContext* context, const char* buffer,
-           size_t length) -> void* { return nullptr; });
+    TfLiteOperator* registration_external =
+        TfLiteOperatorCreate(kTfLiteBuiltinDelegate,
+                             /*name*/ nullptr,
+                             /*version=*/1);
+    TfLiteOperatorSetInit(registration_external,
+                          [](TfLiteOpaqueContext* context, const char* buffer,
+                             size_t length) -> void* { return nullptr; });
     TfLiteIntArray* execution_plan;
     TF_LITE_ENSURE_STATUS(
         TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan));
     for (int i = 0; i < execution_plan->size; ++i) {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration = nullptr;
+      TfLiteOperator* registration = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, i, &node,
                                                 &registration);
       int fd = -1;
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.cc b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.cc
index bc99b46631667e..e77018bc350a25 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.cc
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.cc
@@ -64,7 +64,7 @@ class SampleStableDelegateKernel : public SimpleOpaqueDelegateKernelInterface {
       const int node_index = params->nodes_to_replace->data[i];
 
       TfLiteOpaqueNode* delegated_node = nullptr;
-      TfLiteRegistrationExternal* delegated_node_registration = nullptr;
+      TfLiteOperator* delegated_node_registration = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(
           context, node_index, &delegated_node, &delegated_node_registration);
 
@@ -82,7 +82,7 @@ class SampleStableDelegateKernel : public SimpleOpaqueDelegateKernelInterface {
       node_output_tensors_set_.insert(output_tensor);
 
       builtin_code_[i] =
-          TfLiteRegistrationExternalGetBuiltInCode(delegated_node_registration);
+          TfLiteOperatorGetBuiltInCode(delegated_node_registration);
     }
 
     // Determine which tensors are external (the TFLite runtime takes care
@@ -176,10 +176,10 @@ int helpers::CalculateNumElements(const TfLiteOpaqueTensor* opaque_tensor) {
 }
 
 bool SampleStableDelegate::IsNodeSupportedByDelegate(
-    const TfLiteRegistrationExternal* registration_external,
-    const TfLiteOpaqueNode* node, TfLiteOpaqueContext* context) const {
+    const TfLiteOperator* registration_external, const TfLiteOpaqueNode* node,
+    TfLiteOpaqueContext* context) const {
   TfLiteBuiltinOperator builtin_operator =
-      TfLiteRegistrationExternalGetBuiltInCode(registration_external);
+      TfLiteOperatorGetBuiltInCode(registration_external);
   void* builtin_data = TfLiteOpaqueNodeGetBuiltinData(node);
   if (builtin_operator == kTfLiteBuiltinAdd) {
     TfLiteAddParams* params = reinterpret_cast<TfLiteAddParams*>(builtin_data);
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
index f497f4a4b64954..4301ea77ed4b2c 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
@@ -42,10 +42,9 @@ class SampleStableDelegate : public SimpleOpaqueDelegateInterface {
   // Returns true if the inputs of 'node' are two tensors of float32 with the
   // same shape and the operation is addition or subtraction (without fused
   // activation).
-  bool IsNodeSupportedByDelegate(
-      const TfLiteRegistrationExternal* registration_external,
-      const TfLiteOpaqueNode* node,
-      TfLiteOpaqueContext* context) const override;
+  bool IsNodeSupportedByDelegate(const TfLiteOperator* registration_external,
+                                 const TfLiteOpaqueNode* node,
+                                 TfLiteOpaqueContext* context) const override;
 
   // No-op. The delegate doesn't have extra steps to perform during
   // initialization.
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.cc b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.cc
index 8a4b5df8840ed4..f6b42165cef4ad 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.cc
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.cc
@@ -89,12 +89,12 @@ class SampleStableDelegateKernel : public SimpleOpaqueDelegateKernelInterface {
       const int node_index = nodes_to_execute->data[i];
 
       TfLiteOpaqueNode* delegated_node = nullptr;
-      TfLiteRegistrationExternal* delegated_node_registration = nullptr;
+      TfLiteOperator* delegated_node_registration = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(
           context, node_index, &delegated_node, &delegated_node_registration);
 
       builtin_codes_[subgraph_index][i] =
-          TfLiteRegistrationExternalGetBuiltInCode(delegated_node_registration);
+          TfLiteOperatorGetBuiltInCode(delegated_node_registration);
 
       for (int n = 0; n < TfLiteOpaqueNodeNumberOfInputs(delegated_node); ++n) {
         auto input_tensor =
@@ -419,7 +419,7 @@ TfLiteStatus SampleStableDelegate::ComputeCompatibleCalleeSubgraphs(
   for (int i = 0; i < execution_plan->size; ++i) {
     int node_index = execution_plan->data[i];
     TfLiteOpaqueNode* node = nullptr;
-    TfLiteRegistrationExternal* registration = nullptr;
+    TfLiteOperator* registration = nullptr;
     status = TfLiteOpaqueContextGetNodeAndRegistration(
         current_context, node_index, &node, &registration);
     if (status != kTfLiteOk) {
@@ -427,7 +427,7 @@ TfLiteStatus SampleStableDelegate::ComputeCompatibleCalleeSubgraphs(
     }
 
     TfLiteBuiltinOperator builtin_operator =
-        TfLiteRegistrationExternalGetBuiltInCode(registration);
+        TfLiteOperatorGetBuiltInCode(registration);
     if (builtin_operator == kTfLiteBuiltinWhile) {
       void* builtin_data = TfLiteOpaqueNodeGetBuiltinData(node);
       const auto* op_data =
@@ -503,10 +503,10 @@ int helpers::CalculateNumElements(const TfLiteOpaqueTensor* opaque_tensor) {
 }
 
 bool SampleStableDelegate::IsNodeSupportedByDelegate(
-    const TfLiteRegistrationExternal* registration_external,
-    const TfLiteOpaqueNode* node, TfLiteOpaqueContext* context) const {
+    const TfLiteOperator* registration_external, const TfLiteOpaqueNode* node,
+    TfLiteOpaqueContext* context) const {
   TfLiteBuiltinOperator builtin_operator =
-      TfLiteRegistrationExternalGetBuiltInCode(registration_external);
+      TfLiteOperatorGetBuiltInCode(registration_external);
   void* builtin_data = TfLiteOpaqueNodeGetBuiltinData(node);
   // List of supported / unsupported ops.
   switch (builtin_operator) {
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.h b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.h
index b4db00855513a9..bf784f4de82624 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.h
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.h
@@ -44,10 +44,9 @@ class SampleStableDelegate : public SimpleOpaqueDelegateInterface {
   // SampleStableDelegate supports float32 input type only.
   // Returns true if the inputs of 'node' are two tensors of float32 with the
   // same shape and the operation is supported (without fused activation).
-  bool IsNodeSupportedByDelegate(
-      const TfLiteRegistrationExternal* registration_external,
-      const TfLiteOpaqueNode* node,
-      TfLiteOpaqueContext* context) const override;
+  bool IsNodeSupportedByDelegate(const TfLiteOperator* registration_external,
+                                 const TfLiteOpaqueNode* node,
+                                 TfLiteOpaqueContext* context) const override;
 
   // No-op. The delegate doesn't have extra steps to perform during
   // initialization.
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc b/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
index 193adcc9718e15..96fbd3b3b91082 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
@@ -30,19 +30,19 @@ limitations under the License.
 
 namespace tflite {
 namespace {
-TfLiteRegistrationExternal* CreateDelegateKernelRegistration(
+TfLiteOperator* CreateDelegateKernelRegistration(
     SimpleOpaqueDelegateInterface* delegate) {
-  TfLiteRegistrationExternal* kernel_registration =
-      TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate, delegate->Name(),
-                                       /*version=*/1);
+  TfLiteOperator* kernel_registration =
+      TfLiteOperatorCreate(kTfLiteBuiltinDelegate, delegate->Name(),
+                           /*version=*/1);
 
-  TfLiteRegistrationExternalSetFree(
+  TfLiteOperatorSetFree(
       kernel_registration,
       [](TfLiteOpaqueContext* context, void* buffer) -> void {
         delete reinterpret_cast<SimpleOpaqueDelegateInterface*>(buffer);
       });
 
-  TfLiteRegistrationExternalSetInit(
+  TfLiteOperatorSetInit(
       kernel_registration,
       [](TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
@@ -60,7 +60,7 @@ TfLiteRegistrationExternal* CreateDelegateKernelRegistration(
         }
         return delegate_kernel.release();
       });
-  TfLiteRegistrationExternalSetPrepare(
+  TfLiteOperatorSetPrepare(
       kernel_registration,
       [](TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
@@ -69,7 +69,7 @@ TfLiteRegistrationExternal* CreateDelegateKernelRegistration(
                 TfLiteOpaqueNodeGetUserData(opaque_node));
         return delegate_kernel->Prepare(context, opaque_node);
       });
-  TfLiteRegistrationExternalSetInvoke(
+  TfLiteOperatorSetInvoke(
       kernel_registration,
       [](TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
@@ -100,7 +100,7 @@ TfLiteStatus DelegatePrepare(TfLiteOpaqueContext* opaque_context,
     const int node_id = plan->data[i];
 
     TfLiteOpaqueNode* opaque_node;
-    TfLiteRegistrationExternal* registration_external;
+    TfLiteOperator* registration_external;
     TfLiteOpaqueContextGetNodeAndRegistration(
         opaque_context, node_id, &opaque_node, &registration_external);
 
@@ -110,7 +110,7 @@ TfLiteStatus DelegatePrepare(TfLiteOpaqueContext* opaque_context,
     }
   }
 
-  TfLiteRegistrationExternal* delegate_kernel_registration =
+  TfLiteOperator* delegate_kernel_registration =
       CreateDelegateKernelRegistration(simple_opaque_delegate);
 
   // Transfers ownership of delegate_kernel_registration to the opaque_context.
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate.h b/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
index abaf3ee3dbeb9d..5096eaa135d481 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
@@ -85,8 +85,8 @@ class SimpleOpaqueDelegateInterface {
 
   // Returns true if 'node' is supported by the delegate. False otherwise.
   virtual bool IsNodeSupportedByDelegate(
-      const TfLiteRegistrationExternal* registration_external,
-      const TfLiteOpaqueNode* node, TfLiteOpaqueContext* context) const = 0;
+      const TfLiteOperator* registration_external, const TfLiteOpaqueNode* node,
+      TfLiteOpaqueContext* context) const = 0;
 
   // Initialize the delegate before finding and replacing TfLite nodes with
   // delegate kernels, for example, retrieving some TFLite settings from
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc b/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
index eb32b9281d5419..3ec43793ad7a23 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
@@ -272,18 +272,17 @@ TEST(DelegateTest, TestDataMultiAddBin_MultiInputMultiOutput_FullyDelegated) {
   TfLiteModelDelete(model);
 }
 
-TfLiteRegistrationExternal* CreateDelegateKernelRegistrationImpl(
+TfLiteOperator* CreateDelegateKernelRegistrationImpl(
     SimpleOpaqueDelegateInterface* delegate) {
-  TfLiteRegistrationExternal* kernel_registration =
-      TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate, delegate->Name(),
-                                       1);
-  TfLiteRegistrationExternalSetFree(
+  TfLiteOperator* kernel_registration =
+      TfLiteOperatorCreate(kTfLiteBuiltinDelegate, delegate->Name(), 1);
+  TfLiteOperatorSetFree(
       kernel_registration,
       [](TfLiteOpaqueContext* context, void* buffer) -> void {
         delete reinterpret_cast<SimpleOpaqueDelegateInterface*>(buffer);
       });
 
-  TfLiteRegistrationExternalSetInit(
+  TfLiteOperatorSetInit(
       kernel_registration,
       [](TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
@@ -302,7 +301,7 @@ TfLiteRegistrationExternal* CreateDelegateKernelRegistrationImpl(
         }
         return delegate_kernel.release();
       });
-  TfLiteRegistrationExternalSetPrepare(
+  TfLiteOperatorSetPrepare(
       kernel_registration,
       [](TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
@@ -311,7 +310,7 @@ TfLiteRegistrationExternal* CreateDelegateKernelRegistrationImpl(
                 TfLiteOpaqueNodeGetUserData(opaque_node));
         return delegate_kernel->Prepare(context, opaque_node);
       });
-  TfLiteRegistrationExternalSetInvoke(
+  TfLiteOperatorSetInvoke(
       kernel_registration,
       [](TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
@@ -394,7 +393,7 @@ TEST_F(TestDelegate, SetBufferHandle) {
     TfLiteIntArray* execution_plan;
     TF_LITE_ENSURE_STATUS(
         TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan));
-    TfLiteRegistrationExternal* delegate_kernel_registration =
+    TfLiteOperator* delegate_kernel_registration =
         CreateDelegateKernelRegistrationImpl(simple_opaque_delegate);
 
     return TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
diff --git a/tensorflow/lite/ios/BUILD.apple b/tensorflow/lite/ios/BUILD.apple
index fd5bdd9e3e4e1e..e344e56dd16427 100644
--- a/tensorflow/lite/ios/BUILD.apple
+++ b/tensorflow/lite/ios/BUILD.apple
@@ -1,6 +1,8 @@
 # TensorFlow Lite for iOS
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_binary", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_test")
 load(
     "//tensorflow/lite/ios:ios.bzl",
     "TFL_MINIMUM_OS_VERSION",
@@ -9,8 +11,6 @@ load(
     "tflite_ios_xcframework",
 )
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
-load("//tensorflow:pytype.default.bzl", "pytype_strict_binary", "pytype_strict_library")
-load("//tensorflow:strict.default.bzl", "py_strict_test")
 
 package(
     default_visibility = [
@@ -97,7 +97,7 @@ strip_common_include_path_prefix(
         "//tensorflow/lite/core/c:c_api_opaque.h",
         "//tensorflow/lite/core/c:c_api_types.h",
         "//tensorflow/lite/core/c:common.h",
-        "//tensorflow/lite/core/c:registration_external.h",
+        "//tensorflow/lite/core/c:operator.h",
         "//tensorflow/lite/core/async/c:types.h",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
         "//tensorflow/lite/profiling/telemetry/c:profiler.h",
@@ -124,8 +124,8 @@ tflite_ios_framework(
         ":c_api_opaque.h",
         ":c_api_types.h",
         ":common.h",
+        ":operator.h",
         ":profiler.h",
-        ":registration_external.h",
         ":telemetry_setting.h",
         ":types.h",
         ":xnnpack_delegate.h",
@@ -178,8 +178,8 @@ ios_static_framework(
         ":c_api_opaque.h",
         ":c_api_types.h",
         ":common.h",
+        ":operator.h",
         ":profiler.h",
-        ":registration_external.h",
         ":telemetry_setting.h",
         ":types.h",
         ":xnnpack_delegate.h",
@@ -262,7 +262,7 @@ cc_library(
         "//tensorflow/lite/core/c:c_api_opaque.h",
         "//tensorflow/lite/core/c:c_api_types.h",
         "//tensorflow/lite/core/c:common.h",
-        "//tensorflow/lite/core/c:registration_external.h",
+        "//tensorflow/lite/core/c:operator.h",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
         "//tensorflow/lite/profiling/telemetry/c:profiler.h",
         "//tensorflow/lite/profiling/telemetry/c:telemetry_setting.h",
diff --git a/tensorflow/lite/ios/TensorFlowLiteC.h b/tensorflow/lite/ios/TensorFlowLiteC.h
index 63a687ea306d30..ae72f47b895583 100644
--- a/tensorflow/lite/ios/TensorFlowLiteC.h
+++ b/tensorflow/lite/ios/TensorFlowLiteC.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/core/c/c_api_opaque.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/core/c/registration_external.h"
+#include "tensorflow/lite/core/c/operator.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/profiling/telemetry/c/profiler.h"
 
diff --git a/tensorflow/lite/mutable_op_resolver_utils.cc b/tensorflow/lite/mutable_op_resolver_utils.cc
index d0d03c68d9ab20..fec9571746c824 100644
--- a/tensorflow/lite/mutable_op_resolver_utils.cc
+++ b/tensorflow/lite/mutable_op_resolver_utils.cc
@@ -21,18 +21,16 @@ limitations under the License.
 
 namespace tflite {
 
-void AddOp(MutableOpResolver* mutable_op_resolver,
-           const TfLiteRegistrationExternal* op, int min_version,
-           int max_version) {
+void AddOp(MutableOpResolver* mutable_op_resolver, const TfLiteOperator* op,
+           int min_version, int max_version) {
   TfLiteRegistration registration{};
-  registration.builtin_code = TfLiteRegistrationExternalGetBuiltInCode(op);
-  registration.custom_name = TfLiteRegistrationExternalGetCustomName(op);
-  registration.version = TfLiteRegistrationExternalGetVersion(op);
-  // This const cast is safe because TfLiteRegistrationExternal is an opaque
-  // type and TfLiteRegistrationExternal objects are always allocated with
-  // TfLiteRegistrationExternalCreate() which allocates non-const objects.
-  registration.registration_external =
-      const_cast<TfLiteRegistrationExternal*>(op);
+  registration.builtin_code = TfLiteOperatorGetBuiltInCode(op);
+  registration.custom_name = TfLiteOperatorGetCustomName(op);
+  registration.version = TfLiteOperatorGetVersion(op);
+  // This const cast is safe because TfLiteOperator is an opaque
+  // type and TfLiteOperator objects are always allocated with
+  // TfLiteOperatorCreate() which allocates non-const objects.
+  registration.registration_external = const_cast<TfLiteOperator*>(op);
   if (registration.custom_name != nullptr) {
     mutable_op_resolver->AddCustom(registration.custom_name, &registration,
                                    min_version, max_version);
@@ -42,9 +40,8 @@ void AddOp(MutableOpResolver* mutable_op_resolver,
   }
 }
 
-void AddOp(MutableOpResolver* mutable_op_resolver,
-           const TfLiteRegistrationExternal* op) {
-  int version = TfLiteRegistrationExternalGetVersion(op);
+void AddOp(MutableOpResolver* mutable_op_resolver, const TfLiteOperator* op) {
+  int version = TfLiteOperatorGetVersion(op);
   AddOp(mutable_op_resolver, op, version, version);
 }
 
diff --git a/tensorflow/lite/mutable_op_resolver_utils.h b/tensorflow/lite/mutable_op_resolver_utils.h
index 70ae56caec48d7..905b8a49792d72 100644
--- a/tensorflow/lite/mutable_op_resolver_utils.h
+++ b/tensorflow/lite/mutable_op_resolver_utils.h
@@ -23,15 +23,13 @@ namespace tflite {
 
 /// Registers (the specified version of) the operator `op`.
 /// Replaces any previous registration for the same operator version.
-void AddOp(MutableOpResolver* mutable_op_resolver,
-           const TfLiteRegistrationExternal* op);
+void AddOp(MutableOpResolver* mutable_op_resolver, const TfLiteOperator* op);
 
 /// Registers the specified version range (versions `min_version` to
 /// `max_version`, inclusive) of the specified operator `op`.
 /// Replaces any previous registration for the same operator version.
-void AddOp(MutableOpResolver* mutable_op_resolver,
-           const TfLiteRegistrationExternal* op, int min_version,
-           int max_version);
+void AddOp(MutableOpResolver* mutable_op_resolver, const TfLiteOperator* op,
+           int min_version, int max_version);
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/mutable_op_resolver_utils_test.cc b/tensorflow/lite/mutable_op_resolver_utils_test.cc
index 8484d35ec48410..d9d9e0320486db 100644
--- a/tensorflow/lite/mutable_op_resolver_utils_test.cc
+++ b/tensorflow/lite/mutable_op_resolver_utils_test.cc
@@ -39,21 +39,20 @@ TfLiteStatus DummyPrepare(TfLiteOpaqueContext* context,
   return kTfLiteOk;
 }
 
-TfLiteRegistrationExternal* GetDummyRegistration() {
-  static TfLiteRegistrationExternal* registration = []() {
-    auto* r =
-        TfLiteRegistrationExternalCreate(kTfLiteBuiltinCustom, "dummy", 1);
-    TfLiteRegistrationExternalSetPrepare(r, DummyPrepare);
-    TfLiteRegistrationExternalSetInvoke(r, DummyInvoke);
+TfLiteOperator* GetDummyRegistration() {
+  static TfLiteOperator* registration = []() {
+    auto* r = TfLiteOperatorCreate(kTfLiteBuiltinCustom, "dummy", 1);
+    TfLiteOperatorSetPrepare(r, DummyPrepare);
+    TfLiteOperatorSetInvoke(r, DummyInvoke);
     return r;
   }();
   return registration;
 }
 
-TfLiteRegistrationExternal* GetAdditionOpRegistration() {
-  static TfLiteRegistrationExternal* registration = []() {
-    auto* r = TfLiteRegistrationExternalCreate(kTfLiteBuiltinAdd, nullptr, 1);
-    TfLiteRegistrationExternalSetInvoke(r, DummyInvoke);
+TfLiteOperator* GetAdditionOpRegistration() {
+  static TfLiteOperator* registration = []() {
+    auto* r = TfLiteOperatorCreate(kTfLiteBuiltinAdd, nullptr, 1);
+    TfLiteOperatorSetInvoke(r, DummyInvoke);
     return r;
   }();
   return registration;
@@ -69,11 +68,10 @@ TEST_F(MutableOpResolverTest, FindOp) {
       resolver.FindOp(BuiltinOperator_ADD, 1);
   ASSERT_NE(found_registration, nullptr);
   EXPECT_TRUE(found_registration->registration_external->invoke == DummyInvoke);
-  EXPECT_EQ(TfLiteRegistrationExternalGetBuiltInCode(
-                found_registration->registration_external),
-            kTfLiteBuiltinAdd);
-  EXPECT_EQ(TfLiteRegistrationExternalGetVersion(
-                found_registration->registration_external),
+  EXPECT_EQ(
+      TfLiteOperatorGetBuiltInCode(found_registration->registration_external),
+      kTfLiteBuiltinAdd);
+  EXPECT_EQ(TfLiteOperatorGetVersion(found_registration->registration_external),
             1);
   EXPECT_EQ(found_registration->builtin_code, BuiltinOperator_ADD);
   EXPECT_EQ(found_registration->version, 1);
diff --git a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
index b71f830d07eceb..623908feb1d7f3 100644
--- a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
@@ -47,7 +47,7 @@ Pod::Spec.new do |s|
       tfl_dir + 'c/c_api_types.h',
       tfl_dir + 'c/common.h',
       tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
-      tfl_dir + 'core/c/registration_external.h',
+      tfl_dir + 'core/c/operator.h',
     ]
     core.exclude_files = [
       objc_dir + '{apis,sources}/TFL{Metal,CoreML}Delegate.{h,m}',

From b241d3f9505a43b8b2b56411dd41d62c6154eb59 Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Thu, 21 Mar 2024 12:06:24 -0700
Subject: [PATCH 245/670] Update `Shape::Equals()` to have proper default
 comparison for unbounded shapes.

Previously, the default `Shape::Equals()` used `ShapeUtil::SameDimensions(lhs, rhs)` to check whether the two shapes are equal. However, the implementation of `ShapeUtil::SameDimensions()` was updated such that when one the dimension of one shape is unbounded, it compares equal for each dimension.

The default behavior should be that unbounded size compared to any size other than unbounded size should compare not equal.

PiperOrigin-RevId: 617921492
---
 third_party/xla/xla/shape.cc           | 15 +++++++++++++--
 third_party/xla/xla/shape_util_test.cc |  3 +++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/shape.cc b/third_party/xla/xla/shape.cc
index 33a2179673a6d3..7831898b4a1c0f 100644
--- a/third_party/xla/xla/shape.cc
+++ b/third_party/xla/xla/shape.cc
@@ -200,10 +200,21 @@ bool Shape::Equal::operator()(const Shape& lhs, const Shape& rhs) {
   }
 
   if (!ignore_dimensions_) {
-    if (!ShapeUtil::SameDimensions(lhs, rhs)) {
-      VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
+    if (!ShapeUtil::SameRank(lhs, rhs)) {
+      VLOG(3) << "CompareShapes: lhs rank != rhs rank";
       return false;
     }
+    for (int i = 0; i < lhs.rank(); ++i) {
+      if (ignore_dynamic_dimension_ &&
+          (lhs.is_unbounded_dynamic_dimension(i) ||
+           rhs.is_unbounded_dynamic_dimension(i))) {
+        continue;
+      }
+      if (lhs.dimensions(i) != rhs.dimensions(i)) {
+        VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
+        return false;
+      }
+    }
   } else {
     if (!ShapeUtil::SameRank(lhs, rhs)) {
       VLOG(3) << "CompareShapes: lhs rank != rhs rank";
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index f640ae028ff830..b6a2bfbac2d764 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -216,6 +216,9 @@ TEST(ShapeUtilTest, EqualDynamicShapes) {
   EXPECT_FALSE(
       ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {4, 3}, {true, false}),
                        ShapeUtil::MakeShape(F32, {4, 3}, {false, false})));
+  EXPECT_FALSE(ShapeUtil::Equal(
+      ShapeUtil::MakeShape(F32, {Shape::kUnboundedSize}, {true}),
+      ShapeUtil::MakeShape(F32, {2}, {true})));
 }
 
 TEST(ShapeUtilTest, CompatibleDynamicShapes) {

From 214443d3ff4cad5c57b93917877843c958f58444 Mon Sep 17 00:00:00 2001
From: Anurag Arnab <aarnab@google.com>
Date: Thu, 21 Mar 2024 12:07:01 -0700
Subject: [PATCH 246/670] No public change.

PiperOrigin-RevId: 617921687
---
 tensorflow/python/ops/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 6c2266687c6511..533263a282ffbf 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -36,6 +36,7 @@ visibility = [
     "//third_party/reverb:__subpackages__",
     "//tensorflow_minigo:__subpackages__",
     "//research/graph/fairness/inproc_fair_reg:__subpackages__",
+    "//third_party/py/scenic:__subpackages__",
 ]
 
 package(

From 6de5c0ced93803ad046b2f0f37a667eacdb8c203 Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Thu, 21 Mar 2024 12:07:51 -0700
Subject: [PATCH 247/670] Update implicit broadcast output shape element type
 to match operand.

Some binary ops return types whose element type differs from the operand element types (e.g. Ne). While broadcasting, it should use the element type of the operand instead of the inferred type.

Before:
```
HloModule UnboundedBinaryOpTest_8.15, entry_computation_layout={(f32[?,10]{1,0}, f32[1]{0})->pred[?,10]{1,0}}

ENTRY %UnboundedBinaryOpTest_8.15 (lhs.1: f32[?,10], rhs.2: f32[1]) -> pred[?,10] {
  %constant.3 = s32[1]{0} constant({1})
  %lhs.1 = f32[?,10]{1,0} parameter(0)
  %get-dimension-size.4 = s32[] get-dimension-size(f32[?,10]{1,0} %lhs.1), dimensions={0}
  %reshape.5 = s32[1]{0} reshape(s32[] %get-dimension-size.4)
  %constant.6 = s32[1]{0} constant({10})
  %concatenate.9 = s32[2]{0} concatenate(s32[1]{0} %reshape.5, s32[1]{0} %constant.6), dimensions={0}
  %constant.7 = s32[1]{0} constant({1})
  %constant.8 = s32[1]{0} constant({1})
  %concatenate.10 = s32[2]{0} concatenate(s32[1]{0} %constant.7, s32[1]{0} %constant.8), dimensions={0}
  %maximum.11 = s32[2]{0} maximum(s32[2]{0} %concatenate.9, s32[2]{0} %concatenate.10)
  %custom-call.12 = pred[?,10]{1,0} custom-call(f32[?,10]{1,0} %lhs.1, s32[2]{0} %maximum.11), custom_call_target="mhlo.dynamic_broadcast_in_dim", backend_config={broadcast_dimensions=[0,1]}
  %rhs.2 = f32[1]{0} parameter(1)
  %custom-call.13 = pred[?,10]{1,0} custom-call(f32[1]{0} %rhs.2, s32[2]{0} %maximum.11), custom_call_target="mhlo.dynamic_broadcast_in_dim", backend_config={broadcast_dimensions=[1]}
  ROOT %compare.14 = pred[?,10]{1,0} compare(pred[?,10]{1,0} %custom-call.12, pred[?,10]{1,0} %custom-call.13), direction=NE
}
```

After:
```
HloModule UnboundedBinaryOpTest_8.15, entry_computation_layout={(f32[?,10]{1,0}, f32[1]{0})->pred[?,10]{1,0}}

ENTRY %UnboundedBinaryOpTest_8.15 (lhs.1: f32[?,10], rhs.2: f32[1]) -> pred[?,10] {
  %constant.3 = s32[1]{0} constant({1})
  %lhs.1 = f32[?,10]{1,0} parameter(0)
  %get-dimension-size.4 = s32[] get-dimension-size(f32[?,10]{1,0} %lhs.1), dimensions={0}
  %reshape.5 = s32[1]{0} reshape(s32[] %get-dimension-size.4)
  %constant.6 = s32[1]{0} constant({10})
  %concatenate.9 = s32[2]{0} concatenate(s32[1]{0} %reshape.5, s32[1]{0} %constant.6), dimensions={0}
  %constant.7 = s32[1]{0} constant({1})
  %constant.8 = s32[1]{0} constant({1})
  %concatenate.10 = s32[2]{0} concatenate(s32[1]{0} %constant.7, s32[1]{0} %constant.8), dimensions={0}
  %maximum.11 = s32[2]{0} maximum(s32[2]{0} %concatenate.9, s32[2]{0} %concatenate.10)
  %custom-call.12 = f32[?,10]{1,0} custom-call(f32[?,10]{1,0} %lhs.1, s32[2]{0} %maximum.11), custom_call_target="mhlo.dynamic_broadcast_in_dim", backend_config={broadcast_dimensions=[0,1]}
  %rhs.2 = f32[1]{0} parameter(1)
  %custom-call.13 = f32[?,10]{1,0} custom-call(f32[1]{0} %rhs.2, s32[2]{0} %maximum.11), custom_call_target="mhlo.dynamic_broadcast_in_dim", backend_config={broadcast_dimensions=[1]}
  ROOT %compare.14 = pred[?,10]{1,0} compare(f32[?,10]{1,0} %custom-call.12, f32[?,10]{1,0} %custom-call.13), direction=NE
}
```

Notice the difference in `mhlo.dynamic_broadcast_in_dim` custom call return type `pred[?,10]` vs `f32[?,10]`. The HLO after the change correctly returns unchanged element type from the operand.

PiperOrigin-RevId: 617921912
---
 third_party/xla/xla/client/xla_builder.cc      | 6 ++++--
 third_party/xla/xla/client/xla_builder_test.cc | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/client/xla_builder.cc b/third_party/xla/xla/client/xla_builder.cc
index 57f0529b5877a1..de70b98f4393cf 100644
--- a/third_party/xla/xla/client/xla_builder.cc
+++ b/third_party/xla/xla/client/xla_builder.cc
@@ -1182,10 +1182,12 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
                                   this, rhs, lhs, *lhs_shape));
         }
       } else {
+        Shape output_shape = shape;
+        output_shape.set_element_type(lhs_shape->element_type());
         TF_ASSIGN_OR_RETURN(UnboundedBroadcastResult broadcast_result,
                             BroadcastToOutputShapeWithUnbounded(
-                                this, lhs, *lhs_shape, rhs, *rhs_shape, shape,
-                                broadcast_dimensions));
+                                this, lhs, *lhs_shape, rhs, *rhs_shape,
+                                output_shape, broadcast_dimensions));
         updated_lhs = broadcast_result.lhs;
         updated_rhs = broadcast_result.rhs;
       }
diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index ee9d100a3a1bf2..231cd6bafc4bf6 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -2436,6 +2436,8 @@ INSTANTIATE_TEST_SUITE_P(
          /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Mul},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Mul},
+        {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
+         "pred[?, 10]", &Ne},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
          /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Pow},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,

From fd5333cc3d6801cb5ce51d351ab20b0fd658f516 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Mar 2024 12:36:01 -0700
Subject: [PATCH 248/670] Fix a bug in which an absl hashtable is used with
 inconsistent hash/eq functors.

`eq(k1, k2) -> hash(k1) == hash(k2)` must be true for hashtable usage to be valid.

PiperOrigin-RevId: 617929842
---
 third_party/xla/xla/BUILD     |  1 +
 third_party/xla/xla/literal.h | 50 ++++++++++++++++++++++++++++++++---
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 6d3f689bc93301..df10168ddf0920 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -608,6 +608,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/literal.h b/third_party/xla/xla/literal.h
index a6b4758cf64234..fea38633994652 100644
--- a/third_party/xla/xla/literal.h
+++ b/third_party/xla/xla/literal.h
@@ -35,7 +35,9 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/casts.h"
 #include "absl/base/config.h"
+#include "absl/base/optimization.h"
 #include "absl/functional/function_ref.h"
+#include "absl/hash/hash.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
@@ -353,6 +355,20 @@ class LiteralBase {
     return LiteralBase::Hash(std::move(state), value);
   }
 
+ private:
+  // With C++20, we can use `requires { absl::Hash<NativeT>(); }`.
+  template <typename T>
+  static constexpr bool IsAbslHashable() {
+#ifdef _MSC_VER
+    // `std::is_invocable_v<absl::Hash<T>, T>` doesn't work on MSVC.
+    // See https://godbolt.org/z/Wj9d7zrav.
+    return std::is_arithmetic_v<T>;
+#else
+    return std::is_invocable_v<absl::Hash<T>, T>;
+#endif
+  }
+
+ public:
   template <typename H, bool kIsLayoutSensitive = true,
             int64_t kByteLimit = std::numeric_limits<int64_t>::max()>
   static H Hash(H state, const LiteralBase& literal) {
@@ -366,10 +382,36 @@ class LiteralBase {
           }
 
           CHECK(LayoutUtil::IsDenseArray(subshape));
-          auto data = absl::MakeConstSpan(
-              static_cast<const char*>(literal.untyped_data(index)),
-              std::min(kByteLimit, literal.size_bytes(index)));
-          state = H::combine(std::move(state), data);
+          const auto hash_func = [&](auto primitive_type_constant) {
+            using NativeT =
+                primitive_util::NativeTypeOf<primitive_type_constant>;
+            // If we can hash NativeT, then do so. Otherwise, hash raw buffer
+            // data taking care to avoid invalid parts of 4-bit type data.
+            if constexpr (IsAbslHashable<NativeT>()) {
+              state = H::combine(std::move(state),
+                                 literal.piece(index).data<NativeT>());
+            } else {
+              const int64_t num_bytes =
+                  std::min(kByteLimit, literal.size_bytes(index));
+              const char* buffer =
+                  static_cast<const char*>(literal.untyped_data(index));
+              if (primitive_util::Is4BitType(subshape.element_type())) {
+                // Note: in this case, we could potentially read 8 bytes at a
+                // time, mask out the upper 4 bits of each byte, and then hash 8
+                // bytes, but it adds complexity and needs special handling for
+                // the non-divisible-by-8 leftover bytes.
+                for (int64_t i = 0; i < num_bytes; ++i) {
+                  state =
+                      H::combine(std::move(state), buffer[i] & uint8_t{0xf});
+                }
+              } else {
+                auto data = absl::MakeConstSpan(buffer, num_bytes);
+                state = H::combine(std::move(state), data);
+              }
+            }
+          };
+          primitive_util::ArrayTypeSwitch<void>(hash_func,
+                                                subshape.element_type());
         });
 
     return std::move(state);

From 915042c20b2f83ccf340d83fc36cbc27f0332bfd Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Thu, 21 Mar 2024 12:52:23 -0700
Subject: [PATCH 249/670] Add unbounded dynamism test for BitcastConvertOp.

PiperOrigin-RevId: 617934413
---
 third_party/xla/xla/client/xla_builder_test.cc      | 10 ++++++++++
 third_party/xla/xla/service/shape_inference_test.cc | 11 +++++++++++
 2 files changed, 21 insertions(+)

diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index 231cd6bafc4bf6..ccf4f9a9631e75 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -1907,6 +1907,16 @@ TEST(XlaBuilderTest, UnboundedBatchNormTraining) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
+TEST(XlaBuilderTest, UnboundedBitcastConvert) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f16[?, 10, 2]"));
+  BitcastConvertType(Parameter(&b, 0, operand, "operand"), PrimitiveType::F16);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, UnboundedBroadcastUnsupportedOperand) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=3, ?]"));
diff --git a/third_party/xla/xla/service/shape_inference_test.cc b/third_party/xla/xla/service/shape_inference_test.cc
index d73e5d1a364a6d..56f033238a8a5b 100644
--- a/third_party/xla/xla/service/shape_inference_test.cc
+++ b/third_party/xla/xla/service/shape_inference_test.cc
@@ -3927,6 +3927,17 @@ TEST_P(UnboundedAndOpShapeInferenceTest, UnboundedAnd) {
   }
 }
 
+TEST_F(ShapeInferenceTest, UnboundedBitcastConvert) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred_status,
+      ShapeInference::InferBitcastConvertShape(operand, PrimitiveType::F16));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f16[?, 10, 2]"));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_status, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_status)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_F(ShapeInferenceTest, UnboundedBatchNormGrad) {
   TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
   TF_ASSERT_OK_AND_ASSIGN(Shape grad_operand, ParseShape("f32[?, ?, 7]"));

From 338ecdbf1dad295b898e4c371b93a509fe75e131 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Thu, 21 Mar 2024 13:05:01 -0700
Subject: [PATCH 250/670] [NFC] Spelling fix.

PiperOrigin-RevId: 617938324
---
 third_party/xla/xla/service/scatter_expander.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/scatter_expander.cc b/third_party/xla/xla/service/scatter_expander.cc
index f8faafa963e35d..f76c88cabda01f 100644
--- a/third_party/xla/xla/service/scatter_expander.cc
+++ b/third_party/xla/xla/service/scatter_expander.cc
@@ -497,7 +497,7 @@ bool IsCombinerAssociative(const HloComputation* combiner) {
     case HloOpcode::kMinimum:
     case HloOpcode::kMaximum:
       return true;
-    // Other common combiners are associative at least for interger arithmetic.
+    // Other common combiners are associative at least for integer arithmetic.
     case HloOpcode::kAdd:
     case HloOpcode::kMultiply:
     case HloOpcode::kOr:

From dbd0890c9a2c5e7fc75a00c24f10d637472b6451 Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Thu, 21 Mar 2024 14:05:12 -0700
Subject: [PATCH 251/670] Add unbounded dynamism test for for Atan2Op.

PiperOrigin-RevId: 617957700
---
 third_party/xla/xla/client/xla_builder_test.cc  |  2 ++
 .../xla/xla/service/shape_inference_test.cc     | 17 +++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index ccf4f9a9631e75..0cb7ec1c023172 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -2434,6 +2434,8 @@ INSTANTIATE_TEST_SUITE_P(
          /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Add},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Add},
+        {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
+         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Atan2},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
          /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Div},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
diff --git a/third_party/xla/xla/service/shape_inference_test.cc b/third_party/xla/xla/service/shape_inference_test.cc
index 56f033238a8a5b..a1013bee6137db 100644
--- a/third_party/xla/xla/service/shape_inference_test.cc
+++ b/third_party/xla/xla/service/shape_inference_test.cc
@@ -3927,6 +3927,23 @@ TEST_P(UnboundedAndOpShapeInferenceTest, UnboundedAnd) {
   }
 }
 
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAtan2) {
+  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
+  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
+      HloOpcode::kAtan2, lhs, rhs, GetParam().broadcast_dimensions);
+  if (inferred_status.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+        << " expected: " << ShapeUtil::HumanString(expected);
+  } else {
+    ASSERT_TRUE(GetParam().error_message.has_value());
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr(*GetParam().error_message));
+  }
+}
+
 TEST_F(ShapeInferenceTest, UnboundedBitcastConvert) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
   TF_ASSERT_OK_AND_ASSIGN(

From 5fc08bdd5eef1e5a42d97b10c53536c6cdaf5425 Mon Sep 17 00:00:00 2001
From: Eunjae Kim <eunjaekim@google.com>
Date: Thu, 21 Mar 2024 14:18:22 -0700
Subject: [PATCH 252/670] Add a capacity check when scheduling a low priority
 task

PiperOrigin-RevId: 617961589
---
 tensorflow/core/kernels/batch_kernels_test.cc |  45 ++++----
 tensorflow/core/kernels/batching_util/BUILD   |   3 +
 .../batching_util/batch_resource_base.cc      |  15 ++-
 .../batching_util/shared_batch_scheduler.h    |  56 ++++++++-
 .../shared_batch_scheduler_test.cc            | 107 +++++++++++++++++-
 5 files changed, 188 insertions(+), 38 deletions(-)

diff --git a/tensorflow/core/kernels/batch_kernels_test.cc b/tensorflow/core/kernels/batch_kernels_test.cc
index 320d17b14396c2..aebe26daa39ff4 100644
--- a/tensorflow/core/kernels/batch_kernels_test.cc
+++ b/tensorflow/core/kernels/batch_kernels_test.cc
@@ -117,28 +117,29 @@ class BatchFunctionTestState : public SharedBatchFunctionTestState {
 
     std::vector<NodeDefBuilder::NodeOut> inputs(
         {NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64})});
-    TF_RETURN_IF_ERROR(NodeDefBuilder("BatchTPUInput", "BatchFunction")
-                           .Attr("max_batch_size", 4)
-                           .Attr("num_batch_threads", 4)
-                           .Attr("allowed_batch_sizes", {4})
-                           .Attr("batch_timeout_micros", 5000000)
-                           .Attr("max_enqueued_batches", 10)
-                           .Attr("low_priority_max_batch_size",
-                                 enable_low_priority_queue ? 64 : 0)
-                           .Attr("low_priority_batch_timeout_micros",
-                                 enable_low_priority_queue ? 50000000 : 0)
-                           .Attr("low_priority_allowed_batch_sizes",
-                                 enable_low_priority_queue ? std::vector<int>{1}
-                                                           : std::vector<int>())
-                           .Attr("low_priority_max_enqueued_batches",
-                                 enable_low_priority_queue ? 100 : 0)
-                           .Attr("Tin", {DataType::DT_INT64})
-                           .Input(inputs)
-                           .Attr("Tcaptured", std::vector<DataType>{})
-                           .Input(std::vector<NodeDefBuilder::NodeOut>{})
-                           .Attr("Tout", std::vector<DataType>{DT_INT64})
-                           .Attr("f", f)
-                           .Finalize(node_def()));
+    TF_RETURN_IF_ERROR(
+        NodeDefBuilder("BatchTPUInput", "BatchFunction")
+            .Attr("max_batch_size", 4)
+            .Attr("num_batch_threads", 4)
+            .Attr("allowed_batch_sizes", {4})
+            .Attr("batch_timeout_micros", 5000000)
+            .Attr("max_enqueued_batches", 10)
+            .Attr("low_priority_max_batch_size",
+                  enable_low_priority_queue ? 64 : 0)
+            .Attr("low_priority_batch_timeout_micros",
+                  enable_low_priority_queue ? 50000000 : 0)
+            .Attr("low_priority_allowed_batch_sizes", enable_low_priority_queue
+                                                          ? std::vector<int>{64}
+                                                          : std::vector<int>())
+            .Attr("low_priority_max_enqueued_batches",
+                  enable_low_priority_queue ? 100 : 0)
+            .Attr("Tin", {DataType::DT_INT64})
+            .Input(inputs)
+            .Attr("Tcaptured", std::vector<DataType>{})
+            .Input(std::vector<NodeDefBuilder::NodeOut>{})
+            .Attr("Tout", std::vector<DataType>{DT_INT64})
+            .Attr("f", f)
+            .Finalize(node_def()));
     return OpsTestBase::InitOp();
   }
 
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index d34bd7331a35d5..50c5631d8e6ba1 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -160,6 +160,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme_encode",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:criticality",
         "@local_tsl//tsl/platform:errors",
@@ -180,6 +181,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme_encode",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:criticality",
     ],
@@ -201,6 +203,7 @@ tf_cc_test(
         "//tensorflow/core/platform:status_matchers",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
     ],
 )
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index 51d744616db8c6..9e20c2cab64e3a 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -532,6 +532,13 @@ BatchResourceBase::GetBatcherQueueOptions(
       low_priority_max_enqueued_batches;
   batcher_queue_options.low_priority_queue_options.batch_timeout_micros =
       low_priority_batch_timeout_micros;
+  if (low_priority_allowed_batch_sizes.empty()) {
+    batcher_queue_options.low_priority_queue_options.max_execution_batch_size =
+        low_priority_max_batch_size;
+  } else {
+    batcher_queue_options.low_priority_queue_options.max_execution_batch_size =
+        *low_priority_allowed_batch_sizes.rbegin();
+  }
   batcher_queue_options.enable_large_batch_splitting =
       enable_large_batch_splitting;
   if (enable_large_batch_splitting) {
@@ -554,14 +561,6 @@ BatchResourceBase::GetBatcherQueueOptions(
           .max_execution_batch_size = *allowed_batch_sizes.rbegin();
       batcher_queue_options.allowed_batch_sizes = allowed_batch_sizes;
     }
-    if (low_priority_allowed_batch_sizes.empty()) {
-      batcher_queue_options.low_priority_queue_options
-          .max_execution_batch_size = low_priority_max_batch_size;
-    } else {
-      batcher_queue_options.low_priority_queue_options
-          .max_execution_batch_size =
-          *low_priority_allowed_batch_sizes.rbegin();
-    }
   }
   batcher_queue_options.disable_padding = disable_padding;
 
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 9b836447b74720..cc5b8274033761 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -31,6 +31,7 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "absl/time/clock.h"
 #include "tensorflow/core/kernels/batching_util/batch_input_task.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
@@ -475,6 +476,15 @@ class Queue {
   Status ValidateBatchTaskQueueCapacity(TaskType* task) const
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Returns an error if the low priority task queue doesn't have capacity for
+  // this task using the low priority batch options. Since the low priority
+  // tasks are not batched until they get scheduled, it only checks that a
+  // single task does not it exceed input batch size limit and the total size of
+  // the tasks in the queue does not exceed the max batch size * max enqueued
+  // batch sizes.
+  Status ValidateLowPriorityTaskQueueCapacity(const TaskType& task) const
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   // The task size of the last batch in the queue.
   size_t tail_batch_task_size() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
@@ -884,11 +894,6 @@ Queue<TaskType>::~Queue() {
 
 template <typename TaskType>
 Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
-  if ((*task)->size() > options_.input_batch_size_limit) {
-    return errors::InvalidArgument("Task size ", (*task)->size(),
-                                   " is larger than maximum input batch size ",
-                                   options_.input_batch_size_limit);
-  }
   if (options_.enable_lazy_split) {
     return ScheduleWithLazySplit(std::move(task));
   }
@@ -1048,6 +1053,7 @@ Status Queue<TaskType>::ScheduleWithoutOrEagerSplit(
     if (IsLowPriorityTask(task)) {
       // Insert the task to the low priority task queue instead of the high
       // priority batch queue below.
+      TF_RETURN_IF_ERROR(ValidateLowPriorityTaskQueueCapacity(**task));
       low_priority_tasks_.AddTask(std::move(*task), env_->NowMicros());
     } else {
       TF_RETURN_IF_ERROR(ScheduleWithoutOrEagerSplitImpl(task));
@@ -1084,7 +1090,7 @@ size_t Queue<TaskType>::NumEnqueuedTasks() const {
   for (const auto& batch : GetBatches()) {
     num_enqueued_tasks += batch->num_tasks();
   }
-  return num_enqueued_tasks;
+  return num_enqueued_tasks + low_priority_tasks_.num_tasks();
 }
 
 template <typename TaskType>
@@ -1109,6 +1115,14 @@ size_t Queue<TaskType>::SchedulingCapacityInternal() const {
 
 template <typename TaskType>
 Status Queue<TaskType>::ValidateBatchTaskQueueCapacity(TaskType* task) const {
+  // Check if the task size is larger than the batch size limit, regardless of
+  // the batch capacity.
+  if (task->size() > options_.input_batch_size_limit) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        "Task size %d is larger than maximum input batch size %d", task->size(),
+        options_.input_batch_size_limit));
+  }
+
   // Queue creation requires that `enable_large_batch_splitting` is true
   // when `enable_lazy_split` is true, so this covers both eager split and
   // lazy split.
@@ -1148,6 +1162,36 @@ Status Queue<TaskType>::ValidateBatchTaskQueueCapacity(TaskType* task) const {
   return absl::OkStatus();
 }
 
+template <typename TaskType>
+Status Queue<TaskType>::ValidateLowPriorityTaskQueueCapacity(
+    const TaskType& task) const {
+  // Unlike the high priority batch capacity validation where having only
+  // input_batch_size_limit without max_execution_batch_size is allowed, it
+  // doesn't have the backward compatibility check and always assume that
+  // max_execution_batch_size is present.
+  if (task.size() >
+      options_.low_priority_queue_options.max_execution_batch_size) {
+    return absl::UnavailableError(absl::StrFormat(
+        "The low priority task queue to which this task was submitted has "
+        "max_execution_batch_size=%d and the task size is %d",
+        options_.low_priority_queue_options.max_execution_batch_size,
+        task.size()));
+  }
+  if (low_priority_tasks_.size() + task.size() >
+      options_.low_priority_queue_options.max_enqueued_batches *
+          options_.low_priority_queue_options.max_execution_batch_size) {
+    return absl::UnavailableError(absl::StrFormat(
+        "The low priority task queue to which this task was submitted does not "
+        "have the capcity to handle this task; currently the low priority "
+        "queue has %d tasks enqueued and the submitted task size is %d while "
+        "max_enqueued_batches=%d and max_execution_batch_size=%d",
+        low_priority_tasks_.size(), task.size(),
+        options_.low_priority_queue_options.max_enqueued_batches,
+        options_.low_priority_queue_options.max_execution_batch_size));
+  }
+  return absl::OkStatus();
+}
+
 template <typename TaskType>
 std::unique_ptr<Batch<TaskType>>
 Queue<TaskType>::ScheduleBatchWithEagerSplit() {
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index 680bbb5dd56206..deb8225dff4b09 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/base/call_once.h"
 #include "absl/container/fixed_array.h"
+#include "absl/status/status.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/criticality.h"
 
 namespace tensorflow {
@@ -1051,6 +1053,102 @@ INSTANTIATE_TEST_SUITE_P(
 
 using SharedBatchSchedulerPriorityTest = SharedBatchSchedulerTest;
 
+TEST_P(SharedBatchSchedulerPriorityTest,
+       InvalidLowPriorityTaskWithPriorityQueueEnabled) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    queue_callback_called = true;
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/100, /*input_batch_size_limit=*/100,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/true);
+    queue_options.low_priority_queue_options.max_execution_batch_size = 1;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 1;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    EXPECT_THAT(
+        ScheduleTask(10, queue.get(),
+                     tsl::criticality::Criticality::kSheddablePlus),
+        testing::StatusIs(
+            absl::StatusCode::kUnavailable,
+            HasSubstr(
+                "The low priority task queue to which this task was submitted "
+                "has max_execution_batch_size=1 and the task size is 10")));
+  }
+  EXPECT_FALSE(queue_callback_called);
+}
+
+TEST_P(SharedBatchSchedulerPriorityTest,
+       InvalidLowPriorityTaskWithQueueFullWithPriorityQueueEnabled) {
+  Notification processing, proceed;
+  auto queue_callback = [&processing, &proceed](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    if (!processing.HasBeenNotified()) {
+      processing.Notify();
+    }
+    proceed.WaitForNotification();
+  };
+
+  std::shared_ptr<Scheduler> scheduler =
+      CreateSharedBatchScheduler(/*num_batch_threads=*/1);
+
+  QueueOptions queue_options = CreateQueueOptions(
+      /*max_execution_batch_size=*/100, /*input_batch_size_limit=*/100,
+      /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+      /*enable_priority_queue=*/true);
+  queue_options.low_priority_queue_options.max_execution_batch_size = 10;
+  queue_options.low_priority_queue_options.batch_timeout_micros =
+      1 * 1000 * 1000;
+  queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+  queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+  std::unique_ptr<Queue> queue =
+      CreateQueue(scheduler, queue_options, queue_callback);
+
+  // Schedule one task and block the thread.
+  TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                            tsl::criticality::Criticality::kCriticalPlus));
+  TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                            tsl::criticality::Criticality::kSheddablePlus));
+  processing.WaitForNotification();
+  ASSERT_EQ(0, queue->NumEnqueuedTasks());
+
+  // Adding tasks up to size 20 should be fine.
+  TF_ASSERT_OK(ScheduleTask(10, queue.get(),
+                            tsl::criticality::Criticality::kSheddablePlus));
+  ASSERT_EQ(1, queue->NumEnqueuedTasks());
+  TF_ASSERT_OK(ScheduleTask(10, queue.get(),
+                            tsl::criticality::Criticality::kSheddablePlus));
+  ASSERT_EQ(2, queue->NumEnqueuedTasks());
+
+  // Adding one more task should result in an error.
+  EXPECT_THAT(
+      ScheduleTask(1, queue.get(),
+                   tsl::criticality::Criticality::kSheddablePlus),
+      testing::StatusIs(
+          absl::StatusCode::kUnavailable,
+          HasSubstr("The low priority task queue to which this task was "
+                    "submitted does not have the capcity to handle this task; "
+                    "currently the low priority queue has 20 tasks enqueued "
+                    "and the submitted task size is 1 while "
+                    "max_enqueued_batches=2 and max_execution_batch_size=10")));
+
+  // Unblock the thread.
+  proceed.Notify();
+}
+
 TEST_P(SharedBatchSchedulerPriorityTest,
        CallbackWithTaskVectorOkWithPriorityQueueEnabledWithPrioritySet) {
   bool queue_callback_called = false;
@@ -1070,11 +1168,16 @@ TEST_P(SharedBatchSchedulerPriorityTest,
     std::shared_ptr<Scheduler> scheduler =
         CreateSharedBatchScheduler(/*num_batch_threads=*/3);
 
-    // Create two queues.
-    const QueueOptions queue_options = CreateQueueOptions(
+    // Create a queue with the priority queue enabled.
+    QueueOptions queue_options = CreateQueueOptions(
         /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
         /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
         /*enable_priority_queue=*/true);
+    queue_options.low_priority_queue_options.max_execution_batch_size = 10;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
     std::unique_ptr<Queue> queue =
         CreateQueue(scheduler, queue_options, queue_callback);
 

From cb27285e651d506182e278c5c7b0b28930615675 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgk@google.com>
Date: Thu, 21 Mar 2024 14:19:18 -0700
Subject: [PATCH 253/670] Change pybind11_abseil `type_caster` `const_name`
 `Span` to `Sequence`.

PiperOrigin-RevId: 617961866
---
 tensorflow/python/_pywrap_dtensor_device.pyi | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/_pywrap_dtensor_device.pyi b/tensorflow/python/_pywrap_dtensor_device.pyi
index bf8f123c893e0a..0362a8c0f59a99 100644
--- a/tensorflow/python/_pywrap_dtensor_device.pyi
+++ b/tensorflow/python/_pywrap_dtensor_device.pyi
@@ -88,13 +88,13 @@ class Mesh:
     def device_location(self, arg0: int) -> list[int]: ...
     def device_type(self) -> str: ...
     def dim_size(self, dim_name: str) -> int: ...
-    def global_device_ids(self): ...
+    def global_device_ids(self) -> Sequence[int]: ...
     def global_devices(self) -> list[str]: ...
     def host_mesh(self) -> Mesh: ...
     def is_remote(self) -> bool: ...
     def is_single_device(self) -> bool: ...
-    def local_device_ids(self): ...
-    def local_devices(self): ...
+    def local_device_ids(self) -> Sequence[int]: ...
+    def local_devices(self) -> Sequence[str]: ...
     def min_global_device_id(self) -> int: ...
     def num_local_devices(self) -> int: ...
     def shape(self) -> list[int]: ...

From 6fa63f3079cc7becd444b09535533cd35a55c76d Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Thu, 21 Mar 2024 15:02:18 -0700
Subject: [PATCH 254/670] Call unbounded broadcast only when necessary.

The scale of improvement is more apparent with shapes that have multiple unbounded sizes. Here's a before & after HLO for `add(f32[1,?,2,?,<=2,?,?], f32[?,1,?,2,?,<=2,?])` where implicit broadcasting is not necessary:

Before:
```
HloModule UnboundedBinaryOpTest_0.35, entry_computation_layout={(f32[1,?,2,?,<=2,?,?]{6,5,4,3,2,1,0}, f32[?,1,?,2,?,<=2,?]{6,5,4,3,2,1,0})->f32[?,?,2,2,<=2,<=2,?]{6,5,4,3,2,1,0}}

ENTRY %UnboundedBinaryOpTest_0.35 (lhs.1: f32[1,?,2,?,<=2,?,?], rhs.2: f32[?,1,?,2,?,<=2,?]) -> f32[?,?,2,2,<=2,<=2,?] {
  %constant.3 = s32[1]{0} constant({1})
  %constant.16 = s32[1]{0} constant({1})
  %lhs.1 = f32[1,?,2,?,<=2,?,?]{6,5,4,3,2,1,0} parameter(0)
  %constant.4 = s32[1]{0} constant({1})
  %get-dimension-size.5 = s32[] get-dimension-size(f32[1,?,2,?,<=2,?,?]{6,5,4,3,2,1,0} %lhs.1), dimensions={1}
  %reshape.6 = s32[1]{0} reshape(s32[] %get-dimension-size.5)
  %constant.7 = s32[1]{0} constant({2})
  %get-dimension-size.8 = s32[] get-dimension-size(f32[1,?,2,?,<=2,?,?]{6,5,4,3,2,1,0} %lhs.1), dimensions={3}
  %reshape.9 = s32[1]{0} reshape(s32[] %get-dimension-size.8)
  %get-dimension-size.10 = s32[] get-dimension-size(f32[1,?,2,?,<=2,?,?]{6,5,4,3,2,1,0} %lhs.1), dimensions={4}
  %reshape.11 = s32[1]{0} reshape(s32[] %get-dimension-size.10)
  %get-dimension-size.12 = s32[] get-dimension-size(f32[1,?,2,?,<=2,?,?]{6,5,4,3,2,1,0} %lhs.1), dimensions={5}
  %reshape.13 = s32[1]{0} reshape(s32[] %get-dimension-size.12)
  %get-dimension-size.14 = s32[] get-dimension-size(f32[1,?,2,?,<=2,?,?]{6,5,4,3,2,1,0} %lhs.1), dimensions={6}
  %reshape.15 = s32[1]{0} reshape(s32[] %get-dimension-size.14)
  %concatenate.29 = s32[7]{0} concatenate(s32[1]{0} %constant.4, s32[1]{0} %reshape.6, s32[1]{0} %constant.7, s32[1]{0} %reshape.9, s32[1]{0} %reshape.11, /*index=5*/s32[1]{0} %reshape.13, s32[1]{0} %reshape.15), dimensions={0}
  %rhs.2 = f32[?,1,?,2,?,<=2,?]{6,5,4,3,2,1,0} parameter(1)
  %get-dimension-size.17 = s32[] get-dimension-size(f32[?,1,?,2,?,<=2,?]{6,5,4,3,2,1,0} %rhs.2), dimensions={0}
  %reshape.18 = s32[1]{0} reshape(s32[] %get-dimension-size.17)
  %constant.19 = s32[1]{0} constant({1})
  %get-dimension-size.20 = s32[] get-dimension-size(f32[?,1,?,2,?,<=2,?]{6,5,4,3,2,1,0} %rhs.2), dimensions={2}
  %reshape.21 = s32[1]{0} reshape(s32[] %get-dimension-size.20)
  %constant.22 = s32[1]{0} constant({2})
  %get-dimension-size.23 = s32[] get-dimension-size(f32[?,1,?,2,?,<=2,?]{6,5,4,3,2,1,0} %rhs.2), dimensions={4}
  %reshape.24 = s32[1]{0} reshape(s32[] %get-dimension-size.23)
  %get-dimension-size.25 = s32[] get-dimension-size(f32[?,1,?,2,?,<=2,?]{6,5,4,3,2,1,0} %rhs.2), dimensions={5}
  %reshape.26 = s32[1]{0} reshape(s32[] %get-dimension-size.25)
  %get-dimension-size.27 = s32[] get-dimension-size(f32[?,1,?,2,?,<=2,?]{6,5,4,3,2,1,0} %rhs.2), dimensions={6}
  %reshape.28 = s32[1]{0} reshape(s32[] %get-dimension-size.27)
  %concatenate.30 = s32[7]{0} concatenate(s32[1]{0} %reshape.18, s32[1]{0} %constant.19, s32[1]{0} %reshape.21, s32[1]{0} %constant.22, s32[1]{0} %reshape.24, /*index=5*/s32[1]{0} %reshape.26, s32[1]{0} %reshape.28), dimensions={0}
  %maximum.31 = s32[7]{0} maximum(s32[7]{0} %concatenate.29, s32[7]{0} %concatenate.30)
  %custom-call.32 = f32[?,?,2,2,<=2,<=2,?]{6,5,4,3,2,1,0} custom-call(f32[1,?,2,?,<=2,?,?]{6,5,4,3,2,1,0} %lhs.1, s32[7]{0} %maximum.31), custom_call_target="mhlo.dynamic_broadcast_in_dim", backend_config={broadcast_dimensions=[0,1,2,3,4,5,6]}
  %custom-call.33 = f32[?,?,2,2,<=2,<=2,?]{6,5,4,3,2,1,0} custom-call(f32[?,1,?,2,?,<=2,?]{6,5,4,3,2,1,0} %rhs.2, s32[7]{0} %maximum.31), custom_call_target="mhlo.dynamic_broadcast_in_dim", backend_config={broadcast_dimensions=[0,1,2,3,4,5,6]}
  ROOT %add.34 = f32[?,?,2,2,<=2,<=2,?]{6,5,4,3,2,1,0} add(f32[?,?,2,2,<=2,<=2,?]{6,5,4,3,2,1,0} %custom-call.32, f32[?,?,2,2,<=2,<=2,?]{6,5,4,3,2,1,0} %custom-call.33)
}
```

After:
```
HloModule UnboundedBinaryOpTest_0.4, entry_computation_layout={(f32[1,?,2,?,<=2,?,?]{6,5,4,3,2,1,0}, f32[?,1,?,2,?,<=2,?]{6,5,4,3,2,1,0})->f32[?,?,2,2,<=2,<=2,?]{6,5,4,3,2,1,0}}

ENTRY %UnboundedBinaryOpTest_0.4 (lhs.1: f32[1,?,2,?,<=2,?,?], rhs.2: f32[?,1,?,2,?,<=2,?]) -> f32[?,?,2,2,<=2,<=2,?] {
  %lhs.1 = f32[1,?,2,?,<=2,?,?]{6,5,4,3,2,1,0} parameter(0)
  %rhs.2 = f32[?,1,?,2,?,<=2,?]{6,5,4,3,2,1,0} parameter(1)
  ROOT %add.3 = f32[?,?,2,2,<=2,<=2,?]{6,5,4,3,2,1,0} add(f32[1,?,2,?,<=2,?,?]{6,5,4,3,2,1,0} %lhs.1, f32[?,1,?,2,?,<=2,?]{6,5,4,3,2,1,0} %rhs.2)
}
```

PiperOrigin-RevId: 617973981
---
 third_party/xla/xla/client/xla_builder.cc | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/client/xla_builder.cc b/third_party/xla/xla/client/xla_builder.cc
index de70b98f4393cf..5f8a1c58267679 100644
--- a/third_party/xla/xla/client/xla_builder.cc
+++ b/third_party/xla/xla/client/xla_builder.cc
@@ -1182,14 +1182,16 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
                                   this, rhs, lhs, *lhs_shape));
         }
       } else {
-        Shape output_shape = shape;
-        output_shape.set_element_type(lhs_shape->element_type());
-        TF_ASSIGN_OR_RETURN(UnboundedBroadcastResult broadcast_result,
-                            BroadcastToOutputShapeWithUnbounded(
-                                this, lhs, *lhs_shape, rhs, *rhs_shape,
-                                output_shape, broadcast_dimensions));
-        updated_lhs = broadcast_result.lhs;
-        updated_rhs = broadcast_result.rhs;
+        if (!ShapeUtil::SameDimensions(*lhs_shape, *rhs_shape)) {
+          Shape output_shape = shape;
+          output_shape.set_element_type(lhs_shape->element_type());
+          TF_ASSIGN_OR_RETURN(UnboundedBroadcastResult broadcast_result,
+                              BroadcastToOutputShapeWithUnbounded(
+                                  this, lhs, *lhs_shape, rhs, *rhs_shape,
+                                  output_shape, broadcast_dimensions));
+          updated_lhs = broadcast_result.lhs;
+          updated_rhs = broadcast_result.rhs;
+        }
       }
     }
 

From be73e71db00489b34198c2395036242933adddce Mon Sep 17 00:00:00 2001
From: Arturo Schmidt <arturoschmidt@google.com>
Date: Thu, 21 Mar 2024 15:06:40 -0700
Subject: [PATCH 255/670] Adjust singleCompileOp to compile with old bridge.

PiperOrigin-RevId: 617975218
---
 tensorflow/compiler/tf2xla/xla_compiler.cc | 66 ++++------------------
 1 file changed, 10 insertions(+), 56 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 8001c6dc47e18e..8af2c21994d4c4 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -752,71 +752,25 @@ Status XlaCompiler::CompileSingleOp(
     const XlaCompiler::CompileOptions& compile_options,
     const XlaCompiler::SingleOpCompileArgument& single_op_compile_argument,
     absl::Span<const Argument> args, XlaCompiler::CompilationResult* result) {
-  const std::vector<DataType>& result_dtypes =
-      single_op_compile_argument.output_dtypes;
   const NodeDef& node_def = single_op_compile_argument.node_def;
   TF_ASSIGN_OR_RETURN(
       auto graph,
       CreateSingleOpGraph(node_def, args,
                           single_op_compile_argument.output_dtypes));
 
-  auto compile_with_old_bridge = [&]() {
-    *result = {};
-    Status status = ADD_SOURCE_LOCATION(CompileGraph(
-        compile_options, node_def.name(), std::move(graph), args, result));
-    if (status.ok()) {
-      tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
-          tensorflow::metrics::Phase2XlaCompilerMetric::
-              kCompileSingleOpXlaBuilderSuccess);
-    } else {
-      tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
-          tensorflow::metrics::Phase2XlaCompilerMetric::
-              kCompileSingleOpXlaBuilderFailure);
-    }
-    return status;
-  };
-
-  const ConfigProto* config = &(single_op_compile_argument.config_proto);
-  auto bridge_rollout = GetMlirBridgeRolloutState(
-      config ? std::optional<ConfigProto>(*config) : std::nullopt);
-  if (bridge_rollout ==
-          ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_DISABLED ||
-      node_def.op() == "VarIsInitializedOp" ||
-      (bridge_rollout !=
-           ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED &&
-       options_.device_type.type_string() != DEVICE_TPU_XLA_JIT)) {
-    return compile_with_old_bridge();
-  }
-
-  GraphDebugInfo debug_info;
-  std::vector<std::string> control_rets;
-  if (result_dtypes.empty()) {
-    control_rets.push_back(node_def.name());
-  }
-
-  bool mlir_enabled = (bridge_rollout ==
-                       ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED);
-  VLOG(1) << "Attempting MLIR bridge."
-          << (mlir_enabled ? " MLIR is explicitly enabled." : "");
-  auto mlir_result = CompileGraphToXlaHlo(
-      *graph, mlir::SpanToArrayRef<XlaCompiler::Argument>(args), control_rets,
-      options_.device_type.type_string(), compile_options.use_tuple_arg,
-      /*analyse_graph=*/!mlir_enabled, *options_.flib_def, debug_info,
-      options_.shape_determination_fns, result);
-
-  if (mlir_result.ok() || mlir_enabled) {
+  *result = {};
+  Status status = ADD_SOURCE_LOCATION(CompileGraph(
+      compile_options, node_def.name(), std::move(graph), args, result));
+  if (status.ok()) {
+    tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
+        tensorflow::metrics::Phase2XlaCompilerMetric::
+            kCompileSingleOpXlaBuilderSuccess);
+  } else {
     tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
         tensorflow::metrics::Phase2XlaCompilerMetric::
-            kCompileSingleOpMlirSuccess);
-    return mlir_result;
+            kCompileSingleOpXlaBuilderFailure);
   }
-  tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
-      tensorflow::metrics::Phase2XlaCompilerMetric::
-          kCompileSingleOpMlirFailure);
-  VLOG(1) << "Failed second phase of the MLIR bridge. Will "
-             "retry with the old bridge. MLIR bridge compilation status: "
-          << mlir_result;
-  return compile_with_old_bridge();
+  return status;
 }
 
 Status XlaCompiler::CompileFunction(

From 3bdc0c8c87c0e3aa3ec8b50e191579ad4f648b25 Mon Sep 17 00:00:00 2001
From: Arturo Schmidt <arturoschmidt@google.com>
Date: Thu, 21 Mar 2024 15:49:55 -0700
Subject: [PATCH 256/670] Fix typo in compile_mlir_util deprecation
 description.

PiperOrigin-RevId: 617986722
---
 tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
index 3f6e446ca28fd9..b3e8311c2df7d4 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
@@ -191,7 +191,7 @@ Status CompileGraphToXlaHlo(
 // Compiles a TensorFlow Graph into XLA HLO, generates all accompanying metadata
 // and stores them in CompilationResult.
 ABSL_DEPRECATED(
-    "Use v1/compile_tf_graph.h::CompileTensorflowGraphToHloinstead.")
+    "Use v1/compile_tf_graph.h::CompileTensorflowGraphToHlo instead.")
 Status CompileGraphToXlaHlo(
     const Graph& graph, llvm::ArrayRef<XlaArgument> args,
     llvm::ArrayRef<std::string> control_rets, llvm::StringRef device_type,
@@ -207,7 +207,7 @@ Status CompileGraphToXlaHlo(
 // HLO-level inputs are supplied, and HLO-level outputs are produced.
 // xla_params is the HLO-level inputs and returns is the HLO-level outputs.
 ABSL_DEPRECATED(
-    "Use v1/compile_tf_graph.h::CompileTensorflowGraphToHloinstead.")
+    "Use v1/compile_tf_graph.h::CompileTensorflowGraphToHlo instead.")
 Status BuildHloFromGraph(
     const Graph& graph, xla::XlaBuilder& builder,
     mlir::MLIRContext& mlir_context, llvm::ArrayRef<xla::XlaOp> xla_params,

From cdf993d779c704c02866773bd218f1c03302da4d Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 21 Mar 2024 15:56:42 -0700
Subject: [PATCH 257/670] [XLA:Python] Add a C++ implementation of
 flatten_one_level.

Also add a copy of the default registry that doesn't have None registered as a leaf, which is slightly faster than using an is_leaf function.

This is mostly just doing an old TODO.

PiperOrigin-RevId: 617988496
---
 third_party/xla/xla/python/pytree.cc          | 41 +++++++++++++++++++
 third_party/xla/xla/python/pytree.h           |  4 ++
 third_party/xla/xla/python/xla_client.py      |  2 +-
 .../xla/xla/python/xla_extension/pytree.pyi   |  4 +-
 4 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/python/pytree.cc b/third_party/xla/xla/python/pytree.cc
index 0c8dcf5fe02e49..2d9dd09c3bb69f 100644
--- a/third_party/xla/xla/python/pytree.cc
+++ b/third_party/xla/xla/python/pytree.cc
@@ -218,6 +218,45 @@ bool PyTreeDef::operator==(const PyTreeDef& other) const {
   return true;
 }
 
+nb::object PyTreeRegistry::FlattenOneLevel(nb::handle x) const {
+  PyTreeRegistry::Registration const* custom;
+  PyTreeKind kind = KindOfObject(x, &custom);
+  switch (kind) {
+    case PyTreeKind::kNone:
+      return nb::make_tuple(nb::make_tuple(), nb::none());
+    case PyTreeKind::kTuple:
+    case PyTreeKind::kList:
+      return nb::make_tuple(nb::borrow(x), nb::none());
+    case PyTreeKind::kDict: {
+      nb::dict dict = nb::borrow<nb::dict>(x);
+      std::vector<nb::object> sorted_keys = GetSortedPyDictKeys(dict.ptr());
+      nb::tuple keys = nb::steal<nb::tuple>(PyTuple_New(sorted_keys.size()));
+      nb::tuple values = nb::steal<nb::tuple>(PyTuple_New(sorted_keys.size()));
+      for (size_t i = 0; i < sorted_keys.size(); ++i) {
+        PyTuple_SET_ITEM(values.ptr(), i,
+                         nb::object(dict[sorted_keys[i]]).release().ptr());
+        PyTuple_SET_ITEM(keys.ptr(), i, sorted_keys[i].release().ptr());
+      }
+      return nb::make_tuple(std::move(values), std::move(keys));
+    }
+    case PyTreeKind::kNamedTuple: {
+      nb::tuple in = nb::borrow<nb::tuple>(x);
+      nb::list out;
+      for (size_t i = 0; i < in.size(); ++i) {
+        out.append(in[i]);
+      }
+      return nb::make_tuple(std::move(out), x.type());
+    }
+    case PyTreeKind::kCustom: {
+      auto [leaves, aux_data] = custom->ToIterable(x);
+      return nb::make_tuple(std::move(leaves), std::move(aux_data));
+    }
+    default:
+      DCHECK(kind == PyTreeKind::kLeaf);
+      return nb::none();
+  }
+}
+
 template <typename T>
 void PyTreeDef::FlattenImpl(nb::handle handle, T& leaves,
                             const std::optional<nb::callable>& leaf_predicate) {
@@ -1122,6 +1161,8 @@ void BuildPytreeSubmodule(nb::module_& m) {
         return nb::make_tuple(std::move(leaves), std::move(def));
       },
       nb::arg("tree").none(), nb::arg("leaf_predicate").none() = std::nullopt);
+  registry.def("flatten_one_level", &PyTreeRegistry::FlattenOneLevel,
+               nb::arg("tree").none());
   registry.def("register_node", &PyTreeRegistry::Register);
   registry.def("__reduce__",
                [](nb::object self) { return self.attr("__name__"); });
diff --git a/third_party/xla/xla/python/pytree.h b/third_party/xla/xla/python/pytree.h
index 9a453ad0f17f8f..2cce9c5c3c7abc 100644
--- a/third_party/xla/xla/python/pytree.h
+++ b/third_party/xla/xla/python/pytree.h
@@ -87,6 +87,10 @@ class PyTreeRegistry : public std::enable_shared_from_this<PyTreeRegistry> {
   PyTreeKind KindOfObject(nanobind::handle obj,
                           PyTreeRegistry::Registration const** custom) const;
 
+  // Flattens a pytree one level, returning either a tuple of the leaves and
+  // the node data, or None, if the entry is a leaf.
+  nanobind::object FlattenOneLevel(nanobind::handle x) const;
+
  private:
   struct TypeHash {
     using is_transparent = void;
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 48ba271cbf4f1f..47e0600ee6219e 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -48,7 +48,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 247
+_version = 248
 
 # Version number for MLIR:Python components.
 mlir_api_version = 55
diff --git a/third_party/xla/xla/python/xla_extension/pytree.pyi b/third_party/xla/xla/python/xla_extension/pytree.pyi
index 24421a857ca8dd..e493fda1dfde55 100644
--- a/third_party/xla/xla/python/xla_extension/pytree.pyi
+++ b/third_party/xla/xla/python/xla_extension/pytree.pyi
@@ -29,6 +29,9 @@ class PyTreeRegistry:
       tree: Any,
       leaf_predicate: Optional[Callable[[Any], bool]] = ...,
   ) -> Tuple[List[Any], PyTreeDef]: ...
+  def flatten_one_level(
+      self, tree: Any
+  ) -> Optional[Tuple[Iterable[Any], Any]]: ...
   def register_node(
       self,
       __type: Type[_T],
@@ -74,6 +77,5 @@ class PyTreeDef:
   ) -> PyTreeDef:
     ...
 
-
 _Children = TypeVar("_Children", bound=Iterable[Any])
 _AuxData = TypeVar("_AuxData", bound=Hashable)

From fbc05de1289f18f4c58e8b6da05802ae317c1c6a Mon Sep 17 00:00:00 2001
From: Augie Fackler <augie@google.com>
Date: Thu, 21 Mar 2024 16:15:55 -0700
Subject: [PATCH 258/670] Integrate LLVM at llvm/llvm-project@9fb85b099461

Updates LLVM usage to match
[9fb85b099461](https://github.com/llvm/llvm-project/commit/9fb85b099461)

PiperOrigin-RevId: 617993699
---
 third_party/llvm/generated.patch              | 20 -------------------
 third_party/llvm/workspace.bzl                |  4 ++--
 third_party/xla/xla/mlir_hlo/BUILD            |  1 +
 .../test_hlo_transform_dialect_interpreter.cc |  2 +-
 4 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 17fb01d7f98000..509398da979e83 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,21 +1 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
---- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
-+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
-@@ -505,7 +505,7 @@
-     name = "__support_sign",
-     hdrs = ["src/__support/sign.h"],
-     deps = [
--        ":__support_macros_properties_types",
-+        ":__support_macros_attributes",
-     ],
- )
- 
-@@ -513,6 +513,7 @@
-     name = "__support_uint128",
-     hdrs = ["src/__support/UInt128.h"],
-     deps = [
-+        ":__support_macros_attributes",
-         ":__support_macros_properties_types",
-         ":__support_uint",
-     ],
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 44e870eccddcda..85de3f3ba214f7 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "407937036fa7640f61f225474b1ea6623a40dbdd"
-    LLVM_SHA256 = "56d1ef16706a5952a357f9e0e1ab6a6063a68e7c08dc23529e9c748f2c3b73de"
+    LLVM_COMMIT = "9fb85b09946122aa5793b647d7939ac17817c5f5"
+    LLVM_SHA256 = "76bf16dcf9fe05e412c594a2d8216f7cefe46cd1111fd08f7b842dbfa282df99"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index 72fd5ae46c082b..1344327b89e7ec 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -1054,6 +1054,7 @@ cc_library(
         "@llvm-project//mlir:TensorTransforms",
         "@llvm-project//mlir:TensorUtils",
         "@llvm-project//mlir:TransformDialect",
+        "@llvm-project//mlir:TransformDialectInterfaces",
         "@llvm-project//mlir:TransformDialectTransforms",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
diff --git a/third_party/xla/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc b/third_party/xla/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
index ba51904e2d5e4f..99bfb28760403a 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"

From 43fd9463c7ece254f6d2bbb2f6a068bbe8087e31 Mon Sep 17 00:00:00 2001
From: Eunjae Kim <eunjaekim@google.com>
Date: Thu, 21 Mar 2024 16:41:43 -0700
Subject: [PATCH 259/670] Allow scheduling from the low priority tasks only
 which is padded to the allowed batch sizes in the low priority batch params

PiperOrigin-RevId: 618000151
---
 tensorflow/core/kernels/batch_kernels_test.cc | 158 +++++++++++++-----
 .../batching_util/batch_resource_base.cc      |  41 ++++-
 .../batching_util/batch_resource_base.h       |   7 +-
 .../batching_util/shared_batch_scheduler.h    |  64 ++++++-
 .../shared_batch_scheduler_test.cc            |  91 +++++++++-
 5 files changed, 304 insertions(+), 57 deletions(-)

diff --git a/tensorflow/core/kernels/batch_kernels_test.cc b/tensorflow/core/kernels/batch_kernels_test.cc
index aebe26daa39ff4..4b3b04bda413ae 100644
--- a/tensorflow/core/kernels/batch_kernels_test.cc
+++ b/tensorflow/core/kernels/batch_kernels_test.cc
@@ -89,7 +89,8 @@ class BatchFunctionTestState : public SharedBatchFunctionTestState {
  public:
   // Init test fixture with a batch kernel instance. The caller guarantees that
   // the device pointer is valid throughout the life of this class.
-  absl::Status Init(Device *device, bool enable_low_priority_queue) {
+  absl::Status Init(Device *device, bool enable_low_priority_queue,
+                    int64_t expected_batch_size) {
     // Override the per-test/per-op device with a given device so that it can
     // be shared between ops.
     device_ = device;
@@ -109,7 +110,8 @@ class BatchFunctionTestState : public SharedBatchFunctionTestState {
         {{{"o"},
           "EnsureShape",
           {"x"},
-          {{"T", DataType::DT_INT64}, {"shape", TensorShape({4, 2})}}}},
+          {{"T", DataType::DT_INT64},
+           {"shape", TensorShape({expected_batch_size, 2})}}}},
         // ret_def
         {{"o", "o:output"}});
     TF_RETURN_IF_ERROR(flib_def_->AddFunctionDef(func));
@@ -117,29 +119,28 @@ class BatchFunctionTestState : public SharedBatchFunctionTestState {
 
     std::vector<NodeDefBuilder::NodeOut> inputs(
         {NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64})});
-    TF_RETURN_IF_ERROR(
-        NodeDefBuilder("BatchTPUInput", "BatchFunction")
-            .Attr("max_batch_size", 4)
-            .Attr("num_batch_threads", 4)
-            .Attr("allowed_batch_sizes", {4})
-            .Attr("batch_timeout_micros", 5000000)
-            .Attr("max_enqueued_batches", 10)
-            .Attr("low_priority_max_batch_size",
-                  enable_low_priority_queue ? 64 : 0)
-            .Attr("low_priority_batch_timeout_micros",
-                  enable_low_priority_queue ? 50000000 : 0)
-            .Attr("low_priority_allowed_batch_sizes", enable_low_priority_queue
-                                                          ? std::vector<int>{64}
-                                                          : std::vector<int>())
-            .Attr("low_priority_max_enqueued_batches",
-                  enable_low_priority_queue ? 100 : 0)
-            .Attr("Tin", {DataType::DT_INT64})
-            .Input(inputs)
-            .Attr("Tcaptured", std::vector<DataType>{})
-            .Input(std::vector<NodeDefBuilder::NodeOut>{})
-            .Attr("Tout", std::vector<DataType>{DT_INT64})
-            .Attr("f", f)
-            .Finalize(node_def()));
+    TF_RETURN_IF_ERROR(NodeDefBuilder("BatchTPUInput", "BatchFunction")
+                           .Attr("max_batch_size", 4)
+                           .Attr("num_batch_threads", 4)
+                           .Attr("allowed_batch_sizes", {4})
+                           .Attr("batch_timeout_micros", 1000000)
+                           .Attr("max_enqueued_batches", 10)
+                           .Attr("low_priority_max_batch_size",
+                                 enable_low_priority_queue ? 8 : 0)
+                           .Attr("low_priority_batch_timeout_micros",
+                                 enable_low_priority_queue ? 2000000 : 0)
+                           .Attr("low_priority_allowed_batch_sizes",
+                                 enable_low_priority_queue ? std::vector<int>{8}
+                                                           : std::vector<int>())
+                           .Attr("low_priority_max_enqueued_batches",
+                                 enable_low_priority_queue ? 2 : 0)
+                           .Attr("Tin", {DataType::DT_INT64})
+                           .Input(inputs)
+                           .Attr("Tcaptured", std::vector<DataType>{})
+                           .Input(std::vector<NodeDefBuilder::NodeOut>{})
+                           .Attr("Tout", std::vector<DataType>{DT_INT64})
+                           .Attr("f", f)
+                           .Finalize(node_def()));
     return OpsTestBase::InitOp();
   }
 
@@ -175,8 +176,9 @@ TEST_P(BatchFunctionTest, BatchingWorksWithoutCriticality) {
 
         BatchFunctionTestState test_state;
         test_state.set_session_metadata(session_metadata);
-        TF_ASSERT_OK(
-            test_state.Init(cpu_device_.get(), enable_low_priority_queue));
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     enable_low_priority_queue,
+                                     /*expected_batch_size=*/4));
         test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
         TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -209,8 +211,9 @@ TEST_P(BatchFunctionTest, PaddingWorksWithoutCriticality) {
 
         BatchFunctionTestState test_state;
         test_state.set_session_metadata(session_metadata);
-        TF_ASSERT_OK(
-            test_state.Init(cpu_device_.get(), enable_low_priority_queue));
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     enable_low_priority_queue,
+                                     /*expected_batch_size=*/4));
         test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
         TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -226,7 +229,7 @@ TEST_P(BatchFunctionTest, PaddingWorksWithoutCriticality) {
 }
 
 #if defined(PLATFORM_GOOGLE)
-TEST_P(BatchFunctionTest, BatchingWorks) {
+TEST_P(BatchFunctionTest, LowPriorityTaskPaddingHighPriorityBatch) {
   SessionMetadata session_metadata;
   session_metadata.set_name("test_model");
   session_metadata.set_version(123);
@@ -246,8 +249,9 @@ TEST_P(BatchFunctionTest, BatchingWorks) {
 
         BatchFunctionTestState test_state;
         test_state.set_session_metadata(session_metadata);
-        TF_ASSERT_OK(
-            test_state.Init(cpu_device_.get(), enable_low_priority_queue));
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     enable_low_priority_queue,
+                                     /*expected_batch_size=*/4));
         test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
         TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -267,8 +271,9 @@ TEST_P(BatchFunctionTest, BatchingWorks) {
 
         BatchFunctionTestState test_state;
         test_state.set_session_metadata(session_metadata);
-        TF_ASSERT_OK(
-            test_state.Init(cpu_device_.get(), enable_low_priority_queue));
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     enable_low_priority_queue,
+                                     /*expected_batch_size=*/4));
         test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
         TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -283,7 +288,8 @@ TEST_P(BatchFunctionTest, BatchingWorks) {
   }
 }
 
-TEST_P(BatchFunctionTest, PaddingWorks) {
+TEST_P(BatchFunctionTest,
+       LowPriorityTaskPaddingHighPriorityBatchWithExtraPadding) {
   SessionMetadata session_metadata;
   session_metadata.set_name("test_model");
   session_metadata.set_version(123);
@@ -302,8 +308,8 @@ TEST_P(BatchFunctionTest, PaddingWorks) {
 
       BatchFunctionTestState test_state;
       test_state.set_session_metadata(session_metadata);
-      TF_ASSERT_OK(
-          test_state.Init(cpu_device_.get(), enable_low_priority_queue));
+      TF_ASSERT_OK(test_state.Init(cpu_device_.get(), enable_low_priority_queue,
+                                   /*expected_batch_size=*/4));
       test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
       TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -321,8 +327,8 @@ TEST_P(BatchFunctionTest, PaddingWorks) {
 
       BatchFunctionTestState test_state;
       test_state.set_session_metadata(session_metadata);
-      TF_ASSERT_OK(
-          test_state.Init(cpu_device_.get(), enable_low_priority_queue));
+      TF_ASSERT_OK(test_state.Init(cpu_device_.get(), enable_low_priority_queue,
+                                   /*expected_batch_size=*/4));
       test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
       TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -340,6 +346,80 @@ TEST_P(BatchFunctionTest, PaddingWorks) {
 INSTANTIATE_TEST_SUITE_P(BatchFunctionTest, BatchFunctionTest,
                          ::testing::Bool());
 
+#if defined(PLATFORM_GOOGLE)
+TEST_F(BatchFunctionTest, LowPriorityOnlyBatchAtMaxLowPriorityBatchSize) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  {
+    tsl::BlockingCounter blocking_counter(8);
+    // 8 threads run the batch op with sheddable. They are eventually batched to
+    // form a tensor with [8, 2] shape, which is verified within the function,
+    // since the low priority max batch size is set to 8.
+    for (int i = 0; i < 8; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kSheddable);
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kSheddable);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     /*enable_low_priority_queue=*/true,
+                                     /*expected_batch_size=*/8));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({234, 567}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+}
+
+TEST_F(BatchFunctionTest, LowPriorityBatchPaddedToLowPriorityAllowedBatchSize) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  {
+    tsl::BlockingCounter blocking_counter(2);
+    // 2 threads run the batch op with sheddable. They are eventually batched
+    // and padded to form a tensor with [8, 2] shape, which is verified within
+    // the function, since the low priority allowed batch size is set to [8].
+    for (int i = 0; i < 2; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kSheddable);
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kSheddable);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     /*enable_low_priority_queue=*/true,
+                                     /*expected_batch_size=*/8));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({234, 567}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+}
+#endif
+
 class BatchFunctionKernelParallelWarmupTestState
     : public SharedBatchFunctionTestState {
  public:
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index 9e20c2cab64e3a..2f3195bd9fc3b7 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -71,6 +71,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/util/incremental_barrier.h"
+#include "tsl/platform/criticality.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -539,6 +540,8 @@ BatchResourceBase::GetBatcherQueueOptions(
     batcher_queue_options.low_priority_queue_options.max_execution_batch_size =
         *low_priority_allowed_batch_sizes.rbegin();
   }
+  batcher_queue_options.low_priority_queue_options.allowed_batch_sizes =
+      low_priority_allowed_batch_sizes;
   batcher_queue_options.enable_large_batch_splitting =
       enable_large_batch_splitting;
   if (enable_large_batch_splitting) {
@@ -610,14 +613,32 @@ BatchResourceBase::GetAdaptiveBatcherQueueOptions(
   return absl::OkStatus();
 }
 
+bool BatchResourceBase::IsLowPriorityBatch(const BatchT& batch) const {
+  if (!batcher_queue_options_.enable_priority_queue) return false;
+  if (batch.empty()) return false;
+
+  // TODO(b/316379576): Once the criticality and priority become configurable,
+  // this should rely on the batch parameters instead of the hard coded value.
+  return batch.task(0).criticality() ==
+             tsl::criticality::Criticality::kSheddablePlus ||
+         batch.task(0).criticality() ==
+             tsl::criticality::Criticality::kSheddable;
+}
+
 // Returns the smallest entry in 'allowed_batch_sizes_' that is greater than
 // or equal to 'batch_size'. If 'allowed_batch_sizes_' is empty, simply
 // returns 'batch_size'.
-int BatchResourceBase::RoundToLowestAllowedBatchSize(int batch_size) const {
-  if (batcher_queue_options_.disable_padding || allowed_batch_sizes_.empty()) {
+int BatchResourceBase::RoundToLowestAllowedBatchSize(
+    int batch_size, bool is_low_priority_batch) const {
+  const std::vector<int32>& allowed_batch_sizes =
+      is_low_priority_batch ? batcher_queue_options_.low_priority_queue_options
+                                  .allowed_batch_sizes
+                            : allowed_batch_sizes_;
+
+  if (batcher_queue_options_.disable_padding || allowed_batch_sizes.empty()) {
     return batch_size;
   }
-  for (int allowed_size : allowed_batch_sizes_) {
+  for (int allowed_size : allowed_batch_sizes) {
     if (allowed_size >= batch_size) {
       return allowed_size;
     }
@@ -641,7 +662,8 @@ Status BatchResourceBase::ConcatInputTensors(
   const int padded_batch_size =
       just_for_warmup
           ? batch.task(0).forced_warmup_batch_size
-          : RoundToLowestAllowedBatchSize(batch.size() + unbatched_tasks_size);
+          : RoundToLowestAllowedBatchSize(batch.size() + unbatched_tasks_size,
+                                          IsLowPriorityBatch(batch));
   const int padding_amount =
       just_for_warmup ? padded_batch_size
                       : padded_batch_size - batch.size() - unbatched_tasks_size;
@@ -836,11 +858,12 @@ Status BatchResourceBase::SplitOutputTensors(
     task_sizes_plus_optional_padding.push_back(unbatched_tasks[i]->size());
   }
   int unbatched_tasks_size = GetTotalTaskSize(unbatched_tasks);
-  const int padding_size = batcher_queue_options_.disable_padding
-                               ? 0
-                               : RoundToLowestAllowedBatchSize(
-                                     batch->size() + unbatched_tasks_size) -
-                                     batch->size() - unbatched_tasks_size;
+  const int padding_size =
+      batcher_queue_options_.disable_padding
+          ? 0
+          : RoundToLowestAllowedBatchSize(batch->size() + unbatched_tasks_size,
+                                          IsLowPriorityBatch(*batch)) -
+                batch->size() - unbatched_tasks_size;
   if (padding_size > 0) {
     task_sizes_plus_optional_padding.push_back(padding_size);
   }
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.h b/tensorflow/core/kernels/batching_util/batch_resource_base.h
index 60ecf980e95443..83b0840d5f7e48 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.h
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.h
@@ -266,10 +266,15 @@ class BatchResourceBase : public ResourceBase {
   // Assumes the batch is non-empty.
   static Status ValidateBatch(const BatchT& batch);
 
+  // Returns a boolean indicating whether a batch is formed from low priority
+  // tasks only or not.
+  bool IsLowPriorityBatch(const BatchT& batch) const;
+
   // Returns the smallest entry in 'allowed_batch_sizes_' that is greater than
   // or equal to 'batch_size'. If 'allowed_batch_sizes_' is empty, simply
   // returns 'batch_size'.
-  int RoundToLowestAllowedBatchSize(int batch_size) const;
+  int RoundToLowestAllowedBatchSize(int batch_size,
+                                    bool is_low_priority_batch = false) const;
 
   // Helper function to propagate the status to the task's context and call the
   // done callback on the task.
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index cc5b8274033761..7d0c2d66354e00 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -254,8 +254,12 @@ class SharedBatchScheduler
       size_t input_batch_size_limit = 0;
       // See QueueOptions.max_enqueued_batches
       size_t max_enqueued_batches = 0;
+      // See QueueOptions.allowed_batch_sizes
+      std::vector<int32> allowed_batch_sizes;
     };
-    // A subset of queue options for high priority input.
+    // A subset of queue options for high priority input. These options are
+    // currently not being used in favor of the equivalents options at the
+    // QueueOptions level.
     PriorityQueueOptions high_priority_queue_options;
     // A subset of queue options for low priority input.
     PriorityQueueOptions low_priority_queue_options;
@@ -466,6 +470,12 @@ class Queue {
   bool IsOpenBatchSchedulableAfterEagerSplit() const
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Determines whether the low priority tasks in `low_priority_tasks_` can form
+  // a batch on their own. If yes, returns a batch that is ready to be
+  // processed. Otherwise, returns an empty unique_ptr.
+  std::unique_ptr<Batch<TaskType>> ScheduleLowPriorityBatch()
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   // Same as SchedulingCapacity(), but assumes the caller already holds a
   // lock on 'mu_'.
   size_t SchedulingCapacityInternal() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -1210,14 +1220,26 @@ Queue<TaskType>::ScheduleBatchWithEagerSplit() {
 
     if (batches.size() >= 2) {
       // There is at least one closed batch that is ready to be scheduled.
-      ++num_batches_being_processed_;
       batch_to_schedule = std::move(batches.front());
       batches.pop_front();
-    } else {
+    }
+
+    if (batch_to_schedule == nullptr) {
+      // If there was no schedulable batch in the batch queue, try to schedule
+      // from the low priority task queue.
+      batch_to_schedule = ScheduleLowPriorityBatch();
+    }
+
+    if (batch_to_schedule == nullptr) {
+      // There is neither high nor low priority batch that can be scheduled,
+      // mark the condition false and return the nullptr.
       schedulable_batch_ = false;
+      return batch_to_schedule;
     }
-  }
 
+    // Otherwise, increment the counter and return the batch.
+    ++num_batches_being_processed_;
+  }
   return batch_to_schedule;
 }
 
@@ -1329,7 +1351,7 @@ bool Queue<TaskType>::IsEmptyInternal() const {
   }
   const std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
   return num_batches_being_processed_ == 0 && batches.size() == 1 &&
-         batches.back()->empty();
+         batches.back()->empty() && low_priority_tasks_.empty();
 }
 
 template <typename TaskType>
@@ -1382,6 +1404,38 @@ bool Queue<TaskType>::IsOpenBatchSchedulable() const {
              open_batch_start_time_micros_ + options_.batch_timeout_micros;
 }
 
+template <typename TaskType>
+std::unique_ptr<Batch<TaskType>> Queue<TaskType>::ScheduleLowPriorityBatch() {
+  std::unique_ptr<Batch<TaskType>> batch_to_schedule;
+  if (!options_.enable_priority_queue || low_priority_tasks_.empty()) {
+    // Return early if priority queue is disabled or there is no low priority
+    // task.
+    return batch_to_schedule;
+  }
+  if (env_->NowMicros() <
+          *low_priority_tasks_.EarliestTaskStartTime() +
+              options_.low_priority_queue_options.batch_timeout_micros &&
+      low_priority_tasks_.size() <
+          options_.low_priority_queue_options.max_execution_batch_size) {
+    // Return early if the low priority tasks can't fill up the max batch size
+    // and the earliest task didn't time out.
+    return batch_to_schedule;
+  }
+  if (!GetBatches().empty() && !GetBatches().front()->empty()) {
+    // Return early if there is a non-empty high priority batch in the queue.
+    return batch_to_schedule;
+  }
+
+  batch_to_schedule = std::make_unique<Batch<TaskType>>();
+  for (std::unique_ptr<TaskType>& task : low_priority_tasks_.RemoveTask(
+           options_.low_priority_queue_options.max_execution_batch_size)) {
+    batch_to_schedule->AddTask(std::move(task));
+  }
+  batch_to_schedule->Close();
+
+  return batch_to_schedule;
+}
+
 template <typename TaskType>
 size_t Queue<TaskType>::tail_batch_task_size() const {
   if (options_.enable_lazy_split) {
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index deb8225dff4b09..0e890a2931c92d 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -1181,7 +1181,7 @@ TEST_P(SharedBatchSchedulerPriorityTest,
     std::unique_ptr<Queue> queue =
         CreateQueue(scheduler, queue_options, queue_callback);
 
-    // Submit tasks to the two queues.
+    // Submit tasks to the queue.
     TF_ASSERT_OK(ScheduleTask(1, queue.get(),
                               tsl::criticality::Criticality::kCriticalPlus));
     TF_ASSERT_OK(ScheduleTask(3, queue.get(),
@@ -1211,7 +1211,7 @@ TEST_P(SharedBatchSchedulerPriorityTest,
     std::shared_ptr<Scheduler> scheduler =
         CreateSharedBatchScheduler(/*num_batch_threads=*/3);
 
-    // Create two queues.
+    // Create a queue with the priority queue disabled.
     const QueueOptions queue_options = CreateQueueOptions(
         /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
         /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
@@ -1219,7 +1219,7 @@ TEST_P(SharedBatchSchedulerPriorityTest,
     std::unique_ptr<Queue> queue =
         CreateQueue(scheduler, queue_options, queue_callback);
 
-    // Submit tasks to the two queues.
+    // Submit tasks to the queue.
     TF_ASSERT_OK(ScheduleTask(1, queue.get(),
                               tsl::criticality::Criticality::kCriticalPlus));
     TF_ASSERT_OK(ScheduleTask(3, queue.get(),
@@ -1230,6 +1230,91 @@ TEST_P(SharedBatchSchedulerPriorityTest,
   EXPECT_TRUE(queue_callback_called);
 }
 
+TEST_P(SharedBatchSchedulerPriorityTest,
+       LowPriorityTaskOnlyAtMaxBatchSizeWithPriorityQueueEnabled) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    queue_callback_called = true;
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(3, batch->num_tasks());
+    EXPECT_EQ(1, batch->task(0).size());
+    EXPECT_EQ(3, batch->task(1).size());
+    EXPECT_EQ(5, batch->task(2).size());
+    EXPECT_TRUE(tasks.empty());
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/100, /*input_batch_size_limit=*/100,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/true);
+    queue_options.low_priority_queue_options.max_execution_batch_size = 9;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    // Submit low priority tasks to fill up the max batch size.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get(),
+                              tsl::criticality::Criticality::kSheddablePlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kSheddablePlus));
+    TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+  }
+  EXPECT_TRUE(queue_callback_called);
+}
+
+TEST_P(SharedBatchSchedulerPriorityTest,
+       LowPriorityTaskOnlyAtTimeoutWithPriorityQueueEnabled) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    queue_callback_called = true;
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(3, batch->num_tasks());
+    EXPECT_EQ(1, batch->task(0).size());
+    EXPECT_EQ(3, batch->task(1).size());
+    EXPECT_EQ(5, batch->task(2).size());
+    EXPECT_TRUE(tasks.empty());
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/100, /*input_batch_size_limit=*/100,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/true);
+    queue_options.low_priority_queue_options.max_execution_batch_size = 20;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    // Submit low priority tasks that wouldn't fill up the max batch size, but
+    // they should still be scheduled due to timeout.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get(),
+                              tsl::criticality::Criticality::kSheddablePlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kSheddablePlus));
+    TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+  }
+  EXPECT_TRUE(queue_callback_called);
+}
+
 // Lazy split is to be removed. The mixed priority batching is only supported
 // when the lazy split is not enabled.
 INSTANTIATE_TEST_SUITE_P(

From f12ae4a4c71e1c64f1a3734b1e26e302dfb1ce08 Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Thu, 21 Mar 2024 16:43:32 -0700
Subject: [PATCH 260/670] Do not pack int4 values on the interpreter in PJRT.

The interpreter does not expect int4 values to be packed, so this fixes incorrect results when the interpreter is used and PJRT transfers int4 values to the device.

PiperOrigin-RevId: 618000661
---
 .../xla/xla/pjrt/pjrt_stream_executor_client.cc    | 14 ++++++++------
 .../xla/xla/service/generic_transfer_manager.h     |  6 ------
 third_party/xla/xla/service/transfer_manager.h     | 10 ++++++++++
 .../tpu/tpu_transfer_manager_interface.h           |  2 ++
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 0aae40831a9e20..085d10800326ca 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -850,8 +850,10 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
     TF_ASSIGN_OR_RETURN(transpose, transpose_cache_.GetOrCreate(options));
   }
 
+  bool should_pack =
+      primitive_util::Is4BitType(type) && transfer_manager->PackSubbyteTypes();
   int64_t packed_size;
-  if (primitive_util::Is4BitType(type)) {
+  if (should_pack) {
     packed_size = CeilOfRatio<int64_t>(size, 2);
   } else {
     packed_size = size;
@@ -883,14 +885,14 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
   if (host_buffer_semantics == HostBufferSemantics::kImmutableOnlyDuringCall) {
     if (transpose) {
       transpose->Execute(data, staging_buffer.get());
-      if (primitive_util::Is4BitType(type)) {
+      if (should_pack) {
         PackInt4(absl::MakeConstSpan(
                      static_cast<const char*>(staging_buffer.get()), size),
                  absl::MakeSpan(static_cast<char*>(staging_buffer.get()),
                                 packed_size));
       }
     } else {
-      if (primitive_util::Is4BitType(type)) {
+      if (should_pack) {
         PackInt4(absl::MakeConstSpan(static_cast<const char*>(data), size),
                  absl::MakeSpan(static_cast<char*>(staging_buffer.get()),
                                 packed_size));
@@ -913,7 +915,7 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
   auto transfer_h2d =
       [local_client = client(), transfer_manager, local_device, data, size,
        type, packed_size, movable_device_buffer{device_buffer.ToClosure()},
-       device_shape, py_buffer{py_buffer.get()},
+       device_shape, should_pack, py_buffer{py_buffer.get()},
        on_device_shape{py_buffer->on_device_shape()},
        staging_buffer{std::move(staging_buffer)},
        on_done_with_host_buffer =
@@ -942,7 +944,7 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
               HostBufferSemantics::kImmutableOnlyDuringCall) {
             if (transpose) {
               transpose->Execute(data, staging_buffer.get());
-              if (primitive_util::Is4BitType(type)) {
+              if (should_pack) {
                 PackInt4(
                     absl::MakeConstSpan(
                         static_cast<const char*>(staging_buffer.get()), size),
@@ -950,7 +952,7 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
                                    packed_size));
               }
             } else {
-              if (primitive_util::Is4BitType(type)) {
+              if (should_pack) {
                 PackInt4(
                     absl::MakeConstSpan(static_cast<const char*>(data), size),
                     absl::MakeSpan(static_cast<char*>(staging_buffer.get()),
diff --git a/third_party/xla/xla/service/generic_transfer_manager.h b/third_party/xla/xla/service/generic_transfer_manager.h
index bc376a6932c017..c80d89187073ee 100644
--- a/third_party/xla/xla/service/generic_transfer_manager.h
+++ b/third_party/xla/xla/service/generic_transfer_manager.h
@@ -83,12 +83,6 @@ class GenericTransferManager : public TransferManager {
   Shape HostShapeToDeviceShape(const Shape& host_shape) const override;
 
  private:
-  // Returns whether subbyte types (types less than 1 byte, e.g. U4) should
-  // have multiple values packed into a single byte on the device. Subbyte
-  // bytes are never packed on the host. By default, returns false, so a byte
-  // can only hold one value, but subclasses can override this.
-  virtual bool PackSubbyteTypes() const { return false; }
-
   // Transfer a memory block of the given size from the device source into the
   // 'destination' buffer.
   //
diff --git a/third_party/xla/xla/service/transfer_manager.h b/third_party/xla/xla/service/transfer_manager.h
index 3157b37b627443..a229be39be5ebf 100644
--- a/third_party/xla/xla/service/transfer_manager.h
+++ b/third_party/xla/xla/service/transfer_manager.h
@@ -298,6 +298,16 @@ class TransferManager {
       se::Stream* stream, absl::Span<const se::DeviceMemoryBase> elements,
       const Shape& shape, se::DeviceMemoryBase* region) = 0;
 
+  // Returns whether subbyte types (types less than 1 byte, e.g. U4) should
+  // have multiple values packed into a single byte on the device. Subbyte
+  // bytes are never packed on the host. By default, returns false, so a byte
+  // can only hold one value, but subclasses can override this.
+  //
+  // If overridden to return true, subclasses should pack and unpack in their
+  // overridden implementations of TransferLiteralToDeviceAsync and
+  // TransferLiteralFromDevice respectively.
+  virtual bool PackSubbyteTypes() const { return false; }
+
  private:
   // The mutex that guards the platform-to-transfer manager map.
   static absl::Mutex platform_transfer_manager_mutex_;
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
index 128d4291b2a845..e3478e3a5ff78a 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
@@ -36,6 +36,8 @@ class TpuTransferManagerInterface : public xla::TransferManager {
       std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) = 0;
 
   static TpuTransferManagerInterface* GetRegisteredTpuTransferManager();
+
+  bool PackSubbyteTypes() const override { return true; }
 };
 
 }  // namespace xla

From ed7430fdec4382f04cc1534c0d3c1dd76c7ffcba Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Thu, 21 Mar 2024 16:44:47 -0700
Subject: [PATCH 261/670] Implement `Permute`.

This change is turning the function originally defined in `fold_constant_transpose.cc` into a library API for more general usages.

PiperOrigin-RevId: 618000924
---
 .../mlir/quantization/stablehlo/BUILD         |  1 +
 .../mlir/quantization/stablehlo/cc/BUILD      | 21 ++++++
 .../quantization/stablehlo/cc/permutation.h   | 44 +++++++++++++
 .../stablehlo/cc/permutation_test.cc          | 64 +++++++++++++++++++
 .../passes/fold_constant_transpose.cc         | 16 +----
 5 files changed, 133 insertions(+), 13 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h
 create mode 100644 tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation_test.cc

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 5d9c795083ed7d..829fd93d741604 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -94,6 +94,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:permutation",
         "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
index c04ec9b59e6257..a30e8225117a6b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
@@ -121,6 +121,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "permutation",
+    hdrs = ["permutation.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "permutation_test",
+    srcs = ["permutation_test.cc"],
+    deps = [
+        ":permutation",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "saved_model_export",
     srcs = ["saved_model_export.cc"],
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h
new file mode 100644
index 00000000000000..35b1082b10dae9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PERMUTATION_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PERMUTATION_H_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "llvm/ADT/ArrayRef.h"  // IWYU pragma: keep; required to include the definition of ArrayRef
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"  // IWYU pragma: keep; required to include the definition of SmallVector
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir::quant {
+
+// Permutes `values` with `permutation`. Returns the permuted values. Sizes of
+// `values` and `permutation` must be equal, and the elements of `permutation`
+// should be less than `values.size()`.
+template <typename T,
+          typename = std::enable_if_t<std::is_default_constructible_v<T>, void>>
+SmallVector<T> Permute(const ArrayRef<T> values,
+                       const ArrayRef<int64_t> permutation) {
+  SmallVector<T> permuted_values(/*Size=*/values.size(), /*Value=*/T{});
+  for (auto [i, permutation_idx] : llvm::enumerate(permutation)) {
+    permuted_values[i] = std::move(values[permutation_idx]);
+  }
+  return permuted_values;
+}
+
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PERMUTATION_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation_test.cc
new file mode 100644
index 00000000000000..27a7886ba38466
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h"
+
+#include <cstdint>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir::quant {
+namespace {
+
+using testing::ElementsAre;
+using testing::IsEmpty;
+
+TEST(PermutationTest, PermuteEmptyArray) {
+  const SmallVector<int> permutation_result =
+      Permute<int>(SmallVector<int>{}, SmallVector<int64_t>{});
+  EXPECT_THAT(permutation_result, IsEmpty());
+}
+
+TEST(PermutationTest, PermuteOneElement) {
+  const SmallVector<int> single_element_array = {8};
+  const SmallVector<int64_t> permutation = {0};
+
+  const SmallVector<int> permutation_result =
+      Permute<int>(single_element_array, permutation);
+  EXPECT_THAT(permutation_result, ElementsAre(8));
+}
+
+TEST(PermutationTest, PermuteFourElements) {
+  const SmallVector<int> arr = {0, 3, 1, 2};
+  // Permutation inverse of {0, 3, 1, 2}.
+  const SmallVector<int64_t> permutation = {0, 2, 3, 1};
+
+  const SmallVector<int> permutation_result = Permute<int>(arr, permutation);
+  EXPECT_THAT(permutation_result, ElementsAre(0, 1, 2, 3));
+}
+
+TEST(PermutationTest, PermuteFourStringElements) {
+  const SmallVector<std::string> arr = {"a", "b", "c", "d"};
+  const SmallVector<int64_t> permutation = {0, 2, 3, 1};
+
+  const SmallVector<std::string> permutation_result =
+      Permute<std::string>(arr, permutation);
+  EXPECT_THAT(permutation_result, ElementsAre("a", "c", "d", "b"));
+}
+
+}  // namespace
+}  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
index 52a101b997ad89..051745c0d6792b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h"
 
 namespace mlir::quant::stablehlo {
 
@@ -53,17 +54,6 @@ int64_t GetContiguousOffset(const ArrayRef<int64_t> indices,
   return contiguous_offset;
 }
 
-// Permutes `values` with `permutation`. Returns the permuted values. Sizes of
-// `values` and `permutation` must be equal.
-SmallVector<int64_t> Permute(const ArrayRef<int64_t> values,
-                             const ArrayRef<int64_t> permutation) {
-  SmallVector<int64_t> permuted_values(/*Size=*/values.size(), /*Value=*/0);
-  for (auto [i, permutation_idx] : llvm::enumerate(permutation)) {
-    permuted_values[i] = values[permutation_idx];
-  }
-  return permuted_values;
-}
-
 // Performs transposition of a tensor represented as a contiguous element array.
 // Assumes row-major order. The shape of the input tensor and the desired
 // permutation is registered during construction, and calling `TransposeValues`
@@ -74,7 +64,7 @@ class DenseElementsTransposer {
                           const ArrayRef<int64_t> permutation)
       : rank_(original_shape.size()),
         original_shape_(original_shape),
-        target_shape_(Permute(original_shape, permutation)),
+        target_shape_(Permute<int64_t>(original_shape, permutation)),
         permutation_(permutation) {}
 
   // Transposes `values` with the permutation. Returns the transposed values.
@@ -102,7 +92,7 @@ class DenseElementsTransposer {
           GetContiguousOffset(current_indices, original_shape_);
 
       const SmallVector<int64_t> target_indices =
-          Permute(current_indices, permutation_);
+          Permute<int64_t>(current_indices, permutation_);
       const int64_t target_index =
           GetContiguousOffset(target_indices, target_shape_);
 

From 634570b8797efc902b7e352d3d2c89d74977cdc6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Mar 2024 16:55:22 -0700
Subject: [PATCH 262/670] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/71863e46b69913a87dc676428355d00fa717de8e.

PiperOrigin-RevId: 618003172
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 9bd6d5ed51237a..eb23447adbccb2 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "3192a484ad8453b5f6f1227b35b91235045569c4"
-    TFRT_SHA256 = "e1396fc46a54b819869e269f7a4af5f67b564b253212dc7a434af9f4d9ae095a"
+    TFRT_COMMIT = "71863e46b69913a87dc676428355d00fa717de8e"
+    TFRT_SHA256 = "b78a78860a38c05f96c1029090a71531270923b0a9b3f5151e2b2c011a7a8ced"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 9bd6d5ed51237a..eb23447adbccb2 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "3192a484ad8453b5f6f1227b35b91235045569c4"
-    TFRT_SHA256 = "e1396fc46a54b819869e269f7a4af5f67b564b253212dc7a434af9f4d9ae095a"
+    TFRT_COMMIT = "71863e46b69913a87dc676428355d00fa717de8e"
+    TFRT_SHA256 = "b78a78860a38c05f96c1029090a71531270923b0a9b3f5151e2b2c011a7a8ced"
 
     tf_http_archive(
         name = "tf_runtime",

From 8465bf346216c7ff1fe8ac2c91139c4c125a3059 Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Thu, 21 Mar 2024 16:56:10 -0700
Subject: [PATCH 263/670] Add missing includes and fix typo.

PiperOrigin-RevId: 618003369
---
 third_party/xla/xla/client/BUILD               | 4 ++++
 third_party/xla/xla/client/xla_builder.cc      | 2 +-
 third_party/xla/xla/client/xla_builder_test.cc | 8 ++++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index 6c3b7790d88f21..6b8fa182a1b5ed 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -293,8 +293,10 @@ xla_cc_test(
         ":value_inference",
         ":xla_builder",
         ":xla_computation",
+        "//xla:comparison_util",
         "//xla:debug_options_flags",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:statusor",
         "//xla:test",
         "//xla:test_helpers",
@@ -305,6 +307,8 @@ xla_cc_test(
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/client/xla_builder.cc b/third_party/xla/xla/client/xla_builder.cc
index 5f8a1c58267679..be157c3def9404 100644
--- a/third_party/xla/xla/client/xla_builder.cc
+++ b/third_party/xla/xla/client/xla_builder.cc
@@ -3795,7 +3795,7 @@ XlaOp XlaBuilder::AllToAllTuple(
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
-    // The HloInstruction for Alltoall currently only handles the data
+    // The HloInstruction for AllToAll currently only handles the data
     // communication: it accepts N already split parts and scatters them to N
     // cores, and each core gathers the N received parts into a tuple as the
     // output. So here we explicitly split the operand before the hlo alltoall,
diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index 0cb7ec1c023172..1aac6fe5ec956a 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/client/xla_builder.h"
 
 #include <algorithm>
+#include <array>
 #include <complex>
 #include <cstdint>
 #include <functional>
@@ -27,24 +28,31 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/client/padding.h"
 #include "xla/client/sharding_builder.h"
 #include "xla/client/value_inference.h"
 #include "xla/client/xla_computation.h"
+#include "xla/comparison_util.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/layout_util.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"

From f9f12ddc97557582fc4e5863685b7bc87fdb249e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Mar 2024 17:21:28 -0700
Subject: [PATCH 264/670] Changes visibility for metrics_proto to public.

PiperOrigin-RevId: 618009221
---
 third_party/xla/xla/service/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 3f81f542fa05d6..daae872b83ec01 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -79,6 +79,7 @@ tf_proto_library(
     name = "metrics_proto",
     srcs = ["metrics.proto"],
     cc_api_version = 2,
+    visibility = ["//visibility:public"],
 )
 
 xla_py_proto_library(

From 960c6655b66360636fc7aea518f87d977b56c6b9 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Thu, 21 Mar 2024 17:36:26 -0700
Subject: [PATCH 265/670] Use a compact format for const ops in MLRT

PiperOrigin-RevId: 618012307
---
 .../compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td | 18 +++++++++
 .../compiler/mlir/tfrt/tests/mlrt/inline.mlir |  2 +-
 .../mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir      | 11 +++---
 .../compiler/mlir/tfrt/transforms/mlrt/BUILD  |  3 ++
 .../mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc   | 15 ++++++++
 tensorflow/core/tfrt/mlrt/kernel/BUILD        |  3 ++
 tensorflow/core/tfrt/mlrt/kernel/kernel.cc    | 37 +++++++++++++++++++
 7 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
index 72eac197011a6d..6ff38dda69bd85 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
@@ -35,6 +35,24 @@ def CreateOp: TensorflowMlrt_Op<"createop", []> {
   let assemblyFormat = "attr-dict";
 }
 
+def ConstOp: TensorflowMlrt_Op<"constop", []> {
+  let summary = "The tf_mlrt ConstOp";
+
+  let description = [{
+    The ConstOp creates a constant tensorflow::Tensor from serialized proto.
+  }];
+
+  let arguments = (ins
+    StrAttr:$tensor_proto
+  );
+
+  let results = (outs
+    TFTensorType:$result
+  );
+
+  let assemblyFormat = "attr-dict";
+}
+
 def ExecuteOp : TensorflowMlrt_Op<"executeop", []> {
   let summary = "The Fallback ExecuteOp";
   let description = [{
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/inline.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/inline.mlir
index de2a29c017df30..88bc197c8e88d6 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/inline.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/inline.mlir
@@ -29,7 +29,7 @@ func.func @while_body_if(%cond: tensor<i1>, %x: tensor<i1>, %y: tensor<i1>, %z:
 // CHECK-LABEL: func @while_test_if
 // CHECK-SAME: -> !tf_mlrt.tensor
 func.func @while_test_if(%cond: tensor<i1>, %x: tensor<i1>, %y: tensor<i1>) -> (tensor<i32>) {
-  // CHECK: [[CONST:%.*]] = tf_mlrt.executeop
+  // CHECK: [[CONST:%.*]] = tf_mlrt.constop {tensor_proto = "\08\03\12\00"}
   %cst = "tf.Const"() {__op_key = 2: i32, device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
   // Predicate should be inlined.
   // CHECK-NEXT: tf_mlrt.predicate
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
index 4cd2d6f3613a27..3cb879dabe97f7 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
@@ -236,7 +236,7 @@ func.func @while_body_add2(%arg0: tensor<i32>) -> tensor<i32> {
 // CHECK-LABEL: func @while_test()
 // CHECK-SAME: -> !tf_mlrt.tensor
 func.func @while_test() -> (tensor<i32>) {
-  // CHECK: [[CONST:%.*]] = tf_mlrt.executeop
+  // CHECK: [[CONST:%.*]] = tf_mlrt.constop
   %0 = "tf.Const"() {__op_key = 4: i32, device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
   // CHECK: [[pred_res:%.*]] = call @"while_cond_lt9/tf_mlrt_predicate"([[CONST]]) : (!tf_mlrt.tensor) -> i1
   // CHECK: [[while_res:%.*]]:2 = mlrt.while
@@ -353,8 +353,7 @@ func.func @main(%input0: tensor<i32>) -> tensor<i32> {
     {callee = @main_stream_0} :
     (tensor<i32>, !mlrt.promise) -> !mlrt.async_handle
 
-  // CHECK: [[const:%.*]]  = tf_mlrt.executeop
-  // CHECK-SAME: Const
+  // CHECK: [[const:%.*]]  = tf_mlrt.const
   %const = "tf.Const"() {__op_key = 1: i32, value = dense<2> : tensor<i32>} : () -> tensor<i32>
 
   // CHECK: [[b:%.*]] = tf_mlrt.await [[futures]]
@@ -482,11 +481,11 @@ func.func @ifrt_load_variable_test() -> () {
 
 // CHECK-LABEL: func @ifrt_restore_variable_test
 func.func @ifrt_restore_variable_test() -> () {
-  // CHECK-NEXT: [[PREFIX:%.*]] = tf_mlrt.executeop
+  // CHECK-NEXT: [[PREFIX:%.*]] = tf_mlrt.constop
   %cst = "tf.Const"() {__op_key = 0: i32, value = dense<"restore_ariables"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
-  // CHECK-NEXT: [[SLICE:%.*]] = tf_mlrt.executeop
+  // CHECK-NEXT: [[SLICE:%.*]] = tf_mlrt.constop
   %cst_0 = "tf.Const"()  {__op_key = 1: i32, value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
-  // CHECK-NEXT: [[NAME:%.*]] = tf_mlrt.executeop
+  // CHECK-NEXT: [[NAME:%.*]] = tf_mlrt.constop
   %cst_1 = "tf.Const"()  {__op_key = 2: i32, value = dense<["y"]> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
   // CHECK-NEXT: [[HANDLE:%.*]] = tf_mlrt.executeop
   %handle = "tf.VarHandleOp"() {__op_key = 3: i32, container = "x", shared_name = "y"} : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
index 1bb99fa64ebaf7..14f39a53c44dc1 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
@@ -65,6 +65,7 @@ cc_library(
         ":tpu_conversion_patterns",
         ":util",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
         "//tensorflow/compiler/mlir/tfrt:constants",
@@ -78,12 +79,14 @@ cc_library(
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner_cache",
         "@com_google_protobuf//:protobuf_headers",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
index 0fb986e567b2f4..37ddf0b1bf076d 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 
 #include "google/protobuf/text_format.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -39,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tfrt/constants.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h"
@@ -52,6 +54,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace mlrt_compiler {
@@ -442,6 +445,18 @@ class ExecuteOpConversion final : public mlir::ConversionPattern {
     // TODO(b/173017701): Avoid fallback for ops within XLA GPU clusters.
     if (!UseFallback(op)) return mlir::failure();
 
+    if (auto const_op = llvm::dyn_cast<mlir::TF::ConstOp>(op)) {
+      tensorflow::TensorProto tensor_proto;
+      auto status = ConvertToTensorProto(const_op.getValue(), &tensor_proto);
+      if (!status.ok())
+        return const_op.emitError(tsl::NullTerminatedMessage(status));
+
+      rewriter.replaceOpWithNewOp<tf_mlrt::ConstOp>(
+          op, rewriter.getType<tf_mlrt::TFTensorType>(),
+          tensor_proto.SerializeAsString());
+      return mlir::success();
+    }
+
     // The assign_op_key pass should have ran.
     if (!op->hasAttr(tensorflow::tfrt_compiler::kOpKeyAttrName))
       return op->emitError("does not have op_key defined");
diff --git a/tensorflow/core/tfrt/mlrt/kernel/BUILD b/tensorflow/core/tfrt/mlrt/kernel/BUILD
index d7084120b3e4a5..223dd6da29d54f 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/BUILD
+++ b/tensorflow/core/tfrt/mlrt/kernel/BUILD
@@ -23,6 +23,7 @@ cc_library(
         ":context",
         ":kernel_runner_utils",
         "//tensorflow/core:framework",
+        "//tensorflow/core/framework:tensor_proto_cc",
         "//tensorflow/core/tfrt/mlrt/bytecode:function",
         "//tensorflow/core/tfrt/mlrt/interpreter:async_handle",
         "//tensorflow/core/tfrt/mlrt/interpreter:attribute_span",
@@ -38,6 +39,8 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@tf_runtime//:hostcontext",
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
index 9bf87142e775c5..0e09ec23cf52dd 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
@@ -25,7 +25,10 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/function.h"
@@ -626,6 +629,39 @@ void CancelOp::Invoke() {
   }
 }
 
+struct ConstOp : mlrt::KernelFrame {
+  using KernelFrame::KernelFrame;
+
+  static constexpr char kName[] = "tf_mlrt.constop";
+
+  absl::string_view tensor_proto() const {
+    return attributes().GetAs<mlrt::bc::String>(0).Get();
+  }
+
+  Context& context() { return execution_context().GetUserContext<Context>(); }
+
+  void Invoke();
+};
+
+void ConstOp::Invoke() {
+  tensorflow::TensorProto proto;
+  if (!proto.ParseFromString(tensor_proto())) {
+    execution_context().Fail(
+        absl::InternalError("Failed to parse const tensor proto"));
+    return;
+  }
+
+  tensorflow::Tensor tensor;
+  if (!tensor.FromProto(proto)) {
+    execution_context().Fail(
+        absl::InternalError("Failed to create tensor from tensor proto"));
+    return;
+  }
+
+  results()[0].Emplace<tensorflow::tfrt_stub::FallbackTensor>(
+      std::move(tensor));
+}
+
 struct CreateOp : mlrt::KernelFrame {
   using KernelFrame::KernelFrame;
 
@@ -985,6 +1021,7 @@ void RegisterTfMlrtKernels(mlrt::KernelRegistry& registry) {
   // TODO(chky,rohitju): These kernels should be unified with the corresponding
   // tfrt_fallback_sync kernels, e.g. tfrt_fallback_sync.executeop.
   registry.Register<CancelOp>();
+  registry.Register<ConstOp>();
   registry.Register<CreateOp>();
   registry.Register<CreateOp>("tfrt_fallback_sync.createop");
   registry.Register<ExecuteOp>();

From 7a90a1fe1b0a5c7d0ede2ce05ac68995ad34a6fb Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Thu, 21 Mar 2024 21:01:50 -0700
Subject: [PATCH 266/670] Expose `.layout` on jax.Array. Also add checks in the
 AOT path to make sure that the input `Array`'s layout matches the layout
 given to `jax.jit`.

PiperOrigin-RevId: 618050680
---
 third_party/xla/xla/python/py_array.cc    | 3 +++
 third_party/xla/xla/python/py_array.h     | 4 ++++
 third_party/xla/xla/python/xla_client.py  | 2 +-
 third_party/xla/xla/python/xla_client.pyi | 1 +
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index cd58ff580380c6..d73568c2ccb6d4 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/layout.h"
 #include "xla/layout_util.h"
@@ -1570,6 +1571,8 @@ Status PyArray::RegisterTypes(nb::module_& m) {
       nb::is_method());
   type.attr("__cuda_array_interface__") = nb_property_readonly(
       [](PyArray self) { return self.CudaArrayInterface(); });
+  type.attr("_pjrt_layout") =
+      nb_property_readonly(xla::ValueOrThrowWrapper(&PyArray::layout));
   type.attr("on_device_size_in_bytes") = nb::cpp_function(
       xla::ValueOrThrowWrapper(&PyArray::GetOnDeviceSizeInBytes),
       nb::is_method());
diff --git a/third_party/xla/xla/python/py_array.h b/third_party/xla/xla/python/py_array.h
index 1595c830a92a7b..31d26b95feb68c 100644
--- a/third_party/xla/xla/python/py_array.h
+++ b/third_party/xla/xla/python/py_array.h
@@ -176,6 +176,10 @@ class PyArray : public nanobind::object {
 
   const nanobind::object& sharding() const { return GetStorage().sharding; }
 
+  StatusOr<std::unique_ptr<PjRtLayout>> layout() {
+    return ifrt_array()->layout();
+  }
+
   bool committed() const { return GetStorage().committed; }
 
   const nanobind::object& npy_value() const { return GetStorage().npy_value; }
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 47e0600ee6219e..ccfbf7d00874f1 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -48,7 +48,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 248
+_version = 249
 
 # Version number for MLIR:Python components.
 mlir_api_version = 55
diff --git a/third_party/xla/xla/python/xla_client.pyi b/third_party/xla/xla/python/xla_client.pyi
index cedaa61db5cb96..cd8f5bd09615ee 100644
--- a/third_party/xla/xla/python/xla_client.pyi
+++ b/third_party/xla/xla/python/xla_client.pyi
@@ -42,6 +42,7 @@ from .xla_extension import OpSharding as OpSharding
 from .xla_extension import HloSharding as HloSharding
 from .xla_extension import PrimitiveType as PrimitiveType
 from .xla_extension import Traceback as Traceback
+from .xla_extension import PjRtLayout as PjRtLayout
 from .xla_extension import XlaBuilder as XlaBuilder
 from .xla_extension import XlaComputation as XlaComputation
 from .xla_extension import XlaOp as XlaOp

From ec8aca945e9c79e88d9f28c58ea343fbaf819940 Mon Sep 17 00:00:00 2001
From: Philipp Hack <phack@nvidia.com>
Date: Thu, 21 Mar 2024 21:55:33 -0700
Subject: [PATCH 267/670] PR #10811: Restore ReshapeDecomposer And
 LayoutNormalization Passes in GPU Compiler

Imported from GitHub PR https://github.com/openxla/xla/pull/10811

Restores individual ReshapeDecomposer and LayoutNormalization passes in the GPU compiler previously removed in #9852 and fixes failures in cudnn_norm_rewriter_test.cc.
Copybara import of the project:

--
52065712a6bd4f6bb874575d82b4d99b8f2d16dd by Philipp Hack <phack@nvidia.com>:

Restore ReshapeDecomposer and LayoutNormalization passes.

Merging this change closes #10811

PiperOrigin-RevId: 618059598
---
 third_party/xla/xla/service/gpu/gpu_compiler.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index baab8ed21195b6..776511c18c5c4d 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1383,6 +1383,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     pipeline.AddPass<TransposeFolding>(CanFoldTransposeOperandIntoDot,
                                        TransposeFolding::NeverFoldTranspose);
 
+    pipeline.AddPass<ReshapeDecomposer>();
     pipeline.AddPass<ReduceDecomposer>([&](const HloInstruction* r) {
       return IsReductionFromOrToContiguousDimensions(*r);
     });
@@ -1418,6 +1419,9 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // Rewrite GEMMs with broadcasted inputs as strided GEMMs.
     pipeline.AddPass<GemmBroadcastFoldingRewriter>();
 
+    if (debug_options.xla_gpu_normalize_layouts()) {
+      pipeline.AddPass<LayoutNormalization>(&NormalizeLayoutForGpuCustomCalls);
+    }
     pipeline.AddPass<BroadcastCanonicalizer>();
 
     pipeline.AddPass<ReductionDegenerateDimRemover>();

From 5990de1d9e4bd6971bdd55e44c446f0b23bb2d50 Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Thu, 21 Mar 2024 22:45:28 -0700
Subject: [PATCH 268/670] Remove the `OpSet` alias in `mlir::quant` namespace.

It is defined in the `tensorflow::quantization::` namespace (as part of `QuantizationOptions` proto), but has been unexpectedly exposed under `mlir::quant` due to a `using` directive in `attrs_and_constraints.h`. Remove this exposure and use the one in the `tensorflow::quantization::`.

This change also fixes some include warnings along the way.

PiperOrigin-RevId: 618067952
---
 .../common/attrs_and_constraints.h            |  5 -----
 .../mlir/quantization/tensorflow/BUILD        |  1 +
 .../duplicate_shape_determining_constants.cc  |  6 +++--
 .../passes/insert_quantized_functions.cc      |  5 +++++
 .../tensorflow/passes/insert_save_op.cc       |  1 +
 .../lift_quantizable_spots_as_functions.cc    |  1 +
 ...lift_quantizable_spots_as_functions_drq.cc |  5 +++--
 .../passes/merge_save_function_ops_to_main.cc |  1 +
 .../tensorflow/passes/optimize.cc             |  8 +++++--
 .../quantization/tensorflow/passes/passes.h   | 21 ++++++++++--------
 .../tensorflow/passes/prepare_lifting.cc      |  2 ++
 .../tensorflow/passes/prepare_quantize_drq.cc |  2 +-
 .../tensorflow/passes/preprocess_op.cc        | 22 ++++++++++++++-----
 .../tensorflow/passes/quantize.cc             |  6 ++---
 .../passes/quantize_composite_functions.cc    |  1 +
 15 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
index 42ecca536f54a5..4c9e22dfce62aa 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
@@ -40,11 +40,6 @@ namespace mlir::quant {
 
 constexpr char kAttrMapAttribute[] = "attr_map";
 
-// TODO: b/238829558 - Populate quantization config based on the
-// QuantizationOptions proto.
-// TODO: b/263449239 - Put the OpSet aliases separately within each file
-using OpSet = tensorflow::quantization::OpSet;
-
 // Returns true if the value has static shape.
 bool HasStaticShape(Value value);
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index 062cc52504af33..9d4da1f4dc5b23 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -444,6 +444,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:quantization_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/status",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/duplicate_shape_determining_constants.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/duplicate_shape_determining_constants.cc
index 5237102335e5df..8590a00775cdf0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/duplicate_shape_determining_constants.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/duplicate_shape_determining_constants.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <array>
 #include <iterator>
 #include <memory>
 
 #include "absl/algorithm/container.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -26,9 +26,11 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 // Required to use LLVM_DEBUG macro.
 #define DEBUG_TYPE "quant-duplicate-shape-determining-constants"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc
index 20ffa5aa9b793c..47ab3b82fc2f24 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc
@@ -19,13 +19,17 @@ limitations under the License.
 #include <tuple>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library.h"
@@ -38,6 +42,7 @@ namespace quant {
 namespace {
 
 using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
+using ::tensorflow::quantization::OpSet;
 
 class InsertQuantizedFunctionsPass
     : public PassWrapper<InsertQuantizedFunctionsPass,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_save_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_save_op.cc
index 7c702957385bdf..f0274d26d942f0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_save_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_save_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
index 883d171fafca9f..63fb3bd94005ee 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
@@ -54,6 +54,7 @@ namespace {
 
 using QuantizationUnit =
     ::tensorflow::quantization::UnitWiseQuantizationSpec::QuantizationUnit;
+using ::tensorflow::quantization::OpSet;
 using ::tensorflow::quantization::QuantizationComponentSpec;
 using ::tensorflow::quantization::QuantizationMethod;
 using ::tensorflow::quantization::QuantizationOptions;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
index f28399d7d2c0f8..0acb2e56ea617e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
@@ -16,16 +16,16 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
@@ -39,6 +39,7 @@ namespace {
 
 using QuantMethod =
     ::tensorflow::quantization::QuantizationMethod::PresetMethod;
+using ::tensorflow::quantization::OpSet;
 
 class LiftQuantizableSpotsAsFunctionsDRQPass
     : public PassWrapper<LiftQuantizableSpotsAsFunctionsDRQPass,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc
index caef5c034f4b17..e092352dc52c29 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.cc
index b459bbcd901125..9ff070e1a44e0d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.cc
@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <memory>
-#include <string>
 #include <utility>
 
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"  // IWYU pragma: keep - required to use `IsSplatValueEqual`.
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
index 97a383631e70db..5ea5a058cc94d3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include <optional>
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
@@ -49,14 +49,14 @@ CreateLiftQuantizableSpotsAsFunctionsPass(
 // Apply graph optimizations such as fusing and constant folding to prepare
 // lifting.
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareLiftingPass(
-    OpSet target_opset);
+    tensorflow::quantization::OpSet target_opset);
 
 // Lifts the dynamic range quantizable spots as composite functions.
 std::unique_ptr<OperationPass<ModuleOp>>
 CreateLiftQuantizableSpotsAsFunctionsDRQPass(
     tensorflow::quantization::QuantizationMethod::PresetMethod
         quantization_method,
-    OpSet op_set, int min_num_elements_for_weights);
+    tensorflow::quantization::OpSet op_set, int min_num_elements_for_weights);
 
 // Replaces tf.CustomAggregator ops with quant.Stats ops for finalizing the
 // calibration procedure.
@@ -71,7 +71,7 @@ CreateIssueIDsOfCustomAggregationOpsPass();
 std::unique_ptr<OperationPass<ModuleOp>> CreateInsertQuantizedFunctionsPass(
     tensorflow::quantization::QuantizationMethod::PresetMethod
         quantization_method,
-    OpSet target_opset);
+    tensorflow::quantization::OpSet target_opset);
 
 // Inserts custom aggregation operators for the calibration procedure.
 std::unique_ptr<OperationPass<func::FuncOp>>
@@ -86,8 +86,9 @@ CreateInsertCustomAggregationOpsPass(
 std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizeCompositeFunctionsPass(
     tensorflow::quantization::QuantizationMethod::PresetMethod
         quantization_method,
-    OpSet target_opset, bool enable_per_channel_quantization,
-    int min_num_elements_for_weight, bool enable_legacy_weight_only = false,
+    tensorflow::quantization::OpSet target_opset,
+    bool enable_per_channel_quantization, int min_num_elements_for_weights,
+    bool enable_legacy_weight_only = false,
     std::optional<const absl::string_view> mlir_dump_file_prefix =
         std::nullopt);
 
@@ -100,7 +101,8 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass();
 
 // Overloading of CreateQuantizePass which takes QuantizationSpecs.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
-    QuantizationSpecs quant_specs, OpSet target_opset);
+    QuantizationSpecs quant_specs,
+    tensorflow::quantization::OpSet target_opset);
 
 // Creates an instance of the PrepareQuantize pass, which will perform similar
 // transformations as TFL::PrepareQuantizePass.
@@ -112,12 +114,13 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
 // Creates an instance of the PrepareQuantizeDRQ pass, which will
 // perform similar transformations as TFL::PrepareQuantizeDynamicRangePass.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizeDRQPass(
-    const QuantizationSpecs& quant_specs, OpSet op_set);
+    const QuantizationSpecs& quant_specs,
+    tensorflow::quantization::OpSet op_set);
 
 // Creates an instance of the PreprocessOp pass, which will perform op
 // preprocessing to allow multi-axis quantization, prior to quantization.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePreprocessOpPass(
-    OpSet op_set,
+    tensorflow::quantization::OpSet op_set,
     tensorflow::quantization::QuantizationMethod::PresetMethod
         quantization_method,
     bool enable_per_channel_quantization);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
index ebdd374288a065..38075bb67b7010 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
@@ -49,6 +49,8 @@ namespace mlir {
 namespace quant {
 namespace {
 
+using ::tensorflow::quantization::OpSet;
+
 class PrepareLiftingPass
     : public PassWrapper<PrepareLiftingPass, OperationPass<func::FuncOp>> {
  public:
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
index af02c3694fc16d..71587390580406 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
@@ -45,7 +45,7 @@ namespace {
 
 using QuantizationUnit = std::pair<Operation*, int>;
 using QuantizationUnits = llvm::SetVector<QuantizationUnit>;
-using ::mlir::quant::OpSet;
+using ::tensorflow::quantization::OpSet;
 
 // Applies prepare quantization on the model in TF dialect for dynamic range
 // quantization case.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
index 765929a75043aa..3f54fe580fe1c4 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
@@ -14,25 +14,36 @@ limitations under the License.
 ==============================================================================*/
 // This transformation pass applies quantization propagation on TF dialect.
 
-#include <algorithm>
+#include <cstdint>
 #include <memory>
-#include <string>
 #include <utility>
-#include <vector>
 
+#include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
-#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 //===----------------------------------------------------------------------===//
 // The preprocess-op Pass.
@@ -46,6 +57,7 @@ using QuantMethod =
     ::tensorflow::quantization::QuantizationMethod::PresetMethod;
 using QuantizationUnit = std::pair<Operation*, int>;
 using QuantizationUnits = llvm::SetVector<QuantizationUnit>;
+using ::tensorflow::quantization::OpSet;
 
 // Preprocesses ops to allow multi-axis quantization, prior to quantization
 // passes. Currently, per-channel quantization only supports 1D results.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
index ca088c5d318cf4..26e468556a36ab 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
@@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// Copied and modified from
-// //third_party/tensorflow/compiler/mlir/lite/transforms/quantize.cc
-// This transformation pass applies quantization on TF dialect.
 #include <memory>
 #include <string>
 #include <utility>
@@ -44,7 +41,6 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
-#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
@@ -60,6 +56,8 @@ namespace quant {
 //===----------------------------------------------------------------------===//
 namespace {
 
+using ::tensorflow::quantization::OpSet;
+
 enum QuantizationTrait { kFullQuantization, kDynamicRangeQuantization };
 
 // Base struct for quantization.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
index 2ddb9f50eedee4..0b3c89c56f60bb 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
@@ -62,6 +62,7 @@ namespace quant {
 namespace {
 
 using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
+using ::tensorflow::quantization::OpSet;
 
 constexpr absl::string_view kQuantizeCompositeFunctionsStepName =
     "_quantize_composite_functions";

From 89384cb097e6e15282891cea93e9191648852901 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 22 Mar 2024 00:58:23 -0700
Subject: [PATCH 269/670] [xla:gpu] Support DUS in
 DynamicAddressComputationFusion

PiperOrigin-RevId: 618092070
---
 .../address_computation_fusion_test.cc        | 192 ++++++++++++++++++
 .../xla/xla/service/gpu/fusions/custom.cc     |  70 +++++--
 2 files changed, 243 insertions(+), 19 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index 2bd9b4c5061da9..2fbbb99c80cd50 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -72,6 +72,7 @@ class AddressComputationFusionTest : public HloTestBase {
   HloModuleConfig GetRefModuleConfig() {
     DebugOptions debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+    debug_options.clear_xla_gpu_enable_command_buffer();
     HloModuleConfig config;
     config.set_debug_options(debug_options);
     return config;
@@ -80,6 +81,7 @@ class AddressComputationFusionTest : public HloTestBase {
   HloModuleConfig GetOptModuleConfig() {
     DebugOptions debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+    debug_options.clear_xla_gpu_enable_command_buffer();
     HloModuleConfig config;
     config.set_debug_options(debug_options);
     return config;
@@ -1932,6 +1934,196 @@ TEST_F(AddressComputationFusionTest, DynamicSlicedOperandAliasingOutput) {
                                       /*run_hlo_passes=*/false));
 }
 
+TEST_F(AddressComputationFusionTest, CublasGemmDUS) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY main.9 {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[2,8,8]{2,1,0} parameter(1)
+    p2 = bf16[4,8,8]{2,1,0} parameter(2)
+    c1_s32 = s32[] constant(1)
+    c0_s32 = s32[] constant(0)
+    slice.13 = bf16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.41 = bf16[8,8]{1,0} bitcast(slice.13)
+    slice.14 = bf16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.42 = bf16[8,8]{1,0} bitcast(slice.14)
+
+    custom-call.1 = bf16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    bitcast.43 = bf16[1,8,8]{2,1,0} bitcast(custom-call.1)
+    ROOT dus = bf16[4,8,8]{2,1,0} dynamic-update-slice(p2, bitcast.43, c1_s32, c0_s32, c0_s32)
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  fused_computation {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[2,8,8]{2,1,0} parameter(1)
+    p2 = bf16[4,8,8]{2,1,0} parameter(2)
+    c1_s32 = s32[] parameter(3)
+    c0_s32 = s32[] parameter(4)
+    slice.13 = bf16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.41 = bf16[8,8]{1,0} bitcast(slice.13)
+    slice.14 = bf16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.42 = bf16[8,8]{1,0} bitcast(slice.14)
+
+    custom-call.1 = bf16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    bitcast.43 = bf16[1,8,8]{2,1,0} bitcast(custom-call.1)
+    ROOT dus = bf16[4,8,8]{2,1,0} dynamic-update-slice(p2, bitcast.43, c1_s32, c0_s32, c0_s32)
+  }
+
+  ENTRY main.9 {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[2,8,8]{2,1,0} parameter(1)
+    p2 = bf16[4,8,8]{2,1,0} parameter(2)
+    c1_s32 = s32[] constant(1)
+    c0_s32 = s32[] constant(0)
+    ROOT fusion.2 = bf16[4,8,8]{2,1,0} fusion(p0, p1, p2, c1_s32, c0_s32), kind=kCustom, calls=fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, CublasGemmDUSWithWorkspace) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] parameter(3)
+    %c0_s32 = s32[] parameter(4)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+  }
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    ROOT %fusion.2 = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index bcc4fb094d3f2d..ebe67d7ee99d49 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -194,7 +194,7 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
   std::vector<std::optional<const Shape>> sliced_shapes;
 
   HloDynamicIndexInstruction* slice_instr = nullptr;
-  auto get_original_slice =
+  auto get_original_operand_slice =
       [&](const HloInstruction* start,
           const ShapeIndex& index) -> absl::StatusOr<BufferAllocation::Slice> {
     if (const auto* param = DynCast<HloParameterInstruction>(start)) {
@@ -207,8 +207,8 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
         [](auto node) { return node.opcode() == HloOpcode::kDynamicSlice; });
     if (!slice_adaptor.has_value()) {
       return absl::InternalError(
-          "DynamicAddressComputationFusion expects at least one sliced "
-          "operand");
+          "DynamicAddressComputationFusion expects all operands to be either "
+          "sliced or parameter");
     }
 
     slice_instr = const_cast<HloDynamicIndexInstruction*>(
@@ -246,43 +246,75 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
     }
     offset_buffer_indices.push_back(offset_slices);
     orig_shapes.push_back(slice_instr->operand(0)->shape());
-    sliced_shapes.push_back(slice_instr->shape());
+    sliced_shapes.push_back(DynCast<HloDynamicSliceInstruction>(slice_instr)
+                                ? slice_instr->shape()
+                                : slice_instr->operand(1)->shape());
   };
 
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice lhs_slice,
-                      get_original_slice(custom_call.operand(0), /*index=*/{}));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice lhs_slice,
+      get_original_operand_slice(custom_call.operand(0), /*index=*/{}));
   collect_slice_info();
 
   slice_instr = nullptr;
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice rhs_slice,
-                      get_original_slice(custom_call.operand(1), /*index=*/{}));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice rhs_slice,
+      get_original_operand_slice(custom_call.operand(1), /*index=*/{}));
   collect_slice_info();
 
+  slice_instr = nullptr;
   BufferAllocation::Slice output;
   std::optional<BufferAllocation::Slice> workspace = std::nullopt;
   std::optional<BufferAllocation::Slice> slice_workspace_fake = std::nullopt;
 
-  // TODO(vuson): handle DUS
+  auto get_original_result_slice =
+      [&](const HloInstruction* start,
+          const ShapeIndex& index) -> absl::StatusOr<BufferAllocation::Slice> {
+    auto slice_adaptor = HloFindIf(
+        {HloInstructionAdaptor(*start)}, adaptor,
+        [](auto node) {
+          return node.opcode() == HloOpcode::kDynamicUpdateSlice;
+        },
+        false);
+    if (slice_adaptor.has_value()) {
+      slice_instr = const_cast<HloDynamicIndexInstruction*>(
+          static_cast<const HloDynamicIndexInstruction*>(
+              &slice_adaptor->instruction()));
+
+      if (!IsContiguousSlice(slice_instr->operand(0)->shape(),
+                             slice_instr->shape())) {
+        return absl::InternalError(
+            "DynamicAddressComputationFusion only handles contiguous slices "
+            "currently");
+      }
+    }
+
+    return GetAllocationSlice(buffer_assignment, &fusion, index);
+  };
+
   int64_t out_byte_size = 0;
   if (custom_call.shape().IsArray()) {
     TF_ASSIGN_OR_RETURN(output,
-                        GetAllocationSlice(buffer_assignment, &fusion, {}));
+                        get_original_result_slice(&custom_call, /*index=*/{}));
+    collect_slice_info();
+    // Collect slice info for std::nullopt workspace.
+    slice_instr = nullptr;
+    collect_slice_info();
     out_byte_size = ShapeUtil::ByteSizeOf(custom_call.shape());
   } else {
     TF_ASSIGN_OR_RETURN(output,
-                        GetAllocationSlice(buffer_assignment, &fusion, {0}));
-    TF_ASSIGN_OR_RETURN(workspace,
-                        GetAllocationSlice(buffer_assignment, &fusion, {1}));
+                        get_original_result_slice(&custom_call, /*index=*/{0}));
+    collect_slice_info();
+    // TODO(vuson): If we want to support slices of workspace, we'd need to
+    // start `HloFindIf` with `get-tuple-element` with the right index.
+    TF_ASSIGN_OR_RETURN(workspace, GetAllocationSlice(buffer_assignment,
+                                                      &fusion, /*index=*/{1}));
+    slice_instr = nullptr;
+    collect_slice_info();
     out_byte_size = ShapeUtil::ByteSizeOf(custom_call.shape().tuple_shapes(0));
     slice_workspace_fake =
         BufferAllocation::Slice(workspace->allocation(), 0, workspace->size());
   }
-  offset_buffer_indices.push_back(std::nullopt);
-  offset_buffer_indices.push_back(std::nullopt);
-  orig_shapes.push_back(std::nullopt);
-  orig_shapes.push_back(std::nullopt);
-  sliced_shapes.push_back(std::nullopt);
-  sliced_shapes.push_back(std::nullopt);
 
   // Creating embedded GEMM thunk.
   bool deterministic_ops =

From 3098b7a05caa25a7d9b46549a9039b7d7db53fd1 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Fri, 22 Mar 2024 01:13:39 -0700
Subject: [PATCH 270/670] Rollback of GpuTimer change

PiperOrigin-RevId: 618095643
---
 .../xla/service/gpu/conv_algorithm_picker.cc  |  12 +-
 .../xla/service/gpu/gemm_algorithm_picker.cc  |  31 +----
 third_party/xla/xla/stream_executor/gpu/BUILD |  26 +---
 .../xla/xla/stream_executor/gpu/gpu_timer.cc  | 114 ++----------------
 .../xla/xla/stream_executor/gpu/gpu_timer.h   |  34 +-----
 .../gpu/gpu_timer_kernel.cu.cc                |  52 --------
 .../stream_executor/gpu/gpu_timer_kernel.h    |  26 ----
 7 files changed, 21 insertions(+), 274 deletions(-)
 delete mode 100644 third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
 delete mode 100644 third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h

diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index 4c21084f51d48b..54bde3ac33e147 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -612,6 +612,7 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   // Use assignment instead of brace-list to make GCC 4.9 happy.
   RunConvOptions options;
   options.runner_cache = runner;
+  options.profile_result = &profile_result;
   // The following plan timing code is based on
   // https://github.com/NVIDIA/cudnn-frontend/blob/60496f42fdc7a4ccc059f5934e306e728a756755/include/cudnn_frontend_find_plan.h
   float max_time = 0;
@@ -624,20 +625,15 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   // Dry-run to warmup the plan.
   launch_status = RunGpuConv(config, operand_buffers, result_buffers,
                              scratch_memory, stream, options);
-  // It is intentional that the warm-up run does not have a profile result.
-  // This avoids a timeout and error message if lazy module loading is enabled
-  // by ensuring that lazy loading happens outside the GpuTimer region.
-  options.profile_result = &profile_result;
   constexpr int kMaxIter = 10;
   // Iterate until the new measurement is within kThreshold of the current
   // minimum.
   int num_iters = 0;
-  for (; num_iters < kMaxIter && launch_status.ok(); ++num_iters) {
+  for (;
+       num_iters < kMaxIter && launch_status.ok() && profile_result.is_valid();
+       num_iters++) {
     launch_status = RunGpuConv(config, operand_buffers, result_buffers,
                                scratch_memory, stream, options);
-    if (!profile_result.is_valid()) {
-      break;
-    }
     float old_min_time = min_time;
     min_time = std::min(min_time, profile_result.elapsed_time_in_ms());
     max_time = std::max(max_time, profile_result.elapsed_time_in_ms());
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index 50e4d1daca6a0f..446cde8de272de 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
-#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -247,15 +246,6 @@ class GemmAutotuner {
 
     auto tuned_func = [&](const se::blas::AlgorithmType& algorithm)
         -> absl::StatusOr<se::blas::ProfileResult> {
-      // Do a warm-up run first, without a profile result. This avoids a timeout
-      // and error message if lazy module loading is enabled by ensuring that
-      // lazy loading happens outside the GpuTimer. RunGemm swallows error codes
-      // when profile_result is passed, as it is in the measurement below, but
-      // not otherwise. It is, therefore, consistent to ignore the error code
-      // here.
-      static_cast<void>(RunGemm(gemm_config, lhs_buffer_, rhs_buffer_,
-                                output_buffer_, workspace_buffer,
-                                deterministic_ops_, stream_, algorithm));
       se::blas::ProfileResult profile_result;
       // We expect GemmWithAlgorithm to fail sometimes -- in fact, it will fail
       // for all algorithms if we're targeting < sm_50. But because we pass a
@@ -427,28 +417,15 @@ absl::StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
                  config.GetGpuComputeCapability());
 
   if (update_algorithm) {
-    int64_t new_algorithm{};
     if (algorithm.has_gemm()) {
-      new_algorithm = algorithm.gemm().algorithm();
+      backend_config.set_selected_algorithm(algorithm.gemm().algorithm());
     } else {
       // NOTE: runtime autotuning is no longer available => set to default
-      new_algorithm = se::blas::kDefaultAlgorithm;
+      backend_config.set_selected_algorithm(se::blas::kDefaultAlgorithm);
     }
-
-    if (new_algorithm == old_algorithm &&
-        backend_config.has_selected_algorithm()) {
-      // We don't need to update the backend config if
-      // the algorithm hasn't changed unless previously
-      // the algorithm wasn't set explicitly.
-      return false;
-    }
-
-    backend_config.set_selected_algorithm(new_algorithm);
-    TF_RETURN_IF_ERROR(gemm->set_backend_config(gpu_config));
-    return true;  // We changed `gemm`
   }
-
-  return false;  // No change to `gemm`
+  TF_RETURN_IF_ERROR(gemm->set_backend_config(gpu_config));
+  return old_algorithm != backend_config.selected_algorithm();
 }
 
 absl::StatusOr<bool> RunOnComputation(HloComputation* computation,
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 6853900e2c6edd..f75e32f0fe8866 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -315,29 +315,11 @@ gpu_only_cc_library(
     ],
 )
 
-gpu_only_cc_library(
-    name = "gpu_timer_kernel_header",
-    hdrs = ["gpu_timer_kernel.h"],
-)
-
-gpu_kernel_library(
-    name = "gpu_timer_kernel",
-    srcs = if_gpu_is_configured(["gpu_timer_kernel.cu.cc"]),
-    deps = [
-        ":gpu_timer_kernel_header",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
-)
-
 gpu_only_cc_library(
     name = "gpu_timer_header",
     hdrs = ["gpu_timer.h"],
     deps = [
         ":gpu_executor_header",
-        ":gpu_timer_kernel_header",
         ":gpu_types_header",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/time",
@@ -348,14 +330,10 @@ gpu_only_cc_library(
     name = "gpu_timer",
     srcs = ["gpu_timer.cc"],
     hdrs = ["gpu_timer.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     deps = [
         ":gpu_driver_header",
         ":gpu_executor_header",
         ":gpu_stream",
-        ":gpu_timer_kernel_header",
         ":gpu_types_header",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_internal",
@@ -370,9 +348,7 @@ gpu_only_cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-    ] + if_gpu_is_configured([
-        ":gpu_timer_kernel",
-    ]) + if_cuda_is_configured([
+    ] + if_cuda_is_configured([
         "//xla/stream_executor/cuda:cuda_driver",
     ]) + if_rocm_is_configured([
         "//xla/stream_executor/rocm:rocm_driver",
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
index c0256e8051c719..ecd3f40c6725c9 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
@@ -51,21 +51,10 @@ absl::Duration RandomDuration() {
   return absl::Microseconds(distribution(rng));
 }
 
-bool ShouldLaunchDelayKernel() {
-  // Only launch the delay kernel if CUDA_LAUNCH_BLOCKING is not set to 1.
-  static bool value = [] {
-    const char* blocking = std::getenv("CUDA_LAUNCH_BLOCKING");
-    return !blocking || std::string_view{blocking} != "1";
-  }();
-  return value;
-}
-
 }  // namespace
 
 /*deprecated*/ /*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(
     GpuStream* stream) {
-  // This deprecated factory does not launch the delay kernel and may lead to
-  // reduced measurement accuracy.
   GpuExecutor* parent = stream->parent();
   GpuContext* context = parent->gpu_context();
   GpuEventHandle start_event;
@@ -83,8 +72,6 @@ bool ShouldLaunchDelayKernel() {
 
 /*deprecated*/ /*static*/ absl::StatusOr<std::optional<GpuTimer>>
 GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
-  // This deprecated factory does not launch the delay kernel and may lead to
-  // reduced measurement accuracy.
   if (is_needed) {
     TF_ASSIGN_OR_RETURN(GpuTimer t, GpuTimer::Create(stream));
     return {std::make_optional(std::move(t))};
@@ -92,78 +79,16 @@ GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
   return std::nullopt;
 }
 
-/*static*/ absl::StatusOr<GpuTimer::GpuSemaphore>
-GpuTimer::GpuSemaphore::Create(StreamExecutor* executor) {
-  // Allocate the value in pinned host memory that can be read from both
-  // host and device.
-  TF_ASSIGN_OR_RETURN(auto alloc,
-                      executor->HostMemoryAllocate(sizeof(GpuSemaphoreState)));
-  return GpuSemaphore{std::move(alloc)};
+[[deprecated("So it can quietly call a deprecated method")]] /*static*/ absl::
+    StatusOr<GpuTimer>
+    GpuTimer::Create(Stream* stream) {
+  return GpuTimer::Create(AsGpuStream(stream));
 }
 
-DeviceMemory<GpuSemaphoreState> GpuTimer::GpuSemaphore::device() {
-  // This assumes unified addressing, as we do not explicitly translate the
-  // host pointer into a device pointer.
-  return DeviceMemory<GpuSemaphoreState>::MakeFromByteSize(
-      ptr_->opaque(), sizeof(GpuSemaphoreState));
-}
-
-/*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(Stream* real_stream) {
-  StreamExecutor* executor = real_stream->parent();
-  GpuStream* stream = AsGpuStream(real_stream);
-  GpuExecutor* parent = stream->parent();
-  GpuContext* context = parent->gpu_context();
-  GpuEventHandle start_event;
-  TF_RETURN_IF_ERROR(GpuDriver::InitEvent(context, &start_event,
-                                          GpuDriver::EventFlags::kDefault));
-  GpuEventHandle stop_event;
-  TF_RETURN_IF_ERROR(GpuDriver::InitEvent(context, &stop_event,
-                                          GpuDriver::EventFlags::kDefault));
-  CHECK(start_event != nullptr && stop_event != nullptr);
-  GpuSemaphore semaphore{};
-  if (ShouldLaunchDelayKernel()) {
-    // Check the assumption that this device supports unified addressing,
-    // otherwise skip the delay kernel
-    TF_ASSIGN_OR_RETURN(int status, GpuDriver::GetDeviceAttribute(
-                                        CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
-                                        parent->device()));
-    if (!status) {
-      LOG(WARNING) << "Skipping the delay kernel because the device does not "
-                      "support unified addressing";
-    } else {
-      // Allocate a semaphore value that will be used to signal to the delay
-      // kernel that it may exit.
-      TF_ASSIGN_OR_RETURN(semaphore, GpuSemaphore::Create(executor));
-      *semaphore = GpuSemaphoreState::Hold;
-      // In principle the kernel could be loaded lazily and shared across
-      // multiple GpuTimer objects.
-      TF_ASSIGN_OR_RETURN(
-          auto kernel,
-          (TypedKernel<DeviceMemory<GpuSemaphoreState>,
-                       GpuSemaphoreState>::Create(executor, "DelayKernel",
-                                                  delay_kernel::kernel())));
-      // Launch a delay kernel into this stream, which will spin until
-      // GetElapsedDuration() is called, the timer is destroyed, or the timeout
-      // in the kernel is reached.
-      TF_RETURN_IF_ERROR(real_stream->ThenLaunch(
-          ThreadDim(1, 1, 1), BlockDim(1, 1, 1), kernel, semaphore.device(),
-          GpuSemaphoreState::Release));
-    }
-  }
-  // The start event goes after the delay kernel in the stream
-  TF_RETURN_IF_ERROR(GpuDriver::RecordEvent(parent->gpu_context(), start_event,
-                                            stream->gpu_stream()));
-  return absl::StatusOr<GpuTimer>{absl::in_place, parent, start_event,
-                                  stop_event,     stream, std::move(semaphore)};
-}
-
-/*static*/ absl::StatusOr<std::optional<GpuTimer>> GpuTimer::CreateIfNeeded(
-    Stream* stream, bool is_needed) {
-  if (is_needed) {
-    TF_ASSIGN_OR_RETURN(GpuTimer t, GpuTimer::Create(stream));
-    return {std::make_optional(std::move(t))};
-  }
-  return std::nullopt;
+[[deprecated("So it can quietly call a deprecated method")]] /*static*/ absl::
+    StatusOr<std::optional<GpuTimer>>
+    GpuTimer::CreateIfNeeded(Stream* stream, bool is_needed) {
+  return GpuTimer::CreateIfNeeded(AsGpuStream(stream), is_needed);
 }
 
 /*static*/ void GpuTimer::ReturnRandomDurationsForTesting() {
@@ -172,17 +97,6 @@ DeviceMemory<GpuSemaphoreState> GpuTimer::GpuSemaphore::device() {
 
 GpuTimer::~GpuTimer() {
   GpuContext* context = parent_->gpu_context();
-  if (semaphore_ && !is_stopped_) {
-    // Signal the delay kernel that it can exit
-    *semaphore_ = GpuSemaphoreState::Release;
-    // Wait for the delay kernel to exit before destroying the value that it is
-    // watching.
-    absl::Status status =
-        GpuDriver::SynchronizeStream(context, stream_->gpu_stream());
-    if (!status.ok()) {
-      LOG(ERROR) << status;
-    }
-  }
   if (start_event_ != nullptr) {
     absl::Status status = GpuDriver::DestroyEvent(context, &start_event_);
     if (!status.ok()) {
@@ -203,18 +117,6 @@ absl::StatusOr<absl::Duration> GpuTimer::GetElapsedDuration() {
   }
   TF_RETURN_IF_ERROR(GpuDriver::RecordEvent(parent_->gpu_context(), stop_event_,
                                             stream_->gpu_stream()));
-  // If we launched the delay kernel then check if it already timed out.
-  if (semaphore_) {
-    if (*semaphore_ == GpuSemaphoreState::TimedOut) {
-      // The delay kernel did not achieve the intended result.
-      LOG(ERROR) << "Delay kernel timed out: measured time has sub-optimal "
-                    "accuracy. There may be a missing warmup execution, please "
-                    "investigate in Nsight Systems.";
-    } else {
-      // Signal that the kernel can exit
-      *semaphore_ = GpuSemaphoreState::Release;
-    }
-  }
   float elapsed_milliseconds = NAN;
   if (!GpuDriver::GetEventElapsedTime(parent_->gpu_context(),
                                       &elapsed_milliseconds, start_event_,
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
index 251c77ec7ee1ea..8fd83bec6499e3 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/time/time.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
-#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 
 namespace xla {
@@ -37,29 +36,9 @@ namespace gpu {
 class GpuExecutor;
 class GpuStream;
 
-// When a timer is created it launches a delay kernel into the given stream and
-// queues a start event immediately afterwards. This delay kernel blocks
-// execution on the stream until GetElapsedDuration() is called, at which point
-// an end event is queued and the delay kernel exits. This allows the device
-// execution time of the tasks queued to the stream while the timer is active
-// to be measured more accurately.
+// Timer is started once it's created, and is stopped once read.
 class GpuTimer {
  public:
-  class GpuSemaphore {
-   public:
-    GpuSemaphore() = default;
-    static absl::StatusOr<GpuSemaphore> Create(StreamExecutor* executor);
-    explicit operator bool() const { return bool{ptr_}; }
-    GpuSemaphoreState& operator*() {
-      return *static_cast<GpuSemaphoreState*>(ptr_->opaque());
-    }
-    DeviceMemory<GpuSemaphoreState> device();
-
-   private:
-    explicit GpuSemaphore(std::unique_ptr<HostMemoryAllocation> alloc)
-        : ptr_{std::move(alloc)} {}
-    std::unique_ptr<HostMemoryAllocation> ptr_;
-  };
   static absl::StatusOr<GpuTimer> Create(Stream* stream);
   [[deprecated("Pass Stream* not GpuStream*")]] static absl::StatusOr<GpuTimer>
   Create(GpuStream* stream);
@@ -74,20 +53,17 @@ class GpuTimer {
   CreateIfNeeded(GpuStream* stream, bool is_needed);
 
   explicit GpuTimer(GpuExecutor* parent, GpuEventHandle start_event,
-                    GpuEventHandle stop_event, GpuStream* stream,
-                    GpuSemaphore semaphore = {})
+                    GpuEventHandle stop_event, GpuStream* stream)
       : parent_(parent),
         start_event_(start_event),
         stop_event_(stop_event),
-        stream_(stream),
-        semaphore_(std::move(semaphore)) {}
+        stream_(stream) {}
 
   GpuTimer(GpuTimer&& other)
       : parent_(other.parent_),
         start_event_(std::exchange(other.start_event_, nullptr)),
         stop_event_(std::exchange(other.stop_event_, nullptr)),
-        stream_(other.stream_),
-        semaphore_(std::move(other.semaphore_)) {}
+        stream_(other.stream_) {}
 
   GpuTimer& operator=(GpuTimer&& other) {
     if (this != &other) {
@@ -95,7 +71,6 @@ class GpuTimer {
       start_event_ = std::exchange(other.start_event_, nullptr);
       stop_event_ = std::exchange(other.stop_event_, nullptr);
       stream_ = other.stream_;
-      semaphore_ = std::move(other.semaphore_);
     }
     return *this;
   }
@@ -111,7 +86,6 @@ class GpuTimer {
   GpuEventHandle start_event_ = nullptr;
   GpuEventHandle stop_event_ = nullptr;
   GpuStream* stream_;
-  GpuSemaphore semaphore_;
   bool is_stopped_ = false;
 
   GpuTimer(const GpuTimer&) = delete;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
deleted file mode 100644
index 0ce4b1d9fbb323..00000000000000
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
-
-#include <cstddef>
-
-namespace stream_executor::gpu {
-namespace {
-// Wait for the value pointed to by `semaphore` to have value `target`, timing
-// out after approximately `APPROX_TIMEOUT_SECONDS` seconds if that value is
-// not reached. This can happen if, for example, blocking launches are enabled
-// via CUDA_LAUNCH_BLOCKING=1. It can also happen if launching a kernel after
-// this delay kernel causes synchronisation, e.g. because of lazy loading.
-__global__ void DelayKernel(volatile GpuSemaphoreState* semaphore,
-                            GpuSemaphoreState target) {
-  constexpr int64_t WAIT_CYCLES{1024};
-  constexpr int64_t TIMEOUT_CYCLES{200000000};  // 100ms at 2GHz
-  const int64_t tstart{clock64()};
-  bool target_not_reached;
-  while ((target_not_reached = (*semaphore != target)) &&
-         (clock64() - tstart) < TIMEOUT_CYCLES) {
-    int64_t elapsed{};
-    const int64_t t0{clock64()};
-    do {
-      elapsed = clock64() - t0;
-    } while (elapsed < WAIT_CYCLES);
-  }
-  if (target_not_reached) {
-    // We are exiting due to the timeout. Signal this back to the host so that
-    // we can emit a warning, as it probably indicates suboptimal usage.
-    *semaphore = GpuSemaphoreState::TimedOut;
-  }
-}
-}  // namespace
-
-namespace delay_kernel {
-void* kernel() { return reinterpret_cast<void*>(DelayKernel); }
-}  // namespace delay_kernel
-
-}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
deleted file mode 100644
index 2ac358b4ee56c5..00000000000000
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
-#define XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
-
-namespace stream_executor::gpu {
-enum struct GpuSemaphoreState { Hold, Release, TimedOut };
-namespace delay_kernel {
-void* kernel();  // returns a pointer to a CUDA C++ device function
-}  // namespace delay_kernel
-}  // namespace stream_executor::gpu
-
-#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_

From 8a2d31612fbafc7019bb15ad0a5752178cce745c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Bana=C5=9B?= <adambanas@google.com>
Date: Fri, 22 Mar 2024 01:42:15 -0700
Subject: [PATCH 271/670] [xla:ffi] Refactor custom call tests to reduce code
 duplication

Move builder and module setup to test fixture constructor.
Extract 'module build & execute' code shared by most tests as fixture function.

PiperOrigin-RevId: 618101366
---
 third_party/xla/xla/tests/BUILD               |   2 +
 third_party/xla/xla/tests/custom_call_test.cc | 132 ++++++++----------
 2 files changed, 61 insertions(+), 73 deletions(-)

diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 9443ebf882c54f..9cb47d83f5ced6 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -1740,6 +1740,7 @@ xla_test(
         ":test_macros_header",
         ":test_utils",
         ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -1751,6 +1752,7 @@ xla_test(
         "//xla/service:custom_call_target_registry",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
diff --git a/third_party/xla/xla/tests/custom_call_test.cc b/third_party/xla/xla/tests/custom_call_test.cc
index 4e0481af578569..eaa80424bbd50b 100644
--- a/third_party/xla/xla/tests/custom_call_test.cc
+++ b/third_party/xla/xla/tests/custom_call_test.cc
@@ -19,12 +19,15 @@ limitations under the License.
 
 #include "absl/base/dynamic_annotations.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/client/lib/constants.h"
 #include "xla/client/xla_builder.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
@@ -102,67 +105,69 @@ namespace {
 using ::testing::HasSubstr;
 
 class CustomCallTest : public HloTestBase {
+ public:
+  CustomCallTest()
+      : HloTestBase(),
+        module_(CreateNewVerifiedModule()),
+        builder_(TestName()) {}
+
  protected:
+  // Call this function when builder_ is complete (i.e. when all instructions
+  // have been added). Note that module_ is empty after calling this function.
+  auto BuildAndExecute(absl::Span<Literal* const> arguments) {
+    module_->AddEntryComputation(builder_.Build());
+    return Execute(std::move(module_), arguments);
+  }
+
   Shape r0f32_ = ShapeUtil::MakeShape(F32, {});
   Shape r2f32_ = ShapeUtil::MakeShape(F32, {2, 2});
+
+  std::unique_ptr<HloModule> module_;
+  HloComputation::Builder builder_;
 };
 
 XLA_TEST_F(CustomCallTest, CustomCallR0F32Add2) {
-  auto module = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  auto constant = builder.AddInstruction(
+  auto constant = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  builder.AddInstruction(
+  builder_.AddInstruction(
       HloInstruction::CreateCustomCall(r0f32_, {constant}, "R0F32Add2"));
 
-  module->AddEntryComputation(builder.Build());
-
-  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto result, BuildAndExecute({}));
   LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
 }
 
 XLA_TEST_F(CustomCallTest, CustomCallR2F32Reduce) {
-  auto module = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
   Array2D<float> array(2, 2);
   array(0, 0) = 1.0f;
   array(0, 1) = 2.0f;
   array(1, 0) = 3.0f;
   array(1, 1) = 4.0f;
 
-  auto constant = builder.AddInstruction(
+  auto constant = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D(array)));
-  builder.AddInstruction(
+  builder_.AddInstruction(
       HloInstruction::CreateCustomCall(r0f32_, {constant}, "R2F32ReduceSum"));
 
-  module->AddEntryComputation(builder.Build());
-
-  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto result, BuildAndExecute({}));
   LiteralTestUtil::ExpectR0Near<float>(10.0f, result, error_spec_);
 }
 
 XLA_TEST_F(CustomCallTest, UsedInOtherComputations) {
-  auto module = CreateNewVerifiedModule();
-  auto b = HloComputation::Builder(TestName());
-
-  auto input = b.AddInstruction(
+  auto input = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D(
           Array2D<float>{{1.0f, 2.0f}, {3.0f, 4.0f}})));
-  auto incremented = b.AddInstruction(HloInstruction::CreateCustomCall(
+  auto incremented = builder_.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {1, 2, 2}), {input}, "Add1ToValues"));
-  auto incremented_again = b.AddInstruction(HloInstruction::CreateCustomCall(
-      ShapeUtil::MakeShape(F32, {1, 2, 2}), {incremented}, "Add1ToValues"));
+  auto incremented_again =
+      builder_.AddInstruction(HloInstruction::CreateCustomCall(
+          ShapeUtil::MakeShape(F32, {1, 2, 2}), {incremented}, "Add1ToValues"));
 
   // Concatenate the values along first dim.
-  b.AddInstruction(
+  builder_.AddInstruction(
       HloInstruction::CreateConcatenate(ShapeUtil::MakeShape(F32, {2, 2, 2}),
                                         {incremented, incremented_again}, 0));
 
-  module->AddEntryComputation(b.Build());
-
-  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto result, BuildAndExecute({}));
   LiteralTestUtil::ExpectR3EqualArray3D<float>(
       Array3D<float>{{{2, 3}, {4, 5}}, {{3, 4}, {5, 6}}}, result);
 }
@@ -174,24 +179,22 @@ XLA_TEST_F(CustomCallTest, InputAndOutputLayoutDiffer) {
     GTEST_SKIP() << "Appears to test an XLA current implementation detail";
   }
 
-  auto module = CreateNewVerifiedModule();
-  auto b = HloComputation::Builder(TestName());
-
   auto input =
-      b.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
-  b.AddInstruction(
+      builder_.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
+  builder_.AddInstruction(
       HloInstruction::CreateCustomCall(r2f32_, {input}, "Add1ToValues"));
 
-  module->AddEntryComputation(b.Build());
-  ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
-  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({0, 1}));
+  module_->AddEntryComputation(builder_.Build());
+  ForceParameterLayout(module_.get(), 0, LayoutUtil::MakeLayout({1, 0}));
+  ForceResultLayout(module_.get(), LayoutUtil::MakeLayout({0, 1}));
 
   Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
 
   // Note, the expected result is transposed! This is because the input and
   // output layouts of the custom call differ and the called function just
   // blindly adds one to each element.
-  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {&argument}));
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module_), {&argument}));
   LiteralTestUtil::ExpectR2Equal<float>({{2.f, 4.f}, {3.f, 5.f}}, result);
 }
 
@@ -199,26 +202,24 @@ XLA_TEST_F(CustomCallTest, LayoutConstrained) {
   // The argument and result of the computation are set to different layouts,
   // but the custom call is layout constrained to a fixed operand and result
   // layout, so the correct result should be produced.
-  auto module = CreateNewVerifiedModule();
-  auto b = HloComputation::Builder(TestName());
-
   auto input =
-      b.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
+      builder_.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
 
   const Shape& r2f32_dim0_major =
       ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 2}, {1, 0});
-  auto custom_call = b.AddInstruction(HloInstruction::CreateCustomCall(
+  auto custom_call = builder_.AddInstruction(HloInstruction::CreateCustomCall(
       r2f32_dim0_major, {input}, "Add1ToValues", {r2f32_dim0_major}));
-  b.AddInstruction(
+  builder_.AddInstruction(
       custom_call->CloneWithNewOperands(r2f32_dim0_major, {custom_call}));
 
-  module->AddEntryComputation(b.Build());
-  ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
-  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({0, 1}));
+  module_->AddEntryComputation(builder_.Build());
+  ForceParameterLayout(module_.get(), 0, LayoutUtil::MakeLayout({1, 0}));
+  ForceResultLayout(module_.get(), LayoutUtil::MakeLayout({0, 1}));
 
   Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
 
-  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {&argument}));
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module_), {&argument}));
   LiteralTestUtil::ExpectR2Equal<float>({{3.f, 4.f}, {5.f, 6.f}}, result);
 }
 
@@ -244,58 +245,43 @@ XLA_TEST_F(CustomCallTest, TupleOutput) {
 }
 
 XLA_TEST_F(CustomCallTest, ReportsSuccess) {
-  auto module = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  auto constant = builder.AddInstruction(
+  auto constant = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  builder.AddInstruction(HloInstruction::CreateCustomCall(
+  builder_.AddInstruction(HloInstruction::CreateCustomCall(
       r0f32_, {constant}, "R0F32Add2Succeed",
       /*opaque=*/"", CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
 
-  module->AddEntryComputation(builder.Build());
-
-  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto result, BuildAndExecute({}));
   LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
 }
 
 XLA_TEST_F(CustomCallTest, ReportsFailure) {
-  auto module = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  auto constant = builder.AddInstruction(
+  auto constant = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  builder.AddInstruction(HloInstruction::CreateCustomCall(
+  builder_.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {}), {constant}, "CustomCallFail",
       /*opaque=*/"", CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
 
-  module->AddEntryComputation(builder.Build());
-
-  auto status = Execute(std::move(module), {}).status();
+  auto status = BuildAndExecute({}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
   EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed: 42.0"));
 }
 
 XLA_TEST_F(CustomCallTest, ReportsFirstFailure) {
-  auto module = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  auto constant_1 = builder.AddInstruction(
+  auto constant_1 = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
-  auto constant_2 = builder.AddInstruction(
+  auto constant_2 = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
-  auto res_1 = builder.AddInstruction(HloInstruction::CreateCustomCall(
+  auto res_1 = builder_.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {}), {constant_1}, "CustomCallFail",
       /*opaque=*/"", CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
-  auto res_2 = builder.AddInstruction(HloInstruction::CreateCustomCall(
+  auto res_2 = builder_.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {}), {constant_2}, "CustomCallFail",
       /*opaque=*/"", CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
-  builder.AddInstruction(HloInstruction::CreateBinary(
+  builder_.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, res_1, res_2));
 
-  module->AddEntryComputation(builder.Build());
-
-  auto status = Execute(std::move(module), {}).status();
+  auto status = BuildAndExecute({}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
   EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed: 1.0"));
 }

From eb374370ff8c2218c6408d79ded3cb7a4bb6ba76 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 02:02:07 -0700
Subject: [PATCH 272/670] compat: Update forward compatibility horizon to
 2024-03-22

PiperOrigin-RevId: 618105316
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 32fa29c2cf3ef5..d4e91c5142a1cc 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 21)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 22)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 42419a5d3b17fef1a14b8de2fa926bee4788d362 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 02:02:20 -0700
Subject: [PATCH 273/670] Update GraphDef version to 1809.

PiperOrigin-RevId: 618105375
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index a5df80bed9c41f..20d9b13e9e37ba 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1808  // Updated: 2024/3/21
+#define TF_GRAPH_DEF_VERSION 1809  // Updated: 2024/3/22
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From cd0b05b65d8a92c997c85f841dba46041bdaf81c Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 22 Mar 2024 02:24:59 -0700
Subject: [PATCH 274/670] [xla:gpu] Simplify the search for operand slices in
 DynamicAddressComputationFusion

PiperOrigin-RevId: 618110053
---
 third_party/xla/xla/service/gpu/fusions/BUILD |  1 +
 .../xla/xla/service/gpu/fusions/custom.cc     | 39 ++++++++++---------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index cd200676dfc1c6..802fd9e3b7f5c8 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -80,6 +80,7 @@ cc_library(
         "//xla/service/gpu/runtime:custom_call_thunk",
         "//xla/service/gpu/runtime:gemm_thunk",
         "//xla/service/gpu/runtime:kernel_thunk",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index ebe67d7ee99d49..1b6f0722005d9c 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -197,32 +198,25 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
   auto get_original_operand_slice =
       [&](const HloInstruction* start,
           const ShapeIndex& index) -> absl::StatusOr<BufferAllocation::Slice> {
-    if (const auto* param = DynCast<HloParameterInstruction>(start)) {
-      return GetAllocationSlice(
-          buffer_assignment, fusion.operand(param->parameter_number()), index);
-    }
-
+    auto* param = DynCast<HloParameterInstruction>(start);
     auto slice_adaptor = HloFindIf(
         {HloInstructionAdaptor(*start)}, adaptor,
         [](auto node) { return node.opcode() == HloOpcode::kDynamicSlice; });
-    if (!slice_adaptor.has_value()) {
-      return absl::InternalError(
-          "DynamicAddressComputationFusion expects all operands to be either "
-          "sliced or parameter");
-    }
+    if (slice_adaptor.has_value()) {
+      slice_instr = const_cast<HloDynamicIndexInstruction*>(
+          static_cast<const HloDynamicIndexInstruction*>(
+              &slice_adaptor->instruction()));
 
-    slice_instr = const_cast<HloDynamicIndexInstruction*>(
-        static_cast<const HloDynamicIndexInstruction*>(
-            &slice_adaptor->instruction()));
+      if (!IsContiguousSlice(slice_instr->operand(0)->shape(),
+                             slice_instr->shape())) {
+        return absl::InternalError(
+            "DynamicAddressComputationFusion only handles contiguous slices "
+            "currently");
+      }
 
-    if (!IsContiguousSlice(slice_instr->operand(0)->shape(),
-                           slice_instr->shape())) {
-      return absl::InternalError(
-          "DynamicAddressComputationFusion only handles contiguous slices "
-          "currently");
+      param = Cast<HloParameterInstruction>(slice_instr->operand(0));
     }
 
-    const auto* param = Cast<HloParameterInstruction>(slice_instr->operand(0));
     return GetAllocationSlice(buffer_assignment,
                               fusion.operand(param->parameter_number()), index);
   };
@@ -316,6 +310,13 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
         BufferAllocation::Slice(workspace->allocation(), 0, workspace->size());
   }
 
+  if (absl::c_all_of(offset_buffer_indices, [&](auto offset_slices) {
+        return offset_slices == std::nullopt;
+      }))
+    return absl::InternalError(
+        "DynamicAddressComputationFusion expects at least one sliced "
+        "operand/result");
+
   // Creating embedded GEMM thunk.
   bool deterministic_ops =
       ir_emitter_context.debug_options().xla_gpu_deterministic_ops();

From 0d2d8a66fca1cbcdd6dd4e0cd6971792782e6844 Mon Sep 17 00:00:00 2001
From: Alan Kelly <alankelly@google.com>
Date: Fri, 22 Mar 2024 02:30:51 -0700
Subject: [PATCH 275/670] Experimental: Allow users to enable all features of
 default delegates in python

XNNPack is the only default delegate and this will allow users to benefit from all flag protected features. Other delegates can use this in the future.

PiperOrigin-RevId: 618111218
---
 RELEASE.md                                    |  3 +
 tensorflow/lite/python/interpreter.py         | 20 +++--
 .../lite/python/interpreter_wrapper/BUILD     |  1 +
 .../interpreter_wrapper.cc                    | 85 ++++++++++++-------
 .../interpreter_wrapper/interpreter_wrapper.h |  9 +-
 .../interpreter_wrapper_pybind11.cc           | 12 ++-
 .../v1/tensorflow.lite.-interpreter.pbtxt     |  2 +-
 .../v2/tensorflow.lite.-interpreter.pbtxt     |  2 +-
 8 files changed, 89 insertions(+), 45 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index c64023a3f48dee..8c9ba51d7993ae 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -75,6 +75,9 @@
     * C API:
         * The experimental `TfLiteRegistrationExternal` type has been renamed as
           `TfLiteOperator`, and likewise for the corresponding API functions.
+    * The Python TF Lite Interpreter bindings now have an option
+      `experimental_default_delegate_latest_features` to enable all default
+      delegate features.
 
 ## Thanks to our Contributors
 
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index 77933ee4110018..463204b05753ab 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -396,6 +396,7 @@ def __init__(
       experimental_op_resolver_type=OpResolverType.AUTO,
       experimental_preserve_all_tensors=False,
       experimental_disable_delegate_clustering=False,
+      experimental_default_delegate_latest_features=False,
   ):
     """Constructor.
 
@@ -437,6 +438,8 @@ def __init__(
         this flag is currently experimental, and it might be removed/updated if
         the TF Lite converter doesn't drop such control dependencies in the
         model. Default is False.
+      experimental_default_delegate_latest_features: If true, default delegates
+        may enable all flag protected features. Default is False;
 
     Raises:
       ValueError: If the interpreter was unable to create.
@@ -454,6 +457,12 @@ def __init__(
       raise ValueError('Unrecognized passed in op resolver type: {}'.format(
           experimental_op_resolver_type))
 
+    if num_threads is not None:
+      if not isinstance(num_threads, int):
+        raise ValueError('type of num_threads should be int')
+      if num_threads < 1:
+        raise ValueError('num_threads should >= 1')
+
     if model_path and not model_content:
       custom_op_registerers_by_name = [
           x for x in self._custom_op_registerers if isinstance(x, str)
@@ -468,6 +477,8 @@ def __init__(
           custom_op_registerers_by_func,
           experimental_preserve_all_tensors,
           experimental_disable_delegate_clustering,
+          int(num_threads or 1),
+          experimental_default_delegate_latest_features,
       )
       if not self._interpreter:
         raise ValueError('Failed to open {}'.format(model_path))
@@ -489,19 +500,14 @@ def __init__(
           custom_op_registerers_by_func,
           experimental_preserve_all_tensors,
           experimental_disable_delegate_clustering,
+          int(num_threads or 1),
+          experimental_default_delegate_latest_features,
       )
     elif not model_content and not model_path:
       raise ValueError('`model_path` or `model_content` must be specified.')
     else:
       raise ValueError('Can\'t both provide `model_path` and `model_content`')
 
-    if num_threads is not None:
-      if not isinstance(num_threads, int):
-        raise ValueError('type of num_threads should be int')
-      if num_threads < 1:
-        raise ValueError('num_threads should >= 1')
-      self._interpreter.SetNumThreads(num_threads)
-
     # Each delegate is a wrapper that owns the delegates that have been loaded
     # as plugins. The interpreter wrapper will be using them, but we need to
     # hold them in a list so that the lifetime is preserved at least as long as
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index ed111e41efee0a..e56b861a5143c7 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -39,6 +39,7 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/kernels:builtin_ops",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "//tensorflow/lite/kernels:reference_ops",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index bce64edb32f8ba..6c6fa7c8f3282e 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/register_ref.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
@@ -85,18 +86,31 @@ using python_utils::PyDecrefDeleter;
 std::unique_ptr<Interpreter> CreateInterpreter(
     const InterpreterWrapper::Model* model,
     const tflite::MutableOpResolver& resolver, bool preserve_all_tensors,
-    bool disable_delegate_clustering) {
+    bool disable_delegate_clustering, int num_threads,
+    bool default_delegate_latest_features) {
   if (!model) {
     return nullptr;
   }
 
   ::tflite::python::ImportNumpy();
 
+  TfLiteDelegate* xnnpack_delegate = nullptr;
+  if (default_delegate_latest_features) {
+    auto opts = TfLiteXNNPackDelegateOptionsDefault();
+    opts.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+    opts.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_SUBGRAPH_RESHAPING;
+    opts.num_threads = num_threads;
+    xnnpack_delegate = TfLiteXNNPackDelegateCreate(&opts);
+  }
   std::unique_ptr<Interpreter> interpreter;
   InterpreterOptions options;
   options.SetPreserveAllTensors(preserve_all_tensors);
   options.SetDisableDelegateClustering(disable_delegate_clustering);
   InterpreterBuilder builder(*model, resolver, &options);
+  if (default_delegate_latest_features) {
+    builder.AddDelegate(xnnpack_delegate);
+  }
+  builder.SetNumThreads(num_threads);
   if (builder(&interpreter) != kTfLiteOk) {
     return nullptr;
   }
@@ -200,29 +214,36 @@ InterpreterWrapper* InterpreterWrapper::CreateInterpreterWrapper(
     const std::vector<std::string>& registerers_by_name,
     const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
     std::string* error_msg, bool preserve_all_tensors,
-    bool disable_delegate_clustering) {
+    bool disable_delegate_clustering, int num_threads,
+    bool default_delegate_latest_features) {
   if (!model) {
     *error_msg = error_reporter->message();
     return nullptr;
   }
 
   std::unique_ptr<tflite::MutableOpResolver> resolver;
-  switch (op_resolver_id) {
-    case kBuiltinOpResolver:
-      resolver = std::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
-      break;
-    case kBuiltinRefOpResolver:
-      resolver = std::make_unique<tflite::ops::builtin::BuiltinRefOpResolver>();
-      break;
-    case kBuiltinOpResolverWithoutDefaultDelegates:
-      resolver = std::make_unique<
-          tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates>();
-      break;
-    default:
-      // This should not never happen because the eventual caller in
-      // interpreter.py should have passed a valid id here.
-      TFLITE_DCHECK(false);
-      return nullptr;
+  if (default_delegate_latest_features) {
+    resolver = std::make_unique<
+        tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates>();
+  } else {
+    switch (op_resolver_id) {
+      case kBuiltinOpResolver:
+        resolver = std::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
+        break;
+      case kBuiltinRefOpResolver:
+        resolver =
+            std::make_unique<tflite::ops::builtin::BuiltinRefOpResolver>();
+        break;
+      case kBuiltinOpResolverWithoutDefaultDelegates:
+        resolver = std::make_unique<
+            tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates>();
+        break;
+      default:
+        // This should not never happen because the eventual caller in
+        // interpreter.py should have passed a valid id here.
+        TFLITE_DCHECK(false);
+        return nullptr;
+    }
   }
 
   for (const auto& registerer : registerers_by_name) {
@@ -232,9 +253,9 @@ InterpreterWrapper* InterpreterWrapper::CreateInterpreterWrapper(
   for (const auto& registerer : registerers_by_func) {
     registerer(reinterpret_cast<uintptr_t>(resolver.get()));
   }
-  auto interpreter =
-      CreateInterpreter(model.get(), *resolver, preserve_all_tensors,
-                        disable_delegate_clustering);
+  auto interpreter = CreateInterpreter(
+      model.get(), *resolver, preserve_all_tensors, disable_delegate_clustering,
+      num_threads, default_delegate_latest_features);
   if (!interpreter) {
     *error_msg = error_reporter->message();
     return nullptr;
@@ -806,14 +827,16 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
     const std::vector<std::string>& registerers_by_name,
     const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
     std::string* error_msg, bool preserve_all_tensors,
-    bool disable_delegate_clustering) {
+    bool disable_delegate_clustering, int num_threads,
+    bool default_delegate_latest_features) {
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
   std::unique_ptr<InterpreterWrapper::Model> model =
       Model::BuildFromFile(model_path, error_reporter.get());
   return CreateInterpreterWrapper(
       std::move(model), op_resolver_id, std::move(error_reporter),
       registerers_by_name, registerers_by_func, error_msg, preserve_all_tensors,
-      disable_delegate_clustering);
+      disable_delegate_clustering, num_threads,
+      default_delegate_latest_features);
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
@@ -822,7 +845,8 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
     bool preserve_all_tensors, bool disable_delegate_clustering) {
   return CreateWrapperCPPFromFile(
       model_path, op_resolver_id, registerers, {} /*registerers_by_func*/,
-      error_msg, preserve_all_tensors, disable_delegate_clustering);
+      error_msg, preserve_all_tensors, disable_delegate_clustering,
+      /*num_threads=*/1, /*default_delegate_latest_features=*/false);
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
@@ -830,7 +854,8 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
     const std::vector<std::string>& registerers_by_name,
     const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
     std::string* error_msg, bool preserve_all_tensors,
-    bool disable_delegate_clustering) {
+    bool disable_delegate_clustering, int num_threads,
+    bool default_delegate_latest_features) {
   char* buf = nullptr;
   Py_ssize_t length;
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
@@ -843,16 +868,18 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
   return CreateInterpreterWrapper(
       std::move(model), op_resolver_id, std::move(error_reporter),
       registerers_by_name, registerers_by_func, error_msg, preserve_all_tensors,
-      disable_delegate_clustering);
+      disable_delegate_clustering, num_threads,
+      default_delegate_latest_features);
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
     PyObject* data, int op_resolver_id,
     const std::vector<std::string>& registerers, std::string* error_msg,
     bool preserve_all_tensors, bool disable_delegate_clustering) {
-  return CreateWrapperCPPFromBuffer(data, op_resolver_id, registerers, {},
-                                    error_msg, preserve_all_tensors,
-                                    disable_delegate_clustering);
+  return CreateWrapperCPPFromBuffer(
+      data, op_resolver_id, registerers, {}, error_msg, preserve_all_tensors,
+      disable_delegate_clustering, /*num_threads=*/1,
+      /*default_delegate_latest_features=*/false);
 }
 
 PyObject* InterpreterWrapper::ResetVariableTensors() {
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
index 6d8809bdffb5ca..66e4d8126312b0 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -57,7 +57,8 @@ class InterpreterWrapper {
       const std::vector<std::string>& registerers_by_name,
       const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
       std::string* error_msg, bool preserve_all_tensors,
-      bool disable_delegate_clustering);
+      bool disable_delegate_clustering, int num_threads,
+      bool default_delegate_latest_features);
 
   // SWIG caller takes ownership of pointer.
   static InterpreterWrapper* CreateWrapperCPPFromBuffer(
@@ -69,7 +70,8 @@ class InterpreterWrapper {
       const std::vector<std::string>& registerers_by_name,
       const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
       std::string* error_msg, bool preserve_all_tensors,
-      bool disable_delegate_clustering);
+      bool disable_delegate_clustering, int num_threads,
+      bool default_delegate_latest_features);
 
   ~InterpreterWrapper();
   PyObject* AllocateTensors(int subgraph_index);
@@ -126,7 +128,8 @@ class InterpreterWrapper {
       const std::vector<std::string>& registerers_by_name,
       const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
       std::string* error_msg, bool preserve_all_tensors,
-      bool disable_delegate_clustering);
+      bool disable_delegate_clustering, int num_threads,
+      bool default_delegate_latest_features);
 
   InterpreterWrapper(std::unique_ptr<Model> model,
                      std::unique_ptr<PythonErrorReporter> error_reporter,
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
index 5f8f2a50e13f81..728b16c6d19001 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
@@ -53,12 +53,14 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
       [](const std::string& model_path, int op_resolver_id,
          const std::vector<std::string>& registerers_by_name,
          const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
-         bool preserve_all_tensors, bool disable_delegate_clustering) {
+         bool preserve_all_tensors, bool disable_delegate_clustering,
+         int num_threads, bool default_delegate_latest_features) {
         std::string error;
         auto* wrapper = ::InterpreterWrapper::CreateWrapperCPPFromFile(
             model_path.c_str(), op_resolver_id, registerers_by_name,
             registerers_by_func, &error, preserve_all_tensors,
-            disable_delegate_clustering);
+            disable_delegate_clustering, num_threads,
+            default_delegate_latest_features);
         if (!wrapper) {
           throw std::invalid_argument(error);
         }
@@ -82,12 +84,14 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
       [](const py::bytes& data, int op_resolver_id,
          const std::vector<std::string>& registerers_by_name,
          const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
-         bool preserve_all_tensors, bool disable_delegate_clustering) {
+         bool preserve_all_tensors, bool disable_delegate_clustering,
+         int num_threads, bool default_delegate_latest_features) {
         std::string error;
         auto* wrapper = ::InterpreterWrapper::CreateWrapperCPPFromBuffer(
             data.ptr(), op_resolver_id, registerers_by_name,
             registerers_by_func, &error, preserve_all_tensors,
-            disable_delegate_clustering);
+            disable_delegate_clustering, num_threads,
+            default_delegate_latest_features);
         if (!wrapper) {
           throw std::invalid_argument(error);
         }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt
index 6e48ac1ef1cda3..c102725d9e3e82 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_path\', \'model_content\', \'experimental_delegates\', \'num_threads\', \'experimental_op_resolver_type\', \'experimental_preserve_all_tensors\', \'experimental_disable_delegate_clustering\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'OpResolverType.AUTO\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'model_path\', \'model_content\', \'experimental_delegates\', \'num_threads\', \'experimental_op_resolver_type\', \'experimental_preserve_all_tensors\', \'experimental_disable_delegate_clustering\', \'experimental_default_delegate_latest_features\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'OpResolverType.AUTO\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "allocate_tensors"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt
index 6e48ac1ef1cda3..c102725d9e3e82 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_path\', \'model_content\', \'experimental_delegates\', \'num_threads\', \'experimental_op_resolver_type\', \'experimental_preserve_all_tensors\', \'experimental_disable_delegate_clustering\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'OpResolverType.AUTO\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'model_path\', \'model_content\', \'experimental_delegates\', \'num_threads\', \'experimental_op_resolver_type\', \'experimental_preserve_all_tensors\', \'experimental_disable_delegate_clustering\', \'experimental_default_delegate_latest_features\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'OpResolverType.AUTO\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "allocate_tensors"

From fddf11459dcf486ee2e4ef0428b9bc3da29f2aab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 03:45:15 -0700
Subject: [PATCH 276/670] Expose `.layout` on jax.Array. Also add checks in the
 AOT path to make sure that the input `Array`'s layout matches the layout
 given to `jax.jit`.

PiperOrigin-RevId: 618127324
---
 third_party/xla/xla/python/py_array.cc    | 3 ---
 third_party/xla/xla/python/py_array.h     | 4 ----
 third_party/xla/xla/python/xla_client.py  | 2 +-
 third_party/xla/xla/python/xla_client.pyi | 1 -
 4 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index d73568c2ccb6d4..cd58ff580380c6 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -42,7 +42,6 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "third_party/nanobind/include/nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/layout.h"
 #include "xla/layout_util.h"
@@ -1571,8 +1570,6 @@ Status PyArray::RegisterTypes(nb::module_& m) {
       nb::is_method());
   type.attr("__cuda_array_interface__") = nb_property_readonly(
       [](PyArray self) { return self.CudaArrayInterface(); });
-  type.attr("_pjrt_layout") =
-      nb_property_readonly(xla::ValueOrThrowWrapper(&PyArray::layout));
   type.attr("on_device_size_in_bytes") = nb::cpp_function(
       xla::ValueOrThrowWrapper(&PyArray::GetOnDeviceSizeInBytes),
       nb::is_method());
diff --git a/third_party/xla/xla/python/py_array.h b/third_party/xla/xla/python/py_array.h
index 31d26b95feb68c..1595c830a92a7b 100644
--- a/third_party/xla/xla/python/py_array.h
+++ b/third_party/xla/xla/python/py_array.h
@@ -176,10 +176,6 @@ class PyArray : public nanobind::object {
 
   const nanobind::object& sharding() const { return GetStorage().sharding; }
 
-  StatusOr<std::unique_ptr<PjRtLayout>> layout() {
-    return ifrt_array()->layout();
-  }
-
   bool committed() const { return GetStorage().committed; }
 
   const nanobind::object& npy_value() const { return GetStorage().npy_value; }
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index ccfbf7d00874f1..47e0600ee6219e 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -48,7 +48,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 249
+_version = 248
 
 # Version number for MLIR:Python components.
 mlir_api_version = 55
diff --git a/third_party/xla/xla/python/xla_client.pyi b/third_party/xla/xla/python/xla_client.pyi
index cd8f5bd09615ee..cedaa61db5cb96 100644
--- a/third_party/xla/xla/python/xla_client.pyi
+++ b/third_party/xla/xla/python/xla_client.pyi
@@ -42,7 +42,6 @@ from .xla_extension import OpSharding as OpSharding
 from .xla_extension import HloSharding as HloSharding
 from .xla_extension import PrimitiveType as PrimitiveType
 from .xla_extension import Traceback as Traceback
-from .xla_extension import PjRtLayout as PjRtLayout
 from .xla_extension import XlaBuilder as XlaBuilder
 from .xla_extension import XlaComputation as XlaComputation
 from .xla_extension import XlaOp as XlaOp

From 74713348c68c9834ce68c03b92dca63a81dfacc7 Mon Sep 17 00:00:00 2001
From: TJ Xu <tjx@nvidia.com>
Date: Fri, 22 Mar 2024 03:53:35 -0700
Subject: [PATCH 277/670] PR #10724: [NVIDIA GPU] Fix crash in thunk lowering

Imported from GitHub PR https://github.com/openxla/xla/pull/10724

send and recv will have tuple as output shape, the tuple contains a token field, we only want to use the memory space of the first element in the tuple which is the actual data, not the token element.
Copybara import of the project:

--
89981b6576a22134831ddfc528fad900129d5058 by TJ <tjx@nvidia.com>:

fix crash in thunk lowering

Merging this change closes #10724

PiperOrigin-RevId: 618128910
---
 .../xla/xla/service/gpu/ir_emitter_unnested.cc      | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 2711662bdf1743..93dd7754008298 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -2625,7 +2625,10 @@ absl::Status IrEmitterUnnested::EmitSendThunk(const HloSendInstruction* instr) {
     const auto& hlo_config = ir_emitter_context_->hlo_module().config();
     const int64_t replica_count = hlo_config.replica_count();
     const int64_t partition_count = hlo_config.num_partitions();
-    const int64_t memory_space = src->shape().layout().memory_space();
+    const int64_t memory_space =
+        instr->shape().IsTuple()
+            ? instr->shape().tuple_shapes(0).layout().memory_space()
+            : instr->shape().layout().memory_space();
     const NcclCollectiveThunk::Buffer nccl_buffer = {
         /*element_count=*/ShapeUtil::ElementsIn(src->shape()),
         /*source_buffer=*/buffer,
@@ -2690,11 +2693,17 @@ absl::Status IrEmitterUnnested::EmitRecvThunk(const HloRecvInstruction* instr) {
   TF_RET_CHECK(instr->shape().IsTuple());
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice buffer,
                       GetAllocationSliceForHlo(instr, {0}));
+
   if (!instr->is_host_transfer()) {
     const auto& hlo_config = ir_emitter_context_->hlo_module().config();
     const int64_t replica_count = hlo_config.replica_count();
     const int64_t partition_count = hlo_config.num_partitions();
-    const int64_t memory_space = instr->shape().layout().memory_space();
+
+    const int64_t memory_space =
+        instr->shape().IsTuple()
+            ? instr->shape().tuple_shapes(0).layout().memory_space()
+            : instr->shape().layout().memory_space();
+
     const NcclCollectiveThunk::Buffer nccl_buffer = {
         /*element_count=*/ShapeUtil::ElementsIn(instr->shape().tuple_shapes(0)),
         /*source_buffer=*/buffer,

From b1bd26cd440dfbe89521bb97044cee8b2c4873b7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 04:12:21 -0700
Subject: [PATCH 278/670] Automated Code Change

PiperOrigin-RevId: 618132891
---
 third_party/xla/xla/tsl/c/tsl_status.cc         | 2 +-
 third_party/xla/xla/tsl/c/tsl_status_internal.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/tsl/c/tsl_status.cc b/third_party/xla/xla/tsl/c/tsl_status.cc
index 4b90c2a2b81660..d6f86a71aa7aea 100644
--- a/third_party/xla/xla/tsl/c/tsl_status.cc
+++ b/third_party/xla/xla/tsl/c/tsl_status.cc
@@ -31,7 +31,7 @@ void TSL_DeleteStatus(TSL_Status* s) { delete s; }
 
 void TSL_SetStatus(TSL_Status* s, TSL_Code code, const char* msg) {
   if (code == TSL_OK) {
-    s->status = ::tsl::OkStatus();
+    s->status = absl::OkStatus();
     return;
   }
   s->status =
diff --git a/third_party/xla/xla/tsl/c/tsl_status_internal.h b/third_party/xla/xla/tsl/c/tsl_status_internal.h
index 43ea9e743331e2..132adc62dac66f 100644
--- a/third_party/xla/xla/tsl/c/tsl_status_internal.h
+++ b/third_party/xla/xla/tsl/c/tsl_status_internal.h
@@ -22,7 +22,7 @@ limitations under the License.
 // and should not be depended on.
 
 struct TSL_Status {
-  tsl::Status status;
+  absl::Status status;
 };
 
 #endif  // XLA_TSL_C_TSL_STATUS_INTERNAL_H_

From 0a28fc96f161f0420aca2bb608fe1b7b1fb6c32c Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Fri, 22 Mar 2024 04:27:17 -0700
Subject: [PATCH 279/670] Do not fuse gelu into FP8 cublasLT matmul on CUDA
 versions less than 12.4.

This causes cublas LT to give an error in certain cases. Specially, it always causes an error on Ada and causes an error when fast accumulation is disabled on Hopper, at least in the cases I've tried. I am just disabling it in general for older CUDA versions in case there are other scenarios where it fails.

The issue actually only occurs on CUDA versions less than 12.3.2, but there is no good way of checking the patch version of CUDA, so I'm disabling it on CUDA versions less than 12.4. There is a CUBLAS_VER_PATCH with the patch version of cublas but this is undocumented so I'd rather not use it.

Thank you @philipphack and @artem-b for helping me debug this.

PiperOrigin-RevId: 618135629
---
 third_party/xla/xla/service/gpu/gemm_rewriter.cc       | 10 ++++++++++
 .../xla/xla/service/gpu/tests/gemm_rewrite_test.cc     | 10 ++++++++++
 2 files changed, 20 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index 82860ef6324fa6..4d97587955e05e 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -1612,6 +1612,16 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return absl::OkStatus();
     }
 
+#if CUDA_VERSION < 12040
+    // For CUDA versions less than 12.3.2, cuBLAS LT returns
+    // CUBLAS_STATUS_NOT_SUPPORTED in some cases when fusing gelu into an FP8
+    // matmul. We cannot check the patch version, so disable this fusion with
+    // CUDA versions less than 12.4.
+    if (IsCublasLtMatmulF8(*gemm)) {
+      return absl::OkStatus();
+    }
+#endif
+
     // There are four users of the gemm output within the GELU calculation.
     bool has_aux = gemm->user_count() > 4;
 
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index 20f40a2dc453d7..cd1e82368f988c 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -5486,6 +5486,10 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 )";
 
   CheckFp8IfSupported(hlo_text);
+
+// Fusing gelu into FP8 cublas matmuls is disabled on CUDA versions less
+// than 12.4.
+#if (GOOGLE_CUDA && CUDA_VERSION >= 12040) || TENSORFLOW_USE_ROCM
   RunAndFilecheckHloRewrite(
       hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
       R"(
@@ -5519,6 +5523,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-GCN-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
       )");
+#endif  // (GOOGLE_CUDA && CUDA_VERSION >= 12040) || TENSORFLOW_USE_ROCM
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest,
@@ -5566,6 +5571,10 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 )";
 
   CheckFp8IfSupported(hlo_text);
+
+// Fusing gelu into FP8 cublas matmuls is disabled on CUDA versions less
+// than 12.4.
+#if (GOOGLE_CUDA && CUDA_VERSION >= 12040) || TENSORFLOW_USE_ROCM
   // Currently, hipBlasLt does not support output datatype bf16 for fp8 matmul.
   // And no fusion was done for such cases.
   RunAndFilecheckHloRewrite(
@@ -5600,6 +5609,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-GCN-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
       )");
+#endif  // (GOOGLE_CUDA && CUDA_VERSION >= 12040) || TENSORFLOW_USE_ROCM
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, InvScaledABUnscaledDF8) {

From 34ca53c4bdfde6d43b9349995d8b1375fda78fa7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 04:30:26 -0700
Subject: [PATCH 280/670] Automated Code Change

PiperOrigin-RevId: 618136248
---
 .../coordination/coordination_service.cc      | 135 +++----
 .../coordination/coordination_service.h       |  25 +-
 .../coordination_service_agent.cc             | 229 ++++++------
 .../coordination/coordination_service_agent.h |  40 +-
 .../coordination_service_agent_test.cc        |  42 +--
 .../coordination_service_rpc_handler.cc       |  33 +-
 .../coordination/coordination_service_test.cc | 350 +++++++++---------
 7 files changed, 440 insertions(+), 414 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
index 4f55d9540824b0..8bb30c31cd798c 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -109,32 +109,33 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
 
   void LogConnectStatusLocked() const TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
 
-  Status RegisterTask(const CoordinatedTask& task,
-                      uint64_t incarnation) override;
+  absl::Status RegisterTask(const CoordinatedTask& task,
+                            uint64_t incarnation) override;
   void WaitForAllTasks(const CoordinatedTask& task, const DeviceInfo& devices,
                        StatusCallback done) override;
   void ShutdownTaskAsync(const CoordinatedTask& task,
                          StatusCallback done) override;
-  Status ResetTask(const CoordinatedTask& task) override;
-  Status RecordHeartbeat(const CoordinatedTask& task,
-                         uint64_t incarnation) override;
-  Status ReportTaskError(const CoordinatedTask& task, Status error) override;
+  absl::Status ResetTask(const CoordinatedTask& task) override;
+  absl::Status RecordHeartbeat(const CoordinatedTask& task,
+                               uint64_t incarnation) override;
+  absl::Status ReportTaskError(const CoordinatedTask& task,
+                               absl::Status error) override;
   std::vector<CoordinatedTaskStateInfo> GetTaskState(
       const std::vector<CoordinatedTask>& task) override;
-  Status InsertKeyValue(const std::string& key,
-                        const std::string& value) override;
+  absl::Status InsertKeyValue(const std::string& key,
+                              const std::string& value) override;
   void GetKeyValueAsync(const std::string& key,
                         StatusOrValueCallback done) override;
   absl::StatusOr<std::string> TryGetKeyValue(const std::string& key) override;
   std::vector<KeyValueEntry> GetKeyValueDir(
       absl::string_view directory_key) override;
-  Status DeleteKeyValue(const std::string& key) override;
+  absl::Status DeleteKeyValue(const std::string& key) override;
   void BarrierAsync(const std::string& barrier_id, absl::Duration timeout,
                     const CoordinatedTask& task,
                     const std::vector<CoordinatedTask>& participating_tasks,
                     StatusCallback done) override;
-  Status CancelBarrier(const std::string& barrier_id,
-                       const CoordinatedTask& task) override;
+  absl::Status CancelBarrier(const std::string& barrier_id,
+                             const CoordinatedTask& task) override;
 
  private:
   const DeviceInfo& ListClusterDevices() override
@@ -144,22 +145,22 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
   void Stop(bool shut_staleness_thread = true);
   // Report service error to a specified task.
   void ReportServiceErrorToTaskAsync(const CoordinatedTask& destination_task,
-                                     Status error);
+                                     absl::Status error);
   // Report error from a task to all other connected tasks if the task is not
   // recoverable.
   // Note: SetTaskError() must be called before propagating its error.
   void PropagateError(const CoordinatedTask& source_task,
                       bool is_reported_by_task = false)
       TF_LOCKS_EXCLUDED(state_mu_);
-  void SetTaskError(absl::string_view task_name, Status error)
+  void SetTaskError(absl::string_view task_name, absl::Status error)
       TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
   void AggregateClusterDevices() TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  Status DisconnectTask(const CoordinatedTask& task)
+  absl::Status DisconnectTask(const CoordinatedTask& task)
       TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
 
   struct BarrierState {
     bool passed = false;
-    Status result = errors::Unknown(
+    absl::Status result = errors::Unknown(
         "Invalid barrier result.");  // Only valid if `passed` is true.
     uint64_t deadline_in_micros = 0;
     int num_pending_tasks = 0;
@@ -169,7 +170,7 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
         tasks_at_barrier;
     std::vector<StatusCallback> done_callbacks;
   };
-  void PassBarrier(absl::string_view barrier_id, Status result,
+  void PassBarrier(absl::string_view barrier_id, absl::Status result,
                    BarrierState* barrier)
       TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
   // Check if participating tasks are specified correctly across barrier calls.
@@ -193,17 +194,17 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
     // tasks in the cluster.
 
     CoordinatedTaskState GetState() { return state_; }
-    Status GetStatus() { return status_; }
+    absl::Status GetStatus() { return status_; }
     uint64_t GetTaskIncarnation() { return task_incarnation_; }
     void SetConnected(uint64_t task_incarnation);
     void Disconnect(uint64_t grace_period_duration_us);
-    Status RecordHeartbeat(uint64_t task_incarnation);
+    absl::Status RecordHeartbeat(uint64_t task_incarnation);
     int64_t TimeSinceLastHeartbeatMs();
     // This denotes the deadline after which we stop accepting heartbeats from a
     // disconnected task. This grace period accounts for the lag time between
     // the service recording the state change and the agent stopping heartbeats.
     uint64_t GetDisconnectedGracePeriodMicros();
-    void SetError(Status status);
+    void SetError(absl::Status status);
     DeviceInfo GetDeviceInfo() { return devices_; }
     void CollectDeviceInfo(const DeviceInfo& devices) { devices_ = devices; }
     // Checks if task has called WaitForAllTasks() previously, which gathers the
@@ -219,7 +220,7 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
     uint64_t task_incarnation_ = 0;
 
     CoordinatedTaskState state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
-    Status status_;
+    absl::Status status_;
     mutex last_heartbeat_mu_;
     uint64_t last_heartbeat_us_ TF_GUARDED_BY(last_heartbeat_mu_);
     // This denotes the deadline after which we stop accepting heartbeats from a
@@ -282,7 +283,7 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
 void CoordinationServiceStandaloneImpl::TaskState::SetConnected(
     uint64_t task_incarnation) {
   state_ = CoordinatedTaskState::TASKSTATE_CONNECTED;
-  status_ = OkStatus();
+  status_ = absl::OkStatus();
   task_incarnation_ = task_incarnation;
   mutex_lock l(last_heartbeat_mu_);
   last_heartbeat_us_ = Env::Default()->NowMicros();
@@ -293,17 +294,17 @@ void CoordinationServiceStandaloneImpl::TaskState::Disconnect(
   disconnect_grace_period_us_ =
       Env::Default()->NowMicros() + grace_period_duration_us;
   state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
-  status_ = OkStatus();
+  status_ = absl::OkStatus();
 }
 
 void CoordinationServiceStandaloneImpl::TaskState::SetError(
-    const Status status) {
+    const absl::Status status) {
   if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) return;
   state_ = CoordinatedTaskState::TASKSTATE_ERROR;
   status_ = status;
 }
 
-Status CoordinationServiceStandaloneImpl::TaskState::RecordHeartbeat(
+absl::Status CoordinationServiceStandaloneImpl::TaskState::RecordHeartbeat(
     uint64_t task_incarnation) {
   if (!status_.ok()) return status_;
   if (task_incarnation != task_incarnation_) {
@@ -313,7 +314,7 @@ Status CoordinationServiceStandaloneImpl::TaskState::RecordHeartbeat(
   }
   mutex_lock l(last_heartbeat_mu_);
   last_heartbeat_us_ = Env::Default()->NowMicros();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int64_t
@@ -392,7 +393,7 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
             }
           }
           // Heartbeat check.
-          Status status = OkStatus();
+          absl::Status status = absl::OkStatus();
           {
             mutex_lock l(state_mu_);
             for (const auto& [task_name, task_state] : cluster_state_) {
@@ -463,7 +464,7 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
                   }
                 }
               }
-              const Status error =
+              const absl::Status error =
                   MakeCoordinationError(errors::DeadlineExceeded(absl::StrCat(
                       "Barrier timed out. Barrier_id: ", barrier_id,
                       ". Timed out task names:\n", pending_tasks)));
@@ -504,7 +505,7 @@ void CoordinationServiceStandaloneImpl::Stop(bool shut_staleness_thread) {
     mutex_lock l(state_mu_);
     for (auto& [barrier_id, barrier] : barriers_) {
       if (!barrier.passed) {
-        Status error = MakeCoordinationError(errors::Aborted(absl::StrCat(
+        absl::Status error = MakeCoordinationError(errors::Aborted(absl::StrCat(
             "Barrier failed because service is shutting down. Barrier_id: ",
             barrier_id)));
         PassBarrier(barrier_id, error, &barrier);
@@ -545,11 +546,11 @@ void CoordinationServiceStandaloneImpl::LogConnectStatusLocked() const {
   }
 }
 
-Status CoordinationServiceStandaloneImpl::RegisterTask(
+absl::Status CoordinationServiceStandaloneImpl::RegisterTask(
     const CoordinatedTask& task, uint64_t incarnation) {
   const std::string& task_name = GetTaskName(task);
 
-  Status error;
+  absl::Status error;
   std::string error_message;
   {
     mutex_lock l(state_mu_);
@@ -580,7 +581,7 @@ Status CoordinationServiceStandaloneImpl::RegisterTask(
                 << " has connected to coordination service. Incarnation: "
                 << incarnation;
       LogConnectStatusLocked();
-      return OkStatus();
+      return absl::OkStatus();
     } else if (task_state == CoordinatedTaskState::TASKSTATE_CONNECTED) {
       // This may happen if the service processes the initial RegisterTask(),
       // but the agent did not receive the response so the agent retries again.
@@ -593,7 +594,7 @@ Status CoordinationServiceStandaloneImpl::RegisterTask(
                   << " has connected to coordination service with the same "
                   << "incarnation again: " << incarnation;
         LogConnectStatusLocked();
-        return OkStatus();
+        return absl::OkStatus();
       } else {
         error_message =
             absl::StrCat(task_name,
@@ -643,7 +644,7 @@ void CoordinationServiceStandaloneImpl::ShutdownTaskAsync(
     BarrierAsync(shutdown_barrier_id_, shutdown_barrier_timeout_, task, {},
                  done);
   } else {
-    Status status;
+    absl::Status status;
     {
       mutex_lock l(state_mu_);
       // Disconnect task from service individually.
@@ -653,13 +654,13 @@ void CoordinationServiceStandaloneImpl::ShutdownTaskAsync(
   }
 }
 
-Status CoordinationServiceStandaloneImpl::ResetTask(
+absl::Status CoordinationServiceStandaloneImpl::ResetTask(
     const CoordinatedTask& task) {
   mutex_lock l(state_mu_);
   return DisconnectTask(task);
 }
 
-Status CoordinationServiceStandaloneImpl::DisconnectTask(
+absl::Status CoordinationServiceStandaloneImpl::DisconnectTask(
     const CoordinatedTask& task) {
   const std::string task_name = GetTaskName(task);
   // Check if task is valid and not already disconnected.
@@ -677,14 +678,14 @@ Status CoordinationServiceStandaloneImpl::DisconnectTask(
       /*grace_period_duration_us=*/heartbeat_timeout_ms_ * 1000);
   for (const auto& barrier_id :
        cluster_state_[task_name]->GetOngoingBarriers()) {
-    Status error = MakeCoordinationError(errors::Internal(absl::StrCat(
+    absl::Status error = MakeCoordinationError(errors::Internal(absl::StrCat(
         "Barrier failed from a disconnected task. Barrier Id: ", barrier_id,
         ", Task: ", task_name)));
     PassBarrier(barrier_id, error, &barriers_[barrier_id]);
   }
 
   LOG(INFO) << task_name << " has disconnected from coordination service.";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const DeviceInfo& CoordinationServiceStandaloneImpl::ListClusterDevices() {
@@ -695,8 +696,8 @@ uint64_t CoordinationServiceStandaloneImpl::GetServiceIncarnation() {
   return service_incarnation_;
 }
 
-Status CoordinationServiceStandaloneImpl::ReportTaskError(
-    const CoordinatedTask& task, Status error) {
+absl::Status CoordinationServiceStandaloneImpl::ReportTaskError(
+    const CoordinatedTask& task, absl::Status error) {
   const std::string& task_name = GetTaskName(task);
   {
     mutex_lock l(state_mu_);
@@ -712,7 +713,7 @@ Status CoordinationServiceStandaloneImpl::ReportTaskError(
     }
   }
   PropagateError(task, /*is_reported_by_task=*/true);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::vector<CoordinatedTaskStateInfo>
@@ -722,7 +723,7 @@ CoordinationServiceStandaloneImpl::GetTaskState(
   for (const auto& task : tasks) {
     const std::string task_name = GetTaskName(task);
     auto& state_info = states_info.emplace_back();
-    Status error;
+    absl::Status error;
     {
       mutex_lock l(state_mu_);
       state_info.set_state(cluster_state_[task_name]->GetState());
@@ -739,10 +740,10 @@ CoordinationServiceStandaloneImpl::GetTaskState(
   return states_info;
 }
 
-Status CoordinationServiceStandaloneImpl::RecordHeartbeat(
+absl::Status CoordinationServiceStandaloneImpl::RecordHeartbeat(
     const CoordinatedTask& task, uint64_t incarnation) {
   const std::string& task_name = GetTaskName(task);
-  Status s = OkStatus();
+  absl::Status s = absl::OkStatus();
   {
     mutex_lock l(state_mu_);
     if (!cluster_state_.contains(task_name)) {
@@ -782,7 +783,7 @@ Status CoordinationServiceStandaloneImpl::RecordHeartbeat(
 }
 
 void CoordinationServiceStandaloneImpl::ReportServiceErrorToTaskAsync(
-    const CoordinatedTask& destination_task, Status error) {
+    const CoordinatedTask& destination_task, absl::Status error) {
   assert(!error.ok());
 
   // Don't report error if there is no service-to-client connection.
@@ -805,7 +806,7 @@ void CoordinationServiceStandaloneImpl::ReportServiceErrorToTaskAsync(
   CoordinationClient* client = client_cache_->GetClient(task_name);
   client->ReportErrorToTaskAsync(
       call_opts.get(), request.get(), response.get(),
-      [request, response, task_name, call_opts](Status s) {
+      [request, response, task_name, call_opts](absl::Status s) {
         if (!s.ok()) {
           LOG(ERROR) << "Encountered another error while reporting to "
                      << task_name << ": " << s;
@@ -818,7 +819,7 @@ void CoordinationServiceStandaloneImpl::PropagateError(
   // If the error task is recoverable, do not propagate the error to other
   // connected tasks.
   if (isRecoverableJob(source_task.job_name())) return;
-  Status error;
+  absl::Status error;
   {
     mutex_lock l(state_mu_);
     error = cluster_state_[GetTaskName(source_task)]->GetStatus();
@@ -864,7 +865,8 @@ void CoordinationServiceStandaloneImpl::PropagateError(
     auto response = std::make_shared<ReportErrorToTaskResponse>();
     auto n = std::make_shared<absl::Notification>();
     client->ReportErrorToTaskAsync(
-        &call_opts, &request, response.get(), [response, n, task](Status s) {
+        &call_opts, &request, response.get(),
+        [response, n, task](absl::Status s) {
           if (!s.ok()) {
             LOG(ERROR) << "Encountered another error while reporting to "
                        << task << ": " << s;
@@ -906,7 +908,7 @@ std::string NormalizeKey(const StringPiece orig_key) {
   return norm_key;
 }
 
-Status CoordinationServiceStandaloneImpl::InsertKeyValue(
+absl::Status CoordinationServiceStandaloneImpl::InsertKeyValue(
     const std::string& key, const std::string& value) {
   VLOG(3) << "InsertKeyValue(): " << key << ": " << value;
   const std::string& norm_key = NormalizeKey(key);
@@ -923,7 +925,7 @@ Status CoordinationServiceStandaloneImpl::InsertKeyValue(
     }
     get_cb_.erase(iter);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void CoordinationServiceStandaloneImpl::GetKeyValueAsync(
@@ -984,7 +986,7 @@ std::vector<KeyValueEntry> CoordinationServiceStandaloneImpl::GetKeyValueDir(
   return kvs_in_directory;
 }
 
-Status CoordinationServiceStandaloneImpl::DeleteKeyValue(
+absl::Status CoordinationServiceStandaloneImpl::DeleteKeyValue(
     const std::string& key) {
   VLOG(3) << "DeleteKeyValue(): " << key;
   const std::string& norm_key = NormalizeKey(key);
@@ -1003,15 +1005,15 @@ Status CoordinationServiceStandaloneImpl::DeleteKeyValue(
   if (iter != kv_store_.end()) {
     kv_store_.erase(iter);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void CoordinationServiceStandaloneImpl::SetTaskError(
-    absl::string_view task_name, Status error) {
+    absl::string_view task_name, absl::Status error) {
   cluster_state_[task_name]->SetError(error);
   for (const auto& barrier_id :
        cluster_state_[task_name]->GetOngoingBarriers()) {
-    Status error = MakeCoordinationError(errors::Internal(absl::StrCat(
+    absl::Status error = MakeCoordinationError(errors::Internal(absl::StrCat(
         "Barrier failed from a task error. Barrier Id: ", barrier_id,
         ", Task: ", task_name)));
     PassBarrier(barrier_id, error, &barriers_[barrier_id]);
@@ -1049,7 +1051,7 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
         // barrier.
         const std::string task_name = GetTaskName(task);
         if (!cluster_state_.contains(task_name)) {
-          Status error = MakeCoordinationError(errors::InvalidArgument(
+          absl::Status error = MakeCoordinationError(errors::InvalidArgument(
               absl::StrCat("Unexpected task (", task_name,
                            ") that is not in the cluster called the barrier. "
                            "Barrier Id: ",
@@ -1068,7 +1070,7 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
       const std::string task_name = GetTaskName(pending_task.first);
       if (cluster_state_[task_name]->GetState() ==
           CoordinatedTaskState::TASKSTATE_ERROR) {
-        Status error = MakeCoordinationError(errors::Internal(
+        absl::Status error = MakeCoordinationError(errors::Internal(
             absl::StrCat("Task (", task_name,
                          ") is already in error before the barrier "
                          "was called. Barrier Id: ",
@@ -1099,7 +1101,7 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
   if (barrier->passed) {
     // Special hook for shutdown barrier to disconnect task.
     if (barrier_id == shutdown_barrier_id_) {
-      Status s = DisconnectTask(task);
+      absl::Status s = DisconnectTask(task);
       // Return any errors from the disconnect attempt, otherwise return the
       // barrier status outside of this hook.
       if (!s.ok()) {
@@ -1118,7 +1120,7 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
   // Check if caller task is participating in the barrier.
   if (!barrier->tasks_at_barrier.contains(task)) {
     // Unexpected barrier call from a task not participating in the barrier.
-    Status error = MakeCoordinationError(errors::InvalidArgument(
+    absl::Status error = MakeCoordinationError(errors::InvalidArgument(
         absl::StrCat("A non-participating task (", GetTaskName(task),
                      ") called the barrier: ", barrier_id)));
     PassBarrier(barrier_id, error, barrier);
@@ -1128,8 +1130,9 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
   // Check if task args are specified consistently across barrier calls.
   if (!ValidateTaskArgs(participating_tasks, barrier->tasks_at_barrier,
                         cluster_state_.size())) {
-    Status error = MakeCoordinationError(errors::InvalidArgument(absl::StrCat(
-        "Conflicting tasks specified for the same barrier: ", barrier_id)));
+    absl::Status error =
+        MakeCoordinationError(errors::InvalidArgument(absl::StrCat(
+            "Conflicting tasks specified for the same barrier: ", barrier_id)));
     PassBarrier(barrier_id, error, barrier);
     return;
   }
@@ -1141,13 +1144,13 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
     --barrier->num_pending_tasks;
 
     if (barrier->num_pending_tasks == 0) {
-      PassBarrier(barrier_id, OkStatus(), barrier);
+      PassBarrier(barrier_id, absl::OkStatus(), barrier);
       return;
     }
   }
 }
 
-Status CoordinationServiceStandaloneImpl::CancelBarrier(
+absl::Status CoordinationServiceStandaloneImpl::CancelBarrier(
     const std::string& barrier_id, const CoordinatedTask& task) {
   mutex_lock l(state_mu_);
   auto [it, inserted] = barriers_.try_emplace(barrier_id);
@@ -1165,17 +1168,17 @@ Status CoordinationServiceStandaloneImpl::CancelBarrier(
   }
 
   // Cancel barrier.
-  Status cancelled = MakeCoordinationError(errors::Cancelled(absl::StrCat(
+  absl::Status cancelled = MakeCoordinationError(errors::Cancelled(absl::StrCat(
       "Barrier (", barrier_id, ") is cancelled by task: ", GetTaskName(task))));
   PassBarrier(barrier_id, cancelled, barrier);
 
   VLOG(3) << "Barrier (" << barrier_id << ") is cancelled.";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Mark barrier as passed.
 void CoordinationServiceStandaloneImpl::PassBarrier(
-    absl::string_view barrier_id, Status result, BarrierState* barrier) {
+    absl::string_view barrier_id, absl::Status result, BarrierState* barrier) {
   barrier->passed = true;
   barrier->result = result;
   VLOG(3) << "Barrier(" << barrier_id << ") has passed with status: " << result;
@@ -1201,14 +1204,14 @@ void CoordinationServiceStandaloneImpl::PassBarrier(
                     "crashed early or too slow / hanging. Check the logs for "
                     "an earlier error to identify the root cause.";
     }
-    Status shutdown_error = MakeCoordinationError(errors::Internal(
+    absl::Status shutdown_error = MakeCoordinationError(errors::Internal(
         absl::StrCat("Shutdown barrier has been passed with status: '",
                      barrier->result.ToString(),
                      "', but this task is not at the barrier yet.")));
     for (const auto& [task, at_barrier] : barrier->tasks_at_barrier) {
       if (at_barrier) {
         // Disconnect tasks that reached the barrier.
-        Status disconnect_status = DisconnectTask(task);
+        absl::Status disconnect_status = DisconnectTask(task);
         if (!disconnect_status.ok()) {
           LOG(ERROR) << disconnect_status;
         }
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.h b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.h
index 64c96ce009777c..b82261e6d30bc6 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.h
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.h
@@ -116,8 +116,8 @@ class CoordinationServiceInterface {
   //   - InvalidArgument: Unexpected task request.
   //   - Aborted: (1) task is in error state, or (2) task is in connected state
   //       with a different incarnation, indicating that it restarted.
-  virtual Status RegisterTask(const tensorflow::CoordinatedTask& task,
-                              uint64_t incarnation) = 0;
+  virtual absl::Status RegisterTask(const tensorflow::CoordinatedTask& task,
+                                    uint64_t incarnation) = 0;
 
   // Wait for all tasks to be up and running, and register local device
   // info. The callback is invoked when all tasks are up and registered, or some
@@ -141,16 +141,16 @@ class CoordinationServiceInterface {
   // Possible service errors:
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: task has already disconnected.
-  virtual Status ResetTask(const tensorflow::CoordinatedTask& task) = 0;
+  virtual absl::Status ResetTask(const tensorflow::CoordinatedTask& task) = 0;
 
   // Update the heartbeat timestamp of a task. This should only be invoked on
   // the leader of the cluster.
-  virtual Status RecordHeartbeat(const tensorflow::CoordinatedTask& task,
-                                 uint64_t incarnation) = 0;
+  virtual absl::Status RecordHeartbeat(const tensorflow::CoordinatedTask& task,
+                                       uint64_t incarnation) = 0;
 
   // Set a task in error state permanently.
-  virtual Status ReportTaskError(const tensorflow::CoordinatedTask& task,
-                                 Status error) = 0;
+  virtual absl::Status ReportTaskError(const tensorflow::CoordinatedTask& task,
+                                       absl::Status error) = 0;
 
   // Get the state and the error status of the tasks.
   virtual std::vector<tensorflow::CoordinatedTaskStateInfo> GetTaskState(
@@ -159,8 +159,8 @@ class CoordinationServiceInterface {
   // Insert a configuration key-value in the coordination service.
   // For now, a key-value can only be inserted once and cannot be updated.
   // The key-values are not persisted and will be lost if the leader fails.
-  virtual Status InsertKeyValue(const std::string& key,
-                                const std::string& value) = 0;
+  virtual absl::Status InsertKeyValue(const std::string& key,
+                                      const std::string& value) = 0;
 
   // Get a configuration key-value from the coordination service. The `done`
   // callback is invoked when the key-value becomes available.
@@ -181,7 +181,7 @@ class CoordinationServiceInterface {
 
   // Delete configuration key-value. If key is a directory, recursively clean
   // up all key-values under the directory.
-  virtual Status DeleteKeyValue(const std::string& key) = 0;
+  virtual absl::Status DeleteKeyValue(const std::string& key) = 0;
 
   // Blocks until all (or a subset of) tasks are at the barrier or the barrier
   // fails.
@@ -223,8 +223,9 @@ class CoordinationServiceInterface {
   // CANCELLED error status.
   // Possible service errors:
   //   - FailedPrecondition: Barrier has already been passed.
-  virtual Status CancelBarrier(const std::string& barrier_id,
-                               const tensorflow::CoordinatedTask& task) = 0;
+  virtual absl::Status CancelBarrier(
+      const std::string& barrier_id,
+      const tensorflow::CoordinatedTask& task) = 0;
 
  private:
   friend class CoordinationServiceRpcHandler;
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index 275b8b18575d56..5f65f8e861bd92 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -72,30 +72,30 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
  public:
   CoordinationServiceAgentImpl() = default;
   ~CoordinationServiceAgentImpl() override {
-    Status s = Shutdown();
+    absl::Status s = Shutdown();
     VLOG(3) << "Coordination agent dtor failed with status: " << s;
   }
-  Status Initialize(Env* env, std::string_view job_name, int task_id,
-                    const CoordinationServiceConfig& configs,
-                    std::unique_ptr<CoordinationClient> leader_client,
-                    StatusCallback error_fn) override;
-  Status Initialize(Env* env, const CoordinatedTask& task,
-                    const CoordinationServiceConfig& configs,
-                    std::unique_ptr<CoordinationClient> leader_client,
-                    StatusCallback error_fn) override;
+  absl::Status Initialize(Env* env, std::string_view job_name, int task_id,
+                          const CoordinationServiceConfig& configs,
+                          std::unique_ptr<CoordinationClient> leader_client,
+                          StatusCallback error_fn) override;
+  absl::Status Initialize(Env* env, const CoordinatedTask& task,
+                          const CoordinationServiceConfig& configs,
+                          std::unique_ptr<CoordinationClient> leader_client,
+                          StatusCallback error_fn) override;
   bool IsInitialized() override;
   bool IsConnected() override;
   bool IsError() override;
 
-  Status Connect() override;
-  Status WaitForAllTasks(const DeviceInfo& local_devices) override;
+  absl::Status Connect() override;
+  absl::Status WaitForAllTasks(const DeviceInfo& local_devices) override;
   const DeviceInfo& GetClusterDeviceInfo() override;
   absl::StatusOr<CoordinatedTask> GetOwnTask() override;
   absl::StatusOr<std::vector<CoordinatedTaskStateInfo>> GetTaskState(
       const std::vector<CoordinatedTask>& task) override;
-  Status ReportError(const Status& error) override;
-  Status Shutdown() override;
-  Status Reset() override;
+  absl::Status ReportError(const absl::Status& error) override;
+  absl::Status Shutdown() override;
+  absl::Status Reset() override;
 
   absl::StatusOr<std::string> GetKeyValue(std::string_view key) override;
   absl::StatusOr<std::string> GetKeyValue(std::string_view key,
@@ -107,31 +107,34 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
       std::string_view key) override;
   void GetKeyValueDirAsync(std::string_view key,
                            StatusOrValueDirCallback done) override;
-  Status InsertKeyValue(std::string_view key, std::string_view value) override;
-  Status DeleteKeyValue(std::string_view key) override;
-  Status UpdateKeyValue(std::string_view key, std::string_view value) override;
-
-  Status StartWatchKey(std::string_view key,
-                       ChangedKeyValuesCallback on_change) override;
-  Status StopWatchKey(std::string_view key) override;
-  Status WaitAtBarrier(std::string_view barrier_id, absl::Duration timeout,
-                       const std::vector<CoordinatedTask>& tasks) override;
+  absl::Status InsertKeyValue(std::string_view key,
+                              std::string_view value) override;
+  absl::Status DeleteKeyValue(std::string_view key) override;
+  absl::Status UpdateKeyValue(std::string_view key,
+                              std::string_view value) override;
+
+  absl::Status StartWatchKey(std::string_view key,
+                             ChangedKeyValuesCallback on_change) override;
+  absl::Status StopWatchKey(std::string_view key) override;
+  absl::Status WaitAtBarrier(
+      std::string_view barrier_id, absl::Duration timeout,
+      const std::vector<CoordinatedTask>& tasks) override;
   void WaitAtBarrierAsync(std::string_view barrier_id, absl::Duration timeout,
                           const std::vector<CoordinatedTask>& tasks,
                           StatusCallback done) override;
-  Status CancelBarrier(std::string_view barrier_id) override;
+  absl::Status CancelBarrier(std::string_view barrier_id) override;
   void CancelBarrierAsync(std::string_view barrier_id,
                           StatusCallback done) override;
 
   absl::StatusOr<Env*> GetEnv() override;
 
  protected:
-  void SetError(const Status& error) override;
-  Status ActivateWatch(std::string_view key,
-                       const std::map<std::string, std::string>&) override;
+  void SetError(const absl::Status& error) override;
+  absl::Status ActivateWatch(
+      std::string_view key, const std::map<std::string, std::string>&) override;
   // Returns an error if agent is not running. If `allow_disconnected` is true,
   // returns OK even if the agent is in DISCONNECTED state.
-  Status ValidateRunningAgent(bool allow_disconnected = false);
+  absl::Status ValidateRunningAgent(bool allow_disconnected = false);
   void StopHeartbeat();
 
  private:
@@ -144,7 +147,7 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
   mutable mutex state_mu_;
   CoordinatedTaskState state_ TF_GUARDED_BY(state_mu_) =
       CoordinatedTaskState::TASKSTATE_UNINITIALIZED;
-  Status status_ TF_GUARDED_BY(state_mu_) = OkStatus();
+  absl::Status status_ TF_GUARDED_BY(state_mu_) = absl::OkStatus();
   // Note: this set grows without bounds. For now, this is okay as most users
   // require < 100 barriers. If there is a use case that requires many barriers,
   // consider using a monotonic sequence number to track instead.
@@ -166,7 +169,7 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
   void operator=(const CoordinationServiceAgentImpl&) = delete;
 };
 
-Status CoordinationServiceAgentImpl::Initialize(
+absl::Status CoordinationServiceAgentImpl::Initialize(
     Env* env, std::string_view job_name, int task_id,
     const CoordinationServiceConfig& configs,
     std::unique_ptr<CoordinationClient> leader_client,
@@ -177,7 +180,7 @@ Status CoordinationServiceAgentImpl::Initialize(
   return Initialize(env, task, configs, std::move(leader_client), error_fn);
 }
 
-Status CoordinationServiceAgentImpl::Initialize(
+absl::Status CoordinationServiceAgentImpl::Initialize(
     Env* env, const CoordinatedTask& task,
     const CoordinationServiceConfig& configs,
     std::unique_ptr<CoordinationClient> leader_client,
@@ -203,7 +206,7 @@ Status CoordinationServiceAgentImpl::Initialize(
   }
   error_fn_ = error_fn;
   state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool CoordinationServiceAgentImpl::IsInitialized() {
@@ -230,7 +233,7 @@ void CoordinationServiceAgentImpl::StopHeartbeat() {
   heartbeat_thread_.reset();
 }
 
-Status CoordinationServiceAgentImpl::Connect() {
+absl::Status CoordinationServiceAgentImpl::Connect() {
   VLOG(3) << "Agent has started trying to Connect().";
   {
     mutex_lock l(state_mu_);
@@ -239,7 +242,8 @@ Status CoordinationServiceAgentImpl::Connect() {
           "Coordination service agent is not in DISCONNECTED state."));
     }
   }
-  Status connect_status = absl::UnknownError("Connection not attempted yet.");
+  absl::Status connect_status =
+      absl::UnknownError("Connection not attempted yet.");
   RegisterTaskRequest request;
   *request.mutable_source_task() = task_;
   request.set_incarnation(incarnation_id_);
@@ -261,7 +265,7 @@ Status CoordinationServiceAgentImpl::Connect() {
     call_opts.SetTimeout(absl::ToInt64Milliseconds(deadline - absl::Now()));
     absl::Notification n;
     leader_client_->RegisterTaskAsync(
-        &call_opts, &request, &response, [&](Status s) {
+        &call_opts, &request, &response, [&](absl::Status s) {
           if (s.ok()) {
             leader_incarnation_ = response.leader_incarnation();
             {
@@ -315,13 +319,13 @@ Status CoordinationServiceAgentImpl::Connect() {
         call_opts.SetTimeout(heartbeat_interval_ms);
 
         while (true) {
-          Status status;
+          absl::Status status;
           absl::Notification n;
           // Heartbeat RPC implementation automatically retries to tolerate
           // transient network failures.
           VLOG(10) << "HeartbeatRequest: " << request.DebugString();
           leader_client_->HeartbeatAsync(&call_opts, &request, &response,
-                                         [&](Status s) {
+                                         [&](absl::Status s) {
                                            status = s;
                                            n.Notify();
                                          });
@@ -355,12 +359,12 @@ Status CoordinationServiceAgentImpl::Connect() {
           }
         }
       }));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status CoordinationServiceAgentImpl::WaitForAllTasks(
+absl::Status CoordinationServiceAgentImpl::WaitForAllTasks(
     const DeviceInfo& local_devices) {
-  Status agent_running_status = ValidateRunningAgent();
+  absl::Status agent_running_status = ValidateRunningAgent();
   if (!agent_running_status.ok()) {
     return agent_running_status;
   }
@@ -369,12 +373,13 @@ Status CoordinationServiceAgentImpl::WaitForAllTasks(
   *request.mutable_device_info() = local_devices;
   VLOG(3) << "WaitForAllTasksRequest: " << request.DebugString();
   WaitForAllTasksResponse response;
-  Status status;
+  absl::Status status;
   absl::Notification n;
-  leader_client_->WaitForAllTasksAsync(&request, &response, [&](Status s) {
-    status = s;
-    n.Notify();
-  });
+  leader_client_->WaitForAllTasksAsync(&request, &response,
+                                       [&](absl::Status s) {
+                                         status = s;
+                                         n.Notify();
+                                       });
   n.WaitForNotification();
   if (!status.ok()) {
     VLOG(3) << "WaitForAllTasksResponse: " << status;
@@ -383,7 +388,7 @@ Status CoordinationServiceAgentImpl::WaitForAllTasks(
   }
   VLOG(3) << "WaitForAllTasksResponse: " << response.DebugString();
   cluster_devices_ = response.device_info();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const DeviceInfo& CoordinationServiceAgentImpl::GetClusterDeviceInfo() {
@@ -407,21 +412,23 @@ CoordinationServiceAgentImpl::GetTaskState(
   GetTaskStateResponse response;
   absl::Notification n;
   absl::StatusOr<std::vector<CoordinatedTaskStateInfo>> result;
-  leader_client_->GetTaskStateAsync(&request, &response, [&](const Status& s) {
-    if (s.ok()) {
-      result = std::vector<CoordinatedTaskStateInfo>(
-          std::make_move_iterator(response.task_state().begin()),
-          std::make_move_iterator(response.task_state().end()));
-    } else {
-      result = s;
-    }
-    n.Notify();
-  });
+  leader_client_->GetTaskStateAsync(
+      &request, &response, [&](const absl::Status& s) {
+        if (s.ok()) {
+          result = std::vector<CoordinatedTaskStateInfo>(
+              std::make_move_iterator(response.task_state().begin()),
+              std::make_move_iterator(response.task_state().end()));
+        } else {
+          result = s;
+        }
+        n.Notify();
+      });
   n.WaitForNotification();
   return result;
 }
 
-Status CoordinationServiceAgentImpl::ReportError(const Status& error) {
+absl::Status CoordinationServiceAgentImpl::ReportError(
+    const absl::Status& error) {
   {
     mutex_lock l(state_mu_);
     if (state_ == CoordinatedTaskState::TASKSTATE_UNINITIALIZED) {
@@ -444,24 +451,26 @@ Status CoordinationServiceAgentImpl::ReportError(const Status& error) {
   ReportErrorToServiceResponse response;
 
   absl::Notification n;
-  leader_client_->ReportErrorToServiceAsync(&request, &response, [&](Status s) {
-    VLOG(5) << "ReportErrorToServiceResponse: " << s;
-    if (!s.ok()) {
-      LOG(ERROR) << "Encountered another error when reporting error to "
-                    "coordination service: "
-                 << s
-                 << "\nThis is usually caused by an earlier error during "
-                    "execution. Check the logs (this task or the leader) for "
-                    "an earlier error to debug further.";
-    }
-    n.Notify();
-  });
+  leader_client_->ReportErrorToServiceAsync(
+      &request, &response, [&](absl::Status s) {
+        VLOG(5) << "ReportErrorToServiceResponse: " << s;
+        if (!s.ok()) {
+          LOG(ERROR)
+              << "Encountered another error when reporting error to "
+                 "coordination service: "
+              << s
+              << "\nThis is usually caused by an earlier error during "
+                 "execution. Check the logs (this task or the leader) for "
+                 "an earlier error to debug further.";
+        }
+        n.Notify();
+      });
   n.WaitForNotification();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status CoordinationServiceAgentImpl::Shutdown() {
-  Status status = OkStatus();
+absl::Status CoordinationServiceAgentImpl::Shutdown() {
+  absl::Status status = absl::OkStatus();
   bool is_connected = false;
   {
     mutex_lock l(state_mu_);
@@ -482,7 +491,7 @@ Status CoordinationServiceAgentImpl::Shutdown() {
 
     absl::Notification n;
     leader_client_->ShutdownTaskAsync(&call_opts, &request, &response,
-                                      [&status, &n](Status s) {
+                                      [&status, &n](absl::Status s) {
                                         status = s;
                                         n.Notify();
                                       });
@@ -524,7 +533,7 @@ Status CoordinationServiceAgentImpl::Shutdown() {
   return status;
 }
 
-Status CoordinationServiceAgentImpl::Reset() {
+absl::Status CoordinationServiceAgentImpl::Reset() {
   {
     mutex_lock l(state_mu_);
     if (state_ != CoordinatedTaskState::TASKSTATE_ERROR) {
@@ -538,12 +547,13 @@ Status CoordinationServiceAgentImpl::Reset() {
   VLOG(3) << "ResetTaskRequest: " << request.DebugString();
   ResetTaskResponse response;
 
-  Status status;
+  absl::Status status;
   absl::Notification n;
-  leader_client_->ResetTaskAsync(&request, &response, [&status, &n](Status s) {
-    status = s;
-    n.Notify();
-  });
+  leader_client_->ResetTaskAsync(&request, &response,
+                                 [&status, &n](absl::Status s) {
+                                   status = s;
+                                   n.Notify();
+                                 });
   n.WaitForNotification();
   VLOG(3) << "ResetTaskResponse: " << status;
   if (!status.ok()) {
@@ -609,7 +619,7 @@ std::shared_ptr<CallOptions> CoordinationServiceAgentImpl::GetKeyValueAsync(
   leader_client_->GetKeyValueAsync(
       call_opts.get(), request.get(), response.get(),
       [call_opts, request, response, done = std::move(done),
-       &cm = cancellation_manager_, token](const Status& s) {
+       &cm = cancellation_manager_, token](const absl::Status& s) {
         // RPC call has completed (no longer needs to be cancelled if agent is
         // destroyed).
         cm.TryDeregisterCallback(token);
@@ -635,7 +645,7 @@ absl::StatusOr<std::string> CoordinationServiceAgentImpl::TryGetKeyValue(
   VLOG(3) << "TryGetKeyValueRequest: " << request.DebugString();
   TryGetKeyValueResponse response;
   leader_client_->TryGetKeyValueAsync(
-      &request, &response, [&](const Status& s) {
+      &request, &response, [&](const absl::Status& s) {
         if (s.ok()) {
           result = response.kv().value();
           VLOG(3) << "TryGetKeyValueResponse: " << result.value();
@@ -673,7 +683,7 @@ void CoordinationServiceAgentImpl::GetKeyValueDirAsync(
   auto response = std::make_shared<GetKeyValueDirResponse>();
   leader_client_->GetKeyValueDirAsync(
       request.get(), response.get(),
-      [request, response, done = std::move(done)](const Status& s) {
+      [request, response, done = std::move(done)](const absl::Status& s) {
         if (!s.ok()) {
           done(s);
           VLOG(3) << "GetKeyValueDirResponse: " << s;
@@ -687,17 +697,17 @@ void CoordinationServiceAgentImpl::GetKeyValueDirAsync(
       });
 }
 
-Status CoordinationServiceAgentImpl::InsertKeyValue(std::string_view key,
-                                                    std::string_view value) {
+absl::Status CoordinationServiceAgentImpl::InsertKeyValue(
+    std::string_view key, std::string_view value) {
   InsertKeyValueRequest request;
   request.mutable_kv()->set_key(key.data(), key.size());
   request.mutable_kv()->set_value(value.data(), value.size());
   VLOG(3) << "InsertKeyValueRequest: " << request.DebugString();
   InsertKeyValueResponse response;
 
-  Status status;
+  absl::Status status;
   absl::Notification n;
-  leader_client_->InsertKeyValueAsync(&request, &response, [&](Status s) {
+  leader_client_->InsertKeyValueAsync(&request, &response, [&](absl::Status s) {
     status = s;
     n.Notify();
   });
@@ -706,43 +716,44 @@ Status CoordinationServiceAgentImpl::InsertKeyValue(std::string_view key,
   return status;
 }
 
-Status CoordinationServiceAgentImpl::DeleteKeyValue(std::string_view key) {
+absl::Status CoordinationServiceAgentImpl::DeleteKeyValue(
+    std::string_view key) {
   DeleteKeyValueRequest request;
   request.set_key(key.data(), key.size());
   request.set_is_directory(true);
   VLOG(3) << "DeleteKeyValueRequest: " << request.DebugString();
   DeleteKeyValueResponse response;
 
-  Status status;
+  absl::Status status;
   absl::Notification n;
-  leader_client_->DeleteKeyValueAsync(&request, &response, [&](Status s) {
+  leader_client_->DeleteKeyValueAsync(&request, &response, [&](absl::Status s) {
     status = s;
     n.Notify();
   });
   n.WaitForNotification();
   VLOG(3) << "DeleteKeyValueResponse " << status;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status CoordinationServiceAgentImpl::UpdateKeyValue(std::string_view key,
-                                                    std::string_view value) {
+absl::Status CoordinationServiceAgentImpl::UpdateKeyValue(
+    std::string_view key, std::string_view value) {
   return MakeCoordinationError(absl::UnimplementedError(
       "CoordinationServiceAgent::UpdateKeyValue is not implemented."));
 }
 
-Status CoordinationServiceAgentImpl::StartWatchKey(
+absl::Status CoordinationServiceAgentImpl::StartWatchKey(
     std::string_view key,
     CoordinationServiceAgentImpl::ChangedKeyValuesCallback on_change) {
   return MakeCoordinationError(absl::UnimplementedError(
       "CoordinationServiceAgent::StartWatchKey is not implemented."));
 }
 
-Status CoordinationServiceAgentImpl::StopWatchKey(std::string_view key) {
+absl::Status CoordinationServiceAgentImpl::StopWatchKey(std::string_view key) {
   return MakeCoordinationError(absl::UnimplementedError(
       "CoordinationServiceAgent::StopWatchKey is not implemented."));
 }
 
-void CoordinationServiceAgentImpl::SetError(const Status& error) {
+void CoordinationServiceAgentImpl::SetError(const absl::Status& error) {
   assert(!error.ok());
   mutex_lock l(state_mu_);
   if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) return;
@@ -753,18 +764,18 @@ void CoordinationServiceAgentImpl::SetError(const Status& error) {
   error_fn_(error);
 }
 
-Status CoordinationServiceAgentImpl::ActivateWatch(
+absl::Status CoordinationServiceAgentImpl::ActivateWatch(
     std::string_view key, const std::map<std::string, std::string>& kvs) {
   return MakeCoordinationError(absl::UnimplementedError(
       "CoordinationServiceAgent::ActivateWatch is not implemented."));
 }
 
-Status CoordinationServiceAgentImpl::WaitAtBarrier(
+absl::Status CoordinationServiceAgentImpl::WaitAtBarrier(
     std::string_view barrier_id, absl::Duration timeout,
     const std::vector<CoordinatedTask>& tasks) {
-  Status status;
+  absl::Status status;
   absl::Notification n;
-  WaitAtBarrierAsync(barrier_id, timeout, tasks, [&](Status s) {
+  WaitAtBarrierAsync(barrier_id, timeout, tasks, [&](absl::Status s) {
     status = s;
     n.Notify();
   });
@@ -775,7 +786,7 @@ Status CoordinationServiceAgentImpl::WaitAtBarrier(
 void CoordinationServiceAgentImpl::WaitAtBarrierAsync(
     std::string_view barrier_id, absl::Duration timeout,
     const std::vector<CoordinatedTask>& tasks, StatusCallback done) {
-  Status agent_running_status =
+  absl::Status agent_running_status =
       ValidateRunningAgent(/*allow_disconnected=*/true);
   if (!agent_running_status.ok()) {
     done(agent_running_status);
@@ -801,17 +812,17 @@ void CoordinationServiceAgentImpl::WaitAtBarrierAsync(
   VLOG(3) << "WaitAtBarrierRequest: " << request->DebugString();
   leader_client_->BarrierAsync(
       request.get(), response.get(),
-      [request, response, done = std::move(done)](const Status& s) {
+      [request, response, done = std::move(done)](const absl::Status& s) {
         done(s);
         VLOG(3) << "WaitAtBarrierResponse: " << s;
       });
 }
 
-Status CoordinationServiceAgentImpl::CancelBarrier(
+absl::Status CoordinationServiceAgentImpl::CancelBarrier(
     std::string_view barrier_id) {
-  Status status;
+  absl::Status status;
   absl::Notification n;
-  CancelBarrierAsync(barrier_id, [&](const Status& s) {
+  CancelBarrierAsync(barrier_id, [&](const absl::Status& s) {
     status = s;
     n.Notify();
   });
@@ -821,7 +832,7 @@ Status CoordinationServiceAgentImpl::CancelBarrier(
 
 void CoordinationServiceAgentImpl::CancelBarrierAsync(
     std::string_view barrier_id, StatusCallback done) {
-  Status agent_running_status =
+  absl::Status agent_running_status =
       ValidateRunningAgent(/*allow_disconnected=*/true);
   if (!agent_running_status.ok()) {
     done(agent_running_status);
@@ -834,26 +845,26 @@ void CoordinationServiceAgentImpl::CancelBarrierAsync(
   VLOG(3) << "CancelBarrierRequest: " << request->DebugString();
   leader_client_->CancelBarrierAsync(
       request.get(), response.get(),
-      [request, response, done = std::move(done)](const Status& s) {
+      [request, response, done = std::move(done)](const absl::Status& s) {
         done(s);
         VLOG(3) << "CancelBarrierResponse: " << s;
       });
 }
 
 // Returns an error if agent is not running.
-Status CoordinationServiceAgentImpl::ValidateRunningAgent(
+absl::Status CoordinationServiceAgentImpl::ValidateRunningAgent(
     bool allow_disconnected) {
   mutex_lock l(state_mu_);
   switch (state_) {
     case CoordinatedTaskState::TASKSTATE_CONNECTED:
-      return OkStatus();
+      return absl::OkStatus();
 
     case CoordinatedTaskState::TASKSTATE_UNINITIALIZED:
       return MakeCoordinationError(absl::FailedPreconditionError(
           "Agent must be in CONNECTED state. It is currently UNINITIALIZED."));
 
     case CoordinatedTaskState::TASKSTATE_DISCONNECTED:
-      if (allow_disconnected) return OkStatus();
+      if (allow_disconnected) return absl::OkStatus();
       return MakeCoordinationError(absl::FailedPreconditionError(
           "Agent must be in CONNECTED state. It is currently DISCONNECTED."));
 
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h
index 9ba405c84872eb..6c31eccffd10f0 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h
@@ -75,12 +75,12 @@ class CoordinationServiceAgent {
   virtual ~CoordinationServiceAgent() = default;
 
   // Initialize coordination service agent.
-  virtual Status Initialize(
+  virtual absl::Status Initialize(
       tsl::Env* env, std::string_view job_name, int task_id,
       const tensorflow::CoordinationServiceConfig& configs,
       std::unique_ptr<CoordinationClient> leader_client,
       StatusCallback error_fn) = 0;
-  virtual Status Initialize(
+  virtual absl::Status Initialize(
       tsl::Env* env, const tensorflow::CoordinatedTask& task,
       const tensorflow::CoordinationServiceConfig& configs,
       std::unique_ptr<CoordinationClient> leader_client,
@@ -105,14 +105,14 @@ class CoordinationServiceAgent {
   //   - InvalidArgument: Unexpected task registration
   //   - Aborted: Duplicate task registration (agent will retry connecting until
   //              the configured timeout)
-  virtual Status Connect() = 0;
+  virtual absl::Status Connect() = 0;
 
   // Wait for all tasks to be up and registered. The call blocks until all tasks
   // in the cluster are up, or some error occurs.
   // Possible service errors:
   //   - FailedPrecondition: Agent is not in CONNECTED state.
   //   - InvalidArgument: Unexpected task request
-  virtual Status WaitForAllTasks(
+  virtual absl::Status WaitForAllTasks(
       const tensorflow::DeviceInfo& local_devices) = 0;
 
   // Get the device attributes of tasks from remote tasks in the cluster.
@@ -139,7 +139,7 @@ class CoordinationServiceAgent {
   // Possible service errors:
   //   - FailedPrecondition: Uninitialized/disconnected/already in error state.
   //   - InvalidArgument: Unexpected task request
-  virtual Status ReportError(const Status& error) = 0;
+  virtual absl::Status ReportError(const absl::Status& error) = 0;
 
   // Shuts down by disconnecting from the service. Should only be called if
   // agent is connected and no further agent calls (except the destructor) are
@@ -151,14 +151,14 @@ class CoordinationServiceAgent {
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: Task was in error state (note: agent is still
   //                         shut down forcefully).
-  virtual Status Shutdown() = 0;
+  virtual absl::Status Shutdown() = 0;
 
   // Disconnect from the service, and clean up the internal error status.
   // Possible service errors:
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: task is not in error state/has already
   //       disconnected.
-  virtual Status Reset() = 0;
+  virtual absl::Status Reset() = 0;
 
   // Key-value store API.
   // The agent does not need to be connected to utilize the key-value store.
@@ -193,21 +193,21 @@ class CoordinationServiceAgent {
 
   // Insert config key-value to the service.
   //   - AlreadyExists: key is already set.
-  virtual Status InsertKeyValue(std::string_view key,
-                                std::string_view value) = 0;
+  virtual absl::Status InsertKeyValue(std::string_view key,
+                                      std::string_view value) = 0;
 
   // Delete config keys in the coordination service.
-  virtual Status DeleteKeyValue(std::string_view key) = 0;
+  virtual absl::Status DeleteKeyValue(std::string_view key) = 0;
 
   // Update the value of a config key.
-  virtual Status UpdateKeyValue(std::string_view key,
-                                std::string_view value) = 0;
+  virtual absl::Status UpdateKeyValue(std::string_view key,
+                                      std::string_view value) = 0;
 
   // Register a callback that will be invoked when the key or keys under the key
   // directory are changed (inserted, deleted, or updated).
-  virtual Status StartWatchKey(std::string_view key,
-                               ChangedKeyValuesCallback on_change) = 0;
-  virtual Status StopWatchKey(std::string_view key) = 0;
+  virtual absl::Status StartWatchKey(std::string_view key,
+                                     ChangedKeyValuesCallback on_change) = 0;
+  virtual absl::Status StopWatchKey(std::string_view key) = 0;
 
   // Blocks until all (or a subset of) tasks are at the barrier or the barrier
   // fails.
@@ -241,7 +241,7 @@ class CoordinationServiceAgent {
   //       list of participating tasks.
   //   - FailedPrecondition: Agent is in UNINITIALIZED or ERROR state. Or the
   //       same barrier_id was already used previously.
-  virtual Status WaitAtBarrier(
+  virtual absl::Status WaitAtBarrier(
       std::string_view barrier_id, absl::Duration timeout,
       const std::vector<tensorflow::CoordinatedTask>& tasks) = 0;
 
@@ -255,7 +255,7 @@ class CoordinationServiceAgent {
   // CANCELLED error status.
   // Possible service errors:
   //   - FailedPrecondition: Barrier has already been passed.
-  virtual Status CancelBarrier(std::string_view barrier_id) = 0;
+  virtual absl::Status CancelBarrier(std::string_view barrier_id) = 0;
   virtual void CancelBarrierAsync(std::string_view barrier_id,
                                   StatusCallback done) = 0;
 
@@ -266,11 +266,11 @@ class CoordinationServiceAgent {
   // Set the service agent to error status and invoke the error callback.
   // Note: different from ReportError, this does not report the error status to
   // remote coordination service.
-  virtual void SetError(const Status& error) = 0;
+  virtual void SetError(const absl::Status& error) = 0;
 
   // Activate the key-value callback watch.
-  virtual Status ActivateWatch(std::string_view,
-                               const std::map<std::string, std::string>&) = 0;
+  virtual absl::Status ActivateWatch(
+      std::string_view, const std::map<std::string, std::string>&) = 0;
 
  private:
   friend class CoordinationServiceRpcHandler;
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
index 85de73b5bf323b..60b18a033dd343 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
@@ -158,19 +158,19 @@ class CoordinationServiceAgentTest : public ::testing::Test {
  public:
   void SetUp() override {
     ON_CALL(*client_, RegisterTaskAsync(_, _, _, _))
-        .WillByDefault(InvokeArgument<3>(OkStatus()));
+        .WillByDefault(InvokeArgument<3>(absl::OkStatus()));
     ON_CALL(*client_, HeartbeatAsync(_, _, _, _))
-        .WillByDefault(InvokeArgument<3>(OkStatus()));
+        .WillByDefault(InvokeArgument<3>(absl::OkStatus()));
     ON_CALL(*client_, ShutdownTaskAsync(_, _, _, _))
-        .WillByDefault(InvokeArgument<3>(OkStatus()));
+        .WillByDefault(InvokeArgument<3>(absl::OkStatus()));
     ON_CALL(*client_, ReportErrorToServiceAsync(_, _, _))
-        .WillByDefault(InvokeArgument<2>(OkStatus()));
+        .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
     ON_CALL(*client_, ResetTaskAsync(_, _, _))
-        .WillByDefault(InvokeArgument<2>(OkStatus()));
+        .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
     ON_CALL(*client_, BarrierAsync(_, _, _))
-        .WillByDefault(InvokeArgument<2>(OkStatus()));
+        .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
     ON_CALL(*client_, GetTaskStateAsync(_, _, _))
-        .WillByDefault(InvokeArgument<2>(OkStatus()));
+        .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
   }
 
   // Should be called after mocking service responses, before testing the agent.
@@ -179,7 +179,7 @@ class CoordinationServiceAgentTest : public ::testing::Test {
     TF_ASSERT_OK(agent_->Initialize(
         Env::Default(), /*job_name=*/"test_job",
         /*task_id=*/0, config, std::move(client_),
-        /*error_fn=*/[](Status s) {
+        /*error_fn=*/[](absl::Status s) {
           LOG(ERROR) << "Coordination agent is set to error: " << s;
         }));
   }
@@ -208,7 +208,7 @@ TEST_F(CoordinationServiceAgentTest, GetKeyValue_Simple_Success) {
   kv->set_value(test_value);
   ON_CALL(*GetClient(), GetKeyValueAsync(_, _, _, _))
       .WillByDefault(DoAll(SetArgPointee<2>(mocked_response),
-                           InvokeArgument<3>(OkStatus())));
+                           InvokeArgument<3>(absl::OkStatus())));
   // Initialize coordination agent.
   InitializeAgent();
 
@@ -228,7 +228,7 @@ TEST_F(CoordinationServiceAgentTest, GetKeyValue_WithTimeout_Success) {
   kv->set_value(test_value);
   ON_CALL(*GetClient(), GetKeyValueAsync(_, _, _, _))
       .WillByDefault(DoAll(SetArgPointee<2>(mocked_response),
-                           InvokeArgument<3>(OkStatus())));
+                           InvokeArgument<3>(absl::OkStatus())));
   // Initialize coordination agent.
   InitializeAgent();
 
@@ -281,7 +281,7 @@ TEST_F(CoordinationServiceAgentTest,
   auto kv = owned_response->mutable_kv();
   kv->set_key(test_key);
   kv->set_value(test_value);
-  owned_done(OkStatus());
+  owned_done(absl::OkStatus());
   // No explicit test, but used to verify there is no stack-use-after-return
   // or other memory-related errors.
 }
@@ -313,7 +313,7 @@ TEST_F(CoordinationServiceAgentTest,
                   auto kv = owned_response->mutable_kv();
                   kv->set_key(test_key);
                   kv->set_value(test_value);
-                  owned_done(OkStatus());
+                  owned_done(absl::OkStatus());
                 }));
           }));
   InitializeAgent();
@@ -336,7 +336,7 @@ TEST_F(CoordinationServiceAgentTest, CancelGetKeyValue_Success) {
           }));
   InitializeAgent();
 
-  Status status;
+  absl::Status status;
   std::shared_ptr<CallOptions> get_kv_call_opts = agent_->GetKeyValueAsync(
       test_key, [&status](const absl::StatusOr<std::string>& result) {
         status = result.status();
@@ -359,7 +359,7 @@ TEST_F(CoordinationServiceAgentTest, TryGetKeyValue_Simple_Success) {
   kv->set_value(test_value);
   ON_CALL(*GetClient(), TryGetKeyValueAsync(_, _, _))
       .WillByDefault(DoAll(SetArgPointee<1>(mocked_response),
-                           InvokeArgument<2>(OkStatus())));
+                           InvokeArgument<2>(absl::OkStatus())));
 
   // Initialize coordination agent.
   InitializeAgent();
@@ -379,7 +379,7 @@ TEST_F(CoordinationServiceAgentTest, GetKeyValueDir_Simple_Success) {
   *mocked_response.mutable_kv() = {test_values.begin(), test_values.end()};
   ON_CALL(*GetClient(), GetKeyValueDirAsync(_, _, _))
       .WillByDefault(DoAll(SetArgPointee<1>(mocked_response),
-                           InvokeArgument<2>(OkStatus())));
+                           InvokeArgument<2>(absl::OkStatus())));
   // Initialize coordination agent.
   InitializeAgent();
 
@@ -396,7 +396,7 @@ TEST_F(CoordinationServiceAgentTest, ShutdownInErrorShouldReturnError) {
   TF_ASSERT_OK(agent_->ReportError(absl::InternalError("Test Error.")));
 
   // Shutdown should return error.
-  Status s = agent_->Shutdown();
+  absl::Status s = agent_->Shutdown();
 
   EXPECT_TRUE(absl::IsFailedPrecondition(s));
 }
@@ -428,14 +428,14 @@ TEST_F(CoordinationServiceAgentTest, ResetCanBeRetried) {
   // Mock reset error failing for the first time.
   EXPECT_CALL(*GetClient(), ResetTaskAsync(_, _, _))
       .WillOnce(InvokeArgument<2>(absl::InternalError("Reset error")))
-      .WillOnce(InvokeArgument<2>(OkStatus()));
+      .WillOnce(InvokeArgument<2>(absl::OkStatus()));
   // Connect coordination agent and set it to error.
   InitializeAgent();
   TF_ASSERT_OK(agent_->Connect());
   TF_ASSERT_OK(agent_->ReportError(absl::InternalError("Test Error.")));
 
   // Reset error fails for the first time.
-  Status reset_status = agent_->Reset();
+  absl::Status reset_status = agent_->Reset();
   EXPECT_TRUE(absl::IsInternal(reset_status));
 
   // Agent should be able to attempt resetting again.
@@ -497,7 +497,7 @@ TEST_F(CoordinationServiceAgentTest, Connect_AbortedErrorShouldBeRetried) {
           InvokeArgument<3>(absl::AbortedError("DuplicateTaskRegistration")))
       .WillOnce(
           InvokeArgument<3>(absl::AbortedError("DuplicateTaskRegistration")))
-      .WillOnce(InvokeArgument<3>(OkStatus()));
+      .WillOnce(InvokeArgument<3>(absl::OkStatus()));
   InitializeAgent();
 
   TF_EXPECT_OK(agent_->Connect());
@@ -515,7 +515,7 @@ TEST_F(CoordinationServiceAgentTest, Connect_AbortedErrorShouldFailEventually) {
       absl::ToInt64Milliseconds(absl::Seconds(3)));
   InitializeAgent(config);
 
-  Status s = agent_->Connect();
+  absl::Status s = agent_->Connect();
 
   EXPECT_TRUE(absl::IsAborted(s));
 }
@@ -526,7 +526,7 @@ TEST_F(CoordinationServiceAgentTest, Connect_InternalErrorShouldBeRetried) {
           absl::InternalError("Coordination service is not enabled.")))
       .WillOnce(InvokeArgument<3>(
           absl::InternalError("Coordination service is not enabled.")))
-      .WillOnce(InvokeArgument<3>(OkStatus()));
+      .WillOnce(InvokeArgument<3>(absl::OkStatus()));
   InitializeAgent();
 
   TF_EXPECT_OK(agent_->Connect());
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
index 140c85df4403b0..2f2f87687d6feb 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
@@ -78,13 +78,13 @@ void CoordinationServiceRpcHandler::HeartbeatAsync(
   const CoordinatedTask& task = request->source_task();
   const uint64_t incarnation = request->incarnation();
   const uint64_t leader_incarnation = service_->GetServiceIncarnation();
-  Status s = service_->RecordHeartbeat(task, incarnation);
+  absl::Status s = service_->RecordHeartbeat(task, incarnation);
   if (!s.ok()) {
     done(s);
     return;
   }
   response->set_leader_incarnation(leader_incarnation);
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void CoordinationServiceRpcHandler::WaitForAllTasksAsync(
@@ -98,7 +98,7 @@ void CoordinationServiceRpcHandler::WaitForAllTasksAsync(
   }
   service_->WaitForAllTasks(
       request->source_task(), request->device_info(),
-      [response, service = service_, done = std::move(done)](Status s) {
+      [response, service = service_, done = std::move(done)](absl::Status s) {
         if (s.ok()) {
           *response->mutable_device_info() = service->ListClusterDevices();
         }
@@ -116,7 +116,7 @@ void CoordinationServiceRpcHandler::ShutdownTaskAsync(
     return;
   }
   service_->ShutdownTaskAsync(request->source_task(),
-                              [done](Status s) { done(s); });
+                              [done](absl::Status s) { done(s); });
 }
 
 void CoordinationServiceRpcHandler::ResetTaskAsync(
@@ -141,14 +141,15 @@ void CoordinationServiceRpcHandler::ReportErrorToTaskAsync(
     return;
   }
   const CoordinationServiceError& error_payload = request->error_payload();
-  Status error(static_cast<absl::StatusCode>(request->error_code()),
-               strings::StrCat("Error reported from /job:",
-                               error_payload.source_task().job_name(),
-                               "/task:", error_payload.source_task().task_id(),
-                               ": ", request->error_message()));
+  absl::Status error(
+      static_cast<absl::StatusCode>(request->error_code()),
+      strings::StrCat(
+          "Error reported from /job:", error_payload.source_task().job_name(),
+          "/task:", error_payload.source_task().task_id(), ": ",
+          request->error_message()));
   error = MakeCoordinationError(error, error_payload);
   agent_->SetError(error);
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void CoordinationServiceRpcHandler::ReportErrorToServiceAsync(
@@ -163,8 +164,8 @@ void CoordinationServiceRpcHandler::ReportErrorToServiceAsync(
   done(service_->ReportTaskError(
       request->error_origin(),
       MakeCoordinationError(
-          Status{static_cast<absl::StatusCode>(request->error_code()),
-                 request->error_message()},
+          absl::Status{static_cast<absl::StatusCode>(request->error_code()),
+                       request->error_message()},
           request->error_origin(),
           /*is_reported_error=*/true)));
 }
@@ -182,7 +183,7 @@ void CoordinationServiceRpcHandler::GetTaskStateAsync(
       {request->source_task().begin(), request->source_task().end()});
   absl::c_move(result,
                RepeatedFieldBackInserter(response->mutable_task_state()));
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void CoordinationServiceRpcHandler::InsertKeyValueAsync(
@@ -233,7 +234,7 @@ void CoordinationServiceRpcHandler::TryGetKeyValueAsync(
   }
   response->mutable_kv()->set_key(request->key());
   response->mutable_kv()->set_value(result.value());
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void CoordinationServiceRpcHandler::GetKeyValueDirAsync(
@@ -249,7 +250,7 @@ void CoordinationServiceRpcHandler::GetKeyValueDirAsync(
       service_->GetKeyValueDir(request->directory_key());
   *response->mutable_kv() = {std::make_move_iterator(results.begin()),
                              std::make_move_iterator(results.end())};
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void CoordinationServiceRpcHandler::DeleteKeyValueAsync(
@@ -279,7 +280,7 @@ void CoordinationServiceRpcHandler::BarrierAsync(const BarrierRequest* request,
       request->barrier_id(),
       absl::Milliseconds(request->barrier_timeout_in_ms()),
       request->source_task(), tasks,
-      [done = std::move(done)](const Status& status) { done(status); });
+      [done = std::move(done)](const absl::Status& status) { done(status); });
 }
 
 void CoordinationServiceRpcHandler::CancelBarrierAsync(
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_test.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_test.cc
index eaaa05d4b9813a..b111a6235c8cbf 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_test.cc
@@ -80,7 +80,7 @@ class TestCoordinationClient : public CoordinationClient {
  public:
   TestCoordinationClient() = default;
 
-  Status GetStatus() {
+  absl::Status GetStatus() {
     mutex_lock l(mu_);
     return status_;
   }
@@ -88,7 +88,7 @@ class TestCoordinationClient : public CoordinationClient {
   void RegisterTaskAsync(CallOptions* opts, const RegisterTaskRequest* request,
                          RegisterTaskResponse* response,
                          StatusCallback done) override {
-    done(OkStatus());
+    done(absl::OkStatus());
   }
 
   void ReportErrorToTaskAsync(CallOptions* call_opts,
@@ -96,9 +96,9 @@ class TestCoordinationClient : public CoordinationClient {
                               ReportErrorToTaskResponse* response,
                               StatusCallback done) override {
     mutex_lock l(mu_);
-    status_ = Status(static_cast<absl::StatusCode>(request->error_code()),
-                     request->error_message());
-    done(OkStatus());
+    status_ = absl::Status(static_cast<absl::StatusCode>(request->error_code()),
+                           request->error_message());
+    done(absl::OkStatus());
   }
 
 #define UNIMPLEMENTED(method)                                         \
@@ -134,7 +134,7 @@ class TestCoordinationClient : public CoordinationClient {
 
  private:
   mutex mu_;
-  Status status_ TF_GUARDED_BY(mu_);
+  absl::Status status_ TF_GUARDED_BY(mu_);
 };
 
 class TestCoordinationClientCache : public CoordinationClientCache {
@@ -183,7 +183,8 @@ class CoordinationBarrierTest : public ::testing::Test {
         Env::Default(), config, std::move(client_cache));
     // Register the tasks.
     for (int i = 0; i < num_tasks; ++i) {
-      Status s = coord_service_->RegisterTask(tasks_[i], /*incarnation=*/0);
+      absl::Status s =
+          coord_service_->RegisterTask(tasks_[i], /*incarnation=*/0);
       if (!s.ok()) {
         LOG(FATAL) << "RegisterTask() failed in CoordinationBarrierTest(): "
                    << s;
@@ -278,7 +279,7 @@ TEST_F(CoordinateTwoTasksTest, TestStandaloneService) {
 
   TF_ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   absl::Notification wait_for_all;
-  coord_service_->WaitForAllTasks(task_0_, {}, [&](Status s) {
+  coord_service_->WaitForAllTasks(task_0_, {}, [&](absl::Status s) {
     TF_ASSERT_OK(s);
     wait_for_all.Notify();
   });
@@ -286,7 +287,7 @@ TEST_F(CoordinateTwoTasksTest, TestStandaloneService) {
   ASSERT_FALSE(wait_for_all.HasBeenNotified());
   TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   coord_service_->WaitForAllTasks(task_1_, {},
-                                  [&](Status s) { TF_ASSERT_OK(s); });
+                                  [&](absl::Status s) { TF_ASSERT_OK(s); });
   // All tasks have registered.
   wait_for_all.WaitForNotification();
 
@@ -341,19 +342,19 @@ TEST(CoordinationServiceTest, TestCoordinatedJobs) {
   // Each coordinated task registers and waits for other tasks.
   absl::Notification register_chief;
   TF_ASSERT_OK(coord_service->RegisterTask(chief, /*incarnation=*/0));
-  coord_service->WaitForAllTasks(chief, {}, [&](Status s) {
+  coord_service->WaitForAllTasks(chief, {}, [&](absl::Status s) {
     TF_ASSERT_OK(s);
     register_chief.Notify();
   });
   absl::Notification register_task0;
   TF_ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
-  coord_service->WaitForAllTasks(task_0, {}, [&](Status s) {
+  coord_service->WaitForAllTasks(task_0, {}, [&](absl::Status s) {
     TF_ASSERT_OK(s);
     register_task0.Notify();
   });
   absl::Notification register_task1;
   TF_ASSERT_OK(coord_service->RegisterTask(task_1, /*incarnation=*/0));
-  coord_service->WaitForAllTasks(task_1, {}, [&](Status s) {
+  coord_service->WaitForAllTasks(task_1, {}, [&](absl::Status s) {
     TF_ASSERT_OK(s);
     register_task1.Notify();
   });
@@ -363,7 +364,8 @@ TEST(CoordinationServiceTest, TestCoordinatedJobs) {
   register_task1.WaitForNotification();
 
   // Registering the evaluator task is unexpected
-  Status status = coord_service->RegisterTask(evaluator, /*incarnation=*/0);
+  absl::Status status =
+      coord_service->RegisterTask(evaluator, /*incarnation=*/0);
   EXPECT_TRUE(absl::IsInvalidArgument(status)) << status;
   EXPECT_TRUE(!status.message().empty());
 }
@@ -385,7 +387,8 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyConnected_Succeeds) {
   TF_ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
 
   // Registration should succeed since it is the same task.
-  const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/0);
+  const absl::Status status =
+      coord_service->RegisterTask(task_0, /*incarnation=*/0);
 
   TF_EXPECT_OK(status) << status;
 }
@@ -407,7 +410,8 @@ TEST(CoordinationServiceTest,
   // Registration should fail since task already registered previously with a
   // different incarnation. Note that incarnation usually changes if an agent
   // restarts.
-  const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/1);
+  const absl::Status status =
+      coord_service->RegisterTask(task_0, /*incarnation=*/1);
 
   EXPECT_TRUE(absl::IsAborted(status)) << status;
   EXPECT_TRUE(!status.message().empty());
@@ -430,7 +434,8 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
       coord_service->ReportTaskError(task_0, errors::Internal("test_error")));
 
   // Registration should fail since task already registered previously.
-  const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/0);
+  const absl::Status status =
+      coord_service->RegisterTask(task_0, /*incarnation=*/0);
 
   EXPECT_TRUE(absl::IsAborted(status)) << status;
   EXPECT_TRUE(!status.message().empty());
@@ -474,7 +479,7 @@ TEST_F(CoordinateTwoTasksTest, TestTaskRestart) {
   TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
   // Simulate task restart scenario: trying to register to cluster again.
-  Status s =
+  absl::Status s =
       coord_service_->RegisterTask(task_1_, /*incarnation=*/random::New64());
   EXPECT_TRUE(absl::IsAborted(s)) << s;
   // Aborted error is also propagated to other tasks in cluster.
@@ -664,7 +669,7 @@ TEST(CoordinationServiceTest, ListClusterDevices_TfDevice) {
   CoordinatedTask task_2;
   task_2.set_job_name("worker");
   task_2.set_task_id(2);
-  Status status = OkStatus();
+  absl::Status status = absl::OkStatus();
   auto client_cache = std::make_unique<TestCoordinationClientCache>();
   std::unique_ptr<CoordinationServiceInterface> coord_service =
       CoordinationServiceInterface::EnableCoordinationService(
@@ -686,10 +691,10 @@ TEST(CoordinationServiceTest, ListClusterDevices_TfDevice) {
   // Each task sends its device info.
   DeviceInfo cluster_devices;
   coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [&](Status s) { TF_ASSERT_OK(s); });
+                                 [&](absl::Status s) { TF_ASSERT_OK(s); });
   coord_service->WaitForAllTasks(task_1, local_devices_1,
-                                 [&](Status s) { TF_ASSERT_OK(s); });
-  coord_service->WaitForAllTasks(task_2, local_devices_2, [&](Status s) {
+                                 [&](absl::Status s) { TF_ASSERT_OK(s); });
+  coord_service->WaitForAllTasks(task_2, local_devices_2, [&](absl::Status s) {
     TF_ASSERT_OK(s);
     // Gather the cluster device info.
     cluster_devices = coord_service->ListClusterDevices();
@@ -720,7 +725,7 @@ TEST(CoordinationServiceTest, ListClusterDevices_XlaDevice) {
   CoordinatedTask task_2;
   task_2.set_job_name("worker");
   task_2.set_task_id(2);
-  Status status = OkStatus();
+  absl::Status status = absl::OkStatus();
   auto client_cache = std::make_unique<TestCoordinationClientCache>();
   std::unique_ptr<CoordinationServiceInterface> coord_service =
       CoordinationServiceInterface::EnableCoordinationService(
@@ -761,10 +766,10 @@ TEST(CoordinationServiceTest, ListClusterDevices_XlaDevice) {
   // Make sure that cluster device order is deterministic even if devices are
   // sent out of order.
   coord_service->WaitForAllTasks(task_1, local_devices_1,
-                                 [&](Status s) { TF_ASSERT_OK(s); });
+                                 [&](absl::Status s) { TF_ASSERT_OK(s); });
   coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [&](Status s) { TF_ASSERT_OK(s); });
-  coord_service->WaitForAllTasks(task_2, local_devices_2, [&](Status s) {
+                                 [&](absl::Status s) { TF_ASSERT_OK(s); });
+  coord_service->WaitForAllTasks(task_2, local_devices_2, [&](absl::Status s) {
     TF_ASSERT_OK(s);
     // Gather the cluster device info.
     cluster_devices = coord_service->ListClusterDevices();
@@ -798,7 +803,7 @@ TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
   CoordinatedTask task_1;
   task_1.set_job_name("worker");
   task_1.set_task_id(1);
-  Status status = OkStatus();
+  absl::Status status = absl::OkStatus();
   auto client_cache = std::make_unique<TestCoordinationClientCache>();
   std::unique_ptr<CoordinationServiceInterface> coord_service =
       CoordinationServiceInterface::EnableCoordinationService(
@@ -816,19 +821,20 @@ TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
   // Task0 sends device info.
   DeviceInfo cluster_devices;
   coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [](Status s) { TF_ASSERT_OK(s); });
+                                 [](absl::Status s) { TF_ASSERT_OK(s); });
 
   // Task0 sends device info sgain.
   coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [](Status s) { TF_ASSERT_OK(s); });
-  coord_service->WaitForAllTasks(
-      task_1, local_devices_1,
-      [coord_service = coord_service.get(), &cluster_devices, &n](Status s) {
-        TF_ASSERT_OK(s);
-        // Gather the cluster device info.
-        cluster_devices = coord_service->ListClusterDevices();
-        n.Notify();
-      });
+                                 [](absl::Status s) { TF_ASSERT_OK(s); });
+  coord_service->WaitForAllTasks(task_1, local_devices_1,
+                                 [coord_service = coord_service.get(),
+                                  &cluster_devices, &n](absl::Status s) {
+                                   TF_ASSERT_OK(s);
+                                   // Gather the cluster device info.
+                                   cluster_devices =
+                                       coord_service->ListClusterDevices();
+                                   n.Notify();
+                                 });
   n.WaitForNotification();
 
   // No duplicates found.
@@ -844,37 +850,37 @@ TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
 TEST_F(CoordinationBarrierTest, Barrier) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
-  Status barrier_status_2;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
+  absl::Status barrier_status_2;
   absl::Notification n_0;
   absl::Notification n_1;
   absl::Notification n_2;
 
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(0),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_0, &n_0](Status s) {
-                                           barrier_status_0 = s;
-                                           n_0.Notify();
-                                         });
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(1),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_1, &n_1](Status s) {
-                                           barrier_status_1 = s;
-                                           n_1.Notify();
-                                         });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(0),
+      /*participating_tasks=*/{}, [&barrier_status_0, &n_0](absl::Status s) {
+        barrier_status_0 = s;
+        n_0.Notify();
+      });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(1),
+      /*participating_tasks=*/{}, [&barrier_status_1, &n_1](absl::Status s) {
+        barrier_status_1 = s;
+        n_1.Notify();
+      });
   // Make sure barrier has not been exited prematurely.
   EXPECT_FALSE(n_0.HasBeenNotified());
   EXPECT_FALSE(n_1.HasBeenNotified());
   EXPECT_FALSE(n_2.HasBeenNotified());
 
   // Last task calls the barrier.
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(2),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_2, &n_2](Status s) {
-                                           barrier_status_2 = s;
-                                           n_2.Notify();
-                                         });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(2),
+      /*participating_tasks=*/{}, [&barrier_status_2, &n_2](absl::Status s) {
+        barrier_status_2 = s;
+        n_2.Notify();
+      });
 
   EXPECT_TRUE(n_0.HasBeenNotified());
   EXPECT_TRUE(n_1.HasBeenNotified());
@@ -887,22 +893,22 @@ TEST_F(CoordinationBarrierTest, Barrier) {
 TEST_F(CoordinationBarrierTest, BarrierWithSubsetOfTasks) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
   absl::Notification n_0;
   absl::Notification n_1;
 
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_0, &n_0](Status s) {
+      [&barrier_status_0, &n_0](absl::Status s) {
         barrier_status_0 = s;
         n_0.Notify();
       });
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_1, &n_1](Status s) {
+      [&barrier_status_1, &n_1](absl::Status s) {
         barrier_status_1 = s;
         n_1.Notify();
       });
@@ -917,19 +923,19 @@ TEST_F(CoordinationBarrierTest, BarrierWithSubsetOfTasks) {
 TEST_F(CoordinationBarrierTest, BarrierWithMismatchedTasks) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
 
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_0](Status s) { barrier_status_0 = s; });
+      [&barrier_status_0](absl::Status s) { barrier_status_0 = s; });
   // task_1's barrier call specified a conflicting set of tasks (task_2 instead
   // of task_0).
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{GetTask(1), GetTask(2)},
-      [&barrier_status_1](Status s) { barrier_status_1 = s; });
+      [&barrier_status_1](absl::Status s) { barrier_status_1 = s; });
 
   EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_0));
   EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_1));
@@ -938,20 +944,20 @@ TEST_F(CoordinationBarrierTest, BarrierWithMismatchedTasks) {
 TEST_F(CoordinationBarrierTest, BarrierByNonParticipatingTask) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
   absl::Notification n_0;
   absl::Notification n_1;
 
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_0](Status s) { barrier_status_0 = s; });
+      [&barrier_status_0](absl::Status s) { barrier_status_0 = s; });
   // Task 2 unexpectedly calls a barrier that it is not participating in.
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(2),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_1](Status s) { barrier_status_1 = s; });
+      [&barrier_status_1](absl::Status s) { barrier_status_1 = s; });
 
   // Barrier should fail for all tasks with the unexpected call.
   EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_0));
@@ -961,7 +967,7 @@ TEST_F(CoordinationBarrierTest, BarrierByNonParticipatingTask) {
 TEST_F(CoordinationBarrierTest, BarrierByNonClusterTask) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
+  absl::Status barrier_status_0;
   absl::Notification n_0;
   CoordinatedTask unspecified_task;
   unspecified_task.set_job_name("task_from_another_cluster");
@@ -970,7 +976,7 @@ TEST_F(CoordinationBarrierTest, BarrierByNonClusterTask) {
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{GetTask(0), unspecified_task},
-      [&barrier_status_0, &n_0](Status s) {
+      [&barrier_status_0, &n_0](absl::Status s) {
         barrier_status_0 = s;
         n_0.Notify();
       });
@@ -983,15 +989,15 @@ TEST_F(CoordinationBarrierTest, BarrierByNonClusterTask) {
 TEST_F(CoordinationBarrierTest, BarrierTimeout) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(1);
-  Status barrier_status_0;
+  absl::Status barrier_status_0;
   absl::Notification n_0;
 
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(0),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_0, &n_0](Status s) {
-                                           barrier_status_0 = s;
-                                           n_0.Notify();
-                                         });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(0),
+      /*participating_tasks=*/{}, [&barrier_status_0, &n_0](absl::Status s) {
+        barrier_status_0 = s;
+        n_0.Notify();
+      });
 
   // Block until user-specified timeout.
   n_0.WaitForNotification();
@@ -1007,16 +1013,16 @@ TEST_F(CoordinationBarrierTest, BarrierTimeout) {
 TEST_F(CoordinationBarrierTest, BarrierReturnsPreviousError) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(1);
-  Status barrier_status_0;
-  Status barrier_status_1;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
   absl::Notification n_0;
 
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(0),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_0, &n_0](Status s) {
-                                           barrier_status_0 = s;
-                                           n_0.Notify();
-                                         });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(0),
+      /*participating_tasks=*/{}, [&barrier_status_0, &n_0](absl::Status s) {
+        barrier_status_0 = s;
+        n_0.Notify();
+      });
   TF_ASSERT_OK(GetCoordinationService()->ReportTaskError(
       GetTask(0), errors::Internal("test_error")));
   // Block until barrier has failed due to task error.
@@ -1025,7 +1031,7 @@ TEST_F(CoordinationBarrierTest, BarrierReturnsPreviousError) {
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{},
-      [&barrier_status_1](Status s) { barrier_status_1 = s; });
+      [&barrier_status_1](absl::Status s) { barrier_status_1 = s; });
 
   EXPECT_TRUE(absl::IsInternal(barrier_status_0));
   EXPECT_TRUE(absl::IsInternal(barrier_status_1));
@@ -1034,13 +1040,13 @@ TEST_F(CoordinationBarrierTest, BarrierReturnsPreviousError) {
 TEST_F(CoordinationBarrierTest, BarrierCancelled) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status;
+  absl::Status barrier_status;
 
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{},
-      [&barrier_status](Status s) { barrier_status = s; });
-  Status cancelled_status =
+      [&barrier_status](absl::Status s) { barrier_status = s; });
+  absl::Status cancelled_status =
       GetCoordinationService()->CancelBarrier(barrier_id, GetTask(0));
 
   EXPECT_TRUE(absl::IsCancelled(barrier_status));
@@ -1050,7 +1056,7 @@ TEST_F(CoordinationBarrierTest, BarrierCancelled) {
 TEST_F(CoordinationBarrierTest, CancelNonExistentBarrier_FutureBarrierFails) {
   const std::string barrier_id = "cancelled_barrier_id";
   absl::Duration timeout = absl::Seconds(1);
-  Status barrier_status;
+  absl::Status barrier_status;
 
   // Cancel barrier should still succeed.
   TF_ASSERT_OK(GetCoordinationService()->CancelBarrier(barrier_id, GetTask(0)));
@@ -1058,7 +1064,7 @@ TEST_F(CoordinationBarrierTest, CancelNonExistentBarrier_FutureBarrierFails) {
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{},
-      [&barrier_status](Status s) { barrier_status = s; });
+      [&barrier_status](absl::Status s) { barrier_status = s; });
 
   EXPECT_TRUE(absl::IsCancelled(barrier_status)) << barrier_status;
 }
@@ -1066,24 +1072,24 @@ TEST_F(CoordinationBarrierTest, CancelNonExistentBarrier_FutureBarrierFails) {
 TEST_F(CoordinationBarrierTest, CancelAfterBarrierHasPassed) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
-  Status barrier_status_2;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
+  absl::Status barrier_status_2;
 
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{},
-      [&barrier_status_0](Status s) { barrier_status_0 = s; });
+      [&barrier_status_0](absl::Status s) { barrier_status_0 = s; });
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{},
-      [&barrier_status_1](Status s) { barrier_status_1 = s; });
+      [&barrier_status_1](absl::Status s) { barrier_status_1 = s; });
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(2),
       /*participating_tasks=*/{},
-      [&barrier_status_2](Status s) { barrier_status_2 = s; });
+      [&barrier_status_2](absl::Status s) { barrier_status_2 = s; });
   // Cancel barrier should fail if barrier has already been passed.
-  Status cancelled_status =
+  absl::Status cancelled_status =
       GetCoordinationService()->CancelBarrier(barrier_id, GetTask(0));
 
   EXPECT_TRUE(absl::IsFailedPrecondition(cancelled_status));
@@ -1095,38 +1101,38 @@ TEST_F(CoordinationBarrierTest, CancelAfterBarrierHasPassed) {
 TEST_F(CoordinationBarrierTest, PassedBarrierReturnsImmediately) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
-  Status barrier_status_2;
-  Status barrier_status_repeat;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
+  absl::Status barrier_status_2;
+  absl::Status barrier_status_repeat;
   absl::Notification n0;
   absl::Notification n1;
   absl::Notification n2;
   absl::Notification n_repeat;
 
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(0),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_0, &n0](Status s) {
-                                           barrier_status_0 = s;
-                                           n0.Notify();
-                                         });
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(1),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_1, &n1](Status s) {
-                                           barrier_status_1 = s;
-                                           n1.Notify();
-                                         });
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(2),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_2, &n2](Status s) {
-                                           barrier_status_2 = s;
-                                           n2.Notify();
-                                         });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(0),
+      /*participating_tasks=*/{}, [&barrier_status_0, &n0](absl::Status s) {
+        barrier_status_0 = s;
+        n0.Notify();
+      });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(1),
+      /*participating_tasks=*/{}, [&barrier_status_1, &n1](absl::Status s) {
+        barrier_status_1 = s;
+        n1.Notify();
+      });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(2),
+      /*participating_tasks=*/{}, [&barrier_status_2, &n2](absl::Status s) {
+        barrier_status_2 = s;
+        n2.Notify();
+      });
   // Repeated call should return the same result.
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{},
-      [&barrier_status_repeat, &n_repeat](Status s) {
+      [&barrier_status_repeat, &n_repeat](absl::Status s) {
         barrier_status_repeat = s;
         n_repeat.Notify();
       });
@@ -1147,12 +1153,12 @@ TEST_F(CoordinationBarrierTest, BarrierFailsIfTaskIsAlreadyInError) {
   // Set task 0 to error state.
   TF_ASSERT_OK(GetCoordinationService()->ReportTaskError(
       GetTask(0), errors::Internal("test_error")));
-  Status barrier_status;
+  absl::Status barrier_status;
 
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{},
-      [&barrier_status](Status s) { barrier_status = s; });
+      [&barrier_status](absl::Status s) { barrier_status = s; });
 
   EXPECT_TRUE(absl::IsInternal(barrier_status));
 }
@@ -1161,14 +1167,14 @@ TEST_F(CoordinationBarrierTest, BarrierFailsUponTaskError) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
   absl::Notification n0;
-  Status barrier_status;
-
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(0),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status, &n0](Status s) {
-                                           barrier_status = s;
-                                           n0.Notify();
-                                         });
+  absl::Status barrier_status;
+
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(0),
+      /*participating_tasks=*/{}, [&barrier_status, &n0](absl::Status s) {
+        barrier_status = s;
+        n0.Notify();
+      });
   TF_ASSERT_OK(GetCoordinationService()->ReportTaskError(
       GetTask(0), errors::Internal("test_error")));
   n0.WaitForNotification();
@@ -1180,9 +1186,9 @@ TEST_F(CoordinationBarrierTest,
        BarrierStillBlocksIfSameTaskCallsOngoingBarrierRepeatedly) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
-  Status barrier_status_2;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
+  absl::Status barrier_status_2;
   absl::Notification n_0;
   absl::Notification n_1;
   absl::Notification n_2;
@@ -1190,7 +1196,7 @@ TEST_F(CoordinationBarrierTest,
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_0, &n_0](Status s) {
+      [&barrier_status_0, &n_0](absl::Status s) {
         barrier_status_0 = s;
         n_0.Notify();
       });
@@ -1198,7 +1204,7 @@ TEST_F(CoordinationBarrierTest,
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_1, &n_1](Status s) {
+      [&barrier_status_1, &n_1](absl::Status s) {
         barrier_status_1 = s;
         n_1.Notify();
       });
@@ -1209,7 +1215,7 @@ TEST_F(CoordinationBarrierTest,
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_2, &n_2](Status s) {
+      [&barrier_status_2, &n_2](absl::Status s) {
         barrier_status_2 = s;
         n_2.Notify();
       });
@@ -1248,14 +1254,15 @@ TEST_F(CoordinateTwoTasksTest, Reset_FailsOngoingBarrier) {
   EnableCoordinationService(/*has_service_to_client_connection=*/true,
                             /*enable_shutdown_barrier=*/false);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  Status barrier_status;
+  absl::Status barrier_status;
   absl::Notification barrier_n;
-  coord_service_->BarrierAsync(
-      "ongoing_barrier", absl::InfiniteDuration(), task_0_,
-      /*participating_tasks=*/{}, [&barrier_status, &barrier_n](Status s) {
-        barrier_status = s;
-        barrier_n.Notify();
-      });
+  coord_service_->BarrierAsync("ongoing_barrier", absl::InfiniteDuration(),
+                               task_0_,
+                               /*participating_tasks=*/{},
+                               [&barrier_status, &barrier_n](absl::Status s) {
+                                 barrier_status = s;
+                                 barrier_n.Notify();
+                               });
 
   TF_EXPECT_OK(coord_service_->ResetTask(task_0_));
 
@@ -1270,7 +1277,7 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_HeartbeatsAreAcceptedForAGracePeriod) {
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
   absl::Notification n;
-  coord_service_->ShutdownTaskAsync(task_0_, [&n](Status s) {
+  coord_service_->ShutdownTaskAsync(task_0_, [&n](absl::Status s) {
     TF_EXPECT_OK(s);
     n.Notify();
   });
@@ -1291,17 +1298,18 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_FailsOngoingBarrier) {
   EnableCoordinationService(/*has_service_to_client_connection=*/true,
                             /*enable_shutdown_barrier=*/false);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  Status barrier_status;
+  absl::Status barrier_status;
   absl::Notification barrier_n;
-  coord_service_->BarrierAsync(
-      "ongoing_barrier", absl::InfiniteDuration(), task_0_,
-      /*participating_tasks=*/{}, [&barrier_status, &barrier_n](Status s) {
-        barrier_status = s;
-        barrier_n.Notify();
-      });
+  coord_service_->BarrierAsync("ongoing_barrier", absl::InfiniteDuration(),
+                               task_0_,
+                               /*participating_tasks=*/{},
+                               [&barrier_status, &barrier_n](absl::Status s) {
+                                 barrier_status = s;
+                                 barrier_n.Notify();
+                               });
 
   absl::Notification shutdown_n;
-  coord_service_->ShutdownTaskAsync(task_0_, [&shutdown_n](Status s) {
+  coord_service_->ShutdownTaskAsync(task_0_, [&shutdown_n](absl::Status s) {
     TF_EXPECT_OK(s);
     shutdown_n.Notify();
   });
@@ -1317,13 +1325,13 @@ TEST_F(CoordinateTwoTasksTest, ShutdownWithBarrier_BarrierSucceeds) {
                             /*enable_shutdown_barrier=*/true);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
-  Status barrier_status;
-  Status barrier_status_2;
+  absl::Status barrier_status;
+  absl::Status barrier_status_2;
 
   coord_service_->ShutdownTaskAsync(
-      task_0_, [&barrier_status](Status s) { barrier_status = s; });
+      task_0_, [&barrier_status](absl::Status s) { barrier_status = s; });
   coord_service_->ShutdownTaskAsync(
-      task_1_, [&barrier_status_2](Status s) { barrier_status_2 = s; });
+      task_1_, [&barrier_status_2](absl::Status s) { barrier_status_2 = s; });
 
   TF_EXPECT_OK(barrier_status);
   TF_EXPECT_OK(barrier_status_2);
@@ -1341,13 +1349,14 @@ TEST_F(CoordinateTwoTasksTest,
                             /*enable_shutdown_barrier=*/true);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
-  Status barrier_status;
+  absl::Status barrier_status;
 
   absl::Notification n;
-  coord_service_->ShutdownTaskAsync(task_0_, [&n, &barrier_status](Status s) {
-    barrier_status = s;
-    n.Notify();
-  });
+  coord_service_->ShutdownTaskAsync(task_0_,
+                                    [&n, &barrier_status](absl::Status s) {
+                                      barrier_status = s;
+                                      n.Notify();
+                                    });
   // Block until barrier times out.
   n.WaitForNotification();
 
@@ -1358,7 +1367,7 @@ TEST_F(CoordinateTwoTasksTest,
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
   // Other task is alerted that shutdown has been initiated without it.
-  Status other_task_status = client_1_.GetStatus();
+  absl::Status other_task_status = client_1_.GetStatus();
   EXPECT_TRUE(absl::IsInternal(other_task_status)) << other_task_status;
 }
 
@@ -1368,13 +1377,14 @@ TEST_F(CoordinateTwoTasksTest,
                             /*enable_shutdown_barrier=*/true);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
-  Status barrier_status;
+  absl::Status barrier_status;
 
   absl::Notification n;
-  coord_service_->ShutdownTaskAsync(task_0_, [&n, &barrier_status](Status s) {
-    barrier_status = s;
-    n.Notify();
-  });
+  coord_service_->ShutdownTaskAsync(task_0_,
+                                    [&n, &barrier_status](absl::Status s) {
+                                      barrier_status = s;
+                                      n.Notify();
+                                    });
   // Block until barrier times out.
   n.WaitForNotification();
   // Provide time for coordination service to shut down after barrier timeout.
@@ -1387,7 +1397,7 @@ TEST_F(CoordinateTwoTasksTest,
   // error propagation.
   // Task 1 still sends unexpected heartbeat because it doesn't know that
   // service has stopped yet, which should fail.
-  Status s = coord_service_->RecordHeartbeat(task_1_, incarnation_1_);
+  absl::Status s = coord_service_->RecordHeartbeat(task_1_, incarnation_1_);
 
   EXPECT_TRUE(absl::IsInvalidArgument(s)) << s;
 }

From 2eddcb92b1c67851199c51d47e8f48ed62b3b8dc Mon Sep 17 00:00:00 2001
From: TJ Xu <tjx@nvidia.com>
Date: Fri, 22 Mar 2024 04:40:45 -0700
Subject: [PATCH 281/670] PR #10316: [NVIDIA GPU] Add a mechanism in
 GpuAsyncTracker to force delaying scheduling of an instruction.

Imported from GitHub PR https://github.com/openxla/xla/pull/10316

latency hiding scheduler has a property for each HloScheduleNode called SetForceDelay, setting this to true for a node will result in forcing LHS to ignore all cost info and schedule the instruction to the very beginning of the instruction sequence.
This is useful in manually enforcing an instruction to run before anything else in the scheduled graph.
Passes can set `should_force_delay` attribute in the GpuBackendConfig to instruct how this instruction will be scheduled. It's set to false by default.
An example usage:
```
original sequence:

allreduce-start
allreduce-done
add0, backend_config={should_force_delay=true}
add1

schedule after LHS:
add0, backend_config={should_force_delay=true}
allreduce-start
add1
allreduce-done
```
Copybara import of the project:

--
74267e60142c5161310580db5b760a3fe6d4f625 by TJ <tjx@nvidia.com>:

Add a mechanism in GpuAsyncTracker to force delaying scheduling
of an instruction.

--
0aac4cf8c42f2ca210450581f05239378d8a33c2 by TJ <tjx@nvidia.com>:

Fix failing backend config test

--
4deab8393ecb01ca010e0cf4ea30930ac448b815 by TJ <tjx@nvidia.com>:

Use force delay for windowed einsum loops

--
a1321e88fd336f83a4fcfcc89037b8d4a3d1903f by TJ <tjx@nvidia.com>:

Revert changes in gpu_windowed_einsum_handler, moving them to a separate
pr

--
86ffb97db8ce167438562f997316033b61316003 by TJ Xu <tjx@nvidia.com>:

Fix internal linter error

Merging this change closes #10316

PiperOrigin-RevId: 618138328
---
 .../xla/xla/service/gpu/backend_configs.proto | 12 ++++
 .../xla/service/gpu/backend_configs_test.cc   |  5 +-
 .../service/gpu/gemm_fusion_autotuner_test.cc |  2 +-
 .../xla/xla/service/gpu/gpu_hlo_schedule.cc   | 16 +++++
 .../xla/service/gpu/gpu_hlo_schedule_test.cc  | 71 +++++++++++++++++++
 ...aot_compile_test_autotune_results.prototxt |  4 +-
 6 files changed, 105 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index 47fa4e3383150a..61335f07f0497a 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -259,4 +259,16 @@ message GpuBackendConfig {
 
     CudnnfMHABackendConfig cudnn_fmha_backend_config = 9;
   }
+
+  // This attribute instructs the latency-hiding scheduler to
+  // schedule this particular instruction to the earliest position.
+  // Note that setting this to true will make this instruction scheduled
+  // at the very beginning of the parent computation before
+  // every other nodes.
+  // An example use case would be deciding to schedule between collective
+  // or an async compute. LHS might put either one at the first place
+  // depending on the cost, but it'd be more beneficial if the collective
+  // is always scheduled first as it's not SM-heavy.
+  // In this case we can use this flag to enforce the ordering.
+  bool force_earliest_schedule = 10;
 }
diff --git a/third_party/xla/xla/service/gpu/backend_configs_test.cc b/third_party/xla/xla/service/gpu/backend_configs_test.cc
index 8992257bb8a6bb..7a02490907055c 100644
--- a/third_party/xla/xla/service/gpu/backend_configs_test.cc
+++ b/third_party/xla/xla/service/gpu/backend_configs_test.cc
@@ -133,7 +133,8 @@ TEST_F(BackendConfigsTest, DefaultGpuBackendConfigSetOpQueue) {
   gpu_backend_config.set_operation_queue_id(2);
   EXPECT_THAT(add->set_backend_config(gpu_backend_config), IsOk());
   EXPECT_EQ(add->raw_backend_config_string(),
-            "{\"operation_queue_id\":\"2\",\"wait_on_operation_queues\":[]}");
+            "{\"operation_queue_id\":\"2\",\"wait_on_operation_queues\":[],"
+            "\"force_earliest_schedule\":false}");
 }
 
 TEST_F(BackendConfigsTest, DefaultGpuBackendConfigSetWaitOnQueue) {
@@ -160,7 +161,7 @@ TEST_F(BackendConfigsTest, DefaultGpuBackendConfigSetWaitOnQueue) {
   EXPECT_THAT(add->set_backend_config(gpu_backend_config), IsOk());
   EXPECT_EQ(add->raw_backend_config_string(),
             "{\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[\"0\","
-            "\"1\"]}");
+            "\"1\"],\"force_earliest_schedule\":false}");
   TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig config,
                           add->backend_config<GpuBackendConfig>());
 }
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
index a0d3e85a782356..fe1113c5112a33 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
@@ -695,7 +695,7 @@ ENTRY e {
         RunFileCheck(
             module->ToString(HloPrintOptions{}.set_print_operand_shape(false)),
             R"(
-// CHECK: backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"16","block_k":"16","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+// CHECK: backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"16","block_k":"16","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}},"force_earliest_schedule":false}
             )"));
     EXPECT_TRUE(filecheck_matches);
   } else {
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 4c25424d0dc3dd..9bbe0fbf85a286 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -399,6 +399,22 @@ class GpuAsyncTrackerBase : public AsyncTracker {
             !IsSyncCollective(&hlo)) ||
            IsAsyncComputeOp(hlo);
   }
+
+  void PostProcessScheduleGraph(
+      HloScheduleGraph* schedule_graph,
+      const LatencyEstimator* latency_estimator) const override {
+    for (auto inst : schedule_graph->GetOriginalInstrList()) {
+      if (inst->has_backend_config()) {
+        auto gpu_config = inst->backend_config<GpuBackendConfig>();
+        if (gpu_config.ok()) {
+          HloGraphNode& node = schedule_graph->GetNode(inst);
+          node.SetForceDelay(gpu_config->force_earliest_schedule());
+          VLOG(5) << "Setting force delay for instruction: "
+                  << inst->ToString();
+        }
+      }
+    }
+  }
 };
 
 // GPU async tracker maps all collectives onto an async stream resource.
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index 7eaf836d69f10e..7049d920faf445 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -1111,6 +1111,77 @@ ENTRY e {
 )"));
 }
 
+TEST_F(GpuHloScheduleTest, ProfileGuidedCostModelWithForceEarliestSchedule) {
+  const char* hlo_text = R"(
+  HloModule AsyncAR
+  apply_op {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT apply_op = f32[] add(x, y)
+  }
+
+  ENTRY main {
+    p0 = f32[32] parameter(0)
+    p1 = f32[32, 32] parameter(1)
+    p2 = f32[32, 32] parameter(2)
+    p3 = f32[32] parameter(3)
+
+    // Independent compute
+    dot0 = f32[32,32]{1,0} custom-call(p1, p2), custom_call_target="__cublas$gemm", backend_config={"force_earliest_schedule":true}
+    dot1 = f32[32,32]{1,0} custom-call(p1, p2), custom_call_target="__cublas$gemm"
+    add0 = f32[32,32] add(dot0, dot1)
+
+    // 2 Independent collectives.
+    ar-start = f32[32] all-reduce-start(p0), to_apply=apply_op
+    ar-done = f32[32] all-reduce-done(ar-start)
+
+    ROOT t = (f32[32], f32[32,32]) tuple(ar-done, add0)
+  })";
+
+  const std::string ar_long_latency_proto_text = R"pb(
+    costs { name: "dot0" cost_us: 100.0 }
+    costs { name: "dot1" cost_us: 100.0 }
+    costs { name: "add0" cost_us: 10.0 }
+    costs { name: "ar-start" cost_us: 1000.0 }
+  )pb";
+
+  tensorflow::profiler::ProfiledInstructionsProto profile;
+  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+      ar_long_latency_proto_text, &profile));
+  std::string ar_long_latency_proto_binary = profile.SerializeAsString();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      ParseAndReturnVerifiedModule(
+          hlo_text,
+          GetModuleConfig(/*enable_latency_hiding_scheduler=*/true,
+                          // Post processing should work even with
+                          // GpuAsyncTrackerBase.
+                          /*enable_gpu_async_tracker=*/false,
+                          /*fdo_profile=*/ar_long_latency_proto_binary)));
+  SequentialHloOrdering order = BuildHloOrdering(module.get());
+
+  const std::vector<HloInstruction*>& main =
+      order.SequentialOrder(*module->GetComputationWithName("main"))
+          ->instructions();
+  auto get_index =
+      [](absl::string_view hlo_name,
+         const std::vector<HloInstruction*>& instruction_sequence) {
+        return absl::c_find_if(instruction_sequence,
+                               [hlo_name](HloInstruction* instruction) {
+                                 return instruction->name() == hlo_name;
+                               }) -
+               instruction_sequence.begin();
+      };
+  // Using the profile, LHS should schedule all computes between ar pair,
+  // but since dot0 is marked as force delay, it should be scheduled
+  // before ar-start now.
+  EXPECT_LT(get_index("dot0", main), get_index("ar-start", main));
+  // Also verify that dot1 is scheduled between ar start and ar done.
+  EXPECT_GT(get_index("dot1", main), get_index("ar-start", main));
+  EXPECT_LT(get_index("dot1", main), get_index("ar-done", main));
+}
+
 class GpuHloScheduleParameterizedTest
     : public GpuHloScheduleTest,
       public ::testing::WithParamInterface<bool> {};
diff --git a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
index 117a899556fb15..592ea2a9e185fb 100644
--- a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
+++ b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
@@ -15,7 +15,7 @@
 version: 3
 results {
   device: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
-  hlo: "(f32[3,3]{1,0}, s8[72]{0}) custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"gemm_backend_config\":{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"],\"algorithm\":\"ALG_UNSET\"},\"epilogue\":\"DEFAULT\",\"lhs_stride\":\"9\",\"rhs_stride\":\"9\",\"grad_x\":false,\"grad_y\":false}}"
+  hlo: "(f32[3,3]{1,0}, s8[72]{0}) custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"gemm_backend_config\":{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"],\"algorithm\":\"ALG_UNSET\"},\"epilogue\":\"DEFAULT\",\"lhs_stride\":\"9\",\"rhs_stride\":\"9\",\"grad_x\":false,\"grad_y\":false},\"force_earliest_schedule\":false}"
   result {
     gemm {
       algorithm: 13
@@ -24,7 +24,7 @@ results {
 }
 results {
   device: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
-  hlo: "(f32[1,1,2,3]{3,2,1,0}, u8[0]{0}) custom-call(f32[1,2,4,4]{3,2,1,0}, f32[1,2,3,2]{3,2,1,0}), window={size=3x2}, dim_labels=bf01_oi01->bf01, custom_call_target=\"__cudnn$convForward\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"cudnn_conv_backend_config\":{\"activation_mode\":\"kNone\",\"conv_result_scale\":1,\"side_input_scale\":0,\"leakyrelu_alpha\":0}}"
+  hlo: "(f32[1,1,2,3]{3,2,1,0}, u8[0]{0}) custom-call(f32[1,2,4,4]{3,2,1,0}, f32[1,2,3,2]{3,2,1,0}), window={size=3x2}, dim_labels=bf01_oi01->bf01, custom_call_target=\"__cudnn$convForward\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"cudnn_conv_backend_config\":{\"activation_mode\":\"kNone\",\"conv_result_scale\":1,\"side_input_scale\":0,\"leakyrelu_alpha\":0},\"force_earliest_schedule\":false}"
   result {
     run_time {
       nanos: 8192

From e683be5de84ec587d8ddcf18bb662320a218cab8 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Fri, 22 Mar 2024 04:43:13 -0700
Subject: [PATCH 282/670] [XLA] [NFC] Don't use deprecated tsl Status helpers

PiperOrigin-RevId: 618138716
---
 third_party/xla/xla/BUILD          |  3 +--
 third_party/xla/xla/test_helpers.h | 16 +++++-----------
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index df10168ddf0920..32e7fc1eb15187 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -871,11 +871,10 @@ cc_library(
     hdrs = ["test_helpers.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
+        ":status",
         ":statusor",
         ":types",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:regexp",
         "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/test_helpers.h b/third_party/xla/xla/test_helpers.h
index 4fcfb32c0a733a..11425fd51fa35e 100644
--- a/third_party/xla/xla/test_helpers.h
+++ b/third_party/xla/xla/test_helpers.h
@@ -16,14 +16,8 @@ limitations under the License.
 #ifndef XLA_TEST_HELPERS_H_
 #define XLA_TEST_HELPERS_H_
 
-#include <list>
-#include <vector>
-
-#include "absl/strings/string_view.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
-#include "xla/types.h"
-#include "tsl/platform/protobuf.h"
-#include "tsl/platform/regexp.h"
 #include "tsl/platform/test.h"
 
 // This module contains a minimal subset of gmock functionality just
@@ -54,18 +48,18 @@ inline const Status& GetStatus(const StatusOr<T>& status) {
 // Macros for testing the results of functions that return Status or
 // StatusOr<T> (for any type T).
 #define EXPECT_IS_OK(expression) \
-  EXPECT_EQ(::tsl::OkStatus(),   \
+  EXPECT_EQ(::absl::OkStatus(),  \
             xla::testing::internal_status::GetStatus(expression))
 #define EXPECT_IS_NOT_OK(expression) \
-  EXPECT_NE(::tsl::OkStatus(),       \
+  EXPECT_NE(::absl::OkStatus(),      \
             xla::testing::internal_status::GetStatus(expression))
 #undef ASSERT_IS_OK
 #define ASSERT_IS_OK(expression) \
-  ASSERT_EQ(::tsl::OkStatus(),   \
+  ASSERT_EQ(::absl::OkStatus(),  \
             xla::testing::internal_status::GetStatus(expression))
 #undef ASSERT_IS_NOT_OK
 #define ASSERT_IS_NOT_OK(expression) \
-  ASSERT_NE(::tsl::OkStatus(),       \
+  ASSERT_NE(::absl::OkStatus(),      \
             xla::testing::internal_status::GetStatus(expression))
 
 #endif  // XLA_TEST_HELPERS_H_

From 0d356ed0a7bce3f09218fcd27870498e2ad05855 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 05:16:05 -0700
Subject: [PATCH 283/670] Automated Code Change

PiperOrigin-RevId: 618144863
---
 .../tf2xla/kernels/xla_call_module_loader.cc       | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
index 630e948b95f33d..5c19b9fe1014d3 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
@@ -160,7 +160,7 @@ absl::Status XlaCallModuleLoader::SetPlatformIndex(
     }
   }
 
-  if (platform_index < 0) return tsl::OkStatus();
+  if (platform_index < 0) return absl::OkStatus();
   VLOG(3) << "XlaCallModule setting the platform_index to " << platform_index
           << " for platform " << compilation_platform << ".";
   mlir::Block &main_body = main_.front();
@@ -193,7 +193,7 @@ absl::Status XlaCallModuleLoader::SetPlatformIndex(
 
   main_.eraseArgument(0);
   platform_index_arg_set_ = true;
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 static mlir::stablehlo::CustomCallOp MakeShapeRefinementOperandWrapper(
@@ -232,13 +232,13 @@ absl::Status XlaCallModuleLoader::RefineDynamicShapes(
         VLOG(3) << "XlaCallModule skipping shape refinement due to module "
                 << " attribute " << kUsesShapePolymorphismAttr.str() << "="
                 << mlir::debugString(uses_shape_poly_attr);
-        return tsl::OkStatus();
+        return absl::OkStatus();
       }
     } else {
       VLOG(3) << "XlaCallModule skipping shape refinement due to module "
               << " attribute " << kUsesShapePolymorphismAttr.str()
               << " missing";
-      return tsl::OkStatus();
+      return absl::OkStatus();
     }
   }
   // Add the tokens to the input_shapes. Starting with version 9, the main
@@ -430,7 +430,7 @@ absl::Status XlaCallModuleLoader::RefineDynamicShapes(
     DumpMlirOpToFile("xla_call_module.after_shape_refinement", *module_);
   }
 
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status XlaCallModuleLoader::LoadModule(
@@ -527,7 +527,7 @@ absl::Status XlaCallModuleLoader::LoadModule(
         " arguments of which ", nr_platform_args, " platform index arguments, ",
         "and ", nr_token_arguments, " token arguments."));
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status XlaCallModuleLoader::ValidateDialect() {
@@ -550,7 +550,7 @@ absl::Status XlaCallModuleLoader::ValidateDialect() {
         absl::StrCat("Module has unsupported dialects: ",
                      diag_handler.ConsumeStatus().ToString()));
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status XlaCallModuleLoader::ValidateStaticShapes() {

From 16660960d6e7a02349e6dfce250698cdd83eb20c Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 22 Mar 2024 06:00:01 -0700
Subject: [PATCH 284/670] [XLA:GPU][NFC] Remove unnecessary check in 
 MlirConcatenateFusion emitter

This is already checked at creation of the emitter.

PiperOrigin-RevId: 618153022
---
 .../xla/xla/service/gpu/fusions/concatenate_mlir.cc      | 9 ---------
 .../xla/xla/service/gpu/fusions/concatenate_mlir.h       | 2 --
 third_party/xla/xla/service/gpu/fusions/fusions.cc       | 2 +-
 3 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
index 974365eca8efa3..eeafdbcc110442 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
@@ -51,14 +51,6 @@ using llvm::SmallVector;
 using mlir::Value;
 using mlir::ValueRange;
 
-/*static*/ bool MlirConcatenateFusion::IsSupported(
-    const HloFusionAnalysis& analysis) {
-  if (analysis.fusion_roots().size() != 1) return false;
-
-  return mlir_converter::IsHloConversionSupported(
-      analysis.fusion(), analysis.device_info().gpu_compute_capability());
-}
-
 LaunchDimensions MlirConcatenateFusion::launch_dimensions() const {
   return CalculateLaunchDimensions(GetLargestConcatOperandShape(analysis_),
                                    analysis_.device_info());
@@ -90,7 +82,6 @@ absl::Status MlirConcatenateFusion::EmitEntryFunction(
     const mlir_converter::CallTargetProvider& call_targets,
     mlir::func::FuncOp entry_function,
     const HloFusionInstruction& fusion) const {
-  CHECK(IsSupported(analysis_));
   const auto& root_computation = computations.FindPartitionedComputation(
       fusion.fused_instructions_computation());
   const auto* concat = analysis_.fusion_heroes()[0];
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
index f07a637d16c956..38e5d81d1c0fe3 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
@@ -38,8 +38,6 @@ class MlirConcatenateFusion : public MlirFusionEmitterBase {
   explicit MlirConcatenateFusion(const HloFusionAnalysis& analysis)
       : analysis_(analysis) {}
 
-  static bool IsSupported(const HloFusionAnalysis& analysis);
-
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.cc b/third_party/xla/xla/service/gpu/fusions/fusions.cc
index c94d5bda249fc0..e32a402e4256ce 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.cc
@@ -221,7 +221,7 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
       return std::make_unique<TransposeFusion>(analysis);
     }
     case HloFusionAnalysis::EmitterFusionKind::kConcatenate: {
-      if (check_mlir_emitters(MlirConcatenateFusion::IsSupported)) {
+      if (check_mlir_emitters(nullptr)) {
         return std::make_unique<MlirConcatenateFusion>(analysis);
       }
       return std::make_unique<ConcatenateFusion>(analysis);

From 2a1f79446154264c8d9eff677a9aca77a492c0eb Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 22 Mar 2024 06:36:10 -0700
Subject: [PATCH 285/670] [xla:gpu][NFC] Simplify
 AddressComputationFusionRewriter::Run

PiperOrigin-RevId: 618160571
---
 .../service/gpu/address_computation_fusion_rewriter.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index ad124ed3eabde2..6810b4abdabfd3 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -298,7 +298,6 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (!module->has_schedule()) return Internal("module is not scheduled");
-  bool changed = false;
 
   absl::flat_hash_map<HloInstruction*, absl::InlinedVector<HloInstruction*, 8>>
       matches;
@@ -317,6 +316,8 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
     }
   }
 
+  if (matches.empty()) return false;
+
   HloSchedule& schedule = module->schedule();
   for (auto& kv : matches) {
     auto captures = GetPatternCaptures(kv.second);
@@ -340,14 +341,11 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
 
     // TODO(vuson): handle control dependencies
     TF_RETURN_IF_ERROR(parent->ReplaceInstruction(kv.first, fusion));
-    changed = true;
   }
 
-  if (changed) {
-    TF_RETURN_IF_ERROR(module->schedule().Update());
-  }
+  TF_RETURN_IF_ERROR(module->schedule().Update());
 
-  return changed;
+  return true;
 }
 
 }  // namespace gpu

From d9a60e96298214241d16486fefd7ed74e7bf8952 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Fri, 22 Mar 2024 06:46:49 -0700
Subject: [PATCH 286/670] [XLA] Tiny fixes for indexing.md

PiperOrigin-RevId: 618162572
---
 third_party/xla/docs/indexing.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/docs/indexing.md b/third_party/xla/docs/indexing.md
index f0cd37bc50273e..4ee61f16f62033 100644
--- a/third_party/xla/docs/indexing.md
+++ b/third_party/xla/docs/indexing.md
@@ -123,7 +123,7 @@ Let's study-by-example to understand what's all of the above actually means.
 
 ### Elementwise
 
-For an elementwise ops the indexing map is an identity.
+For elementwise ops the indexing map is an identity.
 
 ```c++
   p0 = f32[10, 20] parameter(0)
@@ -141,7 +141,7 @@ The output to input maps:
 The input to output maps
 
 -   input_i -> output: $(d_0, d_1) \mapsto (d_0, d_1)$ for $\boldsymbol{d} \in
-    {\rm Dom}(output)$
+    {\rm Dom}(input)$
 
 ### [Broadcast](https://openxla.org/xla/operation_semantics#broadcastindim)
 
@@ -160,8 +160,8 @@ The output to input map:
 
 The input to output map
 
--   input -> output: $(d_0) \mapsto (s_0, d_1, s_1)$ for $\boldsymbol{d} \in
-    {\rm Dom}(output)$ and $\boldsymbol{s} \in [0, 9] \times [0, 29]$.
+-   input -> output: $(d_0) \mapsto (s_0, d_0, s_1)$ for $\boldsymbol{d} \in
+    {\rm Dom}(input)$ and $\boldsymbol{s} \in [0, 9] \times [0, 29]$.
 
 Note that now we have $\boldsymbol s$ on the right side for the input-to-output
 mapping. Those are the symbols that represent ranges of values. For example, in

From aeb5b7fc387bd89b6d128ff0ce78d696595251ca Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <isergachev@nvidia.com>
Date: Fri, 22 Mar 2024 07:03:48 -0700
Subject: [PATCH 287/670] PR #10519: [GPU] Upgrade cuDNN frontend to v1.2.0.

Imported from GitHub PR https://github.com/openxla/xla/pull/10519

This will for instance allow fusion of scalar constants.
Copybara import of the project:

--
4695023e035516db05b707d79409a60eba9f47f7 by Ilia Sergachev <isergachev@nvidia.com>:

[GPU] Upgrade cuDNN frontend to v1.2.0.

Merging this change closes #10519

PiperOrigin-RevId: 618165755
---
 tensorflow/workspace2.bzl      | 6 +++---
 third_party/xla/workspace2.bzl | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 865b12e59d64f7..512d43623da089 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -181,9 +181,9 @@ def _tf_repositories():
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "c2f5373ddf84e33d289dad5766667f52de652dfbbb1dccb2fada9cfcf2d774cf",
-        strip_prefix = "cudnn-frontend-1.1.0",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.1.0.zip"),
+        sha256 = "1bb309af98fe9aad81b6a14fd52acbd6566aacfd322fc5803f9a1b77fc681a27",
+        strip_prefix = "cudnn-frontend-1.2.1",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.2.1.zip"),
     )
 
     tf_http_archive(
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 45245072af415d..e7c6c4be2500d1 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -38,9 +38,9 @@ def _tf_repositories():
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "c2f5373ddf84e33d289dad5766667f52de652dfbbb1dccb2fada9cfcf2d774cf",
-        strip_prefix = "cudnn-frontend-1.1.0",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.1.0.zip"),
+        sha256 = "1bb309af98fe9aad81b6a14fd52acbd6566aacfd322fc5803f9a1b77fc681a27",
+        strip_prefix = "cudnn-frontend-1.2.1",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.2.1.zip"),
     )
 
     tf_http_archive(

From baebf00040b38d22ae8b4f03cfce0a2f6661cb7d Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 22 Mar 2024 07:09:01 -0700
Subject: [PATCH 288/670] [XLA:GPU] Add MLIR-based DUS emitter.

PiperOrigin-RevId: 618166719
---
 third_party/xla/xla/service/gpu/fusions/BUILD |  41 +++++
 .../xla/xla/service/gpu/fusions/fusions.cc    |   6 +
 .../in_place_dynamic_update_slice_mlir.cc     | 146 ++++++++++++++++++
 .../in_place_dynamic_update_slice_mlir.h      |  81 ++++++++++
 ...in_place_dynamic_update_slice_mlir_test.cc | 132 ++++++++++++++++
 5 files changed, 406 insertions(+)
 create mode 100644 third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
 create mode 100644 third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
 create mode 100644 third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 802fd9e3b7f5c8..d2fa66184fd5a7 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -32,6 +32,46 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "in_place_dynamic_update_slice_mlir",
+    srcs = ["in_place_dynamic_update_slice_mlir.cc"],
+    hdrs = ["in_place_dynamic_update_slice_mlir.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu/fusions/mlir:computation_partitioner",
+        "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
+        "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
+        "//xla/service/gpu/model:indexing_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:TensorDialect",
+    ],
+)
+
+xla_cc_test(
+    name = "in_place_dynamic_update_slice_mlir_test",
+    srcs = ["in_place_dynamic_update_slice_mlir_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":in_place_dynamic_update_slice_mlir",
+        ":mlir_emitter_test_base",
+        "//xla:error_spec",
+        "//xla/service:gpu_plugin",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/lib/core:status_test_util",
+    ],
+)
+
 cc_library(
     name = "copy",
     srcs = ["copy.cc"],
@@ -175,6 +215,7 @@ cc_library(
         ":custom",
         ":fusion_emitter",
         ":in_place_dynamic_update_slice",
+        ":in_place_dynamic_update_slice_mlir",
         ":input_slices",
         ":input_slices_mlir",
         ":loop",
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.cc b/third_party/xla/xla/service/gpu/fusions/fusions.cc
index e32a402e4256ce..5c192c7a5ca9fc 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/custom.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/in_place_dynamic_update_slice.h"
+#include "xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h"
 #include "xla/service/gpu/fusions/input_slices.h"
 #include "xla/service/gpu/fusions/input_slices_mlir.h"
 #include "xla/service/gpu/fusions/loop.h"
@@ -191,6 +192,11 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
     case HloFusionAnalysis::EmitterFusionKind::kLoop: {
       if (IsDynamicUpdateSliceFusion(analysis) &&
           fusion_info.CanEmitDynamicUpdateSliceInPlace()) {
+        if (check_mlir_emitters(
+                MlirInPlaceDynamicUpdateSliceFusion::IsSupported)) {
+          return std::make_unique<MlirInPlaceDynamicUpdateSliceFusion>(
+              analysis);
+        }
         return std::make_unique<InPlaceDynamicUpdateSliceFusion>(analysis);
       }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
new file mode 100644
index 00000000000000..b546f5a204d49a
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
@@ -0,0 +1,146 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h"
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/primitive_util.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
+#include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+constexpr int kDUSUpdateIndex = 1;
+
+}  // namespace
+
+/*static*/ bool MlirInPlaceDynamicUpdateSliceFusion::IsSupported(
+    const HloFusionAnalysis& analysis) {
+  return analysis.fusion_roots().size() == 1;
+}
+
+LaunchDimensions MlirInPlaceDynamicUpdateSliceFusion::launch_dimensions()
+    const {
+  const auto& update_shape =
+      dus_ops_.front()->operand(kDUSUpdateIndex)->shape();
+  return CalculateLaunchDimensions(update_shape, analysis_.device_info());
+}
+
+std::optional<IndexingMap>
+MlirInPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    IndexingContext* indexing_context) const {
+  auto launch_dims = launch_dimensions();
+  // It is guaranteed that all DUS ops have the same output shape at this point.
+  const auto& update_shape =
+      dus_ops_.front()->operand(kDUSUpdateIndex)->shape();
+  return GetDefaultThreadIdToOutputIndexingMap(launch_dims, /*unroll_factor=*/1,
+                                               update_shape, indexing_context);
+}
+
+std::vector<const HloInstruction*>
+MlirInPlaceDynamicUpdateSliceFusion::GetInstructionsWithCustomCodegen(
+    const HloFusionInstruction& fusion) const {
+  return dus_ops_;
+}
+
+absl::Status MlirInPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
+    const mlir_converter::PartitionedComputations& computations,
+    const mlir_converter::CallTargetProvider& call_targets,
+    mlir::func::FuncOp entry_function,
+    const HloFusionInstruction& fusion) const {
+  mlir::ImplicitLocOpBuilder b(entry_function.getLoc(), entry_function);
+  b.setInsertionPointToStart(entry_function.addEntryBlock());
+
+  mlir::MLIRContext* mlir_context = entry_function.getContext();
+  IndexingContext indexing_context{mlir_context};
+
+  auto indexing = *ComputeThreadIdToInputIndexing(
+      /*root_index=*/0,
+      /*hero_operand_index=*/kDUSUpdateIndex, &indexing_context);
+  indexing.Simplify();
+  indexing.RemoveUnusedSymbols();
+
+  int num_inputs = fusion.fused_instructions_computation()->num_parameters();
+  auto output_tensor_args =
+      entry_function.getArguments().drop_front(num_inputs);
+
+  const auto& root_computation = computations.FindPartitionedComputation(
+      fusion.fused_instructions_computation());
+  const auto& dus_subgraph = root_computation.FindSubgraph(dus_ops_.front());
+
+  const auto* dus_instr = dus_ops_.front();
+  const auto& update_shape = dus_instr->operand(kDUSUpdateIndex)->shape();
+  auto result_tensors = EmitThreadLoopNest(
+      b, output_tensor_args, indexing,
+      [&](mlir::ValueRange output_tensors, mlir::ValueRange dim_values,
+          mlir::ValueRange symbol_values) -> llvm::SmallVector<mlir::Value> {
+        auto input_indices = mlir_converter::ApplyAffineMap(
+            indexing.GetAffineMap(), dim_values, symbol_values, b);
+        llvm::SmallVector<mlir::Value> update_indices;
+        for (int i = 0; i < update_shape.rank(); ++i) {
+          int64_t update_size = update_shape.dimensions(i);
+          auto start_index = mlir_converter::ProvideParameter(
+              dus_subgraph, dus_instr, i + 2, {}, call_targets, entry_function,
+              b)[0];
+          start_index = mlir_converter::ClampIndex(
+              start_index,
+              primitive_util::IsUnsignedIntegralType(
+                  dus_instr->operand(i + 2)->shape().element_type()),
+              dus_instr->shape().dimensions(i) - update_size, b);
+
+          update_indices.push_back(
+              b.create<mlir::arith::AddIOp>(input_indices[i], start_index));
+        }
+
+        auto updated_value = mlir_converter::ProvideParameter(
+            dus_subgraph, dus_instr, kDUSUpdateIndex, input_indices,
+            call_targets, entry_function, b)[0];
+        auto insert = b.create<mlir::tensor::InsertOp>(
+            updated_value, output_tensors[0], update_indices);
+
+        return {insert.getResult()};
+      });
+
+  b.create<mlir::func::ReturnOp>(result_tensors);
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
new file mode 100644
index 00000000000000..80072d160c495d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
@@ -0,0 +1,81 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_IN_PLACE_DYNAMIC_UPDATE_SLICE_MLIR_H_
+#define XLA_SERVICE_GPU_FUSIONS_IN_PLACE_DYNAMIC_UPDATE_SLICE_MLIR_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/status.h"
+
+namespace xla {
+namespace gpu {
+
+// Fusion node where the root is either:
+// 1. a dynamic-update-slice op
+// 2. a bitcast of a dynamic-update-slice op
+// 3. a tuple op returning the result of several dynamic-update-slice ops
+// 4. a tuple op returning the result of several bitcast
+//    dynamic-update-slice ops
+//
+// Lowers to LLVM via MLIR.
+class MlirInPlaceDynamicUpdateSliceFusion : public MlirFusionEmitterBase {
+ public:
+  explicit MlirInPlaceDynamicUpdateSliceFusion(
+      const HloFusionAnalysis& analysis)
+      : analysis_(analysis),
+        dus_ops_(
+            GetOutputDefiningDynamicUpdateSlices(analysis.fusion_roots())) {}
+
+  static bool IsSupported(const HloFusionAnalysis& analysis);
+
+  LaunchDimensions launch_dimensions() const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, IndexingContext* indexing_context) const override {
+    // The mapping cannot be statically computed in general, since the offsets
+    // are unknown.
+    return std::nullopt;
+  }
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      IndexingContext* indexing_context) const override;
+
+ protected:
+  absl::Status EmitEntryFunction(
+      const mlir_converter::PartitionedComputations& computations,
+      const mlir_converter::CallTargetProvider& call_targets,
+      mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion) const override;
+
+  std::vector<const HloInstruction*> GetInstructionsWithCustomCodegen(
+      const HloFusionInstruction& fusion) const override;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+  std::vector<const HloInstruction*> dus_ops_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_IN_PLACE_DYNAMIC_UPDATE_SLICE_MLIR_H_
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
new file mode 100644
index 00000000000000..3374568b3c7a5d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
@@ -0,0 +1,132 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h"
+
+#include <gtest/gtest.h>
+#include "xla/error_spec.h"
+#include "xla/service/gpu/fusions/mlir_emitter_test_base.h"
+#include "tsl/lib/core/status_test_util.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using MlirInPlaceDynamicUpdateSliceFusionTest =
+    MlirEmitterTestBase<MlirInPlaceDynamicUpdateSliceFusion>;
+
+TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, SimpleDUS) {
+  auto kHloString = R"(
+    HloModule module
+
+    fused_computation {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] parameter(2)
+      i1 = s32[] parameter(3)
+      ROOT updated = f32[20,30] dynamic-update-slice(in, updates, i0, i1)
+    }
+    ENTRY entry {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] constant(2)
+      i1 = s32[] constant(3)
+      ROOT fusion = f32[20,30] fusion(in, updates, i0, i1), kind=kLoop, calls=fused_computation
+    }
+  )";
+  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK-DAG: #[[MAP_1:.*]] = affine_map<()[s0] -> (s0 floordiv 6)>
+    // CHECK-DAG: #[[MAP_2:.*]] = affine_map<()[s0] -> (s0 mod 6)>
+    // CHECK:     func.func @fused_computation
+    // CHECK-SAME:  %arg0: tensor<20x30xf32>
+    // CHECK-SAME:  %arg1: tensor<5x6xf32>
+    // CHECK-SAME:  %arg2: tensor<i32>
+    // CHECK-SAME:  %arg3: tensor<i32>
+    // CHECK-SAME:  %arg4: tensor<20x30xf32>
+    // CHECK-DAG:   %[[C_24:.*]] = arith.constant 24
+    // CHECK-DAG:   %[[C_15:.*]] = arith.constant 15
+    // CHECK-DAG:   %[[C_0:.*]] = arith.constant 0
+    // CHECK:       %[[THREAD_ID:.*]] = gpu.thread_id  x
+    // CHECK:       %[[INPUT_INDEX_0:.*]] = affine.apply #[[MAP_1]]()[%[[THREAD_ID]]]
+    // CHECK:       %[[INPUT_INDEX_1:.*]] = affine.apply #[[MAP_2]]()[%[[THREAD_ID]]]
+    // CHECK:       %[[I0:.*]] = xla_gpu.pure_call @fused_computation_i0
+    // CHECK:       %[[IDX0:.*]] = arith.index_cast %[[I0]]
+    // CHECK:       %[[MIN0:.*]] = arith.minsi %[[IDX0]], %[[C_15]]
+    // CHECK:       %[[MAX0:.*]] = arith.maxsi %[[MIN0]], %[[C_0]]
+    // CHECK:       %[[ADD0:.*]] = arith.addi %[[INPUT_INDEX_0]], %[[MAX0]]
+    // CHECK:       %[[I1:.*]] = xla_gpu.pure_call @fused_computation_i1
+    // CHECK:       %[[IDX1:.*]] = arith.index_cast %[[I1]]
+    // CHECK:       %[[MIN1:.*]] = arith.minsi %[[IDX1]], %[[C_24]]
+    // CHECK:       %[[MAX1:.*]] = arith.maxsi %[[MIN1]], %[[C_0]]
+    // CHECK:       %[[ADD1:.*]] = arith.addi %[[INPUT_INDEX_1]], %[[MAX1]]
+    // CHECK:       %[[UPDATE:.*]] = xla_gpu.pure_call @fused_computation_updates
+    // CHECK:       %[[INSERT:.*]] = tensor.insert %[[UPDATE:.*]] into %arg4[%[[ADD0]], %[[ADD1]]]
+    // CHECK:       return %[[INSERT]]
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, OutOfBoundDUS) {
+  auto kHloString = R"(
+    HloModule module
+
+    fused_computation {
+      in = f32[7,8] parameter(0)
+      updates = f32[2,3] parameter(1)
+      i0 = s32[] parameter(2)
+      i1 = s32[] parameter(3)
+      ROOT updated = f32[7,8] dynamic-update-slice(in, updates, i0, i1)
+    }
+    ENTRY entry {
+      in = f32[7,8] parameter(0)
+      updates = f32[2,3] parameter(1)
+      i0 = s32[] constant(-20)
+      i1 = s32[] constant(30)
+      ROOT fusion = f32[7,8] fusion(in, updates, i0, i1), kind=kLoop, calls=fused_computation
+    }
+  )";
+  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK-DAG: #[[MAP_1:.*]] = affine_map<()[s0] -> (s0 floordiv 3)>
+    // CHECK-DAG: #[[MAP_2:.*]] = affine_map<()[s0] -> (s0 mod 3)>
+    // CHECK:     func.func @fused_computation
+    // CHECK-SAME:  %arg0: tensor<7x8xf32>
+    // CHECK-SAME:  %arg1: tensor<2x3xf32>
+    // CHECK-SAME:  %arg2: tensor<i32>
+    // CHECK-SAME:  %arg3: tensor<i32>
+    // CHECK-SAME:  %arg4: tensor<7x8xf32>
+    // CHECK-DAG:   %[[C_5:.*]] = arith.constant 5
+    // CHECK-DAG:   %[[C_0:.*]] = arith.constant 0
+    // CHECK:       %[[THREAD_ID:.*]] = gpu.thread_id  x
+    // CHECK:       %[[INPUT_INDEX_0:.*]] = affine.apply #[[MAP_1]]()[%[[THREAD_ID]]]
+    // CHECK:       %[[INPUT_INDEX_1:.*]] = affine.apply #[[MAP_2]]()[%[[THREAD_ID]]]
+    // CHECK:       %[[I0:.*]] = xla_gpu.pure_call @fused_computation_i0
+    // CHECK:       %[[IDX0:.*]] = arith.index_cast %[[I0]]
+    // CHECK:       %[[MIN0:.*]] = arith.minsi %[[IDX0]], %[[C_5]]
+    // CHECK:       %[[MAX0:.*]] = arith.maxsi %[[MIN0]], %[[C_0]]
+    // CHECK:       %[[ADD0:.*]] = arith.addi %[[INPUT_INDEX_0]], %[[MAX0]]
+    // CHECK:       %[[I1:.*]] = xla_gpu.pure_call @fused_computation_i1
+    // CHECK:       %[[IDX1:.*]] = arith.index_cast %[[I1]]
+    // CHECK:       %[[MIN1:.*]] = arith.minsi %[[IDX1]], %[[C_5]]
+    // CHECK:       %[[MAX1:.*]] = arith.maxsi %[[MIN1]], %[[C_0]]
+    // CHECK:       %[[ADD1:.*]] = arith.addi %[[INPUT_INDEX_1]], %[[MAX1]]
+    // CHECK:       %[[UPDATE:.*]] = xla_gpu.pure_call @fused_computation_updates
+    // CHECK:       %[[INSERT:.*]] = tensor.insert %[[UPDATE:.*]] into %arg4[%[[ADD0]], %[[ADD1]]]
+    // CHECK:       return %[[INSERT]]
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla

From 6feb6a17db2506637632a9a5909bceb00401a351 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 22 Mar 2024 07:18:55 -0700
Subject: [PATCH 289/670] [XLA:Python] Fix build failures on Windows.

PiperOrigin-RevId: 618168554
---
 third_party/xla/xla/python/ifrt_proxy/client/device.h       | 2 ++
 third_party/xla/xla/python/ifrt_proxy/client/memory.h       | 2 ++
 .../xla/xla/python/ifrt_proxy/server/host_callback.cc       | 4 ++--
 third_party/xla/xla/python/py_client.cc                     | 6 ++++--
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt_proxy/client/device.h b/third_party/xla/xla/python/ifrt_proxy/client/device.h
index 6922488d7fa006..6cb461865818d9 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/device.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/device.h
@@ -38,6 +38,8 @@ namespace xla {
 namespace ifrt {
 namespace proxy {
 
+class Client;
+
 class DeviceDescription final : public xla::PjRtDeviceDescription {
  public:
   DeviceDescription(
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/memory.h b/third_party/xla/xla/python/ifrt_proxy/client/memory.h
index 1bf8c584cd3dd3..e33c3a1a30ac83 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/memory.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/memory.h
@@ -31,6 +31,8 @@ namespace xla {
 namespace ifrt {
 namespace proxy {
 
+class Client;
+
 class Memory : public xla::ifrt::Memory {
  public:
   Memory(int id, std::string memory_space_kind, std::string debug_string,
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
index a675ab40c66ef0..43e13700c293bd 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
@@ -154,8 +154,8 @@ absl::Status RemoteLoadedHostCallback::Execute(void** result_ptrs,
         buffers.reserve(args.size());
         for (int i = 0; i < args.size(); ++i) {
           const int64_t size = xla::ShapeUtil::ByteSizeOf(args[i].shape);
-          buffers.push_back(RemoteLoadedHostCallbackQueue::Buffer{
-              .data = ptrs[i], .size = size});
+          buffers.push_back(
+              RemoteLoadedHostCallbackQueue::Buffer{ptrs[i], size});
         }
       };
   to_buffer(host_callback().operands, operand_ptrs, request.operands);
diff --git a/third_party/xla/xla/python/py_client.cc b/third_party/xla/xla/python/py_client.cc
index 19ee503c1ded5c..d0389018940891 100644
--- a/third_party/xla/xla/python/py_client.cc
+++ b/third_party/xla/xla/python/py_client.cc
@@ -285,12 +285,14 @@ absl::Status PyClient::Defragment() {
     std::string shape;
     std::string dtype;
     try {
-      shape = nb::cast<std::string>(nb::str(argument.attr("shape")));
+      shape =
+          nb::cast<std::string>(nb::str(nb::object(argument.attr("shape"))));
     } catch (const std::exception& e) {
       shape = "<unknown>";
     }
     try {
-      dtype = nb::cast<std::string>(nb::str(argument.attr("dtype")));
+      dtype =
+          nb::cast<std::string>(nb::str(nb::object(argument.attr("dtype"))));
     } catch (const std::exception& e) {
       dtype = "<unknown>";
     }

From 4f3c6912ac0c719ccc94ee252124b678c7d017db Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Fri, 22 Mar 2024 07:33:14 -0700
Subject: [PATCH 290/670] Implement EmitReduceWindow in the new MLIR emitters

PiperOrigin-RevId: 618172208
---
 .../gpu/fusions/mlir/elemental_hlo_to_mlir.cc | 67 ++++++++++++++++++-
 .../mlir/elemental_hlo_to_mlir_test.cc        | 48 ++++++++++++-
 .../service/gpu/model/indexing_analysis.cc    |  3 +-
 .../gpu/model/indexing_analysis_test.cc       | 34 +++++++++-
 4 files changed, 147 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index 4d5933ac54d3a3..71b954ee2e693e 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -162,8 +162,7 @@ static auto& kUnsupportedOps =
                                         HloOpcode::kCall};
 
 static auto& kUnimplementedOps = *new absl::flat_hash_set<HloOpcode>{
-    HloOpcode::kConvolution, HloOpcode::kDot, HloOpcode::kMap,
-    HloOpcode::kReduceWindow};
+    HloOpcode::kConvolution, HloOpcode::kDot, HloOpcode::kMap};
 
 bool IsUnsupportedConstant(const HloInstruction* instr) {
   return instr->opcode() == HloOpcode::kConstant &&
@@ -300,6 +299,67 @@ absl::StatusOr<SmallVector<Value>> EmitReduce(
   return ConvertToSignless(result, b);
 }
 
+absl::StatusOr<SmallVector<Value>> EmitReduceWindow(
+    const HloInstruction* instr, mlir::Type result_element_type,
+    ValueRange indices, const OperandProvider& operand_provider,
+    const CallTargetProvider& call_target_provider, ImplicitLocOpBuilder& b) {
+  IndexingContext indexing_context{b.getContext()};
+  HloInstructionIndexing indexing =
+      ComputeOutputToInputIndexing(instr, 0, &indexing_context);
+  const auto& indexing_map = *indexing.indexing_maps[0].begin();
+
+  auto reduce_window = DynCast<HloReduceWindowInstruction>(instr);
+  CHECK(reduce_window != nullptr);
+
+  SmallVector<Value> init_values;
+  for (auto [index, init_value] :
+       llvm::enumerate(reduce_window->init_values())) {
+    TF_ASSIGN_OR_RETURN(
+        init_values.emplace_back(),
+        GetSingleOperandValue(operand_provider, instr,
+                              reduce_window->input_count() + index, {}));
+    // Convert back to signed type.
+    TF_ASSIGN_OR_RETURN(
+        auto element_mlir_type,
+        ConvertPrimitiveTypeToMlirType(init_value->shape().element_type(), b));
+    init_values.back() = b.create<mlir::UnrealizedConversionCastOp>(
+                              element_mlir_type, init_values.back())
+                             .getResult(0);
+  }
+
+  auto body =
+      [&](ValueRange iter_args, ValueRange dim_values,
+          ValueRange symbol_values) -> absl::StatusOr<SmallVector<Value>> {
+    auto indices = ApplyAffineMap(indexing_map.GetAffineMap(), dim_values,
+                                  symbol_values, b);
+
+    SmallVector<Value> args{iter_args};
+    for (auto [index, input] : llvm::enumerate(reduce_window->inputs())) {
+      TF_ASSIGN_OR_RETURN(
+          args.emplace_back(),
+          GetSingleOperandValue(operand_provider, instr, index, indices));
+
+      // Convert back to signed type.
+      TF_ASSIGN_OR_RETURN(
+          auto element_mlir_type,
+          ConvertPrimitiveTypeToMlirType(input->shape().element_type(), b));
+      args.back() = b.create<mlir::UnrealizedConversionCastOp>(
+                         element_mlir_type, args.back())
+                        .getResult(0);
+    }
+
+    auto reducer = call_target_provider(
+        instr->called_computations().front()->root_instruction());
+    return b.create<mlir::func::CallOp>(reducer, args).getResults();
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      auto result,
+      EmitLoopNestWithStatus(b, indices, init_values, indexing_map, body));
+
+  return ConvertToSignless(result, b);
+}
+
 absl::StatusOr<SmallVector<Value>> EmitConcat(
     const HloInstruction* instr, mlir::Type result_element_type,
     ValueRange indices, const OperandProvider& operand_provider,
@@ -649,6 +709,9 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
     case HloOpcode::kReduce:
       return EmitReduce(instr, indices, operand_provider, call_target_provider,
                         builder);
+    case HloOpcode::kReduceWindow:
+      return EmitReduceWindow(instr, result_element_type, indices,
+                              operand_provider, call_target_provider, builder);
     case HloOpcode::kTuple: {
       CHECK(!IsUnsupportedTuple(instr));
       const auto& first_shape = instr->shape().tuple_shapes(0);
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
index f326b06113e0f5..699912005bb223 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "llvm/Support/raw_ostream.h"
@@ -199,6 +198,53 @@ TEST_F(ElementalHloToMlirTest, ReduceUnsigned) {
   )"));
 }
 
+TEST_F(ElementalHloToMlirTest, ReduceWindow) {
+  TF_EXPECT_OK(Run(R"(
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT sum = f32[] add(p0, p1)
+    }
+
+    ENTRY main {
+      p0 = f32[42,12,8] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT r = f32[42,3,8] reduce-window(p0, p1), window={
+                                                size=1x1x7
+                                                stride=1x4x1
+                                                pad=0_0x0_0x3_3
+                                               },
+                                               to_apply=add
+    })",
+                   R"(
+    // CHECK:      @main_r(
+    // CHECK-SAME:   %[[ARG0:.*]]: tensor<42x12x8xf32>
+    // CHECK-SAME:   %[[ARG1:.*]]: tensor<f32>
+    // CHECK-SAME:   %[[X:arg[0-9]*]]: index {{[^}]*}}},
+    // CHECK-SAME:   %[[Y:arg[0-9]*]]: index {{[^}]*}}},
+    // CHECK-SAME:   %[[Z:arg[0-9]*]]: index {{[^}]*}}}) -> f32
+    // CHECK-DAG:  %[[C10:.*]] = arith.constant 10
+    // CHECK-DAG:  %[[C0:.*]] = arith.constant 0
+    // CHECK-DAG:  %[[C1:.*]] = arith.constant 1
+    // CHECK-DAG:  %[[C7:.*]] = arith.constant 7
+    // CHECK:      %[[INIT:.*]] = tensor.extract %[[ARG1]][]
+    // CHECK:      %[[RET:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[C7]]
+    // CHECK-SAME:   step %[[C1]] iter_args(%[[ACC:.*]] = %[[INIT]])
+    // CHECK:      %[[J:.*]] = affine.apply affine_map<()[s0] ->
+    // CHECK-SAME: (s0 * 4)>()[%[[Y]]]
+    // CHECK:      %[[K:.*]] = affine.apply affine_map<()[s0, s1] ->
+    // CHECK-SAME: (s0 + s1 - 3)>()[%[[I]], %[[Z]]]
+    // CHECK:          %[[VAL:.*]] = tensor.extract %[[ARG0]]
+    // CHECK-SAME:        [%[[X]], %[[J]], %[[K]]]
+    // CHECK:          %[[UPD:.*]] = func.call @add_sum(%[[ACC]],
+    // CHECK-SAME:                                      %[[VAL]])
+    // CHECK:          scf.yield %[[UPD]]
+    // CHECK:        }
+    // CHECK:      }
+    // CHECK:      return %[[RET]]
+  )"));
+}
+
 TEST_F(ElementalHloToMlirTest, Concatenate) {
   TF_EXPECT_OK(Run(R"(
     ENTRY main {
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index 81592731a68883..b621b2434893bc 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -661,7 +661,8 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
     AffineExpr dim_expr = getAffineDimExpr(dim_id, mlir_context);
     AffineExpr symbol_expr = getAffineSymbolExpr(dim_id, mlir_context);
 
-    exprs.push_back(symbol_expr + window_config.stride() * dim_expr);
+    exprs.push_back(symbol_expr * window_config.window_dilation() +
+                    window_config.stride() * dim_expr);
     dim_vars.push_back({Interval{0, output_shape.dimensions(dim_id) - 1}});
     range_vars.push_back({Interval{0, window_config.size() - 1}});
   }
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 78270678427055..2d4855f6840ae5 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -1875,7 +1875,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_PaddingAndWindowStride) {
                           )"))));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_Dilation) {
+TEST_F(IndexingAnalysisTest, ReduceWindowOp_BaseDilation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1908,6 +1908,38 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_Dilation) {
                           )"))));
 }
 
+TEST_F(IndexingAnalysisTest, ReduceWindowOp_WindowDilation) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    max {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+    ENTRY e {
+      c_inf = f32[] constant(-inf)
+      p0 = f32[7, 3] parameter(0)
+      ROOT reduce-window = f32[4, 3] reduce-window(p0, c_inf),
+       window={size=2x1 pad=0_0x0_0 rhs_dilate=3x1}, to_apply=max
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1)[s0] -> (d0 + s0 * 3, d1)
+                            domain:
+                            d0 in [0, 3]
+                            d1 in [0, 2]
+                            s0 in [0, 1]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> ()
+                            domain:
+                            d0 in [0, 3]
+                            d1 in [0, 2]
+                          )"))));
+}
+
 TEST_F(IndexingAnalysisTest, ReduceWindowOp_Variadic) {
   auto root = ParseAndGetRoot(R"(
     HloModule m

From 5070febeaa4da1a7ab9b553f84276c246d6594db Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Fri, 22 Mar 2024 08:02:38 -0700
Subject: [PATCH 291/670] Integrate LLVM at llvm/llvm-project@de7a50fb88fa

Updates LLVM usage to match
[de7a50fb88fa](https://github.com/llvm/llvm-project/commit/de7a50fb88fa)

PiperOrigin-RevId: 618179147
---
 tensorflow/compiler/mlir/lite/BUILD                  |  1 +
 tensorflow/compiler/mlir/lite/experimental/tac/BUILD |  5 +++++
 .../compiler/mlir/quantization/common/ir/BUILD       |  1 +
 tensorflow/compiler/mlir/tensorflow/transforms/BUILD |  1 +
 tensorflow/compiler/mlir/tfrt/BUILD                  |  2 ++
 tensorflow/compiler/mlir/tfrt/ir/BUILD               |  2 ++
 tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD          |  2 ++
 tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD  |  1 +
 tensorflow/compiler/mlir/tools/kernel_gen/BUILD      |  1 +
 .../compiler/mlir/tools/kernel_gen/transforms/BUILD  |  1 +
 tensorflow/dtensor/mlir/BUILD                        |  1 +
 third_party/llvm/workspace.bzl                       |  4 ++--
 third_party/triton/cl617812302.patch                 | 12 ++++++++++++
 third_party/triton/workspace.bzl                     |  1 +
 third_party/xla/third_party/triton/cl617812302.patch | 12 ++++++++++++
 third_party/xla/third_party/triton/workspace.bzl     |  1 +
 third_party/xla/xla/service/gpu/BUILD                |  1 +
 third_party/xla/xla/service/gpu/fusions/BUILD        |  4 ++++
 third_party/xla/xla/service/gpu/fusions/mlir/BUILD   |  5 +++++
 .../xla/xla/service/gpu/fusions/mlir/ir/BUILD        |  2 ++
 20 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 third_party/triton/cl617812302.patch
 create mode 100644 third_party/xla/third_party/triton/cl617812302.patch

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 1ca2163146eefc..2b2a23a118630a 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -740,6 +740,7 @@ cc_library(
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla:status",
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
index 21bf8f739aea78..8a2dff2ecb34d7 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
@@ -82,9 +82,11 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
 
@@ -215,6 +217,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/experimental/tac/hardwares:target_hardware",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -259,9 +262,11 @@ cc_library(
         "@com_google_protobuf//:protobuf_headers",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
     ],
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/BUILD b/tensorflow/compiler/mlir/quantization/common/ir/BUILD
index 2fdd6efb2f3dfc..c1429a27368d51 100644
--- a/tensorflow/compiler/mlir/quantization/common/ir/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/ir/BUILD
@@ -73,6 +73,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index 8434d9c4124f25..d44bd428bd9456 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -658,6 +658,7 @@ cc_library(
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Rewrite",
         "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index e2157630ceb1b5..2fa322d47c5c58 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -252,6 +252,7 @@ cc_library(
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
@@ -444,6 +445,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tfrt/ir/BUILD b/tensorflow/compiler/mlir/tfrt/ir/BUILD
index 550e94431cb489..92a1dc3d2757cb 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/BUILD
@@ -52,6 +52,7 @@ cc_library(
         ":tfrt_fallback_opdefs",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
@@ -79,6 +80,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:core_runtime_opdefs",
         "@tf_runtime//:tensor_opdefs",
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
index bff2a14af4f68b..cf64a37c2a696c 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
@@ -167,6 +167,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_side_effects",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:compiler_tfrt_op_interfaces",
         "@tf_runtime//:compiler_tfrt_traits",
@@ -184,5 +185,6 @@ cc_library(
         ":tf_mlrt_tpu_ops_inc_gen",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
index 14f39a53c44dc1..7d28571db5030a 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
@@ -223,6 +223,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
     ],
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 0fc38d776b7e26..70dbc0cae06fdf 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -65,6 +65,7 @@ cc_library(
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ComplexToStandard",
         "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:GPUToGPURuntimeTransforms",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index 18c5ab830d4722..0d75396d3d48d4 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -213,6 +213,7 @@ cc_library(
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:ShapeToStandard",
         "@llvm-project//mlir:ShapeTransforms",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:Transforms",
diff --git a/tensorflow/dtensor/mlir/BUILD b/tensorflow/dtensor/mlir/BUILD
index c9501c52a58f00..063d78221644bb 100644
--- a/tensorflow/dtensor/mlir/BUILD
+++ b/tensorflow/dtensor/mlir/BUILD
@@ -264,6 +264,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 85de3f3ba214f7..6a477415e92f01 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "9fb85b09946122aa5793b647d7939ac17817c5f5"
-    LLVM_SHA256 = "76bf16dcf9fe05e412c594a2d8216f7cefe46cd1111fd08f7b842dbfa282df99"
+    LLVM_COMMIT = "de7a50fb88faa1dafee33f10149561936214062b"
+    LLVM_SHA256 = "3653a70e4cd0e1e230bdbab165af2e4c702717e68e8745d32bff142cdbfad9dc"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/triton/cl617812302.patch b/third_party/triton/cl617812302.patch
new file mode 100644
index 00000000000000..c9c8066190e6ce
--- /dev/null
+++ b/third_party/triton/cl617812302.patch
@@ -0,0 +1,12 @@
+==== triton/BUILD#42 - /google/src/cloud/csigg/mlir_83e5a1239242d64110e3dfa96ed3889170ab96b2_1711020969/triton/BUILD ====
+# action=edit type=text
+--- triton/BUILD	2024-03-21 04:26:30.000000000 -0700
++++ triton/BUILD	2024-03-21 05:39:41.000000000 -0700
+@@ -692,6 +692,7 @@
+         "@llvm-project//mlir:SCFDialect",
+         "@llvm-project//mlir:SCFTransforms",
+         "@llvm-project//mlir:SCFUtils",
++        "@llvm-project//mlir:SideEffectInterfaces",
+         "@llvm-project//mlir:Support",
+         "@llvm-project//mlir:TensorDialect",
+         "@llvm-project//mlir:TransformUtils",
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index bf6fae7683e1c5..b2e16e65435a7c 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -15,5 +15,6 @@ def repo():
         # For temporary changes which haven't landed upstream yet.
         patch_file = [
             "//third_party/triton:cl607293980.patch",  # long standing :(
+            "//third_party/triton:cl617812302.patch",
         ],
     )
diff --git a/third_party/xla/third_party/triton/cl617812302.patch b/third_party/xla/third_party/triton/cl617812302.patch
new file mode 100644
index 00000000000000..c9c8066190e6ce
--- /dev/null
+++ b/third_party/xla/third_party/triton/cl617812302.patch
@@ -0,0 +1,12 @@
+==== triton/BUILD#42 - /google/src/cloud/csigg/mlir_83e5a1239242d64110e3dfa96ed3889170ab96b2_1711020969/triton/BUILD ====
+# action=edit type=text
+--- triton/BUILD	2024-03-21 04:26:30.000000000 -0700
++++ triton/BUILD	2024-03-21 05:39:41.000000000 -0700
+@@ -692,6 +692,7 @@
+         "@llvm-project//mlir:SCFDialect",
+         "@llvm-project//mlir:SCFTransforms",
+         "@llvm-project//mlir:SCFUtils",
++        "@llvm-project//mlir:SideEffectInterfaces",
+         "@llvm-project//mlir:Support",
+         "@llvm-project//mlir:TensorDialect",
+         "@llvm-project//mlir:TransformUtils",
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index bf6fae7683e1c5..b2e16e65435a7c 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -15,5 +15,6 @@ def repo():
         # For temporary changes which haven't landed upstream yet.
         patch_file = [
             "//third_party/triton:cl607293980.patch",  # long standing :(
+            "//third_party/triton:cl617812302.patch",
         ],
     )
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index ac00a6a0c53179..4791ed9bb32223 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1401,6 +1401,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:ml_dtypes",
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index d2fa66184fd5a7..de04f55bd669d2 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -51,6 +51,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TensorDialect",
@@ -379,6 +380,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SCFDialect",
@@ -796,6 +798,7 @@ cc_library(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:BufferizationInterfaces",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
@@ -871,6 +874,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TensorDialect",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
index 8daf5509ce0d3b..0c6c8982bc5111 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
@@ -33,6 +33,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
@@ -87,6 +88,7 @@ cc_library(
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
@@ -178,6 +180,7 @@ cc_library(
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexToStandard",
         "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:GPUDialect",
@@ -222,6 +225,7 @@ xla_cc_test(
         "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:GPUDialect",
@@ -292,6 +296,7 @@ cc_library(
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ComplexToLLVM",
         "@llvm-project//mlir:ControlFlowToLLVM",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncToLLVM",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
index 9199626d83d5f3..d81257a2a05382 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
@@ -58,10 +58,12 @@ cc_library(
     deps = [
         ":xla_gpu_ops_inc_gen",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
     ],
 )

From 47197c4df97b2ef1a0df4c6a2d8ceb4f01b51ea3 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Fri, 22 Mar 2024 08:03:18 -0700
Subject: [PATCH 292/670] graphcycles: drop Vec<>; use std::vector instead
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

name                          old time/op  new time/op  delta
BM_StressTest/2048             487ns ± 1%   482ns ± 3%     ~     (p=0.310 n=5+5)
BM_StressTest/4096             498ns ± 2%   491ns ± 0%     ~     (p=0.111 n=5+4)
BM_StressTest/32768            523ns ± 1%   512ns ± 2%     ~     (p=0.056 n=5+5)
BM_StressTest/262144           613ns ± 5%   588ns ±11%     ~     (p=0.548 n=5+5)
BM_StressTest/1048576          758ns ± 3%   775ns ±26%     ~     (p=1.000 n=5+5)
BM_ContractEdge/1000           127ns ± 3%   121ns ± 5%     ~     (p=0.056 n=5+5)
BM_ContractEdge/10000          149ns ± 2%   146ns ± 1%   -1.47%  (p=0.032 n=5+5)
BM_IsReachableNonConst/10     12.1ns ± 9%  10.1ns ±11%  -16.69%  (p=0.016 n=5+5)
BM_IsReachableNonConst/50     20.6ns ±17%  14.7ns ±13%  -29.00%  (p=0.008 n=5+5)
BM_IsReachableNonConst/100    25.4ns ±20%  18.3ns ±13%  -27.98%  (p=0.008 n=5+5)
BM_IsReachableNonConst/200    30.6ns ±27%  20.1ns ± 6%  -34.35%  (p=0.008 n=5+5)
BM_IsReachableNonConst/1000   40.1ns ±27%  25.8ns ± 5%  -35.71%  (p=0.008 n=5+5)
BM_IsReachableNonConst/30000  67.1ns ± 2%  49.2ns ± 4%  -26.77%  (p=0.016 n=4+5)

PiperOrigin-RevId: 618179344
---
 .../xla/service/graphcycles/graphcycles.cc    | 75 ++++++++-----------
 1 file changed, 33 insertions(+), 42 deletions(-)

diff --git a/third_party/xla/xla/service/graphcycles/graphcycles.cc b/third_party/xla/xla/service/graphcycles/graphcycles.cc
index d1399062ca52eb..8f5609ff1e8524 100644
--- a/third_party/xla/xla/service/graphcycles/graphcycles.cc
+++ b/third_party/xla/xla/service/graphcycles/graphcycles.cc
@@ -32,6 +32,9 @@ limitations under the License.
 #include "xla/service/graphcycles/graphcycles.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
@@ -48,13 +51,6 @@ namespace {
 using NodeSet = absl::flat_hash_set<int32_t>;
 using OrderedNodeSet = OrderedSet<int32_t>;
 
-template <typename T>
-struct VecStruct {
-  typedef absl::InlinedVector<T, 4> type;
-};
-template <typename T>
-using Vec = typename VecStruct<T>::type;
-
 struct Node {
   int32_t rank;        // rank number assigned by Pearce-Kelly algorithm
   bool visited;        // Temporary marker used by depth-first-search
@@ -66,22 +62,22 @@ struct Node {
 }  // namespace
 
 struct GraphCycles::Rep {
-  Vec<Node*> nodes_;
-  Vec<int32_t> free_nodes_;  // Indices for unused entries in nodes_
+  std::vector<Node*> nodes_;
+  std::vector<int32_t> free_nodes_;  // Indices for unused entries in nodes_
 
   // Temporary state.
-  Vec<int32_t> deltaf_;  // Results of forward DFS
-  Vec<int32_t> deltab_;  // Results of backward DFS
-  Vec<int32_t> list_;    // All nodes to reprocess
-  Vec<int32_t> merged_;  // Rank values to assign to list_ entries
-  Vec<int32_t>
+  std::vector<int32_t> deltaf_;  // Results of forward DFS
+  std::vector<int32_t> deltab_;  // Results of backward DFS
+  std::vector<int32_t> list_;    // All nodes to reprocess
+  std::vector<int32_t> merged_;  // Rank values to assign to list_ entries
+  std::vector<int32_t>
       stack_;  // Emulates recursion stack when doing depth first search
 };
 
 GraphCycles::GraphCycles() : rep_(new Rep) {}
 
 GraphCycles::~GraphCycles() {
-  for (Vec<Node*>::size_type i = 0; i < rep_->nodes_.size(); i++) {
+  for (size_t i = 0; i < rep_->nodes_.size(); i++) {
     delete rep_->nodes_[i];
   }
   delete rep_;
@@ -90,7 +86,7 @@ GraphCycles::~GraphCycles() {
 bool GraphCycles::CheckInvariants() const {
   Rep* r = rep_;
   NodeSet ranks;  // Set of ranks seen so far.
-  for (Vec<Node*>::size_type x = 0; x < r->nodes_.size(); x++) {
+  for (size_t x = 0; x < r->nodes_.size(); x++) {
     Node* nx = r->nodes_[x];
     if (nx->visited) {
       LOG(FATAL) << "Did not clear visited marker on node " << x;
@@ -162,10 +158,11 @@ void GraphCycles::RemoveEdge(int32_t x, int32_t y) {
 static bool ForwardDFS(GraphCycles::Rep* r, int32_t n, int32_t upper_bound);
 static void BackwardDFS(GraphCycles::Rep* r, int32_t n, int32_t lower_bound);
 static void Reorder(GraphCycles::Rep* r);
-static void Sort(const Vec<Node*>&, Vec<int32_t>* delta);
-static void MoveToList(GraphCycles::Rep* r, Vec<int32_t>* src,
-                       Vec<int32_t>* dst);
-static void ClearVisitedBits(GraphCycles::Rep* r, const Vec<int32_t>& nodes);
+static void Sort(absl::Span<const Node* const>, std::vector<int32_t>* delta);
+static void MoveToList(GraphCycles::Rep* r, std::vector<int32_t>* src,
+                       std::vector<int32_t>* dst);
+static void ClearVisitedBits(GraphCycles::Rep* r,
+                             absl::Span<const int32_t> visited_indices);
 
 bool GraphCycles::InsertEdge(int32_t x, int32_t y) {
   if (x == y) return false;
@@ -265,26 +262,21 @@ static void Reorder(GraphCycles::Rep* r) {
              r->deltaf_.end(), r->merged_.begin());
 
   // Assign the ranks in order to the collected list.
-  for (Vec<int32_t>::size_type i = 0; i < r->list_.size(); i++) {
+  for (size_t i = 0; i < r->list_.size(); i++) {
     r->nodes_[r->list_[i]]->rank = r->merged_[i];
   }
 }
 
-static void Sort(const Vec<Node*>& nodes, Vec<int32_t>* delta) {
-  struct ByRank {
-    const Vec<Node*>* nodes;
-    bool operator()(int32_t a, int32_t b) const {
-      return (*nodes)[a]->rank < (*nodes)[b]->rank;
-    }
-  };
-  ByRank cmp;
-  cmp.nodes = &nodes;
-  std::sort(delta->begin(), delta->end(), cmp);
+static void Sort(absl::Span<const Node* const> nodes,
+                 std::vector<int32_t>* delta) {
+  std::sort(delta->begin(), delta->end(), [&](int32_t a, int32_t b) {
+    return nodes[a]->rank < nodes[b]->rank;
+  });
 }
 
-static void MoveToList(GraphCycles::Rep* r, Vec<int32_t>* src,
-                       Vec<int32_t>* dst) {
-  for (Vec<int32_t>::size_type i = 0; i < src->size(); i++) {
+static void MoveToList(GraphCycles::Rep* r, std::vector<int32_t>* src,
+                       std::vector<int32_t>* dst) {
+  for (size_t i = 0; i < src->size(); i++) {
     int32_t w = (*src)[i];
     (*src)[i] = r->nodes_[w]->rank;  // Replace src entry with its rank
     r->nodes_[w]->visited = false;   // Prepare for future DFS calls
@@ -292,9 +284,10 @@ static void MoveToList(GraphCycles::Rep* r, Vec<int32_t>* src,
   }
 }
 
-static void ClearVisitedBits(GraphCycles::Rep* r, const Vec<int32_t>& nodes) {
-  for (Vec<int32_t>::size_type i = 0; i < nodes.size(); i++) {
-    r->nodes_[nodes[i]]->visited = false;
+static void ClearVisitedBits(GraphCycles::Rep* r,
+                             absl::Span<const int32_t> visited_indices) {
+  for (auto index : visited_indices) {
+    r->nodes_[index]->visited = false;
   }
 }
 
@@ -431,7 +424,7 @@ std::vector<int32_t> GraphCycles::PredecessorsCopy(int32_t node) const {
 }
 
 namespace {
-void SortInPostOrder(absl::Span<Node* const> nodes,
+void SortInPostOrder(absl::Span<const Node* const> nodes,
                      std::vector<int32_t>* to_sort) {
   absl::c_sort(*to_sort, [&](int32_t a, int32_t b) {
     DCHECK(a == b || nodes[a]->rank != nodes[b]->rank);
@@ -458,10 +451,8 @@ std::vector<int32_t> GraphCycles::AllNodesInPostOrder() const {
 }
 
 std::string GraphCycles::DebugString() const {
-  absl::flat_hash_set<int32_t> free_nodes_set;
-  for (int32_t free_node : rep_->free_nodes_) {
-    free_nodes_set.insert(free_node);
-  }
+  absl::flat_hash_set<int32_t> free_nodes_set(rep_->free_nodes_.begin(),
+                                              rep_->free_nodes_.end());
 
   std::string result = "digraph {\n";
   for (int i = 0, end = rep_->nodes_.size(); i < end; i++) {

From 50cf7f89f50c8d90a3921a80561a167d44cf9a85 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Fri, 22 Mar 2024 08:03:33 -0700
Subject: [PATCH 293/670] #shlo_ref Improve the `RandomBuffer` test helper.

PiperOrigin-RevId: 618179438
---
 .../lite/experimental/shlo/ops/test_util.h    | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/tensorflow/lite/experimental/shlo/ops/test_util.h b/tensorflow/lite/experimental/shlo/ops/test_util.h
index 6285b040026da2..b641fe7b2fa627 100644
--- a/tensorflow/lite/experimental/shlo/ops/test_util.h
+++ b/tensorflow/lite/experimental/shlo/ops/test_util.h
@@ -59,20 +59,21 @@ struct Distribution<storage_type, std::enable_if_t<IsFloat(storage_type)>>
   using std::uniform_real_distribution<float>::uniform_real_distribution;
 };
 
-template <DataType storage_type, class Config = Storage<storage_type>>
-Vector<typename Config::Type> RandomBuffer(
-    const Shape& shape, const typename Config::Type min = Config::kMinValue,
-    const typename Config::Type max = Config::kMaxValue) {
+template <DataType storage_type, class MinT = StorageType<storage_type>,
+          class MaxT = StorageType<storage_type>,
+          class Config = Storage<storage_type>>
+Vector<typename Config::Type> RandomBuffer(const Shape& shape,
+                                           const MinT min = Config::kMinValue,
+                                           const MaxT max = Config::kMaxValue) {
+  using StorageT = StorageType<storage_type>;
+  const StorageT min_val =
+      min > Config::kMinValue ? static_cast<StorageT>(min) : Config::kMinValue;
+  const StorageT max_val =
+      max < Config::kMaxValue ? static_cast<StorageT>(max) : Config::kMaxValue;
   Vector<typename Config::Type> vec(shape.NumElements());
   std::random_device rd;
-  Distribution<storage_type> dist(min, max);
-  absl::c_generate(vec, [&] {
-    if constexpr (storage_type == DataType::kI1) {
-      return dist(rd) >= 0;
-    } else {
-      return dist(rd);
-    }
-  });
+  Distribution<storage_type> dist(min_val, max_val);
+  absl::c_generate(vec, [&] { return dist(rd); });
   return vec;
 }
 

From bb504b63795833a3e11cea4cb734b8937233b848 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 22 Mar 2024 08:16:33 -0700
Subject: [PATCH 294/670] [XLA:Python] Include missing nanobind std::set
 header.

Fixes a failure reported after the nanobind switch.

PiperOrigin-RevId: 618183044
---
 third_party/xla/xla/python/xla.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index 8f9c3e12a943dd..2e4f390f0dfb93 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/function.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/pair.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/set.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep

From 2f2bd8cc38724ae2f8e9ed01cc8fcc1246b4201a Mon Sep 17 00:00:00 2001
From: Yue Sheng <yueshengys@google.com>
Date: Fri, 22 Mar 2024 08:24:09 -0700
Subject: [PATCH 295/670] `DCHECK_EQ(layout, layout_i)` will compare addresses
 of two pointers, we should compare values.

Also fixed some headers.

PiperOrigin-RevId: 618184954
---
 third_party/xla/xla/python/pjrt_ifrt/BUILD         |  1 +
 third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index c635ded4633be5..3a33843ae4d7fa 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -231,6 +231,7 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
+        "//xla/pjrt:pjrt_layout",
         "//xla/pjrt:utils",
         "//xla/python/ifrt",
         "//xla/service:hlo_proto_cc",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index e661361ad0a9e3..1f676a1bfe423a 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -24,24 +25,29 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/utils.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
@@ -565,7 +571,7 @@ absl::StatusOr<std::unique_ptr<PjRtLayout>> PjRtArray::layout() const {
 #ifndef NDEBUG
   for (int i = 1; i < pjrt_buffers_.size(); ++i) {
     std::unique_ptr<PjRtLayout> layout_i = pjrt_buffers_[i]->layout();
-    DCHECK_EQ(layout, layout_i)
+    DCHECK(*layout == *layout_i)
         << "PjRtArray has mismatched layouts across shards! "
         << "shard 0: " << layout->ToString() << ", shard " << i << ": "
         << layout_i->ToString();

From 6b9ab5eb3dbad369145cd1501dfc8e16b14a06e3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 08:36:18 -0700
Subject: [PATCH 296/670] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/7b008a767fce1d12c85f25772323ae0ee0bdc31c.

PiperOrigin-RevId: 618187987
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index eb23447adbccb2..1f46b9dced2657 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "71863e46b69913a87dc676428355d00fa717de8e"
-    TFRT_SHA256 = "b78a78860a38c05f96c1029090a71531270923b0a9b3f5151e2b2c011a7a8ced"
+    TFRT_COMMIT = "7b008a767fce1d12c85f25772323ae0ee0bdc31c"
+    TFRT_SHA256 = "8d4b996a76a56ac7ad30db4bd0082333e9546be63364415c42d9103ef8a97dc5"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index eb23447adbccb2..1f46b9dced2657 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "71863e46b69913a87dc676428355d00fa717de8e"
-    TFRT_SHA256 = "b78a78860a38c05f96c1029090a71531270923b0a9b3f5151e2b2c011a7a8ced"
+    TFRT_COMMIT = "7b008a767fce1d12c85f25772323ae0ee0bdc31c"
+    TFRT_SHA256 = "8d4b996a76a56ac7ad30db4bd0082333e9546be63364415c42d9103ef8a97dc5"
 
     tf_http_archive(
         name = "tf_runtime",

From 168ee8b05f8b8fd731dcfee02b7f7a7ef8696bfa Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 22 Mar 2024 08:37:10 -0700
Subject: [PATCH 297/670] [xla:gpu][NFC] Documenting
 address_computation_fusion_rewriter

PiperOrigin-RevId: 618188208
---
 .../service/gpu/address_computation_fusion_rewriter.cc   | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index 6810b4abdabfd3..afc7c384384531 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -120,6 +120,8 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
   absl::InlinedVector<HloInstruction*, 8> sliced_operand_chains = {
       const_cast<HloInstruction*>(instr)};
   auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
+  // This set is used to avoid duplicates in the matched results. It contains
+  // the matched instructions that we have seen so far.
   absl::flat_hash_set<HloInstruction*> processed_sliced_chain_set;
 
   const auto& aliasing_pairs =
@@ -138,6 +140,7 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
     auto maybe_slice_adaptor =
         HloFindIf({HloInstructionAdaptor(*operand)}, *fusion, [&](auto node) {
           const HloInstruction* cur = &node.instruction();
+          // If the node is a match that has been processed, stop the traversal.
           if (processed_sliced_chain_set.contains(cur)) return true;
           maybe_sliced_operand_chain.push_back(
               const_cast<HloInstruction*>(cur));
@@ -145,13 +148,17 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
           // uses of the operand to reuse the address computation. Only worth it
           // if other uses are also custom calls though.
           // TODO(vuson): lift the second restriction by considering fusing the
-          // non-noop instructions to the computation if possible.
+          // non-noop instructions to the computation if possible (i.e. for
+          // dynamic slices).
           return cur->user_count() > 1 || !IsNoOp(cur) || IsAlignedSlice(*cur);
         });
     if (maybe_slice_adaptor == std::nullopt) continue;
     const auto& maybe_slice_instr = maybe_slice_adaptor->instruction();
     if (IsAlignedSlice(maybe_slice_instr) ||
         processed_sliced_chain_set.contains(&maybe_slice_instr)) {
+      // Even in the case of stopping at a match that has been processed, we
+      // still need to add instructions encountered in the sliced operand chain
+      // during the latest traversal.
       sliced_operand_chains.insert(sliced_operand_chains.end(),
                                    maybe_sliced_operand_chain.begin(),
                                    maybe_sliced_operand_chain.end());

From 514ab7e68b254114cb40ca68317fb1595953c52a Mon Sep 17 00:00:00 2001
From: Greg Olechwierowicz <olechwierowicz@google.com>
Date: Fri, 22 Mar 2024 09:00:23 -0700
Subject: [PATCH 298/670] [XLA] Add missing HTML stage from flag description.

PiperOrigin-RevId: 618194178
---
 third_party/xla/xla/tools/hlo_opt/opt_main.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/tools/hlo_opt/opt_main.cc b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
index 86bc5b0f281c84..a0f803c6751633 100644
--- a/third_party/xla/xla/tools/hlo_opt/opt_main.cc
+++ b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
@@ -209,7 +209,8 @@ int main(int argc, char** argv) {
                 "\t\t\t * llvm : LLVM IR\n"
                 "\t\t\t * ptx : PTX dump\n"
                 "\t\t\t * buffer-assignment: Buffer Assignment\n"
-                "\t\t\t * hlo-backend: HLO after backend passes\n"),
+                "\t\t\t * hlo-backend: HLO after backend passes\n"
+                "\t\t\t * html: HTML dump\n"),
       tsl::Flag("list-stages", &opts.list_stages,
                 "Print all supported stages for a given platform and exit"),
       tsl::Flag("split-input-file", &opts.split_input_file,

From 4d0c58376392b5d56961a3e3d9aa84b83ec70f00 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 09:05:54 -0700
Subject: [PATCH 299/670] Replace hard-failing CHECKs in triton emitter with
 TF_RET_CHECK, TF_RETURN_IF_ERROR, and the like.

PiperOrigin-RevId: 618195809
---
 third_party/xla/xla/service/gpu/BUILD         |   1 +
 .../xla/xla/service/gpu/fusions/triton.cc     |   5 +-
 .../xla/xla/service/gpu/gemm_fusion.cc        |  34 ++-
 .../xla/service/gpu/gemm_fusion_autotuner.cc  |  57 ++--
 .../xla/service/gpu/gemm_fusion_autotuner.h   |   2 +-
 .../service/gpu/gemm_fusion_autotuner_test.cc |  33 ++-
 .../xla/xla/service/gpu/ir_emitter_triton.cc  | 274 ++++++++++--------
 .../xla/xla/service/gpu/ir_emitter_triton.h   |   6 +-
 .../xla/xla/service/gpu/matmul_utils.cc       |  24 +-
 .../xla/xla/service/gpu/matmul_utils.h        |   8 +-
 .../xla/service/gpu/split_k_gemm_rewriter.cc  |   6 +-
 .../xla/service/gpu/triton_fusion_analysis.cc |  24 +-
 .../xla/service/gpu/triton_fusion_analysis.h  |   5 +-
 13 files changed, 271 insertions(+), 208 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 4791ed9bb32223..118cfdf76406f0 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1630,6 +1630,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/fusions/triton.cc b/third_party/xla/xla/service/gpu/fusions/triton.cc
index d7944a318a12e5..7f8d4c16808582 100644
--- a/third_party/xla/xla/service/gpu/fusions/triton.cc
+++ b/third_party/xla/xla/service/gpu/fusions/triton.cc
@@ -169,8 +169,9 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
                         ir_emitter_context.gpu_device_info(), config,
                         ir_emitter_context.llvm_module(), &EmitMatMul,
                         *ir_emitter_context.mlir_context()));
-      launch_dimensions =
-          GetMatMulLaunchDimensions(analysis, analysis_.fusion(), config);
+      TF_ASSIGN_OR_RETURN(
+          launch_dimensions,
+          GetMatMulLaunchDimensions(analysis, analysis_.fusion(), config));
     }
 
     llvm::Function* impl_fn =
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion.cc b/third_party/xla/xla/service/gpu/gemm_fusion.cc
index 57ca58c2ec638b..b51e6aedea9143 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -480,15 +481,15 @@ HlosAndRequirements FuseTowardOperands(
 //
 // The return value contains the HLOs corresponding to the given dot operand and
 // the requirements corresponding to the whole fusion so far.
-HlosAndRequirements FuseDotOperand(
+absl::StatusOr<HlosAndRequirements> FuseDotOperand(
     const HloInstruction& dot, int operand_index,
     const se::GpuComputeCapability& gpu_version,
     HloComputation::Builder& builder,            // append
     std::vector<HloInstruction*>& fusion_params  // append
 ) {
   // Direct dot inputs have well defined dimension orders.
-  const FusionContext context =
-      FusionContext::FromDotOperand(dot, operand_index);
+  TF_ASSIGN_OR_RETURN(const FusionContext context,
+                      FusionContext::FromDotOperand(dot, operand_index));
   const HloInstruction& operand = *dot.operand(operand_index);
   return FuseTowardOperands(operand, context.dim_orders().at(&operand),
                             TritonFusionAnalysis::kMaxParameterPerDotOperand,
@@ -637,14 +638,17 @@ absl::StatusOr<FusionDecision> CreateDotFusion(
     CHECK_EQ(descriptor.dimension(), dot.operand(0)->shape().rank() - 1);
   }
 
-  HlosAndRequirements lhs_hlos_and_reqs = FuseDotOperand(
-      dot, /*operand_index=*/0, gpu_version, builder, fusion_inputs);
-  HlosAndRequirements rhs_hlos_and_reqs = FuseDotOperand(
-      dot, /*operand_index=*/1, gpu_version, builder, fusion_inputs);
+  TF_ASSIGN_OR_RETURN(HlosAndRequirements lhs_hlos_and_reqs,
+                      FuseDotOperand(dot, /*operand_index=*/0, gpu_version,
+                                     builder, fusion_inputs));
+  TF_ASSIGN_OR_RETURN(HlosAndRequirements rhs_hlos_and_reqs,
+                      FuseDotOperand(dot, /*operand_index=*/1, gpu_version,
+                                     builder, fusion_inputs));
   std::optional<const HloInstruction*> meta_hlo;
   if (dot.sparse_operands()) {
-    HlosAndRequirements meta_hlos_and_reqs = FuseDotOperand(
-        dot, /*operand_index=*/2, gpu_version, builder, fusion_inputs);
+    TF_ASSIGN_OR_RETURN(HlosAndRequirements meta_hlos_and_reqs,
+                        FuseDotOperand(dot, /*operand_index=*/2, gpu_version,
+                                       builder, fusion_inputs));
     meta_hlo.emplace(meta_hlos_and_reqs.fused_hlo);
   }
   HloInstruction& fused_dot =
@@ -893,10 +897,14 @@ FusionDecision CanTritonHandleGEMM(
     // This pass relies on dot decomposer which ensures that all non-contracting
     // dimensions are merged into one. Using NonContractingDimensionIndex is
     // sufficient.
-    const int64_t nc_size =
-        dot.operand(operand_number)
-            ->shape()
-            .dimensions(NonContractingDimensionIndex(dot, operand_number));
+    absl::StatusOr<int64_t> non_contracting_dimension_index =
+        NonContractingDimensionIndex(dot, operand_number);
+    if (!non_contracting_dimension_index.ok()) {
+      return non_contracting_dimension_index.status().message();
+    }
+    const int64_t nc_size = dot.operand(operand_number)
+                                ->shape()
+                                .dimensions(*non_contracting_dimension_index);
     if (nc_size <= 1) {
       return "Trivial non-contracting dimensions.";
     }
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
index 4c532b8b2f9751..f5d934455f818c 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -241,11 +242,14 @@ class GemmConfigSetCollector : public ConstDfsHloVisitorWithDefault {
 
     if (backend_config.kind() == kTritonGemmFusionKind &&
         !backend_config.has_triton_gemm_config()) {
-      CHECK(
-          gemm_config_sets_.insert({fusion, GetGemmConfigSet(fusion)}).second);
+      TF_ASSIGN_OR_RETURN(GemmConfigSet gemm_config_set,
+                          GetGemmConfigSet(fusion));
+      TF_RET_CHECK(
+          gemm_config_sets_.insert({fusion, std::move(gemm_config_set)})
+              .second);
     } else if (backend_config.kind() == kCuDnnFusionKind &&
                !backend_config.has_cudnn_fusion_config()) {
-      CHECK(gemm_config_sets_.insert({fusion, {}}).second);
+      TF_RET_CHECK(gemm_config_sets_.insert({fusion, {}}).second);
     }
 
     handled_fusions_.insert(key);
@@ -257,7 +261,8 @@ class GemmConfigSetCollector : public ConstDfsHloVisitorWithDefault {
   }
 
  private:
-  GemmConfigSet GetGemmConfigSet(const HloFusionInstruction* fusion) {
+  absl::StatusOr<GemmConfigSet> GetGemmConfigSet(
+      const HloFusionInstruction* fusion) {
     const DebugOptions& debug_options =
         fusion->GetModule()->config().debug_options();
     auto cuda_comp =
@@ -265,9 +270,11 @@ class GemmConfigSetCollector : public ConstDfsHloVisitorWithDefault {
     const HloDotInstruction* dot_instr =
         Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
             *fusion->called_computations().at(0), HloOpcode::kDot));
-    auto configs = GetPossibleMatmulAutotuneConfigs(
-        *dot_instr, cuda_comp, debug_options, config_.ExhaustiveTilingSearch());
-    return {configs, /*has_sparsity=*/dot_instr->sparse_operands() > 0};
+    TF_ASSIGN_OR_RETURN(auto configs, GetPossibleMatmulAutotuneConfigs(
+                                          *dot_instr, cuda_comp, debug_options,
+                                          config_.ExhaustiveTilingSearch()));
+    return GemmConfigSet{std::move(configs),
+                         /*has_sparsity=*/dot_instr->sparse_operands() > 0};
   }
 
   AutotuneConfig config_;
@@ -282,19 +289,22 @@ struct TileSizeLimit {
   int64_t block_k = 0;
 };
 
-TileSizeLimit GetUpperLimit(const HloDotInstruction& dot) {
+absl::StatusOr<TileSizeLimit> GetUpperLimit(const HloDotInstruction& dot) {
+  TF_ASSIGN_OR_RETURN(int64_t non_contracting_index0,
+                      NonContractingDimensionIndex(dot, /*operand_number=*/0));
+  TF_ASSIGN_OR_RETURN(int64_t non_contracting_index1,
+                      NonContractingDimensionIndex(dot, /*operand_number=*/1));
+  TF_ASSIGN_OR_RETURN(int64_t contracting_index0,
+                      ContractingDimensionIndex(dot, /*operand_number=*/0));
   // This is not a sharp upper limit, the actual m value can be much smaller
   // based on how much of the m dimension is physically contiguous.
   // TODO(tdanyluk): Get the exact m value by running a TritonFusionAnalysis.
-  const int64_t m = dot.operand(0)->shape().dimensions(
-      NonContractingDimensionIndex(dot, /*operand_number=*/0));
+  const int64_t m = dot.operand(0)->shape().dimensions(non_contracting_index0);
   // Theoretically the same is true as for m, but that is not possible in
   // practice with the current implementation.
-  const int64_t n = dot.operand(1)->shape().dimensions(
-      NonContractingDimensionIndex(dot, /*operand_number=*/1));
+  const int64_t n = dot.operand(1)->shape().dimensions(non_contracting_index1);
   // This is before doing the split-k transform.
-  const int64_t k = dot.operand(0)->shape().dimensions(
-      ContractingDimensionIndex(dot, /*operand_number=*/0));
+  const int64_t k = dot.operand(0)->shape().dimensions(contracting_index0);
   const int64_t block_m_limit =
       std::max<int64_t>(tsl::NextPowerOfTwoS64(m), kMinTileSize);
   const int64_t block_n_limit =
@@ -304,7 +314,7 @@ TileSizeLimit GetUpperLimit(const HloDotInstruction& dot) {
   const int64_t block_k_limit =
       std::max<int64_t>(tsl::NextPowerOfTwoS64(k),
                         kMinTileSize * (dot.sparse_operands() ? 2 : 1));
-  return {block_m_limit, block_n_limit, block_k_limit};
+  return TileSizeLimit{block_m_limit, block_n_limit, block_k_limit};
 }
 
 int64_t GetSplitKLimit(int64_t block_k, int64_t block_k_limit) {
@@ -323,11 +333,12 @@ constexpr std::array<int, 5> SPLIT_K = {1, 2, 4, 8, 16};
 // It's possible that some other values may be(come) supported.
 constexpr std::array<int, 5> NUM_CTAS = {1, 2, 4, 8, 16};
 
-std::vector<TritonGemmConfig> GetExhaustiveMatmulAutotuneConfigs(
+absl::StatusOr<std::vector<TritonGemmConfig>>
+GetExhaustiveMatmulAutotuneConfigs(
     const HloDotInstruction& dot,
     const se::CudaComputeCapability compute_capability, const int max_split_k,
     const DebugOptions& debug_options) {
-  const TileSizeLimit limit = GetUpperLimit(dot);
+  TF_ASSIGN_OR_RETURN(const TileSizeLimit limit, GetUpperLimit(dot));
   std::vector<TritonGemmConfig> configs;
   bool mma_layout_v2 =
       compute_capability.IsAtLeast(se::CudaComputeCapability::AMPERE);
@@ -433,9 +444,9 @@ std::vector<TritonGemmConfig> GetFixedMatmulAutotuneConfigs(
 }
 
 // This prefers to take the parameter by moving it.
-std::vector<TritonGemmConfig> ReduceTileSizes(
+absl::StatusOr<std::vector<TritonGemmConfig>> ReduceTileSizes(
     const HloDotInstruction& dot, std::vector<TritonGemmConfig> configs) {
-  const TileSizeLimit limit = GetUpperLimit(dot);
+  TF_ASSIGN_OR_RETURN(const TileSizeLimit limit, GetUpperLimit(dot));
   // Decrease the block sizes and split_k if they are unnecessarily big.
   for (TritonGemmConfig& config : configs) {
     config.block_m = std::min<int64_t>(config.block_m, limit.block_m);
@@ -459,7 +470,7 @@ std::vector<TritonGemmConfig> ReduceTileSizes(
                                  return !configs_so_far.insert(config).second;
                                }),
                 configs.end());
-  CHECK(!configs.empty());
+  TF_RET_CHECK(!configs.empty());
   return configs;
 }
 
@@ -1127,7 +1138,7 @@ absl::Status Autotune(
 
 }  // anonymous namespace
 
-std::vector<TritonGemmConfig> GetPossibleMatmulAutotuneConfigs(
+absl::StatusOr<std::vector<TritonGemmConfig>> GetPossibleMatmulAutotuneConfigs(
     const HloDotInstruction& dot,
     const se::CudaComputeCapability compute_capability,
     const DebugOptions& debug_options, bool exhaustive_tiling_search) {
@@ -1187,7 +1198,9 @@ absl::StatusOr<bool> GemmFusionAutotuner::Run(
         const HloDotInstruction* dot_instr =
             Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
                 *fusion->called_computations().at(0), HloOpcode::kDot));
-        auto config = ReduceTileSizes(*dot_instr, {kDefaultGemmTiling}).front();
+        TF_ASSIGN_OR_RETURN(auto configs,
+                            ReduceTileSizes(*dot_instr, {kDefaultGemmTiling}));
+        auto config = configs.front();
         *res.mutable_triton() = config.ToProto();
       }
       *res.mutable_run_time() =
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.h b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.h
index 318875ae9872a8..18a6e1b76caa44 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.h
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.h
@@ -55,7 +55,7 @@ class GemmFusionAutotuner : public HloModulePass {
 
 // TODO(b/266210099): have a way to generate/load these dynamically.
 // Returns a list of possible tilings for a GEMM performed in Triton.
-std::vector<TritonGemmConfig> GetPossibleMatmulAutotuneConfigs(
+absl::StatusOr<std::vector<TritonGemmConfig>> GetPossibleMatmulAutotuneConfigs(
     const HloDotInstruction& dot, se::CudaComputeCapability compute_capability,
     const DebugOptions& debug_options, bool exhaustive_tiling_search = false);
 
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
index fe1113c5112a33..d4557f4737f70d 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
@@ -241,11 +241,12 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::VOLTA, /*minor=*/0};
-  const std::vector<TritonGemmConfig> configs =
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<TritonGemmConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest());
+          compute_capability, GetDebugOptionsForTest()));
   EXPECT_FALSE(std::any_of(
       configs.begin(), configs.end(),
       [](const TritonGemmConfig& config) { return config.num_stages > 2; }));
@@ -262,11 +263,12 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
-  const std::vector<TritonGemmConfig> configs =
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<TritonGemmConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest());
+          compute_capability, GetDebugOptionsForTest()));
   EXPECT_TRUE(std::any_of(
       configs.begin(), configs.end(),
       [](const TritonGemmConfig& config) { return config.num_stages > 2; }));
@@ -283,11 +285,12 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
-  const std::vector<TritonGemmConfig> configs =
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<TritonGemmConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest());
+          compute_capability, GetDebugOptionsForTest()));
   EXPECT_TRUE(std::any_of(
       configs.begin(), configs.end(),
       [](const TritonGemmConfig& config) { return config.split_k >= 16; }));
@@ -304,11 +307,12 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
-  const std::vector<TritonGemmConfig> configs =
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<TritonGemmConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest());
+          compute_capability, GetDebugOptionsForTest()));
   EXPECT_FALSE(std::any_of(
       configs.begin(), configs.end(),
       [](const TritonGemmConfig& config) { return config.split_k > 1; }));
@@ -761,11 +765,12 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
-  const std::vector<TritonGemmConfig> configs =
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<TritonGemmConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest());
+          compute_capability, GetDebugOptionsForTest()));
   EXPECT_TRUE(std::all_of(
       configs.begin(), configs.end(),
       [](const TritonGemmConfig& config) { return config.split_k == 1; }));
@@ -789,9 +794,11 @@ ENTRY wais {
   auto dot =
       Cast<HloDotInstruction>(module->entry_computation()->root_instruction());
 
-  auto configs = GetPossibleMatmulAutotuneConfigs(
-      *dot, se::CudaComputeCapability{8, 0}, GetDebugOptionsForTest(),
-      /*exhaustive_tiling_search=*/GetParam());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto configs,
+      GetPossibleMatmulAutotuneConfigs(
+          *dot, se::CudaComputeCapability{8, 0}, GetDebugOptionsForTest(),
+          /*exhaustive_tiling_search=*/GetParam()));
   for (const auto& config : configs) {
     int metadata_size = config.block_m * config.block_k / 16;
     EXPECT_LE(config.num_warps * WarpSize(), metadata_size);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 26b73fa8a84bb0..aa582e888d753d 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -528,12 +528,12 @@ struct DimProperties {
   int split_value;
 };
 
-Value EmitBroadcast(ImplicitLocOpBuilder& b,
-                    const TritonFusionAnalysis* analysis,
-                    TritonFusionAnalysis::Scope scope,
-                    absl::Span<const DimProperties> tiled_dimensions,
-                    const HloInstruction& broadcast, Value input) {
-  CHECK(analysis != nullptr);
+absl::StatusOr<Value> EmitBroadcast(
+    ImplicitLocOpBuilder& b, const TritonFusionAnalysis* analysis,
+    TritonFusionAnalysis::Scope scope,
+    absl::Span<const DimProperties> tiled_dimensions,
+    const HloInstruction& broadcast, Value input) {
+  TF_RET_CHECK(analysis != nullptr);
   std::vector<int64_t> out_shape;
   for (const DimProperties& dim : tiled_dimensions) {
     const TensorIterationSpec::DimIterationSpec* spec =
@@ -586,12 +586,13 @@ absl::StatusOr<Value> EmitReduce(ImplicitLocOpBuilder& b,
 
   // At the moment, we should only emit a full reduction over the last axis of
   // a single input.
-  CHECK_EQ(hlo_reduce.operand_count(), 2);
-  CHECK_EQ(hlo_reduce.dimensions().size(), 1);
-  CHECK_EQ(hlo_reduce.dimensions(0), hlo_reduce.operand(0)->shape().rank() - 1);
+  TF_RET_CHECK(hlo_reduce.operand_count() == 2);
+  TF_RET_CHECK(hlo_reduce.dimensions().size() == 1);
+  TF_RET_CHECK(hlo_reduce.dimensions(0) ==
+               hlo_reduce.operand(0)->shape().rank() - 1);
   const int block_row = input_shape.back();
   const int row_len = hlo_reduce.operand(0)->shape().dimensions_minor(0);
-  CHECK_GE(block_row, row_len);
+  TF_RET_CHECK(block_row >= row_len);
 
   const HloInstruction* operand = hlo_reduce.operand(1);
   Value neutral;
@@ -599,14 +600,14 @@ absl::StatusOr<Value> EmitReduce(ImplicitLocOpBuilder& b,
   // We assume that the reduction value was input as a constant, or in the case
   // of a data type affected by float normalization, a convert of a constant.
   if (operand->opcode() == HloOpcode::kConvert) {
-    CHECK_EQ(operand->operand(0)->opcode(), HloOpcode::kConstant);
-    CHECK_EQ(operand->operand(0)->shape().element_type(), BF16);
+    TF_RET_CHECK(operand->operand(0)->opcode() == HloOpcode::kConstant);
+    TF_RET_CHECK(operand->operand(0)->shape().element_type() == BF16);
     PrimitiveType dest_ty = operand->shape().element_type();
-    CHECK_EQ(dest_ty, F32);
+    TF_RET_CHECK(dest_ty == F32);
     neutral = EmitConstant(b, *operand->operand(0));
     neutral = Cast(b, neutral, TritonType(b, dest_ty));
   } else {
-    CHECK_EQ(operand->opcode(), HloOpcode::kConstant);
+    TF_RET_CHECK(operand->opcode() == HloOpcode::kConstant);
     neutral = EmitConstant(b, *operand);
   }
 
@@ -645,16 +646,17 @@ absl::StatusOr<Value> EmitReduce(ImplicitLocOpBuilder& b,
          reduction_computation->MakeInstructionPostOrder()) {
       if (instr->opcode() == HloOpcode::kParameter) {
         int parameter_number = instr->parameter_number();
-        CHECK_LT(parameter_number, 2);
-        CHECK(region_values
-                  .insert({instr, reducer->getArgument(parameter_number)})
-                  .second);
+        TF_RET_CHECK(parameter_number < 2);
+        TF_RET_CHECK(
+            region_values
+                .insert({instr, reducer->getArgument(parameter_number)})
+                .second);
       } else {
         to_emit.push_back(instr);
       }
     }
 
-    CHECK(!to_emit.empty());
+    TF_RET_CHECK(!to_emit.empty());
 
     b.setInsertionPointToStart(reducer);
     TF_ASSIGN_OR_RETURN(
@@ -703,8 +705,9 @@ absl::StatusOr<Value> EmitScope(
       // Splat makes it a tensor to avoid type mismatches.
       result = Splat(b, EmitConstant(b, *hlo), {});
     } else if (hlo->opcode() == HloOpcode::kBroadcast) {
-      result = EmitBroadcast(b, analysis, scope, tiled_dimensions, *hlo,
-                             values[hlo->operand(0)]);
+      TF_ASSIGN_OR_RETURN(
+          result, EmitBroadcast(b, analysis, scope, tiled_dimensions, *hlo,
+                                values[hlo->operand(0)]));
     } else if (hlo->opcode() == HloOpcode::kReduce) {
       TF_ASSIGN_OR_RETURN(result, EmitReduce(b, libdevice_path, device_info,
                                              *hlo, values[hlo->operand(0)]));
@@ -918,8 +921,9 @@ const TensorIterationSpec::DimIterationSpec* GetLhsNoncontractingSplitSpec(
 //   split-K, batch, non-contracting LHS, non-contracting RHS,
 // where split-K and batch are optional.
 struct MatMulDims {
-  MatMulDims(const TritonGemmConfig& config, const HloDotInstruction& dot,
-             const TritonFusionAnalysis& analysis);
+  static absl::StatusOr<MatMulDims> Create(
+      const TritonGemmConfig& config, const HloDotInstruction& dot,
+      const TritonFusionAnalysis& analysis);
 
   std::optional<int> out_split_k_dim_idx = std::nullopt;
 
@@ -942,6 +946,9 @@ struct MatMulDims {
   int64_t m;
   int64_t n;
   int64_t k;
+
+ private:
+  MatMulDims() = default;
 };
 
 // Structure for parameters relating to the MatMul launch grid.
@@ -957,85 +964,90 @@ struct MatMulLaunchConfig {
   mt::ProgramIDDim noncontracting_program_id_dim;
 };
 
-MatMulDims::MatMulDims(const TritonGemmConfig& config,
-                       const HloDotInstruction& dot,
-                       const TritonFusionAnalysis& analysis) {
+/*static*/ absl::StatusOr<MatMulDims> MatMulDims::Create(
+    const TritonGemmConfig& config, const HloDotInstruction& dot,
+    const TritonFusionAnalysis& analysis) {
+  MatMulDims matmul_dims;
   if (config.split_k > 1) {
     // split-k is always the first logical dimension.
-    out_split_k_dim_idx = 0;
+    matmul_dims.out_split_k_dim_idx = 0;
   }
 
   int64_t num_split_k_dims = config.split_k > 1 ? 1 : 0;
   const auto& dims = dot.dot_dimension_numbers();
-  lhs_contracting_dim_idx = dims.lhs_contracting_dimensions(0);
-  lhs_noncontracting_dim_idx =
+  matmul_dims.lhs_contracting_dim_idx = dims.lhs_contracting_dimensions(0);
+  matmul_dims.lhs_noncontracting_dim_idx =
       GetNonContractingDims(dot.operand(0)->shape(),
                             dims.lhs_batch_dimensions(),
                             dims.lhs_contracting_dimensions())
           .value()[0];
-  rhs_contracting_dim_idx = dims.rhs_contracting_dimensions(0);
-  rhs_noncontracting_dim_idx =
+  matmul_dims.rhs_contracting_dim_idx = dims.rhs_contracting_dimensions(0);
+  matmul_dims.rhs_noncontracting_dim_idx =
       GetNonContractingDims(dot.operand(1)->shape(),
                             dims.rhs_batch_dimensions(),
                             dims.rhs_contracting_dimensions())
           .value()[0];
 
   if (dims.lhs_batch_dimensions_size() > num_split_k_dims) {
-    lhs_batch_dim_idx = *dims.lhs_batch_dimensions().rbegin();
-    rhs_batch_dim_idx = *dims.rhs_batch_dimensions().rbegin();
+    matmul_dims.lhs_batch_dim_idx = *dims.lhs_batch_dimensions().rbegin();
+    matmul_dims.rhs_batch_dim_idx = *dims.rhs_batch_dimensions().rbegin();
     // The batch dimension (if present) comes after the split-k dimension (if
     // present, otherwise it's the first dimension).
-    out_batch_dim_idx = num_split_k_dims;
+    matmul_dims.out_batch_dim_idx = num_split_k_dims;
   }
 
   // Logical output dimensions are always ordered as:
   //   split-K, batch, non-contracting LHS, non-contracting RHS,
   // where split-K and batch are optional.
-  out_rhs_noncontracting_dim_idx = dot.shape().rank() - 1;
-  out_lhs_noncontracting_dim_idx = dot.shape().rank() - 2;
+  matmul_dims.out_rhs_noncontracting_dim_idx = dot.shape().rank() - 1;
+  matmul_dims.out_lhs_noncontracting_dim_idx = dot.shape().rank() - 2;
 
   auto* root = dot.parent()->root_instruction();
-  n = analysis
-          .IterSpec(TritonFusionAnalysis::Scope::OUTPUT, root,
-                    out_rhs_noncontracting_dim_idx)
-          ->at(0)
-          .count;
+  matmul_dims.n = analysis
+                      .IterSpec(TritonFusionAnalysis::Scope::OUTPUT, root,
+                                matmul_dims.out_rhs_noncontracting_dim_idx)
+                      ->at(0)
+                      .count;
   // Contracting dimension length.
   if (config.split_k > 1 &&
       dot.operand(0)->operand(0)->opcode() == HloOpcode::kPad) {
     // Unpadded LHS shape:  [..., k, ...]
     // Padded LHS shape:    [..., padded_k, ...]
     // Bitcasted LHS shape: [..., split_k, padded_k / split_k, ...]
-    CHECK_EQ(dot.operand(0)->opcode(), HloOpcode::kBitcast);
+    TF_RET_CHECK(dot.operand(0)->opcode() == HloOpcode::kBitcast);
     const Shape& unpadded_lhs_shape =
         dot.operand(0)->operand(0)->operand(0)->shape();
-    k = unpadded_lhs_shape.dimensions(dims.lhs_contracting_dimensions(0) - 1);
+    matmul_dims.k =
+        unpadded_lhs_shape.dimensions(dims.lhs_contracting_dimensions(0) - 1);
   } else {
-    k = dot.operand(0)->shape().dimensions(dims.lhs_contracting_dimensions(0)) *
+    matmul_dims.k =
+        dot.operand(0)->shape().dimensions(dims.lhs_contracting_dimensions(0)) *
         config.split_k;
   }
 
-  auto* lhs_noncontracting_split_spec =
-      GetLhsNoncontractingSplitSpec(analysis, lhs_noncontracting_dim_idx);
+  auto* lhs_noncontracting_split_spec = GetLhsNoncontractingSplitSpec(
+      analysis, matmul_dims.lhs_noncontracting_dim_idx);
   if (lhs_noncontracting_split_spec != nullptr) {
     // Just the fastest-varying part of it if the dimension is split.
-    m = lhs_noncontracting_split_spec->at(0).count;
-    lhs_noncontracting_split = lhs_noncontracting_split_spec->at(1).count;
+    matmul_dims.m = lhs_noncontracting_split_spec->at(0).count;
+    matmul_dims.lhs_noncontracting_split =
+        lhs_noncontracting_split_spec->at(1).count;
   } else {
-    m = analysis
-            .IterSpec(TritonFusionAnalysis::Scope::OUTPUT, root,
-                      out_lhs_noncontracting_dim_idx)
-            ->at(0)
-            .count;
+    matmul_dims.m = analysis
+                        .IterSpec(TritonFusionAnalysis::Scope::OUTPUT, root,
+                                  matmul_dims.out_lhs_noncontracting_dim_idx)
+                        ->at(0)
+                        .count;
   }
 
   // For now split non-contracting and batch are not supported
   // simultaneously because they are implemented via same mechanism.
-  CHECK(
-      !(out_batch_dim_idx.has_value() && lhs_noncontracting_split.has_value()));
+  TF_RET_CHECK(!(matmul_dims.out_batch_dim_idx.has_value() &&
+                 matmul_dims.lhs_noncontracting_split.has_value()));
 
-  CHECK_GE(m, 1);
-  CHECK_GE(n, 1);
+  TF_RET_CHECK(matmul_dims.m >= 1);
+  TF_RET_CHECK(matmul_dims.n >= 1);
+  return std::move(matmul_dims);
 }
 
 MatMulLaunchConfig::MatMulLaunchConfig(const TritonGemmConfig& config,
@@ -1072,38 +1084,39 @@ MatMulLaunchConfig::MatMulLaunchConfig(const TritonGemmConfig& config,
   }
 }
 
-void ValidateMatMulConfig(const TritonGemmConfig& config,
-                          const HloDotInstruction& dot) {
-  CHECK_GE(config.split_k, 1);
-  CHECK_GE(config.block_m, 16);
-  CHECK_GE(config.block_k, 16);
-  CHECK_GE(config.block_n, 16);
+absl::Status ValidateMatMulConfig(const TritonGemmConfig& config,
+                                  const HloDotInstruction& dot) {
+  TF_RET_CHECK(config.split_k >= 1);
+  TF_RET_CHECK(config.block_m >= 16);
+  TF_RET_CHECK(config.block_k >= 16);
+  TF_RET_CHECK(config.block_n >= 16);
 
   const auto& dims = dot.dot_dimension_numbers();
   int num_batch_dims =
       dims.lhs_batch_dimensions_size() - (config.split_k > 1 ? 1 : 0);
-  CHECK_LE(num_batch_dims, 1);
+  TF_RET_CHECK(num_batch_dims <= 1);
   if (config.split_k > 1) {
     // Split-K dimension has to be the first batch one and have an index
     // just before the contracting one.
     const int lhs_split_k_dim_idx = dims.lhs_contracting_dimensions(0) - 1;
     const int rhs_split_k_dim_idx = dims.rhs_contracting_dimensions(0) - 1;
     // Size of this dimension has to match the split_k value.
-    CHECK_EQ(dims.lhs_batch_dimensions(0), lhs_split_k_dim_idx);
-    CHECK_EQ(dims.rhs_batch_dimensions(0), rhs_split_k_dim_idx);
-    CHECK_EQ(config.split_k,
-             dot.operand(0)->shape().dimensions(lhs_split_k_dim_idx));
-    CHECK_EQ(config.split_k,
-             dot.operand(1)->shape().dimensions(rhs_split_k_dim_idx));
+    TF_RET_CHECK(dims.lhs_batch_dimensions(0) == lhs_split_k_dim_idx);
+    TF_RET_CHECK(dims.rhs_batch_dimensions(0) == rhs_split_k_dim_idx);
+    TF_RET_CHECK(config.split_k ==
+                 dot.operand(0)->shape().dimensions(lhs_split_k_dim_idx));
+    TF_RET_CHECK(config.split_k ==
+                 dot.operand(1)->shape().dimensions(rhs_split_k_dim_idx));
   }
 
   // Rely on dot decomposer: there is just one contracting and one
   // non-contracting dimension on each side + batch ones optionally.
-  CHECK_EQ(dims.lhs_contracting_dimensions_size(), 1);
-  CHECK_EQ(dims.rhs_contracting_dimensions_size(), 1);
+  TF_RET_CHECK(dims.lhs_contracting_dimensions_size() == 1);
+  TF_RET_CHECK(dims.rhs_contracting_dimensions_size() == 1);
 
-  CHECK_EQ(dot.operand(0)->shape().rank(),
-           2 + (config.split_k > 1 ? 1 : 0) + num_batch_dims);
+  TF_RET_CHECK(dot.operand(0)->shape().rank() ==
+               2 + (config.split_k > 1 ? 1 : 0) + num_batch_dims);
+  return absl::OkStatus();
 }
 
 struct Side {
@@ -1121,9 +1134,9 @@ struct Side {
 // } else {
 //   return choices.back();
 // }
-Value EmitMultiSelect(ImplicitLocOpBuilder b, Value index, ValueRange limits,
-                      ValueRange choices) {
-  CHECK_EQ(choices.size() - 1, limits.size());
+absl::StatusOr<Value> EmitMultiSelect(ImplicitLocOpBuilder b, Value index,
+                                      ValueRange limits, ValueRange choices) {
+  TF_RET_CHECK(choices.size() - 1 == limits.size());
   Value result = choices[0];
   for (int i = 0; i < choices.size() - 1; ++i) {
     result = b.create<ma::SelectOp>(
@@ -1281,7 +1294,7 @@ class MatMulEmitterHelper {
         }
         LOG(FATAL) << "Missing dimension.";
       }();
-      CHECK_EQ(bases.size(), hlo->operand_count());
+      TF_RET_CHECK(bases.size() == hlo->operand_count());
 
       concat_boundaries.reserve(hlo->operand_count() - 1);
       for (int i = 0; i < hlo->operand_count() - 1; ++i) {
@@ -1298,16 +1311,16 @@ class MatMulEmitterHelper {
 
       concat_dim_pid_offset =
           b_.create<ma::MulIOp>(properties.pid, Cst32(properties.block_size));
-      base =
-          EmitMultiSelect(b_, concat_dim_pid_offset, concat_boundaries, bases);
+      TF_ASSIGN_OR_RETURN(base, EmitMultiSelect(b_, concat_dim_pid_offset,
+                                                concat_boundaries, bases));
     } else {
       concat_dim_idx = -1;
       base = bases[0];
     }
 
-    auto add_dim = [&](const DimProperties& properties) {
+    auto add_dim = [&](const DimProperties& properties) -> absl::Status {
       if (analysis_.IterSpec(side.scope, hlo, properties.index) == nullptr) {
-        return;
+        return absl::OkStatus();
       }
       Value pid_offset =
           (properties.pid == nullptr)
@@ -1337,13 +1350,19 @@ class MatMulEmitterHelper {
             pid_offset, Cst32(specs.back()->at(0).slice_start)));
         input_bounds.push_back(Cst64(specs.back()->at(0).count));
       }
-      strides.push_back(EmitMultiSelect(b_, concat_dim_pid_offset,
-                                        concat_boundaries, input_strides));
+      TF_ASSIGN_OR_RETURN(Value select_value,
+                          EmitMultiSelect(b_, concat_dim_pid_offset,
+                                          concat_boundaries, input_strides));
+      strides.push_back(select_value);
       if (properties.index == concat_dim_idx) {
-        block_offsets.push_back(
+        TF_ASSIGN_OR_RETURN(
+            select_value,
             EmitMultiSelect(b_, pid_offset, concat_boundaries, input_offsets));
-        bounds.push_back(
+        block_offsets.push_back(select_value);
+        TF_ASSIGN_OR_RETURN(
+            select_value,
             EmitMultiSelect(b_, pid_offset, concat_boundaries, input_bounds));
+        bounds.push_back(select_value);
       } else {
         block_offsets.push_back(pid_offset);
         int64_t count = specs.front()->at(0).count;
@@ -1363,10 +1382,11 @@ class MatMulEmitterHelper {
       tensor_offsets.push_back(Cst32(specs.front()->at(0).slice_start));
       block_dims.push_back(properties.block_size);
       dim_order.emplace(dim_order.begin(), dim_order.size());
+      return absl::OkStatus();
     };
 
     for (const DimProperties& dim : side.tiled_dims) {
-      add_dim(dim);
+      TF_RETURN_IF_ERROR(add_dim(dim));
     }
 
     int64_t offset_batch = 0;
@@ -1376,8 +1396,9 @@ class MatMulEmitterHelper {
     // Return the batch stride of the HLO passed as a parameter. If the
     // parameter HLO has no batch dimension, a zero stride is returned.
     // Also sets offset_batch and updates has_batch_offset as a side effect.
-    auto get_batch_stride = [this, &side, &offset_batch, &has_batch_offset](
-                                const HloInstruction* hlo_param) -> Value {
+    auto get_batch_stride =
+        [this, &side, &offset_batch, &has_batch_offset](
+            const HloInstruction* hlo_param) -> absl::StatusOr<Value> {
       int64_t stride_batch = 0;
       if (side.scope != TritonFusionAnalysis::Scope::RHS &&
           dims_.lhs_noncontracting_split) {
@@ -1396,7 +1417,7 @@ class MatMulEmitterHelper {
                 spec->at(0).stride *
                 (spec->at(0).count / *dims_.lhs_noncontracting_split);
           }
-          CHECK_NE(stride_batch, 0);
+          TF_RET_CHECK(stride_batch != 0);
         }
       } else if (side.batch_dim_idx.has_value()) {
         const TensorIterationSpec::DimIterationSpec* spec =
@@ -1404,7 +1425,7 @@ class MatMulEmitterHelper {
         if (spec != nullptr) {
           stride_batch = spec->at(0).stride;
           offset_batch = spec->at(0).slice_start;
-          CHECK_NE(stride_batch, 0);
+          TF_RET_CHECK(stride_batch != 0);
         }
       }
 
@@ -1416,12 +1437,14 @@ class MatMulEmitterHelper {
       std::vector<Value> batch_strides;
       batch_strides.reserve(hlo->operands().size());
       for (const HloInstruction* operand : hlo->operands()) {
-        batch_strides.push_back(get_batch_stride(operand));
+        TF_ASSIGN_OR_RETURN(Value op_stride, get_batch_stride(operand));
+        batch_strides.push_back(op_stride);
       }
-      batch_stride = EmitMultiSelect(b_, concat_dim_pid_offset,
-                                     concat_boundaries, batch_strides);
+      TF_ASSIGN_OR_RETURN(batch_stride,
+                          EmitMultiSelect(b_, concat_dim_pid_offset,
+                                          concat_boundaries, batch_strides));
     } else {
-      batch_stride = get_batch_stride(hlo);
+      TF_ASSIGN_OR_RETURN(batch_stride, get_batch_stride(hlo));
     }
 
     // Avoid generating logic to compute batch offset if unnecessary.
@@ -1438,7 +1461,7 @@ class MatMulEmitterHelper {
       const TensorIterationSpec::DimIterationSpec* spec = analysis_.IterSpec(
           TritonFusionAnalysis::Scope::OUTPUT, hlo, *dims_.out_split_k_dim_idx);
       if (spec != nullptr) {
-        CHECK(pid_k != nullptr);
+        TF_RET_CHECK(pid_k != nullptr);
         base = AddPtr(b_, base,
                       b_.create<ma::MulIOp>(ConvertScalar(pid_k),
                                             Cst(spec->at(0).stride)));
@@ -1486,16 +1509,17 @@ class MatMulEmitterHelper {
 
 }  // namespace
 
-LaunchDimensions GetMatMulLaunchDimensions(const TritonFusionAnalysis& analysis,
-                                           const HloFusionAdaptor& fusion,
-                                           const TritonGemmConfig& config) {
+absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
+    const TritonFusionAnalysis& analysis, const HloFusionAdaptor& fusion,
+    const TritonGemmConfig& config) {
   auto dot = HloFindIf(fusion.GetRoots(), fusion, [](auto node) {
     return node.opcode() == HloOpcode::kDot;
   });
-  CHECK(dot != std::nullopt);
+  TF_RET_CHECK(dot != std::nullopt);
   const auto& dot_instr =
       *static_cast<const HloDotInstruction*>(&dot->instruction());
-  MatMulDims dims(config, dot_instr, analysis);
+  TF_ASSIGN_OR_RETURN(MatMulDims dims,
+                      MatMulDims::Create(config, dot_instr, analysis));
   MatMulLaunchConfig launch_config(config, dot_instr, dims);
   return launch_config.launch_dims;
 }
@@ -1738,7 +1762,7 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
                         const TritonGemmConfig& config) {
   const HloDotInstruction* dot_instr = DynCast<HloDotInstruction>(
       hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot));
-  CHECK(!dot_instr->sparse_operands());
+  TF_RET_CHECK(!dot_instr->sparse_operands());
   // Use 32-bit indexing if addressing any of the inputs or the output (which
   // could grow if split_k is set) does not cross the INT_MAX boundary.
   // Otherwise, fall back to 64-bit indexing, which is slower.
@@ -1749,7 +1773,7 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   Type index_ty = builder.getIntegerType(use_64bit_indexing ? 64 : 32);
 
   const HloInstruction* root = dot_instr->parent()->root_instruction();
-  CHECK(!root->shape().IsTuple());
+  TF_RET_CHECK(!root->shape().IsTuple());
 
   // We'll be creating a lot of instructions from a single dot, use an
   // implicit loc builder so we don't have to pass around the location all the
@@ -1758,13 +1782,14 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   ImplicitLocOpBuilder b(loc, builder);
   Type i32_ty = b.getI32Type();
 
-  ValidateMatMulConfig(config, *dot_instr);
+  TF_RETURN_IF_ERROR(ValidateMatMulConfig(config, *dot_instr));
   const int split_k = config.split_k;
   const int block_m = config.block_m;
   const int block_k = config.block_k;
   const int block_n = config.block_n;
 
-  const MatMulDims dims(config, *dot_instr, analysis);
+  TF_ASSIGN_OR_RETURN(const MatMulDims dims,
+                      MatMulDims::Create(config, *dot_instr, analysis));
   const MatMulLaunchConfig launch_config(config, *dot_instr, dims);
   VLOG(6) << analysis.ToString();
 
@@ -1957,7 +1982,8 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
 
   for (const Side& side : {lhs, rhs}) {
     for (const HloInstruction* input : ScopeInputs(analysis, side.scope)) {
-      CHECK(iter_args_to_inputs.insert({iter_args.size(), input}).second);
+      TF_RET_CHECK(
+          iter_args_to_inputs.insert({iter_args.size(), input}).second);
       TF_ASSIGN_OR_RETURN(Value tensor_ptr,
                           emitter.EmitTensorPointer(
                               input, side, GetArguments(fn, *input), pid_k,
@@ -1988,10 +2014,10 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
           Value tensor_pointer,
           emitter.EmitTensorPointer(input, out, GetArguments(fn, *input), pid_k,
                                     boundary_checks));
-      CHECK(values_out
-                .insert({input,
-                         EmitParameterLoad(b, tensor_pointer, boundary_checks)})
-                .second);
+      TF_RET_CHECK(values_out
+                       .insert({input, EmitParameterLoad(b, tensor_pointer,
+                                                         boundary_checks)})
+                       .second);
     }
     TF_RETURN_IF_ERROR(EmitScope(b, libdevice_path, device_info, &analysis,
                                  TritonFusionAnalysis::Scope::OUTPUT,
@@ -2046,13 +2072,13 @@ absl::Status EmitSoftMax(mlir::OpBuilder builder,
   const HloInstruction* reduce = hlo_query::GetFirstInstructionWithOpcode(
       *computation, HloOpcode::kReduce);
 
-  CHECK_NE(reduce, nullptr);
+  TF_RET_CHECK(reduce != nullptr);
 
   Shape reduce_input_shape = reduce->operand(0)->shape();
 
-  CHECK_EQ(reduce->opcode(), HloOpcode::kReduce);
-  CHECK_EQ(reduce->dimensions().size(), 1);
-  CHECK_EQ(reduce->dimensions()[0], reduce_input_shape.rank() - 1);
+  TF_RET_CHECK(reduce->opcode() == HloOpcode::kReduce);
+  TF_RET_CHECK(reduce->dimensions().size() == 1);
+  TF_RET_CHECK(reduce->dimensions()[0] == reduce_input_shape.rank() - 1);
 
   int row_len = reduce_input_shape.dimensions_minor(0);
 
@@ -2086,9 +2112,8 @@ absl::Status EmitSoftMax(mlir::OpBuilder builder,
                                             param, /*dimension=*/1);
 
     // Make sure only batch and reduce dims are present in tiling
-    CHECK_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, param,
-                               /*dimension=*/2),
-             nullptr);
+    TF_RET_CHECK(analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, param,
+                                   /*dimension=*/2) == nullptr);
 
     if (!reduce_iterspec) {
       // This parameter's broadcast is along the reduce dimension, and so
@@ -2104,8 +2129,8 @@ absl::Status EmitSoftMax(mlir::OpBuilder builder,
       continue;
     }
 
-    CHECK_NE(reduce_iterspec, nullptr);
-    CHECK_EQ(reduce_iterspec->size(), 1);
+    TF_RET_CHECK(reduce_iterspec != nullptr);
+    TF_RET_CHECK(reduce_iterspec->size() == 1);
 
     // TODO(b/310721908): The below assumes that we tile along a single dim.
     int reduce_dim_len = reduce_iterspec->front().count;
@@ -2119,13 +2144,13 @@ absl::Status EmitSoftMax(mlir::OpBuilder builder,
 
     // We assume that the reduced axis of this parameter has length row_len.
     // TODO(b/316637896): Relax assumption that param reduce_dim_len == row_len.
-    CHECK_EQ(reduce_dim_len, row_len);
+    TF_RET_CHECK(reduce_dim_len == row_len);
 
     // block_size must be a power of two.
     int block_size = pow(2, ceil(log(reduce_dim_len) / log(2)));
 
     // Verify that this param contains a single contiguous fragment.
-    CHECK_EQ(reduce_iterspec->front().subfragments.size(), 1);
+    TF_RET_CHECK(reduce_iterspec->front().subfragments.size() == 1);
 
     Value emitted_tensor = b.create<mt::MakeTensorPtrOp>(
         /*base=*/AddPtr(b, fn.getArgument(param_idx), base_offset),
@@ -2252,7 +2277,7 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
                             llvm_ir::DumpToString(*triton_module));
   }
 
-  CHECK(mlir::succeeded(mlir::verify(*triton_module)));
+  TF_RET_CHECK(mlir::succeeded(mlir::verify(*triton_module)));
   return std::move(triton_module);
 }
 
@@ -2418,8 +2443,9 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
   ll_triton_module->setDataLayout(llvm_module->getDataLayout());
   ll_triton_module->setTargetTriple(llvm_module->getTargetTriple());
   // Use override flag because libdevice functions can be present in both.
-  CHECK(!llvm::Linker::linkModules(*llvm_module, std::move(ll_triton_module),
-                                   llvm::Linker::Flags::OverrideFromSrc));
+  TF_RET_CHECK(
+      !llvm::Linker::linkModules(*llvm_module, std::move(ll_triton_module),
+                                 llvm::Linker::Flags::OverrideFromSrc));
   VLogModule(5, *llvm_module);
   if (should_verify) {
     VerifyModule(*llvm_module);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index 801c648fafd91c..bba456b5393509 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -49,9 +49,9 @@ struct TritonWrapperResult {
 };
 
 // Compute the launch dimensions for the given Triton MatMul.
-LaunchDimensions GetMatMulLaunchDimensions(const TritonFusionAnalysis& analysis,
-                                           const HloFusionAdaptor& fusion,
-                                           const TritonGemmConfig& config);
+absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
+    const TritonFusionAnalysis& analysis, const HloFusionAdaptor& fusion,
+    const TritonGemmConfig& config);
 // Use tiling and execution parameters from 'config'.
 absl::Status EmitMatMul(mlir::OpBuilder b, absl::string_view libdevice_path,
                         const se::DeviceDescription& device_info,
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index c4e675a9d6e132..78bb545b920991 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -85,26 +85,28 @@ const tsl::protobuf::RepeatedField<int64_t>& BatchDimensionsForOperand(
   return dimension_numbers.rhs_batch_dimensions();
 }
 
-int64_t ContractingDimensionIndex(const HloInstruction& dot,
-                                  const int operand_number) {
+absl::StatusOr<int64_t> ContractingDimensionIndex(const HloInstruction& dot,
+                                                  const int operand_number) {
   const DotDimensionNumbers& dimension_numbers = dot.dot_dimension_numbers();
   if (operand_number == 0) {
-    CHECK_EQ(dimension_numbers.lhs_contracting_dimensions().size(), 1);
+    TF_RET_CHECK(dimension_numbers.lhs_contracting_dimensions().size() == 1);
     return dimension_numbers.lhs_contracting_dimensions(0);
   }
-  CHECK_EQ(dimension_numbers.rhs_contracting_dimensions().size(), 1);
+  TF_RET_CHECK(dimension_numbers.rhs_contracting_dimensions().size() == 1);
   return dimension_numbers.rhs_contracting_dimensions(0);
 }
 
-int64_t NonContractingDimensionIndex(const HloInstruction& dot,
-                                     const int operand_number) {
-  absl::StatusOr<std::vector<int64_t>> non_contracting_dims =
+absl::StatusOr<int64_t> NonContractingDimensionIndex(const HloInstruction& dot,
+                                                     const int operand_number) {
+  TF_ASSIGN_OR_RETURN(int64_t contracting_dim,
+                      ContractingDimensionIndex(dot, operand_number));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<int64_t> non_contracting_dims,
       GetNonContractingDims(dot.operand(operand_number)->shape(),
                             BatchDimensionsForOperand(dot, operand_number),
-                            {ContractingDimensionIndex(dot, operand_number)});
-  TF_CHECK_OK(non_contracting_dims.status());
-  CHECK_EQ(non_contracting_dims->size(), 1);
-  return non_contracting_dims->front();
+                            {contracting_dim}));
+  TF_RET_CHECK(non_contracting_dims.size() == 1);
+  return non_contracting_dims.front();
 }
 
 absl::StatusOr<Shape> GetBatchRowColumnShape(
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.h b/third_party/xla/xla/service/gpu/matmul_utils.h
index dd1710d2a979a5..22d7f178133835 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.h
+++ b/third_party/xla/xla/service/gpu/matmul_utils.h
@@ -54,12 +54,12 @@ const tsl::protobuf::RepeatedField<int64_t>& BatchDimensionsForOperand(
     const HloInstruction& dot, int operand_number);
 
 // Index of the only contracting dimension of dot instruction operand.
-int64_t ContractingDimensionIndex(const HloInstruction& dot,
-                                  int operand_number);
+absl::StatusOr<int64_t> ContractingDimensionIndex(const HloInstruction& dot,
+                                                  int operand_number);
 
 // Index of the only non-contracting dimension of dot instruction operand.
-int64_t NonContractingDimensionIndex(const HloInstruction& dot,
-                                     int operand_number);
+absl::StatusOr<int64_t> NonContractingDimensionIndex(const HloInstruction& dot,
+                                                     int operand_number);
 
 // Normalize shape to (batch, rows, columns) logical dimensions.
 absl::StatusOr<Shape> GetBatchRowColumnShape(
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
index 4e1bdd131324ca..2ff84c2c61ea91 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
@@ -244,7 +244,8 @@ absl::Status MakeDotComputationSplitKBatch(
   const DotDimensionNumbers& old_dim_numbers = dot->dot_dimension_numbers();
   DotDimensionNumbers new_dim_numbers;
 
-  const int64_t lhs_contracting_idx = ContractingDimensionIndex(*dot, 0);
+  TF_ASSIGN_OR_RETURN(const int64_t lhs_contracting_idx,
+                      ContractingDimensionIndex(*dot, 0));
   CopyIncrementingAboveThreshold(
       old_dim_numbers.lhs_contracting_dimensions(),
       *new_dim_numbers.mutable_lhs_contracting_dimensions(),
@@ -254,7 +255,8 @@ absl::Status MakeDotComputationSplitKBatch(
       old_dim_numbers.lhs_batch_dimensions(),
       *new_dim_numbers.mutable_lhs_batch_dimensions(), lhs_contracting_idx);
 
-  const int64_t rhs_contracting_idx = ContractingDimensionIndex(*dot, 1);
+  TF_ASSIGN_OR_RETURN(const int64_t rhs_contracting_idx,
+                      ContractingDimensionIndex(*dot, 1));
   CopyIncrementingAboveThreshold(
       old_dim_numbers.rhs_contracting_dimensions(),
       *new_dim_numbers.mutable_rhs_contracting_dimensions(),
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
index 279c96b02a512e..7af7b2a484188c 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -60,14 +61,17 @@ using triton_fusion::TransformDirection;
 
 namespace triton_fusion {
 
-/*static*/ FusionContext FusionContext::FromDotOperand(
+/*static*/ absl::StatusOr<FusionContext> FusionContext::FromDotOperand(
     const HloInstruction& dot, const int operand_number, const int split_k) {
   // There can be either none or one split-K batch dimension.
   const int num_split_k_batch_dims = split_k > 1;
   int split_k_dimension_index = kNoDimensionIndex;
+  TF_ASSIGN_OR_RETURN(int contracting_dimension_index,
+                      ContractingDimensionIndex(dot, operand_number));
+  TF_ASSIGN_OR_RETURN(int non_contracting_dimension_index,
+                      NonContractingDimensionIndex(dot, operand_number));
   if (split_k > 1) {
-    split_k_dimension_index =
-        ContractingDimensionIndex(dot, operand_number) - 1;
+    split_k_dimension_index = contracting_dimension_index - 1;
   }
   int splittable_dimension_index = kNoDimensionIndex;
   // LHS non-contracting dimension can be split if non-splitK batch is absent.
@@ -75,14 +79,11 @@ namespace triton_fusion {
       dot.dot_dimension_numbers().lhs_batch_dimensions_size() -
               num_split_k_batch_dims ==
           0) {
-    splittable_dimension_index =
-        NonContractingDimensionIndex(dot, operand_number);
+    splittable_dimension_index = non_contracting_dimension_index;
   }
-  FusionContext context(
-      DotProperties{
-          static_cast<int>(NonContractingDimensionIndex(dot, operand_number)),
-          splittable_dimension_index},
-      DotRequirements(kNoSplitRequirement));
+  FusionContext context(DotProperties{non_contracting_dimension_index,
+                                      splittable_dimension_index},
+                        DotRequirements(kNoSplitRequirement));
   context.dim_orders_[dot.operand(operand_number)] =
       DimensionOrder::FromDotOperandOrOutput(*dot.operand(operand_number),
                                              split_k_dimension_index);
@@ -279,7 +280,8 @@ absl::Status TritonFusionAnalysis::ExecuteForDotFusion(
     if (dot.operand_count() < operand_number + 1) {
       continue;  // Meta scope is optional.
     }
-    auto context = FusionContext::FromDotOperand(dot, operand_number, split_k);
+    TF_ASSIGN_OR_RETURN(auto context, FusionContext::FromDotOperand(
+                                          dot, operand_number, split_k));
     TF_RETURN_IF_ERROR(context.PropagateDimensionOrdersToParameters(
         *dot.operand(operand_number), parameters_[scope], iter_specs_[scope]));
     if (scope == Scope::LHS) {
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.h b/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
index 295611388fae37..8459f86c8ffcea 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
@@ -97,8 +97,9 @@ class FusionContext {
  public:
   // Create fusion context from a dot operand according to
   // the currently supported configurations.
-  static FusionContext FromDotOperand(const HloInstruction& dot,
-                                      int operand_number, int split_k = 1);
+  static absl::StatusOr<FusionContext> FromDotOperand(const HloInstruction& dot,
+                                                      int operand_number,
+                                                      int split_k = 1);
 
   // Create fusion context from dot's output.
   static FusionContext FromDotOutput(const HloInstruction& dot, int split_k,

From f9ff45593e9f469d99d247d8657a8ffb04b1ebb4 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Fri, 22 Mar 2024 09:09:26 -0700
Subject: [PATCH 300/670] graphcycles: keep nodes in a vector + move user_data
 out of it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This improves cache locality.

Note that user_data currently has no callers, so accessing it
quickly is not something we want to optimize for.

name                          old time/op  new time/op  delta
BM_StressTest/2048             474ns ± 1%   492ns ± 1%   +3.63%  (p=0.008 n=5+5)
BM_StressTest/4096             490ns ± 1%   497ns ± 1%   +1.52%  (p=0.008 n=5+5)
BM_StressTest/32768            504ns ± 1%   508ns ± 1%   +0.86%  (p=0.032 n=5+5)
BM_StressTest/262144           553ns ± 1%   557ns ± 3%     ~     (p=0.548 n=5+5)
BM_StressTest/1048576          617ns ± 2%   616ns ± 2%     ~     (p=0.690 n=5+5)
BM_ContractEdge/1000           121ns ± 2%   104ns ± 6%  -13.56%  (p=0.008 n=5+5)
BM_ContractEdge/10000          147ns ± 1%   119ns ± 1%  -18.86%  (p=0.008 n=5+5)
BM_IsReachableNonConst/10     10.4ns ±35%   9.3ns ± 6%     ~     (p=0.310 n=5+5)
BM_IsReachableNonConst/50     15.8ns ±12%  15.4ns ±10%     ~     (p=0.841 n=5+5)
BM_IsReachableNonConst/100    17.4ns ± 6%  17.5ns ±13%     ~     (p=0.841 n=5+5)
BM_IsReachableNonConst/200    19.7ns ± 6%  19.5ns ± 6%     ~     (p=0.690 n=5+5)
BM_IsReachableNonConst/1000   25.2ns ± 4%  24.9ns ± 6%     ~     (p=0.690 n=5+5)
BM_IsReachableNonConst/30000  49.8ns ± 4%  44.3ns ± 5%  -10.98%  (p=0.008 n=5+5)

PiperOrigin-RevId: 618196781
---
 .../xla/service/graphcycles/graphcycles.cc    | 103 +++++++++---------
 1 file changed, 51 insertions(+), 52 deletions(-)

diff --git a/third_party/xla/xla/service/graphcycles/graphcycles.cc b/third_party/xla/xla/service/graphcycles/graphcycles.cc
index 8f5609ff1e8524..99e4193d9ef001 100644
--- a/third_party/xla/xla/service/graphcycles/graphcycles.cc
+++ b/third_party/xla/xla/service/graphcycles/graphcycles.cc
@@ -54,7 +54,6 @@ using OrderedNodeSet = OrderedSet<int32_t>;
 struct Node {
   int32_t rank;        // rank number assigned by Pearce-Kelly algorithm
   bool visited;        // Temporary marker used by depth-first-search
-  void* data;          // User-supplied data
   OrderedNodeSet in;   // List of immediate predecessor nodes in graph
   OrderedNodeSet out;  // List of immediate successor nodes in graph
 };
@@ -62,7 +61,7 @@ struct Node {
 }  // namespace
 
 struct GraphCycles::Rep {
-  std::vector<Node*> nodes_;
+  std::vector<Node> nodes_;
   std::vector<int32_t> free_nodes_;  // Indices for unused entries in nodes_
 
   // Temporary state.
@@ -72,14 +71,15 @@ struct GraphCycles::Rep {
   std::vector<int32_t> merged_;  // Rank values to assign to list_ entries
   std::vector<int32_t>
       stack_;  // Emulates recursion stack when doing depth first search
+
+  // User-supplied data. Stored outside of Node since it is rarely accessed.
+  std::vector<void*> node_data_;
 };
 
 GraphCycles::GraphCycles() : rep_(new Rep) {}
 
+// Define the destructor here because Rep is also defined in this file.
 GraphCycles::~GraphCycles() {
-  for (size_t i = 0; i < rep_->nodes_.size(); i++) {
-    delete rep_->nodes_[i];
-  }
   delete rep_;
 }
 
@@ -87,7 +87,7 @@ bool GraphCycles::CheckInvariants() const {
   Rep* r = rep_;
   NodeSet ranks;  // Set of ranks seen so far.
   for (size_t x = 0; x < r->nodes_.size(); x++) {
-    Node* nx = r->nodes_[x];
+    Node* nx = &r->nodes_[x];
     if (nx->visited) {
       LOG(FATAL) << "Did not clear visited marker on node " << x;
     }
@@ -95,7 +95,7 @@ bool GraphCycles::CheckInvariants() const {
       LOG(FATAL) << "Duplicate occurrence of rank " << nx->rank;
     }
     for (int32_t y : nx->out.GetSequence()) {
-      Node* ny = r->nodes_[y];
+      Node* ny = &r->nodes_[y];
       if (nx->rank >= ny->rank) {
         LOG(FATAL) << "Edge " << x << "->" << y << " has bad rank assignment "
                    << nx->rank << "->" << ny->rank;
@@ -107,29 +107,29 @@ bool GraphCycles::CheckInvariants() const {
 
 int32_t GraphCycles::NewNode() {
   if (rep_->free_nodes_.empty()) {
-    Node* n = new Node;
-    n->visited = false;
-    n->data = nullptr;
-    n->rank = rep_->nodes_.size();
-    rep_->nodes_.push_back(n);
-    return n->rank;
+    Node n;
+    n.visited = false;
+    n.rank = rep_->nodes_.size();
+    rep_->nodes_.emplace_back(n);
+    rep_->node_data_.push_back(nullptr);
+    return n.rank;
   } else {
     // Preserve preceding rank since the set of ranks in use must be
     // a permutation of [0,rep_->nodes_.size()-1].
     int32_t r = rep_->free_nodes_.back();
-    rep_->nodes_[r]->data = nullptr;
     rep_->free_nodes_.pop_back();
+    rep_->node_data_[r] = nullptr;
     return r;
   }
 }
 
 void GraphCycles::RemoveNode(int32_t node) {
-  Node* x = rep_->nodes_[node];
+  Node* x = &rep_->nodes_[node];
   for (int32_t y : x->out.GetSequence()) {
-    rep_->nodes_[y]->in.Erase(node);
+    rep_->nodes_[y].in.Erase(node);
   }
   for (int32_t y : x->in.GetSequence()) {
-    rep_->nodes_[y]->out.Erase(node);
+    rep_->nodes_[y].out.Erase(node);
   }
   x->in.Clear();
   x->out.Clear();
@@ -137,20 +137,20 @@ void GraphCycles::RemoveNode(int32_t node) {
 }
 
 void* GraphCycles::GetNodeData(int32_t node) const {
-  return rep_->nodes_[node]->data;
+  return rep_->node_data_[node];
 }
 
 void GraphCycles::SetNodeData(int32_t node, void* data) {
-  rep_->nodes_[node]->data = data;
+  rep_->node_data_[node] = data;
 }
 
 bool GraphCycles::HasEdge(int32_t x, int32_t y) const {
-  return rep_->nodes_[x]->out.Contains(y);
+  return rep_->nodes_[x].out.Contains(y);
 }
 
 void GraphCycles::RemoveEdge(int32_t x, int32_t y) {
-  rep_->nodes_[x]->out.Erase(y);
-  rep_->nodes_[y]->in.Erase(x);
+  rep_->nodes_[x].out.Erase(y);
+  rep_->nodes_[y].in.Erase(x);
   // No need to update the rank assignment since a previous valid
   // rank assignment remains valid after an edge deletion.
 }
@@ -158,7 +158,7 @@ void GraphCycles::RemoveEdge(int32_t x, int32_t y) {
 static bool ForwardDFS(GraphCycles::Rep* r, int32_t n, int32_t upper_bound);
 static void BackwardDFS(GraphCycles::Rep* r, int32_t n, int32_t lower_bound);
 static void Reorder(GraphCycles::Rep* r);
-static void Sort(absl::Span<const Node* const>, std::vector<int32_t>* delta);
+static void Sort(absl::Span<const Node>, std::vector<int32_t>* delta);
 static void MoveToList(GraphCycles::Rep* r, std::vector<int32_t>* src,
                        std::vector<int32_t>* dst);
 static void ClearVisitedBits(GraphCycles::Rep* r,
@@ -167,13 +167,13 @@ static void ClearVisitedBits(GraphCycles::Rep* r,
 bool GraphCycles::InsertEdge(int32_t x, int32_t y) {
   if (x == y) return false;
   Rep* r = rep_;
-  Node* nx = r->nodes_[x];
+  Node* nx = &r->nodes_[x];
   if (!nx->out.Insert(y)) {
     // Edge already exists.
     return true;
   }
 
-  Node* ny = r->nodes_[y];
+  Node* ny = &r->nodes_[y];
   ny->in.Insert(x);
 
   if (nx->rank <= ny->rank) {
@@ -206,14 +206,14 @@ static bool ForwardDFS(GraphCycles::Rep* r, int32_t n, int32_t upper_bound) {
   while (!r->stack_.empty()) {
     n = r->stack_.back();
     r->stack_.pop_back();
-    Node* nn = r->nodes_[n];
+    Node* nn = &r->nodes_[n];
     if (nn->visited) continue;
 
     nn->visited = true;
     r->deltaf_.push_back(n);
 
     for (auto w : nn->out.GetSequence()) {
-      Node* nw = r->nodes_[w];
+      Node* nw = &r->nodes_[w];
       if (nw->rank == upper_bound) {
         return false;  // Cycle
       }
@@ -232,14 +232,14 @@ static void BackwardDFS(GraphCycles::Rep* r, int32_t n, int32_t lower_bound) {
   while (!r->stack_.empty()) {
     n = r->stack_.back();
     r->stack_.pop_back();
-    Node* nn = r->nodes_[n];
+    Node* nn = &r->nodes_[n];
     if (nn->visited) continue;
 
     nn->visited = true;
     r->deltab_.push_back(n);
 
     for (auto w : nn->in.GetSequence()) {
-      Node* nw = r->nodes_[w];
+      Node* nw = &r->nodes_[w];
       if (!nw->visited && lower_bound < nw->rank) {
         r->stack_.push_back(w);
       }
@@ -263,14 +263,13 @@ static void Reorder(GraphCycles::Rep* r) {
 
   // Assign the ranks in order to the collected list.
   for (size_t i = 0; i < r->list_.size(); i++) {
-    r->nodes_[r->list_[i]]->rank = r->merged_[i];
+    r->nodes_[r->list_[i]].rank = r->merged_[i];
   }
 }
 
-static void Sort(absl::Span<const Node* const> nodes,
-                 std::vector<int32_t>* delta) {
+static void Sort(absl::Span<const Node> nodes, std::vector<int32_t>* delta) {
   std::sort(delta->begin(), delta->end(), [&](int32_t a, int32_t b) {
-    return nodes[a]->rank < nodes[b]->rank;
+    return nodes[a].rank < nodes[b].rank;
   });
 }
 
@@ -278,8 +277,8 @@ static void MoveToList(GraphCycles::Rep* r, std::vector<int32_t>* src,
                        std::vector<int32_t>* dst) {
   for (size_t i = 0; i < src->size(); i++) {
     int32_t w = (*src)[i];
-    (*src)[i] = r->nodes_[w]->rank;  // Replace src entry with its rank
-    r->nodes_[w]->visited = false;   // Prepare for future DFS calls
+    (*src)[i] = r->nodes_[w].rank;  // Replace src entry with its rank
+    r->nodes_[w].visited = false;   // Prepare for future DFS calls
     dst->push_back(w);
   }
 }
@@ -287,7 +286,7 @@ static void MoveToList(GraphCycles::Rep* r, std::vector<int32_t>* src,
 static void ClearVisitedBits(GraphCycles::Rep* r,
                              absl::Span<const int32_t> visited_indices) {
   for (auto index : visited_indices) {
-    r->nodes_[index]->visited = false;
+    r->nodes_[index].visited = false;
   }
 }
 
@@ -321,7 +320,7 @@ int GraphCycles::FindPath(int32_t x, int32_t y, int max_path_len,
       return path_len;
     }
 
-    for (auto w : r->nodes_[n]->out.GetSequence()) {
+    for (auto w : r->nodes_[n].out.GetSequence()) {
       if (seen.insert(w).second) {
         r->stack_.push_back(w);
       }
@@ -338,8 +337,8 @@ bool GraphCycles::IsReachable(int32_t x, int32_t y) const {
 bool GraphCycles::IsReachableNonConst(int32_t x, int32_t y) {
   if (x == y) return true;
   Rep* r = rep_;
-  Node* nx = r->nodes_[x];
-  Node* ny = r->nodes_[y];
+  Node* nx = &r->nodes_[x];
+  Node* ny = &r->nodes_[y];
 
   if (nx->rank >= ny->rank) {
     // x cannot reach y since it is after it in the topological ordering
@@ -374,29 +373,29 @@ std::optional<int32_t> GraphCycles::ContractEdge(int32_t a, int32_t b) {
     return std::nullopt;
   }
 
-  if (rep_->nodes_[b]->in.Size() + rep_->nodes_[b]->out.Size() >
-      rep_->nodes_[a]->in.Size() + rep_->nodes_[a]->out.Size()) {
+  if (rep_->nodes_[b].in.Size() + rep_->nodes_[b].out.Size() >
+      rep_->nodes_[a].in.Size() + rep_->nodes_[a].out.Size()) {
     // Swap "a" and "b" to minimize copying.
     std::swap(a, b);
   }
 
-  Node* nb = rep_->nodes_[b];
+  Node* nb = &rep_->nodes_[b];
   OrderedNodeSet out = std::move(nb->out);
   OrderedNodeSet in = std::move(nb->in);
   for (int32_t y : out.GetSequence()) {
-    rep_->nodes_[y]->in.Erase(b);
+    rep_->nodes_[y].in.Erase(b);
   }
   for (int32_t y : in.GetSequence()) {
-    rep_->nodes_[y]->out.Erase(b);
+    rep_->nodes_[y].out.Erase(b);
   }
   rep_->free_nodes_.push_back(b);
 
-  rep_->nodes_[a]->out.Reserve(rep_->nodes_[a]->out.Size() + out.Size());
+  rep_->nodes_[a].out.Reserve(rep_->nodes_[a].out.Size() + out.Size());
   for (int32_t y : out.GetSequence()) {
     InsertEdge(a, y);
   }
 
-  rep_->nodes_[a]->in.Reserve(rep_->nodes_[a]->in.Size() + in.Size());
+  rep_->nodes_[a].in.Reserve(rep_->nodes_[a].in.Size() + in.Size());
   for (int32_t y : in.GetSequence()) {
     InsertEdge(y, a);
   }
@@ -406,11 +405,11 @@ std::optional<int32_t> GraphCycles::ContractEdge(int32_t a, int32_t b) {
 }
 
 absl::Span<const int32_t> GraphCycles::Successors(int32_t node) const {
-  return rep_->nodes_[node]->out.GetSequence();
+  return rep_->nodes_[node].out.GetSequence();
 }
 
 absl::Span<const int32_t> GraphCycles::Predecessors(int32_t node) const {
-  return rep_->nodes_[node]->in.GetSequence();
+  return rep_->nodes_[node].in.GetSequence();
 }
 
 std::vector<int32_t> GraphCycles::SuccessorsCopy(int32_t node) const {
@@ -424,11 +423,11 @@ std::vector<int32_t> GraphCycles::PredecessorsCopy(int32_t node) const {
 }
 
 namespace {
-void SortInPostOrder(absl::Span<const Node* const> nodes,
+void SortInPostOrder(absl::Span<const Node> nodes,
                      std::vector<int32_t>* to_sort) {
   absl::c_sort(*to_sort, [&](int32_t a, int32_t b) {
-    DCHECK(a == b || nodes[a]->rank != nodes[b]->rank);
-    return nodes[a]->rank > nodes[b]->rank;
+    DCHECK(a == b || nodes[a].rank != nodes[b].rank);
+    return nodes[a].rank > nodes[b].rank;
   });
 }
 }  // namespace
@@ -460,7 +459,7 @@ std::string GraphCycles::DebugString() const {
       continue;
     }
 
-    for (int32_t succ : rep_->nodes_[i]->out.GetSequence()) {
+    for (int32_t succ : rep_->nodes_[i].out.GetSequence()) {
       absl::StrAppend(&result, "  \"", i, "\" -> \"", succ, "\"\n");
     }
   }

From f0ae6583f5d39ac2f3d53e1ac22bedd51ccc6d8d Mon Sep 17 00:00:00 2001
From: Wilsin Gosti <wilsin@google.com>
Date: Fri, 22 Mar 2024 09:17:23 -0700
Subject: [PATCH 301/670] #tf-data To help debugging, output the autotuned
 `max_outstanding_requests` in xprof.

PiperOrigin-RevId: 618198886
---
 .../core/data/service/client/data_service_client.cc       | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index 744b8042a3433d..472562e750bb73 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -206,8 +206,10 @@ void DataServiceClient::Cancel() TF_LOCKS_EXCLUDED(mu_) {
 TraceMeMetadata DataServiceClient::GetTraceMeMetadata() const {
   TraceMeMetadata result;
   int64_t num_tasks = -1;
+  int64_t autotuned_max_outstanding_requests = model::kAutotune;
   if (mu_.try_lock()) {
     num_tasks = tasks_.size() - finished_tasks_;
+    autotuned_max_outstanding_requests = max_outstanding_requests_;
     mu_.unlock();
   }
   result.push_back(std::make_pair(
@@ -220,6 +222,12 @@ TraceMeMetadata DataServiceClient::GetTraceMeMetadata() const {
       "max_outstanding_requests",
       strings::Printf(
           "%lld", static_cast<long long>(params_.max_outstanding_requests))));
+  if (params_.max_outstanding_requests == model::kAutotune) {
+    result.push_back(std::make_pair(
+        "autotuned_max_outstanding_requests",
+        strings::Printf("%lld", static_cast<long long>(
+                                    autotuned_max_outstanding_requests))));
+  }
   return result;
 }
 

From 5abfc19cf0b769564b6d466cd43874740360a556 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 22 Mar 2024 09:19:42 -0700
Subject: [PATCH 302/670] [XLA:GPU][NFC] Fix style in
 in_place_dynamic_update_slice_mlir.cc for consistency

PiperOrigin-RevId: 618199494
---
 .../in_place_dynamic_update_slice_mlir.cc     | 65 ++++++++++++-------
 1 file changed, 42 insertions(+), 23 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
index b546f5a204d49a..2d41b95e0dc595 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -46,6 +47,20 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using llvm::SmallVector;
+using mlir::ImplicitLocOpBuilder;
+using mlir::MLIRContext;
+using mlir::Value;
+using mlir::ValueRange;
+using mlir::arith::AddIOp;
+using mlir::func::ReturnOp;
+using mlir::tensor::InsertOp;
+using mlir_converter::ApplyAffineMap;
+using mlir_converter::CallTargetProvider;
+using mlir_converter::ClampIndex;
+using mlir_converter::PartitionedComputations;
+using mlir_converter::ProvideParameter;
+
 constexpr int kDUSUpdateIndex = 1;
 
 }  // namespace
@@ -81,14 +96,13 @@ MlirInPlaceDynamicUpdateSliceFusion::GetInstructionsWithCustomCodegen(
 }
 
 absl::Status MlirInPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
-    const mlir_converter::PartitionedComputations& computations,
-    const mlir_converter::CallTargetProvider& call_targets,
-    mlir::func::FuncOp entry_function,
+    const PartitionedComputations& computations,
+    const CallTargetProvider& call_targets, mlir::func::FuncOp entry_function,
     const HloFusionInstruction& fusion) const {
-  mlir::ImplicitLocOpBuilder b(entry_function.getLoc(), entry_function);
+  ImplicitLocOpBuilder b(entry_function.getLoc(), entry_function);
   b.setInsertionPointToStart(entry_function.addEntryBlock());
 
-  mlir::MLIRContext* mlir_context = entry_function.getContext();
+  MLIRContext* mlir_context = entry_function.getContext();
   IndexingContext indexing_context{mlir_context};
 
   auto indexing = *ComputeThreadIdToInputIndexing(
@@ -105,40 +119,45 @@ absl::Status MlirInPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
       fusion.fused_instructions_computation());
   const auto& dus_subgraph = root_computation.FindSubgraph(dus_ops_.front());
 
-  const auto* dus_instr = dus_ops_.front();
+  const auto* dus_instr =
+      Cast<HloDynamicUpdateSliceInstruction>(dus_ops_.front());
   const auto& update_shape = dus_instr->operand(kDUSUpdateIndex)->shape();
   auto result_tensors = EmitThreadLoopNest(
       b, output_tensor_args, indexing,
-      [&](mlir::ValueRange output_tensors, mlir::ValueRange dim_values,
-          mlir::ValueRange symbol_values) -> llvm::SmallVector<mlir::Value> {
-        auto input_indices = mlir_converter::ApplyAffineMap(
-            indexing.GetAffineMap(), dim_values, symbol_values, b);
-        llvm::SmallVector<mlir::Value> update_indices;
+      [&](ValueRange output_tensors, ValueRange dim_values,
+          ValueRange symbol_values) -> llvm::SmallVector<Value> {
+        auto input_indices = ApplyAffineMap(indexing.GetAffineMap(), dim_values,
+                                            symbol_values, b);
+        SmallVector<Value> update_indices;
         for (int i = 0; i < update_shape.rank(); ++i) {
           int64_t update_size = update_shape.dimensions(i);
-          auto start_index = mlir_converter::ProvideParameter(
-              dus_subgraph, dus_instr, i + 2, {}, call_targets, entry_function,
-              b)[0];
-          start_index = mlir_converter::ClampIndex(
+          auto start_index =
+              ProvideParameter(dus_subgraph, dus_instr,
+                               i + dus_instr->first_index_operand_number(), {},
+                               call_targets, entry_function, b)[0];
+          start_index = ClampIndex(
               start_index,
               primitive_util::IsUnsignedIntegralType(
-                  dus_instr->operand(i + 2)->shape().element_type()),
+                  dus_instr
+                      ->operand(i + dus_instr->first_index_operand_number())
+                      ->shape()
+                      .element_type()),
               dus_instr->shape().dimensions(i) - update_size, b);
 
           update_indices.push_back(
-              b.create<mlir::arith::AddIOp>(input_indices[i], start_index));
+              b.create<AddIOp>(input_indices[i], start_index));
         }
 
-        auto updated_value = mlir_converter::ProvideParameter(
-            dus_subgraph, dus_instr, kDUSUpdateIndex, input_indices,
-            call_targets, entry_function, b)[0];
-        auto insert = b.create<mlir::tensor::InsertOp>(
-            updated_value, output_tensors[0], update_indices);
+        auto updated_value =
+            ProvideParameter(dus_subgraph, dus_instr, kDUSUpdateIndex,
+                             input_indices, call_targets, entry_function, b)[0];
+        auto insert = b.create<InsertOp>(updated_value, output_tensors[0],
+                                         update_indices);
 
         return {insert.getResult()};
       });
 
-  b.create<mlir::func::ReturnOp>(result_tensors);
+  b.create<ReturnOp>(result_tensors);
   return absl::OkStatus();
 }
 

From 57e21f7044a3b75fff46f40958d29c39632f7ea6 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Fri, 22 Mar 2024 09:35:02 -0700
Subject: [PATCH 303/670] #shlo_ref Add a helper for binary elementwise ops.

PiperOrigin-RevId: 618203876
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  26 ++++
 .../shlo/ops/binary_elementwise.h             |  81 ++++++++++
 .../shlo/ops/binary_elementwise_test.cc       | 144 ++++++++++++++++++
 3 files changed, 251 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/binary_elementwise.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/binary_elementwise_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 6e66dae30bfec9..42e59598161063 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -746,3 +746,29 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "binary_elementwise",
+    hdrs = ["binary_elementwise.h"],
+    deps = [
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:tensor",
+    ],
+)
+
+cc_test(
+    name = "binary_elementwise_test",
+    srcs = ["binary_elementwise_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":binary_elementwise",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h b/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h
new file mode 100644
index 00000000000000..94d4c0c82b02b8
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h
@@ -0,0 +1,81 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_H_
+
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+namespace detail {
+
+template <DataType storage_type, DataType expressed_type, typename F>
+void DequantizeOpQuantizePerTensor(F&& func, const Tensor& lhs,
+                                   const Tensor& rhs, Tensor& output) {
+  using StorageT = StorageType<storage_type>;
+  using ExpressedT = StorageType<expressed_type>;
+  const DimensionSize num_elements = lhs.NumElements();
+  const StorageT lhs_zero_point =
+      lhs.quantized_tensor_element_type().ZeroPoints<storage_type>()[0];
+  const ExpressedT lhs_scale =
+      lhs.quantized_tensor_element_type().Scales<expressed_type>()[0];
+  const StorageT rhs_zero_point =
+      rhs.quantized_tensor_element_type().ZeroPoints<storage_type>()[0];
+  const ExpressedT rhs_scale =
+      rhs.quantized_tensor_element_type().Scales<expressed_type>()[0];
+  const StorageT output_zero_point =
+      output.quantized_tensor_element_type().ZeroPoints<storage_type>()[0];
+  const ExpressedT output_scale =
+      output.quantized_tensor_element_type().Scales<expressed_type>()[0];
+  const StorageT* lhs_data = lhs.GetDataAs<storage_type>();
+  const StorageT* rhs_data = rhs.GetDataAs<storage_type>();
+  StorageT* output_data = output.GetDataAs<storage_type>();
+  const ExpressedT inv_scale = static_cast<ExpressedT>(1 / output_scale);
+  for (DimensionSize i = 0; i < num_elements;
+       ++i, ++lhs_data, ++rhs_data, ++output_data) {
+    const ExpressedT dequantized_lhs =
+        Dequantize(*lhs_data, lhs_zero_point, lhs_scale);
+    const ExpressedT dequantized_rhs =
+        Dequantize(*rhs_data, rhs_zero_point, rhs_scale);
+    const ExpressedT dequantized_res = func(dequantized_lhs, dequantized_rhs);
+    *output_data = Quantize<storage_type, expressed_type>(
+        dequantized_res, output_zero_point, inv_scale);
+  }
+}
+
+template <DataType data_type, class F>
+void EvaluateNoQuantization(F&& func, const Tensor& lhs, const Tensor& rhs,
+                            Tensor& output) {
+  using T = StorageType<data_type>;
+  const T* lhs_data = lhs.GetDataAs<data_type>();
+  const T* rhs_data = rhs.GetDataAs<data_type>();
+  T* output_data = output.GetDataAs<data_type>();
+  const DimensionSize num_elements = lhs.NumElements();
+  for (DimensionSize i = 0; i < num_elements;
+       ++i, ++output_data, ++lhs_data, ++rhs_data) {
+    *output_data = func(*lhs_data, *rhs_data);
+  }
+}
+
+}  // namespace detail
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test.cc b/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test.cc
new file mode 100644
index 00000000000000..5c2904b229ef66
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test.cc
@@ -0,0 +1,144 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+
+namespace shlo_ref {
+namespace {
+
+struct TestOp {
+  template <typename T>
+  T operator()(const T& lhs, const T& rhs) {
+    return lhs + rhs;
+  }
+};
+
+template <class T>
+struct EvaluateNoQuantizationTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(EvaluateNoQuantizationTest, ArithmeticTestTypes,
+                 TestParamNames);
+
+TYPED_TEST(EvaluateNoQuantizationTest, ArithmeticTensorsWithTestOp) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(), TestOp());
+
+  detail::EvaluateNoQuantization<TypeParam::kStorage>(
+      TestOp(), lhs_tensor, rhs_tensor, output_tensor);
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+template <class T>
+struct DequantizeOpQuantizePerTensor : ::testing::Test {};
+
+TYPED_TEST_SUITE(DequantizeOpQuantizePerTensor, QuantizedTestTypes,
+                 TestParamNames);
+
+TYPED_TEST(DequantizeOpQuantizePerTensor, QuantizedPerTensorWithTestOp) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT lhs_scale = static_cast<ExpressedT>(1.3);
+  const StorageT lhs_zero_point = static_cast<StorageT>(4);
+  const ExpressedT rhs_scale = static_cast<ExpressedT>(1.2);
+  const StorageT rhs_zero_point = static_cast<StorageT>(5);
+  const ExpressedT output_scale = static_cast<ExpressedT>(1.5);
+  const StorageT output_zero_point = static_cast<StorageT>(3);
+  Tensor lhs_tensor{
+      .type =
+          QuantizedTensorType{
+              .shape = shape,
+              .element_type =
+                  QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                                        TypeParam::kExpressed>(
+                      lhs_scale, lhs_zero_point)},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type =
+          QuantizedTensorType{
+              .shape = shape,
+              .element_type =
+                  QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                                        TypeParam::kExpressed>(
+                      rhs_scale, rhs_zero_point)},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type =
+          QuantizedTensorType{
+              .shape = shape,
+              .element_type =
+                  QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                                        TypeParam::kExpressed>(
+                      output_scale, output_zero_point)},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [lhs_zero_point, lhs_scale, rhs_zero_point, rhs_scale, output_zero_point,
+       output_scale](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs =
+            Dequantize(lhs, lhs_zero_point, lhs_scale);
+        const ExpressedT dequantized_rhs =
+            Dequantize(rhs, rhs_zero_point, rhs_scale);
+        const ExpressedT dequantized_res =
+            TestOp()(dequantized_lhs, dequantized_rhs);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, output_zero_point,
+            static_cast<ExpressedT>(1.) / output_scale);
+      });
+
+  detail::DequantizeOpQuantizePerTensor<TypeParam::kStorage,
+                                        TypeParam::kExpressed>(
+      TestOp(), lhs_tensor, rhs_tensor, output_tensor);
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From 649a343d9a5d9d99634a2d9614a749d0a558d4fb Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Fri, 22 Mar 2024 09:45:58 -0700
Subject: [PATCH 304/670] #shlo_ref Move helpers that are shared between unary
 and binary element-wise tests to test_util.

PiperOrigin-RevId: 618206610
---
 tensorflow/lite/experimental/shlo/ops/test_util.h        | 9 +++++++++
 .../experimental/shlo/ops/unary_elementwise_test_util.h  | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/experimental/shlo/ops/test_util.h b/tensorflow/lite/experimental/shlo/ops/test_util.h
index b641fe7b2fa627..739090b20d3095 100644
--- a/tensorflow/lite/experimental/shlo/ops/test_util.h
+++ b/tensorflow/lite/experimental/shlo/ops/test_util.h
@@ -285,6 +285,15 @@ using PerAxis0 = PerAxis<T, 0>;
 // Use this with TYPED_TEST_SUITE for quantized per axis testing.
 using PerAxisQuantizedTestTypes = MapTypes<PerAxis0, QuantizedTestTypes>;
 
+// Customization point for generic tests that need to create a supported tensor
+// for an op but that don't care what that type is.
+//
+// Specialize this in the test file if F32 isn't supported by the op under test.
+template <class Op>
+struct SupportedOpDataType {
+  static constexpr DataType kStorageType = DataType::kF32;
+};
+
 // Builds a TensorType object and returns it in a variant that can be passed to
 // a tensor.
 template <DataType storage_type>
diff --git a/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h
index 8ea34b69141403..2f5fab7a712ea2 100644
--- a/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h
+++ b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h
@@ -136,15 +136,6 @@ using UnaryElementwiseConstraint1Types = ::testing::Types<
 // Tests that the input shape is compared to the output shape and that it is
 // propagated if needed.
 
-// Customization point for generic tests that need to create a supported tensor
-// for an op but that don't care what that type is.
-//
-// Specialize this in the test file if F32 isn't supported by the op under test.
-template <class Op>
-struct SupportedOpDataType {
-  static constexpr DataType kStorageType = DataType::kF32;
-};
-
 template <class Op>
 class UnaryElementwiseOpShapePropagationTest : public ::testing::Test {
  protected:

From d8f86133e924b74459f4c069c835dae146d8f328 Mon Sep 17 00:00:00 2001
From: Augie Fackler <augie@google.com>
Date: Fri, 22 Mar 2024 09:51:14 -0700
Subject: [PATCH 305/670] Integrate LLVM at llvm/llvm-project@6f44bb771789

Updates LLVM usage to match
[6f44bb771789](https://github.com/llvm/llvm-project/commit/6f44bb771789)

PiperOrigin-RevId: 618207998
---
 third_party/llvm/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 6a477415e92f01..9d68095c7cf6f5 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "de7a50fb88faa1dafee33f10149561936214062b"
-    LLVM_SHA256 = "3653a70e4cd0e1e230bdbab165af2e4c702717e68e8745d32bff142cdbfad9dc"
+    LLVM_COMMIT = "6f44bb7717897191be25aa01161831c67cdf5b84"
+    LLVM_SHA256 = "ce1ae51a2790299efa81060ad6c4c8a25ef9b99e9b63ca5e571bc635b2f3a026"
 
     tf_http_archive(
         name = name,

From 5a2ab21daf33cb7c287a6a874e403e07914e3aee Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 09:58:47 -0700
Subject: [PATCH 306/670] Relocate the graph executor mode metric.

PiperOrigin-RevId: 618210183
---
 .../tfrt/graph_executor/graph_executor.cc     | 13 +++++--
 tensorflow/core/tfrt/saved_model/BUILD        |  2 ++
 .../core/tfrt/saved_model/saved_model.cc      | 35 +++----------------
 3 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.cc b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
index d363360bd78353..366508e8719ba0 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
@@ -61,6 +61,7 @@ limitations under the License.
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
@@ -132,6 +133,12 @@ StepId GetNextStepId() {
   return gen.GetNextStepId();
 }
 
+auto* graph_executor_mode = monitoring::Gauge<std::string, 2>::New(
+    "/tfrt/graph_executor/mode",
+    "Record the total number of imported savedmodel using different graph "
+    "executor modes (BEF vs MLRT interpreter)",
+    "model_name", "model_version");
+
 }  // namespace
 
 tensorflow::Status RunMlrtFunction(
@@ -496,14 +503,16 @@ absl::StatusOr<std::unique_ptr<GraphExecutor>> GraphExecutor::Create(
     // Overrides cost_analysis_options.
     options.cost_analysis_options.version = Options::CostAnalysisOptions::kOnce;
   }
-
   TfrtGraphExecutionState::Options graph_execution_state_options;
   graph_execution_state_options.run_placer_grappler_on_functions =
       options.run_placer_grappler_on_functions;
 
   options.compile_options.fuse_get_resource_ops_in_hoisting =
       !options.enable_mlrt;
-
+  graph_executor_mode
+      ->GetCell(options.model_metadata.name(),
+                absl::StrCat(options.model_metadata.version()))
+      ->Set(options.enable_mlrt ? "mlrt" : "bef");
   TF_ASSIGN_OR_RETURN(
       auto graph_execution_state,
       TfrtGraphExecutionState::Create(graph_execution_state_options,
diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index 32b1da670b69fd..03976ee0048ca7 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -105,6 +105,7 @@ cc_library(
         "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tensorflow:upgrade_graph",
@@ -148,6 +149,7 @@ cc_library(
         "//tensorflow/core/tfrt/utils:tfrt_graph_execution_state",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.cc b/tensorflow/core/tfrt/saved_model/saved_model.cc
index 0d20c9da9a30e0..56cec2b5821ae3 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <iterator>
 #include <memory>
@@ -26,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/cleanup/cleanup.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -39,10 +41,9 @@ limitations under the License.
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "tensorflow/cc/saved_model/reader.h"
 #include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h"
@@ -53,21 +54,19 @@ limitations under the License.
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/graph_executor/export_mlir.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_executor.h"
@@ -77,21 +76,15 @@ limitations under the License.
 #include "tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
-#include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_util.h"
 #include "tensorflow/core/tfrt/saved_model/utils/serialize_utils.h"
 #include "tensorflow/core/tfrt/stubs/model_config_stub.h"
-#include "tensorflow/core/tfrt/utils/error_util.h"
-#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
-#include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 #include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
 #include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/host_context/async_value.h"  // from @tf_runtime
-#include "tfrt/host_context/chain.h"  // from @tf_runtime
 #include "tfrt/host_context/execution_context.h"  // from @tf_runtime
 #include "tfrt/host_context/function.h"  // from @tf_runtime
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
@@ -111,12 +104,6 @@ auto* lazy_loading_count = monitoring::Counter<3>::New(
     "/tensorflow/tfrt/lazy_loading_count", "The total number of lazy loadings.",
     "model_name", "model_version", "use_graph_executor");
 
-auto* saved_model_graph_executor_mode = monitoring::Counter<3>::New(
-    "/tensorflow/tfrt/saved_model/graph_executor_mode",
-    "Record the total number of imported savedmodel using different graph "
-    "executor modes (BEF vs MLRT interpreter)",
-    "model_name", "model_version", "mode");
-
 auto* saved_model_import_time_seconds =
     tensorflow::monitoring::Gauge<int64_t, 1>::New(
         "/tensorflow/tfrt/saved_model/import_time",
@@ -609,25 +596,11 @@ SavedModelImpl::LoadSavedModel(Options options,
     tensorflow::tf_mlrt::RegisterTfMlrtBatchKernels(*kernel_registry);
 
     if (options.graph_execution_options.enable_mlrt) {
-      saved_model_graph_executor_mode
-          ->GetCell(
-              options.graph_execution_options.model_metadata.name(),
-              absl::StrCat(
-                  options.graph_execution_options.model_metadata.version()),
-              "mlrt")
-          ->IncrementBy(1);
       ASSIGN_OR_RETURN_IN_COMPILE(
           bytecode, tensorflow::mlrt_compiler::ConvertTfMlirToBytecode(
                         options.graph_execution_options.compile_options,
                         *fallback_state, mlir_module.get(), model_context));
     } else {
-      saved_model_graph_executor_mode
-          ->GetCell(
-              options.graph_execution_options.model_metadata.name(),
-              absl::StrCat(
-                  options.graph_execution_options.model_metadata.version()),
-              "bef")
-          ->IncrementBy(1);
       RETURN_IF_ERROR_IN_COMPILE(tensorflow::ConvertTfMlirToBef(
           options.graph_execution_options.compile_options, mlir_module.get(),
           &bef, model_context, fallback_state.get()));

From 830cd964add606e2423a08fed102f29531c552d5 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Fri, 22 Mar 2024 10:12:49 -0700
Subject: [PATCH 307/670] #shlo_ref Add `multiply` op and binary element-wise
 test utils.

PiperOrigin-RevId: 618215004
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  46 ++++
 .../shlo/ops/binary_elementwise_test_util.h   | 200 ++++++++++++++++++
 .../lite/experimental/shlo/ops/multiply.cc    |  77 +++++++
 .../lite/experimental/shlo/ops/multiply.h     |  36 ++++
 .../experimental/shlo/ops/multiply_test.cc    | 159 ++++++++++++++
 .../lite/experimental/shlo/ops/test_util.h    |  82 ++++++-
 tensorflow/lite/experimental/shlo/ops/util.cc |  25 ++-
 tensorflow/lite/experimental/shlo/ops/util.h  |   9 +-
 8 files changed, 616 insertions(+), 18 deletions(-)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/multiply.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/multiply.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/multiply_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 42e59598161063..b87249f468002b 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -772,3 +772,49 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "binary_elementwise_test_util",
+    testonly = True,
+    hdrs = ["binary_elementwise_test_util.h"],
+    deps = [
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "multiply",
+    srcs = ["multiply.cc"],
+    hdrs = ["multiply.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "multiply_test",
+    srcs = ["multiply_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":multiply",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h b/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h
new file mode 100644
index 00000000000000..fccd0a5c510bea
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h
@@ -0,0 +1,200 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_TEST_UTIL_H_
+
+#include <tuple>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <class Op, class List>
+struct OpTuple;
+
+template <class Op, class... Ts>
+struct OpTuple<Op, ::testing::Types<Ts...>> {
+  using Type = std::tuple<Op, Ts...>;
+};
+
+template <class Op>
+struct OpTupleFactory {
+  template <class T>
+  using WithOp = typename OpTuple<Op, T>::Type;
+};
+
+template <class Op, class SupportedTypes>
+using BinaryElementwiseBaselineConstraintTypes =
+    MapTypes<OpTupleFactory<Op>::template WithOp,
+             FilterTypes<NegatePred<SameTypes>::template Predicate,
+                         CrossProductTypes<SupportedTypes, SupportedTypes,
+                                           SupportedTypes>>>;
+
+using BaselineConstraintIntTypes = ::testing::Types<TestParam<DataType::kSI32>>;
+
+using BaselineConstraintFloatTypes =
+    ::testing::Types<TestParam<DataType::kF32>>;
+
+using BaselineConstraintQuantizedPerTensorTypes =
+    ::testing::Types<PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+                     PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>;
+
+template <class Op>
+class BinaryElementwiseOpShapePropagationTest : public ::testing::Test {
+ protected:
+  void SetRhsShape(Shape shape) { rhs_tensor_.shape() = std::move(shape); }
+  void SetOutputShape(Shape shape) {
+    output_tensor_.shape() = std::move(shape);
+  }
+  bool LhsAndOutputShapesAreEqual() const {
+    return lhs_tensor_.shape() == output_tensor_.shape();
+  }
+
+  Op op_ = Create(typename Op::Attributes{});
+  Tensor lhs_tensor_ = {
+      .type = TensorType{.shape = Shape({2, 3, 4}),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
+  Tensor rhs_tensor_ = {
+      .type = TensorType{.shape = Shape({2, 3, 4}),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
+  Tensor output_tensor_ = {
+      .type = TensorType{.shape = Shape(),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
+};
+
+TYPED_TEST_SUITE_P(BinaryElementwiseOpShapePropagationTest);
+
+TYPED_TEST_P(BinaryElementwiseOpShapePropagationTest, ShapePropagationWorks) {
+  ASSERT_TRUE(this->output_tensor_.shape().empty());
+  EXPECT_OK(Prepare(this->op_, this->lhs_tensor_, this->rhs_tensor_,
+                    this->output_tensor_));
+  EXPECT_THAT(this->output_tensor_.shape(),
+              ::testing::ElementsAreArray(this->lhs_tensor_.shape()));
+}
+
+TYPED_TEST_P(BinaryElementwiseOpShapePropagationTest,
+             SmallerOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3}));
+  ASSERT_FALSE(this->LhsAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->lhs_tensor_, this->rhs_tensor_,
+              this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shapes."));
+}
+
+TYPED_TEST_P(BinaryElementwiseOpShapePropagationTest,
+             BiggerOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3, 4, 5}));
+  ASSERT_FALSE(this->LhsAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->lhs_tensor_, this->rhs_tensor_,
+              this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shapes."));
+}
+
+TYPED_TEST_P(BinaryElementwiseOpShapePropagationTest,
+             IncompatibleOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3, 5}));
+  ASSERT_FALSE(this->LhsAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->lhs_tensor_, this->rhs_tensor_,
+              this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shapes."));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(BinaryElementwiseOpShapePropagationTest,
+                            ShapePropagationWorks,
+                            SmallerOutputShapeRaisesAnError,
+                            BiggerOutputShapeRaisesAnError,
+                            IncompatibleOutputShapeRaisesAnError);
+
+// Tests that the baseline element type of the input and output tensors is the
+// same.
+template <class T>
+class BinaryElementwiseSameBaselineElementTypeConstraintTest
+    : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(BinaryElementwiseSameBaselineElementTypeConstraintTest);
+
+TYPED_TEST_P(BinaryElementwiseSameBaselineElementTypeConstraintTest,
+             DifferentInputOutputStorageTypesRaiseAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using LhsTypeDesc = std::tuple_element_t<1, TypeParam>;
+  using RhsTypeDesc = std::tuple_element_t<2, TypeParam>;
+  using ResultTypeDesc = std::tuple_element_t<3, TypeParam>;
+  const Shape shape({2, 3, 4});
+  Tensor lhs_tensor{.type = TensorTypeFor(LhsTypeDesc{}, shape),
+                    .data = nullptr};
+  Tensor rhs_tensor{.type = TensorTypeFor(RhsTypeDesc{}, shape),
+                    .data = nullptr};
+  Tensor output_tensor{.type = TensorTypeFor(ResultTypeDesc{}, shape),
+                       .data = nullptr};
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status =
+      Prepare(op, lhs_tensor, rhs_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(
+      status.message(),
+      ::testing::ContainsRegex(
+          "stablehlo.[_a-z]+: baseline type constraint is not satisfied"));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(
+    BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    DifferentInputOutputStorageTypesRaiseAnError);
+
+// Tests that unsupported types are detected during when `Prepare` is called.
+template <class T>
+class BinaryElementwiseUnsupportedTypeTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(BinaryElementwiseUnsupportedTypeTest);
+
+TYPED_TEST_P(BinaryElementwiseUnsupportedTypeTest, PrepareRaisesAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using TypeDesc = std::tuple_element_t<1, TypeParam>;
+  Tensor input_tensor{.type = TensorTypeFor(TypeDesc{}, Shape({2, 3, 4})),
+                      .data = nullptr};
+  Tensor output_tensor = input_tensor;
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status =
+      Prepare(op, input_tensor, input_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(status.message(),
+              ::testing::HasSubstr("Unsupported tensor type"));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(BinaryElementwiseUnsupportedTypeTest,
+                            PrepareRaisesAnError);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_TEST_UTIL_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/multiply.cc b/tensorflow/lite/experimental/shlo/ops/multiply.cc
new file mode 100644
index 00000000000000..03898ff46367bd
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/multiply.cc
@@ -0,0 +1,77 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/multiply.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <DataType expressed_type>
+struct Multiply : std::multiplies<void> {};
+
+template <>
+struct Multiply<DataType::kI1> {
+  template <class T>
+  T operator()(const T& lhs, const T& rhs) const {
+    return static_cast<T>(lhs && rhs);
+  }
+};
+
+MultiplyOp Create(MultiplyOp::Attributes) { return {}; }
+
+absl::Status Prepare(MultiplyOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("multiply"), lhs, IsBoolTensor, IsIntTensor,
+                          IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("multiply"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("multiply"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(MultiplyOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  if (IsBoolTensor(lhs)) {
+    detail::EvaluateNoQuantization<DataType::kI1>(Multiply<DataType::kI1>(),
+                                                  lhs, rhs, output);
+    return absl::OkStatus();
+  } else if (IsIntTensor(lhs) || IsFloatTensor(lhs)) {
+    // Note: all the arithmetic types share the same implementation.
+    Multiply<DataType::kF32> multiply;
+    DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
+                       lhs.tensor_element_type(), multiply, lhs, rhs, output);
+  } else if (IsQuantizedPerTensorTensor(lhs)) {
+    Multiply<DataType::kF32> multiply;
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       lhs.quantized_tensor_element_type().StorageType(),
+                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       multiply, lhs, rhs, output)
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.multiply: Unsupported tensor type.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/multiply.h b/tensorflow/lite/experimental/shlo/ops/multiply.h
new file mode 100644
index 00000000000000..1301969b578788
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/multiply.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MULTIPLY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MULTIPLY_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct MultiplyOp {
+  struct Attributes {};
+};
+
+MultiplyOp Create(MultiplyOp::Attributes);
+absl::Status Prepare(MultiplyOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(MultiplyOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MULTIPLY_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/multiply_test.cc b/tensorflow/lite/experimental/shlo/ops/multiply_test.cc
new file mode 100644
index 00000000000000..241ed9214e2edb
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/multiply_test.cc
@@ -0,0 +1,159 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/multiply.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<MultiplyOp> {
+  static std::string Get() { return "Multiply"; }
+};
+
+template <DataType expressed_type>
+struct Multiply : std::multiplies<void> {};
+
+template <>
+struct Multiply<DataType::kI1> {
+  template <class T>
+  T operator()(const T& lhs, const T& rhs) const {
+    return static_cast<T>(lhs && rhs);
+  }
+};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Multiply,
+                               BinaryElementwiseOpShapePropagationTest,
+                               MultiplyOp, TestParamNames);
+
+using MultipyBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    MultiplyOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes,
+                            BaselineConstraintFloatTypes,
+                            BaselineConstraintQuantizedPerTensorTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Multiply, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MultipyBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes = WithOpTypes<MultiplyOp, PerAxisQuantizedTestTypes>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Multiply, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using ArithmeticTypes = ConcatTypes<BoolTestType, ArithmeticTestTypes>;
+
+template <class T>
+struct MultiplyTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(MultiplyTest, ArithmeticTypes, TestParamNames);
+
+TYPED_TEST(MultiplyTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(),
+                    Multiply<TypeParam::kStorage>());
+
+  auto op = Create(MultiplyOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedMultiplyTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedMultiplyTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedMultiplyTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor lhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [zero_point, scale](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs = Dequantize(lhs, zero_point, scale);
+        const ExpressedT dequantized_rhs = Dequantize(rhs, zero_point, scale);
+        const ExpressedT dequantized_res =
+            Multiply<TypeParam::kExpressed>()(dequantized_lhs, dequantized_rhs);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(MultiplyOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/test_util.h b/tensorflow/lite/experimental/shlo/ops/test_util.h
index 739090b20d3095..3ca89ffea3f086 100644
--- a/tensorflow/lite/experimental/shlo/ops/test_util.h
+++ b/tensorflow/lite/experimental/shlo/ops/test_util.h
@@ -239,6 +239,77 @@ struct WithOp<Op, ::testing::Types<Ts...>> {
 template <class Op, class T>
 using WithOpTypes = typename WithOp<Op, T>::Types;
 
+template <class Accu, class... Lists>
+struct CrossProductImpl;
+
+template <class... AccuTs, class... Ts, class... Lists>
+struct CrossProductImpl<::testing::Types<AccuTs...>, ::testing::Types<Ts...>,
+                        Lists...> {
+  using Types =
+      ConcatTypes<typename CrossProductImpl<::testing::Types<AccuTs..., Ts>,
+                                            Lists...>::Types...>;
+};
+
+template <class... AccuTs>
+struct CrossProductImpl<::testing::Types<AccuTs...>> {
+  using Types = ::testing::Types<::testing::Types<AccuTs...>>;
+};
+
+// Generates a cross-product of lists.
+template <class... Lists>
+struct CrossProduct {
+  using Types = typename CrossProductImpl<::testing::Types<>, Lists...>::Types;
+};
+
+template <class... Lists>
+using CrossProductTypes = typename CrossProduct<Lists...>::Types;
+
+static_assert(
+    std::is_same_v<
+        CrossProductTypes<::testing::Types<int, float>,
+                          ::testing::Types<char, double>>,
+        ::testing::Types<
+            ::testing::Types<int, char>, ::testing::Types<int, double>,
+            ::testing::Types<float, char>, ::testing::Types<float, double>>>);
+
+static_assert(
+    std::is_same_v<
+        CrossProductTypes<::testing::Types<int>, ::testing::Types<char, double>,
+                          ::testing::Types<float>>,
+        ::testing::Types<::testing::Types<int, char, float>,
+                         ::testing::Types<int, double, float>>>);
+
+template <template <class...> class Predicate, class List>
+struct Filter;
+
+template <template <class...> class Predicate, class... Ts>
+struct Filter<Predicate, ::testing::Types<Ts...>> {
+  using Type =
+      ConcatTypes<std::conditional_t<Predicate<Ts>::value, ::testing::Types<Ts>,
+                                     ::testing::Types<>>...>;
+};
+
+template <template <class...> class Predicate, class List>
+using FilterTypes = typename Filter<Predicate, List>::Type;
+
+static_assert(std::is_same_v<
+              FilterTypes<std::is_integral, ::testing::Types<int, char, float>>,
+              ::testing::Types<int, char>>);
+
+template <class T, class... Ts>
+struct SameTypes : std::bool_constant<(std::is_same_v<T, Ts> && ...)> {};
+
+// Checks if all types in the testing::Types list are the same.
+template <class T, class... Ts>
+struct SameTypes<::testing::Types<T, Ts...>> : SameTypes<T, Ts...> {};
+
+// Provides a new predicate that negates the given one.
+template <template <class...> class Pred>
+struct NegatePred {
+  template <class... Ts>
+  using Predicate = std::negation<Pred<Ts...>>;
+};
+
 // Use this with TYPED_TEST_SUITE for boolean testing.
 using BoolTestType = ::testing::Types<TestParam<DataType::kI1>>;
 
@@ -252,17 +323,6 @@ using FloatTestTypes =
     ::testing::Types<TestParam<DataType::kBF16>, TestParam<DataType::kF16>,
                      TestParam<DataType::kF32>>;
 
-// Use this with TYPED_TEST_SUITE for non quantized integer testing.
-using NonQuantizedBoolIntTestTypes =
-    testing::Types<TestParam<DataType::kI1>, TestParam<DataType::kSI4>,
-                   TestParam<DataType::kSI8>, TestParam<DataType::kSI16>,
-                   TestParam<DataType::kSI32>>;
-
-// Use this with TYPED_TEST_SUITE for non quantized integer testing.
-using NonQuantizedBoolFloatTestTypes =
-    testing::Types<TestParam<DataType::kI1>, TestParam<DataType::kBF16>,
-                   TestParam<DataType::kF16>, TestParam<DataType::kF32>>;
-
 // Use this with TYPED_TEST_SUITE for non quantized testing.
 using ArithmeticTestTypes = ConcatTypes<IntTestTypes, FloatTestTypes>;
 
diff --git a/tensorflow/lite/experimental/shlo/ops/util.cc b/tensorflow/lite/experimental/shlo/ops/util.cc
index e3d6bc6552e855..6698649dae6332 100644
--- a/tensorflow/lite/experimental/shlo/ops/util.cc
+++ b/tensorflow/lite/experimental/shlo/ops/util.cc
@@ -24,12 +24,25 @@ namespace shlo_ref {
 absl::Status Propagate(const Shape& input_shape, Shape& output_shape) {
   if (output_shape.Dimensions().empty()) {
     output_shape = input_shape;
-  } else {
-    if (output_shape != input_shape) {
-      return absl::FailedPreconditionError(
-          "The specified output tensor shape is not compatible with the input "
-          "shape.");
-    }
+  } else if (output_shape != input_shape) {
+    return absl::FailedPreconditionError(
+        "The specified output tensor shape is not compatible with the input "
+        "shape.");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status Propagate(const Shape& lhs_shape, const Shape& rhs_shape,
+                       Shape& output_shape) {
+  if (lhs_shape != rhs_shape) {
+    return absl::FailedPreconditionError(
+        "The LHS and RHS shapes are incompatible.");
+  } else if (output_shape.Dimensions().empty()) {
+    output_shape = lhs_shape;
+  } else if (output_shape != lhs_shape) {
+    return absl::FailedPreconditionError(
+        "The specified output tensor shape is not compatible with the input "
+        "shapes.");
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/experimental/shlo/ops/util.h b/tensorflow/lite/experimental/shlo/ops/util.h
index 97a87c3e59728d..ccb858c6194439 100644
--- a/tensorflow/lite/experimental/shlo/ops/util.h
+++ b/tensorflow/lite/experimental/shlo/ops/util.h
@@ -28,12 +28,19 @@ namespace shlo_ref {
     return s;                             \
   }
 
-// Propages the input shape to the output shape.
+// Propagates the input shape to the output shape.
 //
 // If the output shape is already populated, checks that is it compatible with
 // the input.
 absl::Status Propagate(const Shape& input_shape, Shape& output_shape);
 
+// Propagates the input shapes to the output shape.
+//
+// If the output shape is already populated, checks that is it compatible with
+// the inputs.
+absl::Status Propagate(const Shape& lhs_shape, const Shape& rhs_shape,
+                       Shape& output_shape);
+
 // Provides context information for the `Check*` functions error messages.
 struct CheckCtx {
   explicit CheckCtx(std::string name) : op_name(name) {}

From bd52cd7b3d609de08c396965956824c53c6f62f4 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Fri, 22 Mar 2024 10:27:43 -0700
Subject: [PATCH 308/670] [XLA:GPU][NFC] Cosmetic and safety fixes to symbolic
 tile analysis code.

PiperOrigin-RevId: 618219557
---
 third_party/xla/xla/service/gpu/model/BUILD   |  1 +
 .../service/gpu/model/indexing_analysis.cc    |  6 +--
 .../gpu/model/symbolic_tile_analysis.cc       | 46 ++++++++++---------
 .../gpu/model/symbolic_tile_analysis.h        | 13 +++---
 .../gpu/model/symbolic_tile_analysis_test.cc  |  2 +-
 5 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 97179fb7726f46..136c0ae48fb0f5 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -567,6 +567,7 @@ cc_library(
         "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:instruction_fusion",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index b621b2434893bc..5666258b2276d1 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -1042,12 +1042,12 @@ IndexingMap CreateIdentityMap(const Shape& shape,
     return CreateIdentityMap(shape.tuple_shapes(0), indexing_context);
   }
 
-  auto dims = shape.dimensions();
+  auto dimensions = shape.dimensions();
   IndexingMap identity_map = IndexingMap::FromTensorSizes(
       indexing_context,
-      AffineMap::getMultiDimIdentityMap(dims.size(),
+      AffineMap::getMultiDimIdentityMap(dimensions.size(),
                                         indexing_context->GetMLIRContext()),
-      dims, {});
+      dimensions, {});
   return identity_map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index 33c393835eb9c3..d1b360fd5c325f 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
@@ -95,9 +96,10 @@ struct HloAndPath {
           .c_str();
     }
 
-    const IndexingMap& hlo_indexing_map =
-        indexing_map_from_path.at(hlo_and_path.path);
+    auto hlo_indexing_map_it = indexing_map_from_path.find(hlo_and_path.path);
+    CHECK(hlo_indexing_map_it != indexing_map_from_path.end());
 
+    const IndexingMap& hlo_indexing_map = hlo_indexing_map_it->second;
     std::optional<SymbolicTile> symbolic_tile =
         SymbolicTile::FromIndexingMap(hlo_indexing_map);
     if (!symbolic_tile.has_value()) {
@@ -136,9 +138,8 @@ struct HloAndPath {
       indexing_map_from_path.insert({operand_path, operand_indexing_map});
       to_process.push(HloAndPath{operand, operand_path});
 
-      // TODO(bchetioui): replace instances of 'count' with 'contains' once OSS
-      // builds use C++20.
-      if (paths_from_root_to_instruction.count(operand) == 0) {
+      if (paths_from_root_to_instruction.find(operand) ==
+          paths_from_root_to_instruction.end()) {
         paths_from_root_to_instruction.insert({operand, {operand_path}});
       } else {
         paths_from_root_to_instruction.at(operand).insert(operand_path);
@@ -164,7 +165,7 @@ std::vector<int64_t> EvaluateTileMap(AffineMap affine_map,
         return mlir::getAffineConstantExpr(v, affine_map.getContext());
       }));
 
-  mlir::AffineMap simplified_affine_map =
+  AffineMap simplified_affine_map =
       mlir::simplifyAffineMap(affine_map.replaceDimsAndSymbols(
           /*dimReplacements=*/{}, symbol_replacements, /*numResultDims=*/0,
           /*numResultSyms=*/0));
@@ -180,35 +181,38 @@ std::vector<int64_t> EvaluateTileMap(AffineMap affine_map,
 }  // namespace
 
 std::vector<int64_t> SymbolicTileAnalysis::TileOffsets(
-    const HloInstruction* hlo, const InstructionPathFromRoot& path) const {
+    absl::Nonnull<const HloInstruction*> hlo,
+    const InstructionPathFromRoot& path) const {
   CHECK(tile_parameters_.has_value());
-  // TODO(bchetioui): replace instances of 'count' with 'contains' once OSS
-  // builds use C++20.
-  CHECK_EQ(paths_from_root_to_instruction_.count(hlo), 1);
-  CHECK_EQ(paths_from_root_to_instruction_.at(hlo).count(path), 1);
+  CHECK(paths_from_root_to_instruction_.find(hlo) !=
+        paths_from_root_to_instruction_.end());
+  CHECK(paths_from_root_to_instruction_.at(hlo).find(path) !=
+        paths_from_root_to_instruction_.at(hlo).end());
   return EvaluateTileMap(symbolic_tile_from_path_.at(path).offset_map(),
                          *tile_parameters_);
 }
 
 // TODO(bchetioui): remove dependency on stride and offset parameters.
 std::vector<int64_t> SymbolicTileAnalysis::TileSizes(
-    const HloInstruction* hlo, const InstructionPathFromRoot& path) const {
+    absl::Nonnull<const HloInstruction*> hlo,
+    const InstructionPathFromRoot& path) const {
   CHECK(tile_parameters_.has_value());
-  // TODO(bchetioui): replace instances of 'count' with 'contains' once OSS
-  // builds use C++20.
-  CHECK_EQ(paths_from_root_to_instruction_.count(hlo), 1);
-  CHECK_EQ(paths_from_root_to_instruction_.at(hlo).count(path), 1);
+  CHECK(paths_from_root_to_instruction_.find(hlo) !=
+        paths_from_root_to_instruction_.end());
+  CHECK(paths_from_root_to_instruction_.at(hlo).find(path) !=
+        paths_from_root_to_instruction_.at(hlo).end());
   return EvaluateTileMap(symbolic_tile_from_path_.at(path).size_map(),
                          *tile_parameters_);
 }
 
 std::vector<int64_t> SymbolicTileAnalysis::TileStrides(
-    const HloInstruction* hlo, const InstructionPathFromRoot& path) const {
+    absl::Nonnull<const HloInstruction*> hlo,
+    const InstructionPathFromRoot& path) const {
   CHECK(tile_parameters_.has_value());
-  // TODO(bchetioui): replace instances of 'count' with 'contains' once OSS
-  // builds use C++20.
-  CHECK_EQ(paths_from_root_to_instruction_.count(hlo), 1);
-  CHECK_EQ(paths_from_root_to_instruction_.at(hlo).count(path), 1);
+  CHECK(paths_from_root_to_instruction_.find(hlo) !=
+        paths_from_root_to_instruction_.end());
+  CHECK(paths_from_root_to_instruction_.at(hlo).find(path) !=
+        paths_from_root_to_instruction_.at(hlo).end());
   return EvaluateTileMap(symbolic_tile_from_path_.at(path).stride_map(),
                          *tile_parameters_);
 }
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
index cc595990bea0a0..9baf9bc781df36 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
@@ -58,29 +59,29 @@ class SymbolicTileAnalysis {
   // Evaluates the tile offsets of an instruction from the analyzed computation
   // following the provided path from the root. Tile parameters must have been
   // set before calling this method.
-  std::vector<int64_t> TileOffsets(const HloInstruction* hlo,
+  std::vector<int64_t> TileOffsets(absl::Nonnull<const HloInstruction*> hlo,
                                    const InstructionPathFromRoot& path) const;
   // Evaluates the tile sizes of an instruction from the analyzed computation
   // following the provided path from the root. Tile parameters must have been
   // set before calling this method.
-  std::vector<int64_t> TileSizes(const HloInstruction* hlo,
+  std::vector<int64_t> TileSizes(absl::Nonnull<const HloInstruction*> hlo,
                                  const InstructionPathFromRoot& path) const;
   // Evaluates the tile strides of an instruction from the analyzed computation
   // following the provided path from the root. Tile parameters must have been
   // set before calling this method.
-  std::vector<int64_t> TileStrides(const HloInstruction* hlo,
+  std::vector<int64_t> TileStrides(absl::Nonnull<const HloInstruction*> hlo,
                                    const InstructionPathFromRoot& path) const;
 
-  // Populate tile parameters. This is a prerequisite in order to extract
+  // Populates tile parameters. This is a prerequisite in order to extract
   // concrete values using `TileOffsets`, `TileSizes`, and `TileStrides`.
   void SetTileParameters(absl::Span<int64_t const> parameters);
 
-  // Populate tile parameters with given sizes. All offsets are 0 and strides
+  // Populates tile parameters with given sizes. All offsets are 0 and strides
   // are 1.
   void SetTileParametersWithDefaultOffsetsAndStrides(
       absl::Span<int64_t const> sizes);
 
-  // Return the underlying IndexingContext.
+  // Returns the underlying IndexingContext.
   IndexingContext* GetIndexingContext() const { return context_; };
 
  private:
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
index 1a918a5b17d7ff..6518cea7ae16e0 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
@@ -62,7 +62,7 @@ ENTRY main {
       SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
                                                &ctx);
 
-  EXPECT_TRUE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+  ASSERT_TRUE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
   SymbolicTileAnalysis analysis =
       std::get<SymbolicTileAnalysis>(analysis_or_error);
 

From f1858b014b698aaba1afa15ccd533e7daa5ca1d9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 10:35:44 -0700
Subject: [PATCH 309/670] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/29bfd29cf353020006fa20ec0a41062ff699678d.

PiperOrigin-RevId: 618222196
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 1f46b9dced2657..c0f514be6dccf3 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "7b008a767fce1d12c85f25772323ae0ee0bdc31c"
-    TFRT_SHA256 = "8d4b996a76a56ac7ad30db4bd0082333e9546be63364415c42d9103ef8a97dc5"
+    TFRT_COMMIT = "29bfd29cf353020006fa20ec0a41062ff699678d"
+    TFRT_SHA256 = "a99d91273478e9e07f9a80ad238ae358d02c149c8a53a8c8c02ebc189c0b4d5f"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 1f46b9dced2657..c0f514be6dccf3 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "7b008a767fce1d12c85f25772323ae0ee0bdc31c"
-    TFRT_SHA256 = "8d4b996a76a56ac7ad30db4bd0082333e9546be63364415c42d9103ef8a97dc5"
+    TFRT_COMMIT = "29bfd29cf353020006fa20ec0a41062ff699678d"
+    TFRT_SHA256 = "a99d91273478e9e07f9a80ad238ae358d02c149c8a53a8c8c02ebc189c0b4d5f"
 
     tf_http_archive(
         name = "tf_runtime",

From 8e03634c5cedb4192f02df973b40a650e13b858c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 10:35:59 -0700
Subject: [PATCH 310/670] Removes the deprecated build_pip_package.sh script.

PiperOrigin-RevId: 618222286
---
 tensorflow/opensource_only.files              |   1 -
 .../tools/pip_package/build_pip_package.sh    | 485 ------------------
 2 files changed, 486 deletions(-)
 delete mode 100755 tensorflow/tools/pip_package/build_pip_package.sh

diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 563714c9810e3a..6cfd11e5d688d8 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -164,7 +164,6 @@ tf_staging/tensorflow/tools/lib_package/libtensorflow_test.sh:
 tf_staging/tensorflow/tools/pip_package/BUILD:
 tf_staging/tensorflow/tools/pip_package/MANIFEST.in:
 tf_staging/tensorflow/tools/pip_package/README:
-tf_staging/tensorflow/tools/pip_package/build_pip_package.sh:
 tf_staging/tensorflow/tools/pip_package/check_load_py_test:.py
 tf_staging/tensorflow/tools/pip_package/pip_smoke_test:.py
 tf_staging/tensorflow/tools/pip_package/setup:.py
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
deleted file mode 100755
index ff21aadba95be6..00000000000000
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ /dev/null
@@ -1,485 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-
-set -e
-
-# Read the value of VERSION from vercod.bzl
-VERSION=$(grep 'VERSION = ' tensorflow/tensorflow.bzl | sed -E 's/VERSION = "(.*)"/\1/g')
-VERSION_MAJOR=$(echo "$VERSION" | cut -d '.' -f1)
-echo TensorFlow Version: ${VERSION}
-echo TensorFlow Major Version: ${VERSION_MAJOR}
-
-function is_absolute {
-  [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
-}
-
-function real_path() {
-  is_absolute "$1" && echo "$1" || echo "$PWD/${1#./}"
-}
-
-function cp_external() {
-  local src_dir=$1
-  local dest_dir=$2
-
-  pushd .
-  cd "$src_dir"
-  for f in `find . ! -type d ! -name '*.py' ! -path '*local_config_cuda*' ! -path '*local_config_tensorrt*' ! -path '*pypi*' ! -path '*python_x86_64*' ! -path '*python_aarch64*' ! -path '*local_config_syslibs*' ! -path '*org_tensorflow*' ! -path '*llvm-project/llvm/*' ! -path '*local_tsl*' ! -path '*local_xla*'`; do
-    mkdir -p "${dest_dir}/$(dirname ${f})"
-    cp "${f}" "${dest_dir}/$(dirname ${f})/"
-  done
-  popd
-
-  mkdir -p "${dest_dir}/local_config_cuda/cuda/cuda/"
-  cp "${src_dir}/local_config_cuda/cuda/cuda/cuda_config.h" "${dest_dir}/local_config_cuda/cuda/cuda/"
-}
-
-function cp_local_config_python() {
-  local src_dir=$1
-  local dest_dir=$2
-  pushd .
-  cd "$src_dir"
-  mkdir -p "${dest_dir}/local_config_python/numpy_include/"
-  cp -r "pypi_numpy/site-packages/numpy/core/include/numpy" "${dest_dir}/local_config_python/numpy_include/"
-  mkdir -p "${dest_dir}/local_config_python/python_include/"
-  if is_windows; then
-    cp -r python_*/include/* "${dest_dir}/local_config_python/python_include/"
-  else
-    cp -r python_*/include/python*/* "${dest_dir}/local_config_python/python_include/"
-  fi
-  popd
-}
-
-function copy_xla_aot_runtime_sources() {
-  local src_dir=$1
-  local dst_dir=$2
-
-  local srcs_txt="tensorflow/tools/pip_package/xla_compiled_cpu_runtime_srcs.txt"
-
-  if [ ! -f "${src_dir}/${srcs_txt}" ]; then
-    echo Could not find source list file "${src_dir}/${srcs_txt}". 1>&2
-    return 0
-  fi
-
-  pushd $src_dir
-  for file in $(cat "${srcs_txt}")
-  do
-    # Sometimes $file has a prefix bazel-out/host/ we want to remove.
-    prefix=${file%%tensorflow/*}  # Find the location of "tensorflow/*"
-    candidate_file=${file#$prefix}  # Remove the prefix
-    if [ ! -z "$candidate_file" ]; then
-      file=$candidate_file
-    fi
-
-    # For XLA/TSL, we need to remove the prefix "../local_{xla|tsl}/".
-    dst_file=$file
-    dst_file=${dst_file#"../local_xla/"}
-    dst_file=${dst_file#"../local_tsl/"}
-
-    if test -f "$file"; then
-      mkdir -p "${dst_dir}/$(dirname $dst_file)"
-      cp $file "${dst_dir}/${dst_file}"
-    else
-      echo "Missing xla source file: ${file}" 1>&2
-    fi
-  done
-  cp tensorflow/tools/pip_package/xla_build/CMakeLists.txt "${dst_dir}"
-  popd
-}
-
-function move_to_root_if_exists () {
-  arg_to_move="$1"
-  if [ -e "${arg_to_move}" ]; then
-    mv ${arg_to_move} ./
-  fi
-}
-
-function reorganize_includes() {
-  TMPDIR="${1%/}"
-  pushd "${TMPDIR}/tensorflow/include/"
-
-  move_to_root_if_exists external/com_google_absl/absl
-
-  move_to_root_if_exists external/eigen_archive/Eigen
-  move_to_root_if_exists external/eigen_archive/unsupported
-
-  move_to_root_if_exists external/jsoncpp_git/include
-  rm -rf external/jsoncpp_git
-
-  move_to_root_if_exists external/com_google_protobuf/src/google
-  rm -rf external/com_google_protobuf/python
-
-  cp -R external/ml_dtypes ./
-
-  popd
-}
-
-PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
-function is_windows() {
-  if [[ "${PLATFORM}" =~ (cygwin|mingw32|mingw64|msys)_nt* ]]; then
-    true
-  else
-    false
-  fi
-}
-
-function is_macos() {
-  if [[ "${PLATFORM}" =~ darwin* ]]; then
-    true
-  else
-    false
-  fi
-}
-
-function prepare_src() {
-  if [ $# -lt 1 ] ; then
-    echo "No destination dir provided"
-    exit 1
-  fi
-
-  TMPDIR="${1%/}"
-  mkdir -p "$TMPDIR"
-  echo TMPDIR: ${TMPDIR}
-  EXTERNAL_INCLUDES="${TMPDIR}/tensorflow/include/external"
-  XLA_AOT_RUNTIME_SOURCES="${TMPDIR}/tensorflow/xla_aot_runtime_src"
-
-  echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
-
-  if [ ! -d bazel-bin/tensorflow ]; then
-    echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
-    exit 1
-  fi
-
-  if is_windows; then
-    cp -L \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/org_tensorflow/LICENSE \
-      "${TMPDIR}"
-
-    # Change the format of file path (TMPDIR-->TMPDIR_rsync) which is input to the rsync from
-    # Windows-compatible to Linux-compatible to resolve the error below
-    # error: ssh: Could not resolve hostname c: No such host is known.
-
-    TMPDIR_rsync=`cygpath $TMPDIR`
-    rsync -a \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/org_tensorflow/tensorflow \
-      "${TMPDIR_rsync}"
-    cp_external \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles \
-      "${EXTERNAL_INCLUDES}/"
-    cp_local_config_python \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles \
-      "${EXTERNAL_INCLUDES}/"
-    copy_xla_aot_runtime_sources \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/org_tensorflow \
-      "${XLA_AOT_RUNTIME_SOURCES}/"
-    RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/org_tensorflow
-    # If oneDNN was built with openMP then copy the omp libs over
-    if [ -f "bazel-bin/external/llvm_openmp/libiomp5md.dll" ]; then
-      cp bazel-bin/external/llvm_openmp/libiomp5md.dll ${TMPDIR}/tensorflow/python
-      cp bazel-bin/external/llvm_openmp/libiomp5md.dll.if.lib ${TMPDIR}/tensorflow/python
-    fi
-  else
-    RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow
-    # Resolved the issue of a missing symlink to libtensorflow_cc.so.2 b/264967822#comment25
-    if is_macos; then
-      if [ ! -L "${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION_MAJOR}.dylib" ]; then
-        ln -s "$(dirname "$(readlink "${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION}.dylib")")/libtensorflow_cc.${VERSION_MAJOR}.dylib" \
-         "${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION_MAJOR}.dylib"
-        echo "Created symlink: $(dirname "$(readlink "${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION}.dylib")")/libtensorflow_cc.${VERSION_MAJOR}.dylib -> \
-          ${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION_MAJOR}.dylib"
-      else
-        echo "Symlink already exists: ${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION_MAJOR}.dylib"
-      fi
-    else
-      # cp -P ${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION} ${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION_MAJOR}
-      if [ ! -L "${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION_MAJOR}" ]; then
-        ln -s "$(dirname "$(readlink "${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION}")")/libtensorflow_cc.so.${VERSION_MAJOR}" \
-          "${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION_MAJOR}"
-      else
-        echo "Symlink already exists: ${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION_MAJOR}"
-      fi
-    fi
-    cp -L \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/LICENSE \
-      "${TMPDIR}"
-    cp -LR \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \
-      "${TMPDIR}"
-    # Prevents pip package bloat. See b/228948031#comment17.
-    rm -f ${TMPDIR}/tensorflow/python/lib_pywrap_tensorflow_internal.*
-    if [ -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external ]; then
-      # Old-style runfiles structure (--legacy_external_runfiles).
-      cp_external \
-        bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external \
-        "${EXTERNAL_INCLUDES}"
-      cp_local_config_python \
-        bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external \
-        "${EXTERNAL_INCLUDES}"
-    else
-      # New-style runfiles structure (--nolegacy_external_runfiles).
-      cp_external \
-        bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
-        "${EXTERNAL_INCLUDES}"
-      cp_local_config_python \
-        bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
-        "${EXTERNAL_INCLUDES}"
-    fi
-    copy_xla_aot_runtime_sources \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow \
-      "${XLA_AOT_RUNTIME_SOURCES}"
-    # Copy MKL libs over so they can be loaded at runtime
-    so_lib_dir=$(ls $RUNFILES | grep solib)
-    if is_macos; then
-      chmod +rw ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
-    else
-      chmod +rw ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
-      chmod +rw ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.so
-      chmod +rw ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.so
-      chmod +rw ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.so
-      patchelf --set-rpath $(patchelf --print-rpath ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so):\$ORIGIN/../../tensorflow/tsl/python/lib/core ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
-      patchelf --set-rpath $(patchelf --print-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.so):\$ORIGIN/../../../../../python ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.so
-      patchelf --set-rpath $(patchelf --print-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.so):\$ORIGIN/../../../../../python ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.so
-      patchelf --set-rpath $(patchelf --print-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.so):\$ORIGIN/../../../../../python ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.so
-      patchelf --shrink-rpath ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
-      patchelf --shrink-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.so
-      patchelf --shrink-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.so
-      patchelf --shrink-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.so
-    fi
-    mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true
-    if [ -n "${mkl_so_dir}" ]; then
-      mkdir "${TMPDIR}/${so_lib_dir}"
-      cp -R ${RUNFILES}/${so_lib_dir}/${mkl_so_dir} "${TMPDIR}/${so_lib_dir}"
-    fi
-  fi
-
-  # Move vendored files into proper locations
-  # This is required because TSL/XLA don't publish their own wheels
-  # We copy from bazel-bin/tensorflow instead of bazel-bin/internal to copy
-  # headers from TSL/XLA into tensorflow so that InstallHeaders can move
-  # them back into tensorflow/include
-  if is_windows; then
-    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/local_tsl/tsl/ ${TMPDIR}/tensorflow
-    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
-  else
-    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_tsl/tsl ${TMPDIR}/tensorflow
-    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_xla/xla ${TMPDIR}/tensorflow/compiler
-  fi
-  # Fix the proto stubs
-  if is_macos; then
-    find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i '' 's/from tsl\./from tensorflow.tsl./' {} \;
-    find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i '' 's/from local_xla\.xla/from tensorflow.compiler.xla/' {} \;
-    find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i '' 's/from xla/from tensorflow.compiler.xla/' {} \;
-  else
-    find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i'' 's/from tsl\./from tensorflow.tsl./' {} \;
-    find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i'' 's/from local_xla\.xla/from tensorflow.compiler.xla/' {} \;
-    find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i'' 's/from xla/from tensorflow.compiler.xla/' {} \;
-  fi
-
-  mkdir -p ${TMPDIR}/third_party
-  cp -LR $RUNFILES/../local_config_cuda/cuda/_virtual_includes/cuda_headers_virtual/third_party/gpus ${TMPDIR}/third_party
-  cp $RUNFILES/tensorflow/tools/pip_package/THIRD_PARTY_NOTICES.txt "${TMPDIR}/tensorflow"
-
-  reorganize_includes "${TMPDIR}"
-
-  cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
-  cp tensorflow/tools/pip_package/README ${TMPDIR}/README.md
-  cp tensorflow/tools/pip_package/setup.py ${TMPDIR}
-
-  rm -f ${TMPDIR}/tensorflow/libtensorflow_framework.so
-  rm -f ${TMPDIR}/tensorflow/libtensorflow_framework.so.[0-9].*
-
-  # Copying symlinks with -L duplicates these libraries.
-  rm -f ${TMPDIR}/tensorflow/libtensorflow_framework.dylib
-  rm -f ${TMPDIR}/tensorflow/libtensorflow_framework.[0-9].*.dylib
-  rm -f ${TMPDIR}/tensorflow/libtensorflow_cc.dylib
-  rm -f ${TMPDIR}/tensorflow/libtensorflow_cc.[0-9].*.dylib
-
-  # TODO(annarev): copy over API files from tensorflow/api/_vN to tensorflow/
-  #   except tensorflow/api/_vN/lite/.
-
-  # TODO(b/150440817): support autocomplete for tf.keras
-  # Copy over keras API folder to the root directory
-  # so that autocomplete works as expected for all keras subimports.
-  # if [ -d "${TMPDIR}/tensorflow/_api/v1/" ]
-  # then
-  #   cp -r ${TMPDIR}/tensorflow/python/keras/api/_v1/keras/ ${TMPDIR}/tensorflow/keras/
-  #   sed -i'.original' -e 's/.python.keras.api._v1/tensorflow/g' ${TMPDIR}/tensorflow/__init__.py
-  # else
-  #   cp -r ${TMPDIR}/tensorflow/python/keras/api/_v2/keras/ ${TMPDIR}/tensorflow/keras/
-  #   sed -i'.original' -e 's/.python.keras.api._v2/tensorflow/g' ${TMPDIR}/tensorflow/__init__.py
-  # fi
-}
-
-function build_wheel() {
-  if [ $# -lt 2 ] ; then
-    echo "No src and dest dir provided"
-    exit 1
-  fi
-
-  TMPDIR="$1"
-  DEST="$2"
-  PKG_NAME_FLAG="$3"
-
-  # Before we leave the top-level directory, make sure we know how to
-  # call python.
-  if [[ -e tools/python_bin_path.sh ]]; then
-    source tools/python_bin_path.sh
-  fi
-  if is_windows; then
-    PY_DIR=$(find -L ./bazel-tensorflow/external -maxdepth 1 -type d -name "python_*")
-    FULL_DIR="$(real_path "$PY_DIR")/python"
-    export PYTHONPATH="$PYTHONPATH:$PWD/bazel-tensorflow/external/pypi_wheel/site-packages/"
-  else
-    PY_DIR=$(find ./bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/ -maxdepth 1 -type d -name "python_*")
-    FULL_DIR="$(real_path "$PY_DIR")/bin/python3"
-    export PYTHONPATH="$PYTHONPATH:$PWD/bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/pypi_wheel/site-packages/"
-  fi
-
-  pushd ${TMPDIR} > /dev/null
-
-  rm -f MANIFEST
-  echo $(date) : "=== Building wheel"
-  $FULL_DIR setup.py bdist_wheel ${PKG_NAME_FLAG} >/dev/null
-  mkdir -p ${DEST}
-  cp dist/* ${DEST}
-  popd > /dev/null
-  echo $(date) : "=== Output wheel file is in: ${DEST}"
-}
-
-function usage() {
-  echo "Usage:"
-  echo "$0 [--src srcdir] [--dst dstdir] [options]"
-  echo "$0 dstdir [options]"
-  echo ""
-  echo "    --src                 prepare sources in srcdir"
-  echo "                              will use temporary dir if not specified"
-  echo ""
-  echo "    --dst                 build wheel in dstdir"
-  echo "                              if dstdir is not set do not build, only prepare sources"
-  echo ""
-  echo "  Options:"
-  echo "    --project_name <name> set project name to name"
-  echo "    --cpu                 build tensorflow_cpu"
-  echo "    --tpu                 build tensorflow_tpu"
-  echo "    --gpudirect           build tensorflow_gpudirect"
-  echo "    --rocm                build tensorflow_rocm"
-  echo "    --nightly_flag        build tensorflow nightly"
-  echo ""
-  exit 1
-}
-
-function main() {
-  PKG_NAME_FLAG=""
-  PROJECT_NAME=""
-  CPU_BUILD=0
-  TPU_BUILD=0
-  GPUDIRECT_BUILD=0
-  ROCM_BUILD=0
-  NIGHTLY_BUILD=0
-  SRCDIR=""
-  DSTDIR=""
-  CLEANSRC=1
-  while true; do
-    if [[ "$1" == "--help" ]]; then
-      usage
-      exit 1
-    elif [[ "$1" == "--nightly_flag" ]]; then
-      NIGHTLY_BUILD=1
-    elif [[ "$1" == "--cpu" ]]; then
-      CPU_BUILD=1
-    elif [[ "$1" == "--tpu" ]]; then
-      TPU_BUILD=1
-    elif [[ "$1" == "--gpudirect" ]]; then
-      GPUDIRECT_BUILD=1
-    elif [[ "$1" == "--rocm" ]]; then
-      ROCM_BUILD=1
-    elif [[ "$1" == "--project_name" ]]; then
-      shift
-      if [[ -z "$1" ]]; then
-        break
-      fi
-      PROJECT_NAME="$1"
-    elif [[ "$1" == "--src" ]]; then
-      shift
-      SRCDIR="$(real_path $1)"
-      CLEANSRC=0
-    elif [[ "$1" == "--dst" ]]; then
-      shift
-      DSTDIR="$(real_path $1)"
-    else
-      DSTDIR="$(real_path $1)"
-    fi
-    shift
-
-    if [[ -z "$1" ]]; then
-      break
-    fi
-  done
-
-  if [[ $(( TPU_BUILD + CPU_BUILD + GPUDIRECT_BUILD + ROCM_BUILD )) -gt "1" ]]; then
-    echo "Only one of [--tpu, --cpu, --gpudirect, --rocm] may be provided."
-    usage
-    exit 1
-  fi
-
-  if [[ -z "$DSTDIR" ]] && [[ -z "$SRCDIR" ]]; then
-    echo "No destination dir provided"
-    usage
-    exit 1
-  fi
-
-  if [[ -z "$SRCDIR" ]]; then
-    # make temp srcdir if none set
-    SRCDIR="$(mktemp -d -t tmp.XXXXXXXXXX)"
-  fi
-
-  prepare_src "$SRCDIR"
-
-  if [[ -z "$DSTDIR" ]]; then
-      # only want to prepare sources
-      exit
-  fi
-
-  if [[ -n ${PROJECT_NAME} ]]; then
-    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPUDIRECT_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_gpudirect"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${ROCM_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_rocm"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${CPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_cpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${TPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_tpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly"
-  elif [[ ${GPUDIRECT_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
-  elif [[ ${ROCM_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_rocm"
-  elif [[ ${CPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_cpu"
-  elif [[ ${TPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_tpu"
-  fi
-
-  build_wheel "$SRCDIR" "$DSTDIR" "$PKG_NAME_FLAG"
-
-  if [[ $CLEANSRC -ne 0 ]]; then
-    rm -rf "${TMPDIR}"
-  fi
-}
-
-main "$@"

From ec90699d199db2971ffc381e46d6993c54c76d0f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 11:36:38 -0700
Subject: [PATCH 311/670] Temporarily comment @fused_batchnorm_no_training() as
 it breaks the msan check.

PiperOrigin-RevId: 618241782
---
 .../transforms/fold_broadcast_pass.cc         |  3 +--
 .../tests/components/tf_to_stablehlo.mlir     | 23 +++++++++----------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc
index 066bc83ad90217..847738e5cc7cbe 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc
@@ -177,8 +177,7 @@ class FoldBroadcastInDimBeforeBinaryElementwiseOp
     // When the operand other than the broadcast op is not a const op, we
     // should not fold broadcast op.
     auto binary_op_const_operand =
-        lhs_bcast_op ? rhs.template getDefiningOp<mhlo::ConstantOp>()
-                     : lhs.template getDefiningOp<mhlo::ConstantOp>();
+        (lhs_bcast_op ? rhs : lhs).template getDefiningOp<mhlo::ConstantOp>();
     if (!binary_op_const_operand) return failure();
     auto bcast_op = lhs_bcast_op ? lhs_bcast_op : rhs_bcast_op;
     auto const_op =
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir
index 55ff087240a5e0..240b10d8438431 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir
@@ -1,17 +1,16 @@
 // RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics -stablehlo-test-tf-to-stablehlo | FileCheck %s
 
-func.func @fused_batchnorm_no_training() -> (tensor<1x1x2x8xf32>) {
-  %cst_0 = "tf.Const"() {value = dense<[[[[0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2], [0.3, 0.4, 0.3, 0.4, 0.3, 0.4, 0.3, 0.4]]]]> : tensor<1x1x2x8xf32>} : () -> tensor<1x1x2x8xf32>
-  %cst_1 = "tf.Const"() {value = dense<[0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2]> : tensor<8xf32>} : () -> tensor<8xf32>
-  %cst_2 = "tf.Const"() {value = dense<[0.3, 0.4, 0.3, 0.4, 0.3, 0.4, 0.3, 0.4]> : tensor<8xf32>} : () -> tensor<8xf32>
-  %0:6 = "tf.FusedBatchNormV3"(%cst_0, %cst_1, %cst_2, %cst_1, %cst_2) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<1x1x2x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<1x1x2x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
-  func.return %0#0 : tensor<1x1x2x8xf32>
-}
-// CHECK: func.func @main() -> tensor<1x1x2x8xf32>
-// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<{{.*}}> : tensor<1x1x2x8xf32>
-// CHECK: return %[[CONST]] : tensor<1x1x2x8xf32>
-
-// -----
+// TODO(b/330759552): Fix the msan issue and enable this test.
+// func.func @fused_batchnorm_no_training() -> tensor<1x1x2x8xf32> {
+//   %cst_0 = "tf.Const"() {value = dense<[[[[0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2], [0.3, 0.4, 0.3, 0.4, 0.3, 0.4, 0.3, 0.4]]]]> : tensor<1x1x2x8xf32>} : () -> tensor<1x1x2x8xf32>
+//   %cst_1 = "tf.Const"() {value = dense<[0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2]> : tensor<8xf32>} : () -> tensor<8xf32>
+//   %cst_2 = "tf.Const"() {value = dense<[0.3, 0.4, 0.3, 0.4, 0.3, 0.4, 0.3, 0.4]> : tensor<8xf32>} : () -> tensor<8xf32>
+//   %0:6 = "tf.FusedBatchNormV3"(%cst_0, %cst_1, %cst_2, %cst_1, %cst_2) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<1x1x2x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<1x1x2x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+//   func.return %0#0 : tensor<1x1x2x8xf32>
+// }
+// COM: CHECK: func.func @main() -> tensor<1x1x2x8xf32>
+// COM: CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<{{.*}}> : tensor<1x1x2x8xf32>
+// COM: CHECK: return %[[CONST]] : tensor<1x1x2x8xf32>
 
 func.func @fused_batchnorm_no_training_arg_input(%arg_0: tensor<1x1x2x8xf32>) -> (tensor<1x1x2x8xf32>) {
   %cst_0 = "tf.Const"() {value = dense<[0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2]> : tensor<8xf32>} : () -> tensor<8xf32>

From 9d1e9959026ab83fb6fb3105ec574ae2858525f1 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 22 Mar 2024 11:43:30 -0700
Subject: [PATCH 312/670] [XLA:GPU][NFC] Remove unused variable

PiperOrigin-RevId: 618243919
---
 third_party/xla/xla/service/gpu/hlo_traversal_test.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/hlo_traversal_test.cc b/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
index 4ca50f4538cf6b..c7e3f0db3b7b47 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
@@ -226,7 +226,6 @@ TEST_F(HloTraversalTest, FindIf) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
   auto fusion = HloFusionAdaptor::ForInstruction(
       module->entry_computation()->GetInstructionWithName("fusion"));
-  std::vector<std::string> visited_nodes;
   auto result =
       HloFindIf(fusion->GetRoots(), *fusion, [&](HloInstructionAdaptor node) {
         return node.opcode() == HloOpcode::kMultiply;
@@ -239,7 +238,6 @@ TEST_F(HloTraversalTest, NotFound) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
   auto fusion = HloFusionAdaptor::ForInstruction(
       module->entry_computation()->GetInstructionWithName("fusion"));
-  std::vector<std::string> visited_nodes;
   auto result = HloFindIf(fusion->GetRoots(), *fusion,
                           [&](HloInstructionAdaptor node) { return false; });
   ASSERT_EQ(result, std::nullopt);

From 304704326d063da9fdc6713f5e8c07bd27d5f44e Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Fri, 22 Mar 2024 11:46:31 -0700
Subject: [PATCH 313/670] [XLA:GPU] [NFC] Switch DynCast to Cast

The pointer is dereferenced later anyway, DynCast is just hiding bugs.

PiperOrigin-RevId: 618244837
---
 third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 9bbe0fbf85a286..5683766497bf40 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -335,18 +335,18 @@ std::pair<GpuResourceType, ResourceUsageType> GetP2PResourceAndUsage(
       // A pipelined P2P. Find the corresponding start-op.
       const HloSendRecvInstruction* start;
       const HloGetTupleElementInstruction* gte =
-          DynCast<HloGetTupleElementInstruction>(operand);
+          Cast<HloGetTupleElementInstruction>(operand);
       int64_t tuple_index = gte->tuple_index();
       if (gte->operand(0)->opcode() == HloOpcode::kWhile) {
         // The op is a while-result, so the start-op should be a value in the
         // while-op operands.
-        start = DynCast<HloSendRecvInstruction>(
+        start = Cast<HloSendRecvInstruction>(
             gte->operand(0)->operand(0)->operand(tuple_index));
       } else {
         // The op is a while-body parameter, so the start-op should be a value
         // in the while-body result.
         const HloComputation* computation = instr.parent();
-        start = DynCast<HloSendRecvInstruction>(
+        start = Cast<HloSendRecvInstruction>(
             computation->root_instruction()->operand(tuple_index));
       }
       pipeline = GetPipelineStream(*start);

From 8b5fb41fb4c37dd2af90ef3d6309acca06de774f Mon Sep 17 00:00:00 2001
From: Michael Levesque-Dion <mlevesquedion@google.com>
Date: Fri, 22 Mar 2024 12:03:15 -0700
Subject: [PATCH 314/670] Implement pass to convert custom calls to composites

PiperOrigin-RevId: 618249745
---
 tensorflow/compiler/mlir/lite/stablehlo/BUILD |  26 +++++
 ...blehlo-custom-call-legalize-composite.mlir |  18 +++
 ...lize_stablehlo_custom_call_to_composite.cc | 110 ++++++++++++++++++
 .../mlir/lite/stablehlo/transforms/passes.h   |   3 +-
 .../mlir/lite/stablehlo/transforms/passes.td  |   4 +
 .../compiler/mlir/lite/tf_tfl_passes.cc       |   7 ++
 6 files changed, 166 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/stablehlo-custom-call-legalize-composite.mlir
 create mode 100644 tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 858c09d323683a..34401ced6e280f 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -190,6 +190,7 @@ cc_library(
         ":drop_savedmodel_semantics",
         ":fold_broadcast_pass",
         ":fuse_convolution_pass",
+        ":legalize_stablehlo_custom_call_to_composite",
         ":legalize_tf_xla_call_module_to_stablehlo_pass",
         ":optimize",
         ":rename_entrypoint_to_main",
@@ -469,6 +470,30 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "legalize_stablehlo_custom_call_to_composite",
+    srcs = [
+        "transforms/legalize_stablehlo_custom_call_to_composite.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+        "transforms/passes.h.inc",
+    ],
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+        "@stablehlo//:stablehlo_ops",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "optimize",
     srcs = [
@@ -684,6 +709,7 @@ tf_cc_binary(
         ":compose_uniform_quantized_type_pass",
         ":fold_broadcast_pass",
         ":fuse_convolution_pass",
+        ":legalize_stablehlo_custom_call_to_composite",
         ":legalize_stablehlo_to_vhlo_pass",
         ":legalize_tf_xla_call_module_to_stablehlo_pass",
         ":optimize",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/stablehlo-custom-call-legalize-composite.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/stablehlo-custom-call-legalize-composite.mlir
new file mode 100644
index 00000000000000..b2b12c4c47b579
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/stablehlo-custom-call-legalize-composite.mlir
@@ -0,0 +1,18 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-custom-call-legalize-composite | FileCheck %s
+
+// CHECK-LABEL: module
+module {
+  // CHECK-LABEL: @main
+  func.func @main(%arg0: tensor<1xf32>, %arg1: tensor<2xf32>) {
+    // CHECK: stablehlo.custom_call @foo
+    stablehlo.custom_call @foo() : () -> ()
+    // CHECK-NOT: stablehlo.custom_call
+    // CHECK: stablehlo.composite "odml.foo" %arg0, %arg1 {composite_attributes = {bar = 500 : i64}, decomposition = @foo.impl} : (tensor<1xf32>, tensor<2xf32>) -> (tensor<2xf32>, tensor<1xf32>)
+    %1:2 = stablehlo.custom_call @stablehlo.composite(%arg0, %arg1) {called_computations = [@foo.impl], composite.backend_config = {attributes = {bar = 500 : i64}, name = "odml.foo"}} : (tensor<1xf32>, tensor<2xf32>) -> (tensor<2xf32>, tensor<1xf32>)
+    return
+  }
+  // CHECK-LABEL: func private @foo.impl
+  func.func private @foo.impl(%arg0: tensor<1xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<1xf32>) {
+    return %arg1, %arg0 : tensor<2xf32>, tensor<1xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
new file mode 100644
index 00000000000000..4cfb0e04e96af4
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
@@ -0,0 +1,110 @@
+/* Copyright 2022 The StableHLO Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
+
+namespace mlir {
+namespace odml {
+
+#define GEN_PASS_DEF_LEGALIZESTABLEHLOCUSTOMCALLTOCOMPOSITEPASS
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h.inc"
+
+struct ReplaceCustomCallWithComposite final
+    : OpRewritePattern<mlir::stablehlo::CustomCallOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  explicit ReplaceCustomCallWithComposite(MLIRContext *context)
+      : OpRewritePattern<mlir::stablehlo::CustomCallOp>(context) {}
+
+  LogicalResult matchAndRewrite(mlir::stablehlo::CustomCallOp op,
+                                PatternRewriter &rewriter) const override {
+    auto backendConfig =
+        op->getAttr("composite.backend_config").dyn_cast<DictionaryAttr>();
+    if (!backendConfig)
+      return op->emitError(
+          "custom_call has no 'composite.backend_config' attribute or the "
+          "attribute is not a dictionary");
+
+    auto name = backendConfig.get("name").dyn_cast<StringAttr>();
+    if (!name)
+      return op->emitError(
+          "backend_config has no 'name' key or the name value is not a string");
+
+    auto attrs = backendConfig.get("attributes").dyn_cast<DictionaryAttr>();
+    if (!attrs)
+      return op->emitError(
+          "backend_config has no 'attributes' key or the attributes value is "
+          "not a dictionary");
+
+    auto calledComputations = op.getCalledComputations();
+    if (!calledComputations || calledComputations.size() != 1)
+      return op->emitError("expected exactly one called_computation");
+
+    auto decomposition = calledComputations[0].cast<FlatSymbolRefAttr>();
+
+    auto composite = rewriter.create<mlir::stablehlo::CompositeOp>(
+        op.getLoc(), op.getResultTypes(), op.getOperands(), name.str(), attrs,
+        decomposition.getValue());
+    rewriter.replaceOp(op, composite.getResults());
+    return success();
+  }
+};
+
+struct LegalizeStablehloCustomCallToCompositePass
+    : public impl::LegalizeStablehloCustomCallToCompositePassBase<
+          LegalizeStablehloCustomCallToCompositePass> {
+  using LegalizeStablehloCustomCallToCompositePassBase::
+      LegalizeStablehloCustomCallToCompositePassBase;
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+
+    ConversionTarget target(*context);
+    target.addLegalDialect<mlir::stablehlo::StablehloDialect>();
+    target.addLegalDialect<mlir::func::FuncDialect>();
+    target.addDynamicallyLegalOp<mlir::stablehlo::CustomCallOp>(
+        [&](mlir::stablehlo::CustomCallOp op) {
+          return op.getCallTargetName() != "stablehlo.composite";
+        });
+
+    RewritePatternSet patterns(context);
+    patterns.add<ReplaceCustomCallWithComposite>(context);
+
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+static PassRegistration<LegalizeStablehloCustomCallToCompositePass>
+    pass_shlo_sc2c;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
index 8df2d3503f3632..1735b99aafb0e0 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
@@ -67,8 +67,7 @@ void PopulateLegalizeHloToTfPatterns(RewritePatternSet* patterns,
 void PopulateLegalizeHloToTFLitePatterns(RewritePatternSet* patterns,
                                          MLIRContext* context);
 
-#define GEN_PASS_DECL_LEGALIZESTABLEHLOTOVHLOPASS
-#define GEN_PASS_DECL_LEGALIZEVHLOTOSTABLEHLOPASS
+#define GEN_PASS_DECL
 #define GEN_PASS_REGISTRATION
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h.inc"
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
index 002990601a9efb..f736fa734d3ffc 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
@@ -95,6 +95,10 @@ def LegalizeVhloToStablehloPass : Pass<"vhlo-legalize-stablehlo", "ModuleOp"> {
   let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
 }
 
+def LegalizeStablehloCustomCallToCompositePass : Pass<"stablehlo-custom-call-legalize-composite", "ModuleOp"> {
+  let summary = "Legalize StableHLO custom call ops where the call target is 'stablehlo.composite' to composite ops.";
+  let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
+}
 
 def UnfoldSplatConstantPass : Pass<"unfold-splat-constant-pass", "ModuleOp"> {
   let summary = "Replaces a splat constant tensor with a BroadcastInDim op.";
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 59bb17ba3bc613..9f0d1c99c64d60 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -252,6 +252,13 @@ void AddPostQuantizationStableHloToTfPasses(
 
   // Legalize all remaining mhlo ops to stableHLO
   pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+
+  // Translate "stablehlo.custom_call @stablehlo.composite" to
+  // "stablehlo.composite"
+  // TODO: b/330741524 - clean this up when "stablehlo.composite" is emitted
+  // directly.
+  pass_manager.addPass(
+      mlir::odml::createLegalizeStablehloCustomCallToCompositePass());
 }
 
 // This is the early part of the conversion in isolation. This enables a caller

From e39899366da9f9cb3c9b00b950bf47446c4ecd4a Mon Sep 17 00:00:00 2001
From: Eric Yang <yijieyang@google.com>
Date: Fri, 22 Mar 2024 12:10:41 -0700
Subject: [PATCH 315/670] Add public visibility for tensorflow protobuf BUILD
 targets "for_core_protos".

PiperOrigin-RevId: 618251933
---
 tensorflow/core/protobuf/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
index 76f6c6551f8d74..8c60bdab0656ee 100644
--- a/tensorflow/core/protobuf/BUILD
+++ b/tensorflow/core/protobuf/BUILD
@@ -216,6 +216,7 @@ tf_proto_library(
         "@local_tsl//tsl/protobuf:status_proto",
     ],
     tags = ["alt_dep=//third_party/tensorflow/core:protos_all"],
+    visibility = ["//visibility:public"],
     exports = [
         "@local_tsl//tsl/protobuf:bfc_memory_map_proto",
         "@local_tsl//tsl/protobuf:rpc_options_proto",

From 8b5370df5655da95b113362e18a9cd850ded7973 Mon Sep 17 00:00:00 2001
From: Mike Kruskal <mkruskal@google.com>
Date: Fri, 22 Mar 2024 12:21:04 -0700
Subject: [PATCH 316/670] Migrate direct callers of options().packed() to
 is_packed() helper.

PiperOrigin-RevId: 618254593
---
 tensorflow/core/kernels/encode_proto_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/encode_proto_op.cc b/tensorflow/core/kernels/encode_proto_op.cc
index 8a79d0817d4126..50329ad8db720f 100644
--- a/tensorflow/core/kernels/encode_proto_op.cc
+++ b/tensorflow/core/kernels/encode_proto_op.cc
@@ -252,7 +252,7 @@ Status WriteField(const FieldDescriptor& field_desc, const Tensor& input,
       WireFormatLite::FieldType(field_desc.type()));
 
   auto input_t = input.flat_inner_dims<TensorT>();
-  if (field_desc.options().packed()) {
+  if (field_desc.is_packed()) {
     // Write the tag for the packed field.
     WireFormatLite::WriteTag(field_desc.number(),
                              WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);

From 5a6898987e7e7e6bafb4f7d0705177f963659f42 Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Fri, 22 Mar 2024 12:40:59 -0700
Subject: [PATCH 317/670] Enable more warnings for XLA

PiperOrigin-RevId: 618259828
---
 .bazelrc                                      | 87 +++++++++++++++++++
 third_party/xla/.bazelrc                      | 87 +++++++++++++++++++
 third_party/xla/.kokoro/linux/build.sh        |  1 +
 third_party/xla/third_party/tsl/.bazelrc      | 87 +++++++++++++++++++
 .../xla/xla/service/gpu/gemm_rewriter.cc      |  2 +
 .../xla/service/sharding_propagation_test.cc  |  1 +
 .../xla/service/spmd/spmd_partitioner_util.h  |  1 +
 7 files changed, 266 insertions(+)

diff --git a/.bazelrc b/.bazelrc
index 1e297772408ba6..5f36e395844fc1 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -922,3 +922,90 @@ build:rbe_cross_compile_macos_x86 --jobs=100
 test:rbe_cross_compile_macos_x86 --jobs=100
 # END MACOS CROSS-COMPILE CONFIGS
 # END CROSS-COMPILE CONFIGS
+
+# XLA Warnings Config
+# # Treat warnings as errors...
+build:warnings --copt=-Werror --host_copt=-Werror
+# ...and silence them outside of the workspace.
+build:warnings --per_file_copt=external/.*@-w
+# ...and silence them on host builds. There is no host_per_file_copt and
+# everything we build in the host configuration we either also build in the
+# target configuration or is external, so we can't control it.
+# If/when Bazel supports --host_per_file_copt, we could use that instead:
+# https://github.com/bazelbuild/bazel/issues/12406.
+# Would need to then make all the --copt below duplicated with --host_copt.
+build:warnings --host_copt=-w
+
+# Set clang warnings and promotion to errors. These largely match the set of
+# warnings used within Google. If you feel that some of these should be
+# different, please raise an issue!
+
+build:warnings --copt=-Wall
+
+# The list below comes from cs/warnings.bzl internally.
+# Disable warnings that generally have a low signal/noise ratio.
+build:warnings --copt=-Wno-ambiguous-member-template
+build:warnings --copt=-Wno-char-subscripts
+build:warnings --copt=-Wno-deprecated-declarations
+build:warnings --copt=-Wno-deprecated-pragma
+build:warnings --copt=-Wno-extern-c-compat
+build:warnings --copt=-Wno-gnu-alignof-expression
+build:warnings --copt=-Wno-gnu-variable-sized-type-not-at-end
+build:warnings --copt=-Wno-implicit-int-float-conversion
+build:warnings --copt=-Wno-invalid-source-encoding
+build:warnings --copt=-Wno-mismatched-tags
+build:warnings --copt=-Wno-pointer-sign
+build:warnings --copt=-Wno-private-header
+build:warnings --copt=-Wno-sign-compare
+build:warnings --copt=-Wno-strict-overflow
+build:warnings --copt=-Wno-unknown-pragmas
+build:warnings --copt=-Wno-unused-command-line-argument
+build:warnings --copt=-Wno-unused-const-variable
+build:warnings --copt=-Wno-unused-function
+build:warnings --copt=-Wno-unused-private-field
+build:warnings --copt=-Wno-user-defined-warnings
+build:warnings --copt=-Wno-return-type-c-linkage
+build:warnings --copt=-Wno-self-assign-overloaded
+
+# Warnings that are disabled internally, but may be reenabled
+build:warnings --copt=-Wno-address-of-packed-member
+build:warnings --copt=-Wno-defaulted-function-deleted
+build:warnings --copt=-Wno-enum-compare-switch
+build:warnings --copt=-Wno-expansion-to-defined
+build:warnings --copt=-Wno-ignored-attributes
+build:warnings --copt=-Wno-ignored-qualifiers
+build:warnings --copt=-Wno-inconsistent-missing-override
+build:warnings --copt=-Wno-potentially-evaluated-expression
+build:warnings --copt=-Wno-range-loop-analysis
+build:warnings --copt=-Wno-strict-prototypes
+build:warnings --copt=-Wno-tautological-type-limit-compare
+build:warnings --copt=-Wno-tautological-undefined-compare
+build:warnings --copt=-Wno-tautological-unsigned-zero-compare
+build:warnings --copt=-Wno-tautological-unsigned-enum-zero-compare
+build:warnings --copt=-Wno-undefined-func-template
+build:warnings --copt=-Wno-unused-but-set-variable
+build:warnings --copt=-Wno-unused-lambda-capture
+build:warnings --copt=-Wno-unused-local-typedef
+build:warnings --copt=-Wno-deprecated-builtins
+
+# Disable warnings that we want, but require tons of work to enable at present
+# for XLA specifically
+build:warnings --copt=-Wno-macro-redefined # because of tsl vs absl logging
+
+# Explicitly enable some additional warnings.
+# Some of these aren't on by default, or aren't on under -Wall, or are subsets
+# of warnings turned off above.
+build:warnings --copt=-Wfloat-overflow-conversion
+build:warnings --copt=-Wfloat-zero-conversion
+build:warnings --copt=-Wfor-loop-analysis
+build:warnings --copt=-Wgnu-redeclared-enum
+build:warnings --copt=-Winfinite-recursion
+build:warnings --copt=-Wself-assign
+build:warnings --copt=-Wstring-conversion
+build:warnings --copt=-Wtautological-overlap-compare
+build:warnings --copt=-Wunused-but-set-parameter
+build:warnings --copt=-Wunused-comparison
+build:warnings --copt=-Wvla
+build:warnings --copt=-Wctad-maybe-unsupported
+build:warnings --copt=-Wthread-safety-beta
+
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 1e297772408ba6..5f36e395844fc1 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -922,3 +922,90 @@ build:rbe_cross_compile_macos_x86 --jobs=100
 test:rbe_cross_compile_macos_x86 --jobs=100
 # END MACOS CROSS-COMPILE CONFIGS
 # END CROSS-COMPILE CONFIGS
+
+# XLA Warnings Config
+# # Treat warnings as errors...
+build:warnings --copt=-Werror --host_copt=-Werror
+# ...and silence them outside of the workspace.
+build:warnings --per_file_copt=external/.*@-w
+# ...and silence them on host builds. There is no host_per_file_copt and
+# everything we build in the host configuration we either also build in the
+# target configuration or is external, so we can't control it.
+# If/when Bazel supports --host_per_file_copt, we could use that instead:
+# https://github.com/bazelbuild/bazel/issues/12406.
+# Would need to then make all the --copt below duplicated with --host_copt.
+build:warnings --host_copt=-w
+
+# Set clang warnings and promotion to errors. These largely match the set of
+# warnings used within Google. If you feel that some of these should be
+# different, please raise an issue!
+
+build:warnings --copt=-Wall
+
+# The list below comes from cs/warnings.bzl internally.
+# Disable warnings that generally have a low signal/noise ratio.
+build:warnings --copt=-Wno-ambiguous-member-template
+build:warnings --copt=-Wno-char-subscripts
+build:warnings --copt=-Wno-deprecated-declarations
+build:warnings --copt=-Wno-deprecated-pragma
+build:warnings --copt=-Wno-extern-c-compat
+build:warnings --copt=-Wno-gnu-alignof-expression
+build:warnings --copt=-Wno-gnu-variable-sized-type-not-at-end
+build:warnings --copt=-Wno-implicit-int-float-conversion
+build:warnings --copt=-Wno-invalid-source-encoding
+build:warnings --copt=-Wno-mismatched-tags
+build:warnings --copt=-Wno-pointer-sign
+build:warnings --copt=-Wno-private-header
+build:warnings --copt=-Wno-sign-compare
+build:warnings --copt=-Wno-strict-overflow
+build:warnings --copt=-Wno-unknown-pragmas
+build:warnings --copt=-Wno-unused-command-line-argument
+build:warnings --copt=-Wno-unused-const-variable
+build:warnings --copt=-Wno-unused-function
+build:warnings --copt=-Wno-unused-private-field
+build:warnings --copt=-Wno-user-defined-warnings
+build:warnings --copt=-Wno-return-type-c-linkage
+build:warnings --copt=-Wno-self-assign-overloaded
+
+# Warnings that are disabled internally, but may be reenabled
+build:warnings --copt=-Wno-address-of-packed-member
+build:warnings --copt=-Wno-defaulted-function-deleted
+build:warnings --copt=-Wno-enum-compare-switch
+build:warnings --copt=-Wno-expansion-to-defined
+build:warnings --copt=-Wno-ignored-attributes
+build:warnings --copt=-Wno-ignored-qualifiers
+build:warnings --copt=-Wno-inconsistent-missing-override
+build:warnings --copt=-Wno-potentially-evaluated-expression
+build:warnings --copt=-Wno-range-loop-analysis
+build:warnings --copt=-Wno-strict-prototypes
+build:warnings --copt=-Wno-tautological-type-limit-compare
+build:warnings --copt=-Wno-tautological-undefined-compare
+build:warnings --copt=-Wno-tautological-unsigned-zero-compare
+build:warnings --copt=-Wno-tautological-unsigned-enum-zero-compare
+build:warnings --copt=-Wno-undefined-func-template
+build:warnings --copt=-Wno-unused-but-set-variable
+build:warnings --copt=-Wno-unused-lambda-capture
+build:warnings --copt=-Wno-unused-local-typedef
+build:warnings --copt=-Wno-deprecated-builtins
+
+# Disable warnings that we want, but require tons of work to enable at present
+# for XLA specifically
+build:warnings --copt=-Wno-macro-redefined # because of tsl vs absl logging
+
+# Explicitly enable some additional warnings.
+# Some of these aren't on by default, or aren't on under -Wall, or are subsets
+# of warnings turned off above.
+build:warnings --copt=-Wfloat-overflow-conversion
+build:warnings --copt=-Wfloat-zero-conversion
+build:warnings --copt=-Wfor-loop-analysis
+build:warnings --copt=-Wgnu-redeclared-enum
+build:warnings --copt=-Winfinite-recursion
+build:warnings --copt=-Wself-assign
+build:warnings --copt=-Wstring-conversion
+build:warnings --copt=-Wtautological-overlap-compare
+build:warnings --copt=-Wunused-but-set-parameter
+build:warnings --copt=-Wunused-comparison
+build:warnings --copt=-Wvla
+build:warnings --copt=-Wctad-maybe-unsupported
+build:warnings --copt=-Wthread-safety-beta
+
diff --git a/third_party/xla/.kokoro/linux/build.sh b/third_party/xla/.kokoro/linux/build.sh
index dde19cb3d7e3fb..87bf19cd7ad71f 100644
--- a/third_party/xla/.kokoro/linux/build.sh
+++ b/third_party/xla/.kokoro/linux/build.sh
@@ -93,6 +93,7 @@ docker exec xla bazel \
         --features=layering_check \
         --profile=/tf/pkg/profile.json.gz \
         --flaky_test_attempts=3 \
+        --config=warnings \
         $RBE_FLAGS \
         $ADDITIONAL_FLAGS \
         -- //xla/... //build_tools/... @local_tsl//tsl/... $TARGET_FILTERS
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 1e297772408ba6..5f36e395844fc1 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -922,3 +922,90 @@ build:rbe_cross_compile_macos_x86 --jobs=100
 test:rbe_cross_compile_macos_x86 --jobs=100
 # END MACOS CROSS-COMPILE CONFIGS
 # END CROSS-COMPILE CONFIGS
+
+# XLA Warnings Config
+# # Treat warnings as errors...
+build:warnings --copt=-Werror --host_copt=-Werror
+# ...and silence them outside of the workspace.
+build:warnings --per_file_copt=external/.*@-w
+# ...and silence them on host builds. There is no host_per_file_copt and
+# everything we build in the host configuration we either also build in the
+# target configuration or is external, so we can't control it.
+# If/when Bazel supports --host_per_file_copt, we could use that instead:
+# https://github.com/bazelbuild/bazel/issues/12406.
+# Would need to then make all the --copt below duplicated with --host_copt.
+build:warnings --host_copt=-w
+
+# Set clang warnings and promotion to errors. These largely match the set of
+# warnings used within Google. If you feel that some of these should be
+# different, please raise an issue!
+
+build:warnings --copt=-Wall
+
+# The list below comes from cs/warnings.bzl internally.
+# Disable warnings that generally have a low signal/noise ratio.
+build:warnings --copt=-Wno-ambiguous-member-template
+build:warnings --copt=-Wno-char-subscripts
+build:warnings --copt=-Wno-deprecated-declarations
+build:warnings --copt=-Wno-deprecated-pragma
+build:warnings --copt=-Wno-extern-c-compat
+build:warnings --copt=-Wno-gnu-alignof-expression
+build:warnings --copt=-Wno-gnu-variable-sized-type-not-at-end
+build:warnings --copt=-Wno-implicit-int-float-conversion
+build:warnings --copt=-Wno-invalid-source-encoding
+build:warnings --copt=-Wno-mismatched-tags
+build:warnings --copt=-Wno-pointer-sign
+build:warnings --copt=-Wno-private-header
+build:warnings --copt=-Wno-sign-compare
+build:warnings --copt=-Wno-strict-overflow
+build:warnings --copt=-Wno-unknown-pragmas
+build:warnings --copt=-Wno-unused-command-line-argument
+build:warnings --copt=-Wno-unused-const-variable
+build:warnings --copt=-Wno-unused-function
+build:warnings --copt=-Wno-unused-private-field
+build:warnings --copt=-Wno-user-defined-warnings
+build:warnings --copt=-Wno-return-type-c-linkage
+build:warnings --copt=-Wno-self-assign-overloaded
+
+# Warnings that are disabled internally, but may be reenabled
+build:warnings --copt=-Wno-address-of-packed-member
+build:warnings --copt=-Wno-defaulted-function-deleted
+build:warnings --copt=-Wno-enum-compare-switch
+build:warnings --copt=-Wno-expansion-to-defined
+build:warnings --copt=-Wno-ignored-attributes
+build:warnings --copt=-Wno-ignored-qualifiers
+build:warnings --copt=-Wno-inconsistent-missing-override
+build:warnings --copt=-Wno-potentially-evaluated-expression
+build:warnings --copt=-Wno-range-loop-analysis
+build:warnings --copt=-Wno-strict-prototypes
+build:warnings --copt=-Wno-tautological-type-limit-compare
+build:warnings --copt=-Wno-tautological-undefined-compare
+build:warnings --copt=-Wno-tautological-unsigned-zero-compare
+build:warnings --copt=-Wno-tautological-unsigned-enum-zero-compare
+build:warnings --copt=-Wno-undefined-func-template
+build:warnings --copt=-Wno-unused-but-set-variable
+build:warnings --copt=-Wno-unused-lambda-capture
+build:warnings --copt=-Wno-unused-local-typedef
+build:warnings --copt=-Wno-deprecated-builtins
+
+# Disable warnings that we want, but require tons of work to enable at present
+# for XLA specifically
+build:warnings --copt=-Wno-macro-redefined # because of tsl vs absl logging
+
+# Explicitly enable some additional warnings.
+# Some of these aren't on by default, or aren't on under -Wall, or are subsets
+# of warnings turned off above.
+build:warnings --copt=-Wfloat-overflow-conversion
+build:warnings --copt=-Wfloat-zero-conversion
+build:warnings --copt=-Wfor-loop-analysis
+build:warnings --copt=-Wgnu-redeclared-enum
+build:warnings --copt=-Winfinite-recursion
+build:warnings --copt=-Wself-assign
+build:warnings --copt=-Wstring-conversion
+build:warnings --copt=-Wtautological-overlap-compare
+build:warnings --copt=-Wunused-but-set-parameter
+build:warnings --copt=-Wunused-comparison
+build:warnings --copt=-Wvla
+build:warnings --copt=-Wctad-maybe-unsupported
+build:warnings --copt=-Wthread-safety-beta
+
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index 4d97587955e05e..4068b2336f6b9a 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -922,8 +922,10 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 
 #endif  // TENSORFLOW_USE_ROCM
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     PrimitiveType a_type = a->shape().element_type();
     PrimitiveType b_type = b->shape().element_type();
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
     // cuBLASLt FP8 GEMM kernels require one of the two operands to be in
     // F8E4M3FN format.
diff --git a/third_party/xla/xla/service/sharding_propagation_test.cc b/third_party/xla/xla/service/sharding_propagation_test.cc
index a078f630b1df9c..f4a97ff8c1dd57 100644
--- a/third_party/xla/xla/service/sharding_propagation_test.cc
+++ b/third_party/xla/xla/service/sharding_propagation_test.cc
@@ -9306,6 +9306,7 @@ ENTRY %entry {
       HloConstantSplitter(/*split_expressions=*/true).Run(module.get()));
   EXPECT_TRUE(is_split);
   TF_ASSERT_OK_AND_ASSIGN(auto _, HloDCE().Run(module.get()));
+  (void)_;  // Suppress unused variable warning in OSS
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true)
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
index 2bea346dbf9f0c..ae59b9f8737220 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
@@ -901,6 +901,7 @@ absl::StatusOr<std::pair<int64_t, int64_t>> EvaluatePartitionCost(
   HloDCE hlo_dce;
   TF_ASSIGN_OR_RETURN(
       auto _, hlo_dce.Run(&fake_module, partitioner->execution_threads()));
+  (void)_;  // Suppress unused variable warning in OSS
   VLOG(5) << "Dry-run partitioning for op: " << original_hlo->ToString() << "\n"
           << fake_module.ToString();
 

From b27ce6421f6cc55a5c08848614ff9b96ff6725c2 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 22 Mar 2024 13:10:53 -0700
Subject: [PATCH 318/670] [xla:gpu] Factor out
 AddressComputationFusionRewriter::Run to handle dynamic slices

PiperOrigin-RevId: 618267336
---
 .../address_computation_fusion_rewriter.cc    | 127 +++++++++++-------
 1 file changed, 77 insertions(+), 50 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index afc7c384384531..4e63e48997c5f9 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -93,12 +93,13 @@ bool IsCustomCall(const HloInstruction* hlo, absl::string_view platform_name) {
 // Returns true if the slice is 128-byte-aligned. The slice starting
 // address is determined by the product of all non-sliced dimensions and an
 // offset defined by `slice_starts` of the slice op.
-bool IsAlignedSlice(const HloInstruction& instr) {
-  if (!IsContiguousSlice(instr)) return false;
-
-  auto slice = Cast<HloSliceInstruction>(&instr);
-  const Shape& src_shape = instr.operand(0)->shape();
-  const Shape& dst_shape = instr.shape();
+//
+// For dynamic cases, we don't have info about the start indices, so we have to
+// be conservative by only accepting sliced shapes that have the product of all
+// non-sliced dimensions being a multiple of `kXlaAllocatedBufferAlignBytes`.
+bool IsAlignedSlice(const Shape& src_shape, const Shape& dst_shape,
+                    const HloSliceInstruction* slice) {
+  if (!IsContiguousSlice(src_shape, dst_shape)) return false;
 
   auto strides = ShapeUtil::ByteStrides(dst_shape);
   if (!strides.has_value()) return false;
@@ -107,16 +108,17 @@ bool IsAlignedSlice(const HloInstruction& instr) {
     if ((strides.value()[dim] % kXlaAllocatedBufferAlignBytes) == 0)
       return true;
     if (dst_shape.dimensions(dim) < src_shape.dimensions(dim)) {
-      return ((strides.value()[dim] * slice->slice_starts(dim)) %
-                  kXlaAllocatedBufferAlignBytes ==
-              0);
+      return (slice != nullptr &&
+              ((strides.value()[dim] * slice->slice_starts(dim)) %
+                   kXlaAllocatedBufferAlignBytes ==
+               0));
     }
   }
   return true;
 }
 
 absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
-    const HloInstruction* instr) {
+    const HloInstruction* instr, bool dynamic) {
   absl::InlinedVector<HloInstruction*, 8> sliced_operand_chains = {
       const_cast<HloInstruction*>(instr)};
   auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
@@ -137,6 +139,7 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
     // operand.
     if (aliased_operands.contains(instr->operand_index(operand))) continue;
     absl::InlinedVector<HloInstruction*, 4> maybe_sliced_operand_chain;
+    bool slice_found = false;
     auto maybe_slice_adaptor =
         HloFindIf({HloInstructionAdaptor(*operand)}, *fusion, [&](auto node) {
           const HloInstruction* cur = &node.instruction();
@@ -150,11 +153,27 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
           // TODO(vuson): lift the second restriction by considering fusing the
           // non-noop instructions to the computation if possible (i.e. for
           // dynamic slices).
-          return cur->user_count() > 1 || !IsNoOp(cur) || IsAlignedSlice(*cur);
+          if (dynamic) {
+            if (const auto slice_instr =
+                    DynCast<HloDynamicSliceInstruction>(cur)) {
+              if (IsAlignedSlice(slice_instr->shape(),
+                                 slice_instr->operand(0)->shape(), nullptr))
+                slice_found = true;
+              return slice_found;
+            }
+          } else {
+            if (const auto slice_instr = DynCast<HloSliceInstruction>(cur)) {
+              if (IsAlignedSlice(slice_instr->operand(0)->shape(),
+                                 slice_instr->shape(), slice_instr))
+                slice_found = true;
+              return slice_found;
+            }
+          }
+          return cur->user_count() > 1 || !IsNoOp(cur);
         });
     if (maybe_slice_adaptor == std::nullopt) continue;
     const auto& maybe_slice_instr = maybe_slice_adaptor->instruction();
-    if (IsAlignedSlice(maybe_slice_instr) ||
+    if (slice_found ||
         processed_sliced_chain_set.contains(&maybe_slice_instr)) {
       // Even in the case of stopping at a match that has been processed, we
       // still need to add instructions encountered in the sliced operand chain
@@ -274,7 +293,8 @@ absl::StatusOr<HloComputation*> CreateFusionBody(
 
 absl::StatusOr<HloInstruction*> CreateFusionInstruction(
     HloModule* module, HloInstruction* orig,
-    absl::Span<HloInstruction* const> captures, HloComputation* body) {
+    absl::Span<HloInstruction* const> captures, HloComputation* body,
+    bool dynamic) {
   HloComputation* parent = orig->parent();
 
   // Add a fusion operation calling outlined fusion computation.
@@ -292,7 +312,8 @@ absl::StatusOr<HloInstruction*> CreateFusionInstruction(
       *gpu_config.mutable_fusion_backend_config();
   backend_config.set_kind("__custom_fusion");
   CustomFusionConfig config;
-  config.set_name("address_computation");
+  config.set_name(dynamic ? "dynamic_address_computation"
+                          : "address_computation");
   *backend_config.mutable_custom_fusion_config() = config;
   TF_RETURN_IF_ERROR(fusion->set_backend_config(std::move(gpu_config)));
 
@@ -306,53 +327,59 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (!module->has_schedule()) return Internal("module is not scheduled");
 
-  absl::flat_hash_map<HloInstruction*, absl::InlinedVector<HloInstruction*, 8>>
-      matches;
-
-  // Collect all potential custom call matches in the non-fusion computations.
-  for (HloComputation* computation : module->computations()) {
-    if (computation->IsFusionComputation()) continue;
-    for (HloInstruction* instr : computation->instructions()) {
-      if (IsLegacyCublasMatmul(*instr) || IsCustomCall(instr, platform_name_)) {
-        auto sliced_operand_chains = GetSlicedOperandChains(instr);
-        if (!(sliced_operand_chains.size() == 1 &&
-              sliced_operand_chains.front() == instr)) {
-          matches[instr] = std::move(sliced_operand_chains);
+  auto process_slices = [&](bool dynamic) -> absl::StatusOr<bool> {
+    absl::flat_hash_map<HloInstruction*,
+                        absl::InlinedVector<HloInstruction*, 8>>
+        matches;
+
+    // Collect all potential custom call matches in the non-fusion computations.
+    for (HloComputation* computation : module->computations()) {
+      if (computation->IsFusionComputation()) continue;
+      for (HloInstruction* instr : computation->instructions()) {
+        if (IsLegacyCublasMatmul(*instr) ||
+            (!dynamic && IsCustomCall(instr, platform_name_))) {
+          auto sliced_operand_chains = GetSlicedOperandChains(instr, dynamic);
+          if (!(sliced_operand_chains.size() == 1 &&
+                sliced_operand_chains.front() == instr)) {
+            matches[instr] = std::move(sliced_operand_chains);
+          }
         }
       }
     }
-  }
 
-  if (matches.empty()) return false;
+    if (matches.empty()) return false;
 
-  HloSchedule& schedule = module->schedule();
-  for (auto& kv : matches) {
-    auto captures = GetPatternCaptures(kv.second);
-    auto sorted = GetSortedMatched(kv.second);
+    HloSchedule& schedule = module->schedule();
+    for (auto& kv : matches) {
+      auto captures = GetPatternCaptures(kv.second);
+      auto sorted = GetSortedMatched(kv.second);
 
-    TF_ASSIGN_OR_RETURN(HloComputation * fusion_body,
-                        CreateFusionBody(module, sorted, captures));
-    TF_ASSIGN_OR_RETURN(
-        HloInstruction * fusion,
-        CreateFusionInstruction(module, kv.first, captures, fusion_body));
+      TF_ASSIGN_OR_RETURN(HloComputation * fusion_body,
+                          CreateFusionBody(module, sorted, captures));
+      TF_ASSIGN_OR_RETURN(HloInstruction * fusion,
+                          CreateFusionInstruction(module, kv.first, captures,
+                                                  fusion_body, dynamic));
 
-    // As we are running after scheduling we have to keep it valid.
-    HloComputation* parent = kv.first->parent();
+      // As we are running after scheduling we have to keep it valid.
+      HloComputation* parent = kv.first->parent();
 
-    // Update schedule to replace the custom call instruction with the fusion
-    // instruction.
-    // Removal of the rest of the instructions in the sequence is handled by
-    // schedule update below.
-    HloInstructionSequence& sequence = schedule.GetOrCreateSequence(parent);
-    sequence.replace_instruction(kv.first, fusion);
+      // Update schedule to replace the custom call instruction with the fusion
+      // instruction.
+      // Removal of the rest of the instructions in the sequence is handled by
+      // schedule update below.
+      HloInstructionSequence& sequence = schedule.GetOrCreateSequence(parent);
+      sequence.replace_instruction(kv.first, fusion);
 
-    // TODO(vuson): handle control dependencies
-    TF_RETURN_IF_ERROR(parent->ReplaceInstruction(kv.first, fusion));
-  }
+      // TODO(vuson): handle control dependencies
+      TF_RETURN_IF_ERROR(parent->ReplaceInstruction(kv.first, fusion));
+    }
 
-  TF_RETURN_IF_ERROR(module->schedule().Update());
+    TF_RETURN_IF_ERROR(module->schedule().Update());
 
-  return true;
+    return true;
+  };
+
+  return process_slices(false);
 }
 
 }  // namespace gpu

From 58060b4017b971435b5473c21da3d5228a9e41da Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 22 Mar 2024 13:12:57 -0700
Subject: [PATCH 319/670] [XLA:GPU] Fix possible bug in libdevice path logic.

* When copying a C string, we need to include the null terminator byte. It seems likely this code reads off the end of the array since it only copies non-null bytes. But if we use a std::string it will handle this for us.
* dirname() may not be thread-safe. On Linux it likely is, but better to avoid it and just use the TSL utility for this.

This may be the cause of a flaky test failure in JAX CI.

PiperOrigin-RevId: 618267961
---
 .../tsl/tsl/platform/default/cuda_libdevice_path.cc    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc b/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc
index ed2ffece58819b..46321e74b5dc38 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tsl/platform/path.h"
 #include "tsl/platform/platform.h"
 
 #if defined(PLATFORM_POSIX) && !defined(__APPLE__)
@@ -44,15 +45,14 @@ std::vector<std::string> CandidateCudaRoots() {
   Dl_info info;
 
   if (dladdr(&__FUNCTION__, &info)) {
-    auto lib = std::vector<char>{info.dli_fname,
-                                 info.dli_fname + strlen(info.dli_fname)};
-    auto dir = dirname(lib.data());
+    auto lib = std::string(info.dli_fname);
+    auto dir = io::Dirname(lib);
 
     // TF lib binaries are located in both the package's root dir and within a
     // 'python' subdirectory (for pywrap libs). So we check two possible paths
     // relative to the current binary for the wheel-based nvcc package.
-    for (auto path : {"/../nvidia/cuda_nvcc", "/../../nvidia/cuda_nvcc"})
-      roots.emplace_back(std::string(dir) + path);
+    for (auto path : {"../nvidia/cuda_nvcc", "../../nvidia/cuda_nvcc"})
+      roots.emplace_back(io::JoinPath(dir, path));
   }
 #endif  // defined(PLATFORM_POSIX) && !defined(__APPLE__)
 

From aeab9b40882723c07ae834fbf384211b301e82b2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 13:20:56 -0700
Subject: [PATCH 320/670] Use the appropriate operand sharding when computing
 resharding costs for HLO ops when a new ShardingStrategy is generated for n
 op based on a user annotation.

PiperOrigin-RevId: 618269975
---
 .../auto_sharding/auto_sharding.cc            |  24 +-
 .../auto_sharding/auto_sharding_test.cc       | 220 ++++++++++--------
 2 files changed, 140 insertions(+), 104 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 596883dc97d5c8..116447777fd9cf 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -162,7 +162,8 @@ std::vector<double> MemoryReshardingCostVector(
   auto required_sharding_for_resharding = required_sharding.IsTileMaximal()
                                               ? HloSharding::Replicate()
                                               : required_sharding;
-  CHECK_OK(required_sharding.Validate(operand_shape));
+  CHECK_OK(required_sharding.Validate(operand_shape))
+      << strategy_group->ToString();
   for (const auto& x : strategy_group->strategies) {
     ret.push_back(ComputeMemoryReshardingCost(operand_shape, x.output_sharding,
                                               required_sharding_for_resharding,
@@ -1452,13 +1453,9 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
           for (size_t i = 0; i < strategy_group->in_nodes.size(); i++) {
             HloInstruction* operand =
                 instructions.at(strategy_group->in_nodes.at(i)->instruction_id);
-            std::optional<HloSharding> input_sharding_or =
+            std::optional<HloSharding> input_sharding =
                 ShardingPropagation::GetShardingFromUser(*operand, *ins, 10,
                                                          true, call_graph);
-            if (input_sharding_or.has_value()) {
-              input_shardings.push_back(input_sharding_or.value());
-            }
-
             StrategyGroup* operand_strategy_group =
                 strategy_map.at(operand).get();
             Shape operand_shape = operand->shape();
@@ -1467,12 +1464,23 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
                   operand_strategy_group->childs[ins->tuple_index()].get();
               operand_shape = operand_shape.tuple_shapes(ins->tuple_index());
             }
+
+            if (input_sharding.has_value()) {
+              input_sharding = *input_sharding;
+            } else if (existing_sharding.Validate(operand_shape).ok()) {
+              input_sharding = existing_sharding;
+            } else {
+              input_sharding = HloSharding::Replicate();
+            }
+            CHECK(input_sharding.has_value());
+
+            input_shardings.push_back(*input_sharding);
             communication_resharding_costs.push_back(
                 CommunicationReshardingCostVector(
-                    operand_strategy_group, operand_shape, existing_sharding,
+                    operand_strategy_group, operand_shape, *input_sharding,
                     cluster_env));
             memory_resharding_costs.push_back(MemoryReshardingCostVector(
-                operand_strategy_group, operand_shape, existing_sharding,
+                operand_strategy_group, operand_shape, *input_sharding,
                 cluster_env));
           }
         }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index 27b9df98e3a88c..cd90015605086b 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -59,7 +59,7 @@ using ::testing::UnorderedElementsAre;
 using DummyAutoShardingTest = HloTestBase;
 
 TEST_F(DummyAutoShardingTest, ReplicatedShardingDummy) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0)
@@ -69,7 +69,7 @@ ENTRY %elementwise {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   TF_ASSERT_OK_AND_ASSIGN(bool changed, DummyAutoSharding().Run(module.get()));
   EXPECT_TRUE(changed);
   auto* instruction = FindInstruction(module.get(), "param0");
@@ -79,14 +79,14 @@ ENTRY %elementwise {
 
 class AutoShardingTest : public HloTestBase {
  protected:
-  absl::string_view dot_hlo_string_ = R"(
+  const absl::string_view kDotHloString = R"(
 HloModule module
 ENTRY matmul {
   parameter.1 = f32[32,64]{1,0} parameter(0)
   parameter.2 = f32[64,128]{1,0} parameter(1)
   ROOT root = f32[32,128]{1,0} dot(parameter.1, parameter.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
-  absl::string_view add_hlo_string_ = R"(
+  const absl::string_view kAddHloString = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[16,32,64]{2,1,0} parameter(0)
@@ -97,7 +97,7 @@ ENTRY %elementwise {
       AutoShardingOption option, size_t expected_num_tiles,
       size_t expected_sharded_dimensions = 1) {
     TF_ASSERT_OK_AND_ASSIGN(auto module,
-                            ParseAndReturnVerifiedModule(dot_hlo_string_));
+                            ParseAndReturnVerifiedModule(kDotHloString));
     RunAutoShardingWithOptions(module.get(), option, expected_num_tiles,
                                expected_sharded_dimensions);
   }
@@ -106,7 +106,7 @@ ENTRY %elementwise {
                                      size_t expected_num_tiles,
                                      size_t expected_sharded_dimensions = 1) {
     TF_ASSERT_OK_AND_ASSIGN(auto module,
-                            ParseAndReturnVerifiedModule(add_hlo_string_));
+                            ParseAndReturnVerifiedModule(kAddHloString));
     RunAutoShardingWithOptions(module.get(), option, expected_num_tiles,
                                expected_sharded_dimensions);
   }
@@ -128,7 +128,7 @@ ENTRY %elementwise {
 
   void RunMatMulAutoShardingWithOptionsExpectFail(AutoShardingOption option) {
     TF_ASSERT_OK_AND_ASSIGN(auto module,
-                            ParseAndReturnVerifiedModule(dot_hlo_string_));
+                            ParseAndReturnVerifiedModule(kDotHloString));
     RunAutoShardingWithOptionsExpectFail(module.get(), option);
   }
 
@@ -141,7 +141,7 @@ ENTRY %elementwise {
       AutoShardingOption option, std::vector<int64_t> expected_tile,
       bool expeted_last_dim_replicate = false) {
     TF_ASSERT_OK_AND_ASSIGN(auto module,
-                            ParseAndReturnVerifiedModule(dot_hlo_string_));
+                            ParseAndReturnVerifiedModule(kDotHloString));
     RunAutoShardingWithOptionsNoDeviceIds(module.get(), option, expected_tile,
                                           expeted_last_dim_replicate);
   }
@@ -163,7 +163,7 @@ ENTRY %elementwise {
 };
 
 TEST_F(AutoShardingTest, DISABLED_ElementWiseOperator) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[128,128]{0,1} parameter(0)
@@ -173,7 +173,7 @@ ENTRY %elementwise {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -189,7 +189,7 @@ ENTRY %elementwise {
 }
 
 TEST_F(AutoShardingTest, Unsupported3DShardingTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[32,32,32,32] parameter(0)
@@ -199,7 +199,7 @@ ENTRY %elementwise {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   // The case of a fleet HLO when run with try_multiple_mesh_shapes = true
@@ -211,7 +211,7 @@ ENTRY %elementwise {
 }
 
 TEST_F(AutoShardingTest, NDIterativeSolveTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %elementwise {
@@ -230,7 +230,7 @@ ENTRY %elementwise {
   option.device_mesh_beta = {0.01, 1.0};
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
   VLOG(10) << module->ToString();
   EXPECT_TRUE(changed);
@@ -240,7 +240,7 @@ ENTRY %elementwise {
 }
 
 TEST_F(AutoShardingTest, SliceDeviceMeshTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %elementwise {
@@ -250,15 +250,15 @@ ENTRY %elementwise {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      AutoSharding(/* option */ {.enable = true,
-                                 .solve_nd_sharding_iteratively = true,
-                                 .device_mesh_shape = {2, 2},
-                                 .device_mesh_alpha = {1.0, 1.0},
-                                 .device_mesh_beta = {0.01, 1.0}})
-          .Run(module.get()));
+      bool changed, AutoSharding(/* option */ AutoShardingOption{
+                                     .enable = true,
+                                     .solve_nd_sharding_iteratively = true,
+                                     .device_mesh_shape = {2, 2},
+                                     .device_mesh_alpha = {1.0, 1.0},
+                                     .device_mesh_beta = {0.01, 1.0}})
+                        .Run(module.get()));
   VLOG(10) << module->ToString();
   EXPECT_TRUE(changed);
   const HloInstruction* slice = FindInstruction(module.get(), "slice");
@@ -306,8 +306,36 @@ ENTRY %elementwise {
   EXPECT_THAT(instructions, Each(op::Sharding("{devices=[4,1]0,2,1,3}")));
 }
 
+TEST_F(AutoShardingTest, UserShardingTest) {
+  constexpr absl::string_view kHloString = R"(
+HloModule module
+
+ENTRY %elementwise {
+  concatenate.76306 = bf16[1,4096,8,256]{3,2,1,0} parameter(0)
+  constant.15158 = bf16[] constant(0)
+  pad.70 = bf16[1,4352,8,256]{3,2,1,0} pad(concatenate.76306, constant.15158), padding=0_0x0_256x0_0x0_0, sharding={devices=[1,1,128,1]<=[128]}
+  ROOT copy.45 = bf16[1,4352,8,256]{3,2,1,0} copy(pad.70)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      AutoSharding(
+          /* option */ AutoShardingOption{
+              .enable = true,
+              .preserve_shardings =
+                  AutoShardingOption::PreserveShardingsType::kKeepAllShardings,
+              .device_mesh_shape = {128, 1},
+              .device_mesh_alpha = {1.0, 1.0},
+              .device_mesh_beta = {0.01, 1.0}})
+          .Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+}
+
 TEST_F(AutoShardingTest, RngBitGeneratorArrayInput) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule rng_bit_generator
 
 ENTRY %RngBitGenerator (p0: u64[2]) -> (u64[2], u32[16,16]) {
@@ -316,7 +344,7 @@ ENTRY %RngBitGenerator (p0: u64[2]) -> (u64[2], u32[16,16]) {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -332,7 +360,7 @@ ENTRY %RngBitGenerator (p0: u64[2]) -> (u64[2], u32[16,16]) {
 }
 
 TEST_F(AutoShardingTest, RngBitGeneratorTupleInput) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule rng_bit_generator
 
 ENTRY %RngBitGenerator {
@@ -343,7 +371,7 @@ ENTRY %RngBitGenerator {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -362,7 +390,7 @@ ENTRY %RngBitGenerator {
 }
 
 TEST_F(AutoShardingTest, DotLHSTwoNonContractingDims) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[4,256,64]{2,1,0} parameter(0)
@@ -372,7 +400,7 @@ ENTRY %entry {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -414,7 +442,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DotRHSTwoNonContractingDims) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[4,256,32]{2,1,0} parameter(0)
@@ -424,7 +452,7 @@ ENTRY %entry {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -466,7 +494,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DotTwoContractingDims) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[4,256,64]{2,1,0} parameter(0)
@@ -476,7 +504,7 @@ ENTRY %entry {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -507,7 +535,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, TwoMatmul) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY twomatmul {
   parameter.1 = f32[64,64]{1,0} parameter(0)
@@ -518,7 +546,7 @@ ENTRY twomatmul {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.allow_recompute_heavy_op = false;
@@ -550,7 +578,7 @@ ENTRY twomatmul {
               op::Sharding("{devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}"));
 
   // Test with replicated strategies on for dot
-  TF_ASSERT_OK_AND_ASSIGN(module, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(module, ParseAndReturnVerifiedModule(kHloString));
   option.enable = true;
   option.allow_recompute_heavy_op = true;
   option.device_mesh_shape = {2, 2};
@@ -586,7 +614,7 @@ ENTRY twomatmul {
 }
 
 TEST_F(AutoShardingTest, ProcessCustomCallShardings) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry {
@@ -599,7 +627,7 @@ ENTRY %entry {
   ROOT %copy.3 = f32[6,3] copy(%copy.2)
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -616,7 +644,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, SaveAndRemoveShardingAnnotationKeepAll) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -626,7 +654,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   // Keep all user shardings
   option.preserve_shardings =
@@ -670,7 +698,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 
 TEST_F(AutoShardingTest,
        SaveAndRemoveShardingAnnotationKeepInputOutputSmallTensor) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -680,7 +708,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   // Keep all user shardings
   option.preserve_shardings =
@@ -720,7 +748,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, SaveAndRemoveShardingAnnotationKeepInputOutput) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -732,7 +760,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.preserve_shardings =
       AutoShardingOption::PreserveShardingsType::kKeepInputOutputShardings;
@@ -798,7 +826,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, SaveAndRemoveShardingAnnotationRemoveAll) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -811,7 +839,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   // Remove all user shardings
   option.preserve_shardings =
@@ -835,7 +863,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, SaveAndRemoveShardingAnnotationRemoveAllSmallTensor) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -845,7 +873,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={replicated}
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   // Remove all user shardings
   option.preserve_shardings =
@@ -888,7 +916,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, TupleReduceTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 %func (lhs_value: f32[], lhs_index: s32[], rhs_value: f32[], rhs_index: s32[]) -> (f32[], s32[]) {
   %lhs_value = f32[] parameter(0)
@@ -912,7 +940,7 @@ ENTRY %entry {
   %reduce = (f32[1,16]{1,0}, s32[1,16]{1,0}) reduce(f32[1,16,40]{2,1,0} %param0, s32[1,16,40]{2,1,0} %iota, f32[] %constant.a, s32[] %constant.b), dimensions={2}, to_apply=%func
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -934,7 +962,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, ReduceTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 %func (x: f32[], y: f32[]) -> f32[] {
@@ -949,7 +977,7 @@ ENTRY %entry {
   %reduce = f32[1,16]{1,0} reduce(f32[1,16,128]{2,1,0} %param0, f32[] %param1), dimensions={2}, to_apply=%func
   })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -977,7 +1005,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, ScatterTest2D) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 region {
@@ -993,7 +1021,7 @@ ENTRY %Scatter {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1016,7 +1044,7 @@ ENTRY %Scatter {
 }
 
 TEST_F(AutoShardingTest, ScatterTest3D) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 region {
@@ -1032,7 +1060,7 @@ ENTRY %Scatter {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1059,7 +1087,7 @@ ENTRY %Scatter {
 }
 
 TEST_F(AutoShardingTest, GatherTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[256,1024]{0,1} parameter(0)
@@ -1067,7 +1095,7 @@ ENTRY %entry {
   ROOT %gather = f32[128,512,1024]{2,1,0} gather(f32[256,1024]{0,1} %param0, s32[128,512,1]{2,1,0} %param1), offset_dims={2}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2, slice_sizes={1,1024}
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1090,7 +1118,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, GatherTestNoReshard) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   get-tuple-element = s8[1000,128]{1,0} parameter(0)
@@ -1098,7 +1126,7 @@ ENTRY %entry {
   gather = s8[8,1,128]{2,1,0} gather(get-tuple-element, reshape), offset_dims={2}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2, slice_sizes={1,128}
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {1, 1, 8};
@@ -1121,7 +1149,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, GatherConvTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[1024,1024]{0,1} parameter(0)
@@ -1135,7 +1163,7 @@ ENTRY %entry {
   window={size=1}, dim_labels=b0f_io0->b0f
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {4, 1, 1};
@@ -1420,7 +1448,7 @@ TEST_F(AutoShardingTest, InvalidOptions) {
 
 TEST_F(AutoShardingTest, AutoShardingKeepUserShardingInputOutput) {
   // An HLO Module with sharding for all instructions.
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -1431,7 +1459,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   // Remove the sharding in dot
   auto* dot = FindInstruction(module.get(), "dot");
   dot->clear_sharding();
@@ -1452,7 +1480,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 
 TEST_F(AutoShardingTest, AutoShardingKeepUserShardingAdd) {
   // An HLO Module with sharding for all instructions.
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[128,128]{0,1} parameter(0)
@@ -1461,7 +1489,7 @@ ENTRY %elementwise {
   ROOT %copy = f32[128,128]{0,1} copy(%add)
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   // Run AutoSharding
   AutoShardingOption option;
   option.enable = true;
@@ -1486,7 +1514,7 @@ ENTRY %elementwise {
 
 TEST_F(AutoShardingTest, AutoShardingKeepUserShardingDot) {
   // An HLO Module with sharding for all instructions.
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -1497,7 +1525,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   // Remove the sharding in param0, param1 and copy
   auto* param0 = FindInstruction(module.get(), "param0");
   param0->clear_sharding();
@@ -1532,7 +1560,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, DISABLED_AutoShardingKeepUserShardingTupleReduce) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 %func (lhs_value: f32[], lhs_index: s32[], rhs_value: f32[], rhs_index: s32[]) -> (f32[], s32[]) {
   %lhs_value = f32[] parameter(0)
@@ -1557,7 +1585,7 @@ ENTRY %entry {
     sharding={{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}, {devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}}
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1580,7 +1608,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DISABLED_TupleParameter) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %tupleparameter {
   %tuple_param = (f32[16,32,64]{2,1,0}, f32[16,32,64]{2,1,0}) parameter(0)
@@ -1590,7 +1618,7 @@ ENTRY %tupleparameter {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1610,7 +1638,7 @@ ENTRY %tupleparameter {
 
 // CRASHES
 TEST_F(AutoShardingTest, DISABLED_GetTupleElementWithUserShardingTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 %while_cond {
@@ -1643,7 +1671,7 @@ ENTRY %entry (param0: f32[16,256,256], param1: f32[16,256,256]) -> f32[16,256,25
   ROOT %tanh = f32[16,256,256]{2,1,0} tanh(f32[16,256,256]{2,1,0} %tuple1)
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.preserve_shardings =
       AutoShardingOption::PreserveShardingsType::kKeepAllShardings;
@@ -1657,7 +1685,7 @@ ENTRY %entry (param0: f32[16,256,256], param1: f32[16,256,256]) -> f32[16,256,25
 }
 
 TEST_F(AutoShardingTest, While) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 %cond {
@@ -1694,7 +1722,7 @@ ENTRY %entry {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1737,7 +1765,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DynamicSlice) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   %param0 = s32[] parameter(0)
@@ -1753,7 +1781,7 @@ ENTRY %entry {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1766,7 +1794,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, Alias) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module, input_output_alias={ {0}: (0, {}, may-alias), {1}: (1, {}, may-alias), {2}: (2, {}, may-alias), {3}: (3, {}, may-alias)}
 
 ENTRY %entry {
@@ -1779,7 +1807,7 @@ ENTRY %entry {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1792,7 +1820,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, AliasTupleParameter) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module, input_output_alias={ {0}: (0, {0}, may-alias), {1}: (0, {1}, may-alias), {2}: (0, {2}, may-alias), {3}: (0, {3}, may-alias)}
 
 ENTRY %entry {
@@ -1806,7 +1834,7 @@ ENTRY %entry {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1819,7 +1847,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, JaxRandomUniform) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 clone {
   lhs.1 = u32[] parameter(0)
@@ -1853,7 +1881,7 @@ ENTRY %entry {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1870,7 +1898,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, Reshape) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry {
@@ -1884,7 +1912,7 @@ ENTRY %entry {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {64, 1};
@@ -1898,7 +1926,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, ReshapeWithInvalidUserSharding) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry {
@@ -1908,7 +1936,7 @@ ENTRY %entry {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {32, 1};
@@ -1924,7 +1952,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, Broadcast) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry {
@@ -1932,7 +1960,7 @@ ENTRY %entry {
   ROOT broadcast = s32[512,1024,1024,32]{3,2,1,0} broadcast(s32[32]{0} %param.0), dimensions={3}
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {1, 1, 64};
@@ -1943,7 +1971,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, TestReshardingCostsForUserAnnotatedSharding) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry {
@@ -1953,7 +1981,7 @@ ENTRY %entry {
   ROOT %result = f32[256,256] tanh(%dot), sharding={devices=[1,4]0,1,2,3}
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1969,7 +1997,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, AllowAliasToFollowerConversion) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module, input_output_alias={ {0}: (0, {}, may-alias), {1}: (1, {}, may-alias), {2}: (2, {}, may-alias), {3}: (3, {}, may-alias)}
 
 ENTRY %entry {
@@ -1982,7 +2010,7 @@ ENTRY %entry {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1996,7 +2024,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DisallowAliasToFollowerConversion) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module, input_output_alias={ {0}: (0, {}, may-alias), {1}: (1, {}, may-alias), {2}: (2, {}, may-alias), {3}: (3, {}, may-alias)}
 
 ENTRY %entry {
@@ -2009,7 +2037,7 @@ ENTRY %entry {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};

From 413974dc74772fcbcae8da4095dad13b90e53fff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 13:47:51 -0700
Subject: [PATCH 321/670] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/5c38721c44d1366b675e38651c4c1cc885054f7b.

PiperOrigin-RevId: 618277026
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index c0f514be6dccf3..c219069e9795d3 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "29bfd29cf353020006fa20ec0a41062ff699678d"
-    TFRT_SHA256 = "a99d91273478e9e07f9a80ad238ae358d02c149c8a53a8c8c02ebc189c0b4d5f"
+    TFRT_COMMIT = "5c38721c44d1366b675e38651c4c1cc885054f7b"
+    TFRT_SHA256 = "79307512f2f9b4ebaec87c73d1d188867a947f52d5b68df965b751897853ab88"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index c0f514be6dccf3..c219069e9795d3 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "29bfd29cf353020006fa20ec0a41062ff699678d"
-    TFRT_SHA256 = "a99d91273478e9e07f9a80ad238ae358d02c149c8a53a8c8c02ebc189c0b4d5f"
+    TFRT_COMMIT = "5c38721c44d1366b675e38651c4c1cc885054f7b"
+    TFRT_SHA256 = "79307512f2f9b4ebaec87c73d1d188867a947f52d5b68df965b751897853ab88"
 
     tf_http_archive(
         name = "tf_runtime",

From 537006913d9d814c12c3fc422d482590ce3f83da Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Fri, 22 Mar 2024 13:54:02 -0700
Subject: [PATCH 322/670] [XLA:GPU] Remove IndexingContext.

PiperOrigin-RevId: 618278599
---
 third_party/xla/xla/service/gpu/BUILD         |   1 -
 third_party/xla/xla/service/gpu/fusions/BUILD |   3 -
 .../xla/service/gpu/fusions/concatenate.cc    |   2 +-
 .../xla/xla/service/gpu/fusions/concatenate.h |   4 +-
 .../service/gpu/fusions/concatenate_mlir.cc   |  17 +-
 .../service/gpu/fusions/concatenate_mlir.h    |   5 +-
 .../xla/service/gpu/fusions/fusion_emitter.cc |  19 +-
 .../xla/service/gpu/fusions/fusion_emitter.h  |   6 +-
 .../fusions/in_place_dynamic_update_slice.h   |   4 +-
 .../in_place_dynamic_update_slice_mlir.cc     |   9 +-
 .../in_place_dynamic_update_slice_mlir.h      |   4 +-
 .../xla/service/gpu/fusions/input_slices.cc   |   4 +-
 .../xla/service/gpu/fusions/input_slices.h    |   4 +-
 .../service/gpu/fusions/input_slices_mlir.cc  |   8 +-
 .../service/gpu/fusions/input_slices_mlir.h   |   4 +-
 .../service/gpu/fusions/input_slices_test.cc  |   5 +-
 .../xla/xla/service/gpu/fusions/loop.cc       |  13 +-
 .../xla/xla/service/gpu/fusions/loop.h        |   4 +-
 .../xla/xla/service/gpu/fusions/loop_mlir.cc  |  17 +-
 .../xla/xla/service/gpu/fusions/loop_mlir.h   |   4 +-
 .../xla/service/gpu/fusions/loop_mlir_test.cc |  16 +-
 .../xla/xla/service/gpu/fusions/loop_test.cc  |  12 +-
 .../gpu/fusions/mlir/elemental_hlo_to_mlir.cc |  22 +-
 .../gpu/fusions/mlir/mlir_fusion_emitter.h    |   1 -
 .../fusions/mlir/mlir_fusion_emitter_test.cc  |   4 +-
 .../gpu/fusions/mlir/simplify_affine.cc       |   6 +-
 .../gpu/fusions/mlir_emitter_test_base.cc     |   3 +-
 .../gpu/fusions/mlir_emitter_test_base.h      |   2 -
 .../xla/service/gpu/fusions/reduction_base.cc |  39 ++-
 .../xla/service/gpu/fusions/reduction_base.h  |  13 +-
 .../gpu/fusions/reduction_base_test.cc        |  51 ++--
 .../xla/service/gpu/fusions/reduction_mlir.cc |  12 +-
 .../xla/xla/service/gpu/fusions/scatter.h     |   4 +-
 .../xla/service/gpu/fusions/scatter_mlir.cc   |  17 +-
 .../xla/service/gpu/fusions/scatter_mlir.h    |   4 +-
 .../service/gpu/fusions/scatter_mlir_test.cc  |  12 +-
 .../xla/xla/service/gpu/fusions/transpose.cc  |  20 +-
 .../xla/xla/service/gpu/fusions/transpose.h   |   4 +-
 .../xla/service/gpu/fusions/transpose_mlir.cc |  39 ++-
 .../xla/service/gpu/fusions/transpose_mlir.h  |   6 +-
 .../gpu/fusions/transpose_mlir_test.cc        |  16 +-
 .../xla/service/gpu/fusions/transpose_test.cc |  41 ++--
 .../xla/xla/service/gpu/ir_emitter_context.h  |   4 -
 third_party/xla/xla/service/gpu/model/BUILD   |   5 +-
 .../service/gpu/model/coalescing_analysis.cc  |  29 +--
 .../service/gpu/model/coalescing_analysis.h   |   7 +-
 .../gpu/model/coalescing_analysis_test.cc     |   6 +-
 .../model/gpu_indexing_performance_model.cc   |   2 +-
 .../model/gpu_indexing_performance_model.h    |   5 +-
 .../service/gpu/model/indexing_analysis.cc    | 225 ++++++------------
 .../xla/service/gpu/model/indexing_analysis.h |  39 ++-
 .../gpu/model/indexing_analysis_test.cc       |  15 +-
 .../xla/xla/service/gpu/model/indexing_map.cc |  26 +-
 .../xla/xla/service/gpu/model/indexing_map.h  |  25 +-
 .../service/gpu/model/indexing_map_test.cc    |  83 +++----
 .../service/gpu/model/indexing_test_utils.cc  |  12 +-
 .../service/gpu/model/indexing_test_utils.h   |   4 -
 .../gpu/model/symbolic_tile_analysis.cc       |   4 +-
 .../gpu/model/symbolic_tile_analysis.h        |  12 +-
 .../gpu/model/symbolic_tile_analysis_test.cc  |  16 +-
 .../xla/service/gpu/model/tile_analysis.cc    |   8 +-
 61 files changed, 400 insertions(+), 608 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 118cfdf76406f0..8ac0e29e3759a7 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -300,7 +300,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:name_uniquer",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index de04f55bd669d2..7d5340bd131783 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -260,7 +260,6 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:affine_map_printer",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/stream_executor:device_description",
         "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
@@ -756,7 +755,6 @@ xla_cc_test(
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emitter_context",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/model:indexing_test_utils",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
@@ -1036,7 +1034,6 @@ xla_cc_test(
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/model:affine_map_printer",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/model:indexing_test_utils",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.cc b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
index b8acbd4f8072d9..084aece24b1c92 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
@@ -58,7 +58,7 @@ ConcatenateFusion::ConcatenateFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis) {}
 
 std::optional<IndexingMap> ConcatenateFusion::ComputeThreadIdToOutputIndexing(
-    int64_t output_id, IndexingContext* indexing_context) const {
+    int64_t output_id, mlir::MLIRContext* ctx) const {
   return std::nullopt;  // TODO(b/319081342): Implement this.
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.h b/third_party/xla/xla/service/gpu/fusions/concatenate.h
index 5e51b50c2d1408..997033293eff2b 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.h
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.h
@@ -38,11 +38,11 @@ class ConcatenateFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, IndexingContext* indexing_context) const override;
+      int64_t output_id, mlir::MLIRContext* ctx) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override {
+      mlir::MLIRContext* ctx) const override {
     // TODO(b/319081342): Implement this.
     return std::nullopt;
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
index eeafdbcc110442..7039e992814877 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
@@ -58,17 +58,17 @@ LaunchDimensions MlirConcatenateFusion::launch_dimensions() const {
 
 std::optional<IndexingMap>
 MlirConcatenateFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, IndexingContext* indexing_context) const {
+    int64_t root_index, mlir::MLIRContext* ctx) const {
   return std::nullopt;
 }
 
 std::optional<IndexingMap>
 MlirConcatenateFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    IndexingContext* indexing_context) const {
+    mlir::MLIRContext* ctx) const {
   return GetDefaultThreadIdToOutputIndexingMap(
       launch_dimensions(), /*unroll_factor=*/1,
-      GetLargestConcatOperandShape(analysis_), indexing_context);
+      GetLargestConcatOperandShape(analysis_), ctx);
 }
 
 std::vector<const HloInstruction*>
@@ -87,8 +87,7 @@ absl::Status MlirConcatenateFusion::EmitEntryFunction(
   const auto* concat = analysis_.fusion_heroes()[0];
   mlir::ImplicitLocOpBuilder builder(entry_function.getLoc(), entry_function);
   builder.setInsertionPointToStart(entry_function.addEntryBlock());
-  auto* mlir_context = entry_function.getContext();
-  IndexingContext indexing_context{mlir_context};
+  auto* ctx = entry_function.getContext();
 
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
   SmallVector<Value> input_tensors(
@@ -101,15 +100,13 @@ absl::Status MlirConcatenateFusion::EmitEntryFunction(
 
   auto thread_id_to_input_map =
       ComputeThreadIdToInputIndexing(
-          /*root_index=*/0, /*hero_operand_index=*/0, &indexing_context)
+          /*root_index=*/0, /*hero_operand_index=*/0, ctx)
           .value();
-  auto epilogue_indexing =
-      ComputeEpilogueInputToOutputIndexing(concat, &indexing_context);
+  auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(concat, ctx);
 
   for (auto [operand_index, operand] : llvm::enumerate(concat->operands())) {
     auto input_to_output_map =
-        *ComputeInputToOutputIndexing(concat, /*input_id=*/operand_index,
-                                      &indexing_context)
+        *ComputeInputToOutputIndexing(concat, /*input_id=*/operand_index, ctx)
              .indexing_maps.front()
              .begin();
     auto thread_id_to_output_map = ComposeIndexingMaps(
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
index 38e5d81d1c0fe3..f14606a1073f47 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
@@ -41,11 +40,11 @@ class MlirConcatenateFusion : public MlirFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, IndexingContext* indexing_context) const override;
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override;
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitEntryFunction(
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
index bfb26e3a1d8737..f5dd10924ec327 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
@@ -119,9 +119,8 @@ absl::Status AnnotateKernelLaunchDimensions(
 
 IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
     const LaunchDimensions& launch_dims, int unroll_factor,
-    const Shape& output_shape, IndexingContext* indexing_context) {
+    const Shape& output_shape, mlir::MLIRContext* ctx) {
   std::vector<mlir::AffineExpr> output_dims(output_shape.rank());
-  auto mlir_context = indexing_context->GetMLIRContext();
 
   std::array<uint64_t, 3> thread_counts{
       launch_dims.thread_counts_per_block().x,
@@ -144,20 +143,19 @@ IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
   // This means that this code supports some launch grids that the parallel
   // loop emitter doesn't support. This is safe, since the latter CHECK fails
   // if its assumptions are not fulfilled.
-  mlir::AffineExpr c0 = mlir::getAffineConstantExpr(0, mlir_context);
+  mlir::AffineExpr c0 = mlir::getAffineConstantExpr(0, ctx);
   mlir::AffineExpr linear_index = c0;
   uint64_t stride = 1;
   for (int i = 0; i < 3; ++i) {
-    auto coord =
-        mlir::getAffineDimExpr(kIndexingMapThreadIdxDims[i], mlir_context) +
-        mlir::getAffineDimExpr(kIndexingMapBlockIdxDims[i], mlir_context) *
-            thread_counts[i];
+    auto coord = mlir::getAffineDimExpr(kIndexingMapThreadIdxDims[i], ctx) +
+                 mlir::getAffineDimExpr(kIndexingMapBlockIdxDims[i], ctx) *
+                     thread_counts[i];
     auto linear_component = coord * stride;
     linear_index = linear_index + linear_component;
     stride *= total_sizes[i];
   }
-  mlir::AffineExpr chunk_id = mlir::getAffineSymbolExpr(0, mlir_context);
-  mlir::AffineExpr unroll_elem_id = mlir::getAffineSymbolExpr(1, mlir_context);
+  mlir::AffineExpr chunk_id = mlir::getAffineSymbolExpr(0, ctx);
+  mlir::AffineExpr unroll_elem_id = mlir::getAffineSymbolExpr(1, ctx);
 
   linear_index = linear_index * unroll_factor +
                  chunk_id * unroll_factor * launch_dims.launch_bound() +
@@ -189,9 +187,8 @@ IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
                1}});
   range_vars.push_back({0, unroll_factor - 1});
   IndexingMap indexing_map(
-      indexing_context,
       mlir::AffineMap::get(/*dimCount=*/6,
-                           /*symbolCount=*/2, output_dims, mlir_context),
+                           /*symbolCount=*/2, output_dims, ctx),
       dim_vars, range_vars, /*rt_vars=*/{});
   // Remove the unroll_elem_id symbol if unrolling divides num_elements.
   if (num_elements % unroll_factor == 0) {
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
index b5fa0f32152e32..dbc8e8718debe0 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
@@ -77,14 +77,14 @@ class KernelFusionInterface : public FusionInterface {
   // unsupported (scatter, in-place DUS). Implementations will return nullopt.
   // Note: Work in progress, not implemented for all emitters.
   virtual std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, IndexingContext* indexing_context) const = 0;
+      int64_t root_index, mlir::MLIRContext* ctx) const = 0;
 
   // Computes an indexing map from thread to input element(s) of the root's
   // **hero**. Note that in many cases this is not computable from the output
   // indexing. The indexing may only be known for some operands of the hero.
   virtual std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const = 0;
+      mlir::MLIRContext* ctx) const = 0;
 
   static constexpr std::array<int, 3> kIndexingMapThreadIdxDims = {0, 1, 2};
   static constexpr std::array<int, 3> kIndexingMapBlockIdxDims = {3, 4, 5};
@@ -96,7 +96,7 @@ class KernelFusionInterface : public FusionInterface {
   // block sizes in the given launch dimensions.
   static IndexingMap GetDefaultThreadIdToOutputIndexingMap(
       const LaunchDimensions& launch_dims, int unroll_factor,
-      const Shape& output_shape, IndexingContext* indexing_context);
+      const Shape& output_shape, mlir::MLIRContext* ctx);
 };
 
 // Base class for fusions that are implemented using a single kernel, which is
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
index 4e4f2d82e94a80..12be8043b05ec1 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
@@ -67,7 +67,7 @@ class InPlaceDynamicUpdateSliceFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, IndexingContext* indexing_context) const override {
+      int64_t root_index, mlir::MLIRContext* ctx) const override {
     // The mapping cannot be statically computed in general, since the offsets
     // are unknown.
     return std::nullopt;
@@ -75,7 +75,7 @@ class InPlaceDynamicUpdateSliceFusion : public KernelFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override {
+      mlir::MLIRContext* ctx) const override {
     // TODO(b/319081342): Implement this.
     return std::nullopt;
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
index 2d41b95e0dc595..fcc4a31905f92e 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
@@ -80,13 +80,13 @@ LaunchDimensions MlirInPlaceDynamicUpdateSliceFusion::launch_dimensions()
 std::optional<IndexingMap>
 MlirInPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    IndexingContext* indexing_context) const {
+    mlir::MLIRContext* mlir_context) const {
   auto launch_dims = launch_dimensions();
   // It is guaranteed that all DUS ops have the same output shape at this point.
   const auto& update_shape =
       dus_ops_.front()->operand(kDUSUpdateIndex)->shape();
   return GetDefaultThreadIdToOutputIndexingMap(launch_dims, /*unroll_factor=*/1,
-                                               update_shape, indexing_context);
+                                               update_shape, mlir_context);
 }
 
 std::vector<const HloInstruction*>
@@ -102,12 +102,11 @@ absl::Status MlirInPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
   ImplicitLocOpBuilder b(entry_function.getLoc(), entry_function);
   b.setInsertionPointToStart(entry_function.addEntryBlock());
 
-  MLIRContext* mlir_context = entry_function.getContext();
-  IndexingContext indexing_context{mlir_context};
+  mlir::MLIRContext* mlir_context = entry_function.getContext();
 
   auto indexing = *ComputeThreadIdToInputIndexing(
       /*root_index=*/0,
-      /*hero_operand_index=*/kDUSUpdateIndex, &indexing_context);
+      /*hero_operand_index=*/kDUSUpdateIndex, mlir_context);
   indexing.Simplify();
   indexing.RemoveUnusedSymbols();
 
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
index 80072d160c495d..bac44f13144cd3 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
@@ -50,7 +50,7 @@ class MlirInPlaceDynamicUpdateSliceFusion : public MlirFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, IndexingContext* indexing_context) const override {
+      int64_t root_index, mlir::MLIRContext* indexing_context) const override {
     // The mapping cannot be statically computed in general, since the offsets
     // are unknown.
     return std::nullopt;
@@ -58,7 +58,7 @@ class MlirInPlaceDynamicUpdateSliceFusion : public MlirFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override;
+      mlir::MLIRContext* indexing_context) const override;
 
  protected:
   absl::Status EmitEntryFunction(
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.cc b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
index aa1398639bd397..85f661a8f125f5 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
@@ -183,7 +183,7 @@ LaunchDimensions InputSlicesFusion::launch_dimensions() const {
 }
 
 std::optional<IndexingMap> InputSlicesFusion::ComputeThreadIdToOutputIndexing(
-    int64_t output_id, IndexingContext* indexing_context) const {
+    int64_t output_id, mlir::MLIRContext* ctx) const {
   // The mapping here is trivial and the same for all outputs - slice offsets
   // are applied in the indexing from slice outputs to slice inputs.
   auto launch_dims = launch_dimensions();
@@ -191,7 +191,7 @@ std::optional<IndexingMap> InputSlicesFusion::ComputeThreadIdToOutputIndexing(
   // still use the requested output's shape for clarity.
   const auto& shape = analysis_.fusion_roots()[output_id]->shape();
   return GetDefaultThreadIdToOutputIndexingMap(launch_dims, unroll_factor_,
-                                               shape, indexing_context);
+                                               shape, ctx);
 }
 
 absl::Status InputSlicesFusion::EmitKernel(
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.h b/third_party/xla/xla/service/gpu/fusions/input_slices.h
index b1164c5df28e45..90f4f4e4a24d03 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.h
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.h
@@ -48,11 +48,11 @@ class InputSlicesFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, IndexingContext* indexing_context) const override;
+      int64_t output_id, mlir::MLIRContext* ctx) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override {
+      mlir::MLIRContext* ctx) const override {
     // TODO(b/319081342): Implement this.
     return std::nullopt;
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
index a10babd539b2e7..c1108ca37e8cd3 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
@@ -52,7 +52,7 @@ using mlir::ValueRange;
 
 std::optional<IndexingMap>
 MlirInputSlicesFusion::ComputeThreadIdToOutputIndexing(
-    int64_t output_id, IndexingContext* indexing_context) const {
+    int64_t output_id, mlir::MLIRContext* ctx) const {
   // The mapping here is trivial and the same for all outputs - slice offsets
   // are applied in the indexing from slice outputs to slice inputs.
   auto launch_dims = launch_dimensions();
@@ -60,7 +60,7 @@ MlirInputSlicesFusion::ComputeThreadIdToOutputIndexing(
   // still use the requested output's shape for clarity.
   const auto& shape = analysis_.fusion_roots()[output_id]->shape();
   return GetDefaultThreadIdToOutputIndexingMap(launch_dims, unroll_factor_,
-                                               shape, indexing_context);
+                                               shape, ctx);
 }
 
 LaunchDimensions MlirInputSlicesFusion::launch_dimensions() const {
@@ -80,8 +80,8 @@ absl::Status MlirInputSlicesFusion::EmitEntryFunction(
 
   // We enforce that all the root shapes have identical dimensions in
   // IsHloOpSupported.
-  IndexingContext indexing_context{entry_function.getContext()};
-  auto indexing = ComputeThreadIdToOutputIndexing(0, &indexing_context);
+  auto indexing =
+      ComputeThreadIdToOutputIndexing(0, entry_function.getContext());
   TF_RET_CHECK(indexing) << "Indexing is never nullopt";
 
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
index 53b9d76f97a9ca..1de06b963d9e59 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
@@ -37,11 +37,11 @@ class MlirInputSlicesFusion : public MlirFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, IndexingContext* indexing_context) const override;
+      int64_t output_id, mlir::MLIRContext* ctx) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override {
+      mlir::MLIRContext* ctx) const override {
     // TODO(b/319081342): Implement this.
     return std::nullopt;
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc b/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc
index 939ab506b62dc4..094bbfac7a27a9 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
@@ -35,7 +34,6 @@ namespace {
 
 class InputSlicesTest : public HloTestBase {
  public:
-  InputSlicesTest() : indexing_context_(&mlir_context_) {}
   void SetUp() override {
     HloTestBase::SetUp();
     printer_ =
@@ -46,7 +44,6 @@ class InputSlicesTest : public HloTestBase {
  protected:
   AffineMapPrinter printer_;
   mlir::MLIRContext mlir_context_;
-  IndexingContext indexing_context_;
 };
 
 TEST_F(InputSlicesTest, ThreadIndexing) {
@@ -79,7 +76,7 @@ TEST_F(InputSlicesTest, ThreadIndexing) {
   ASSERT_NE(fusion, nullptr);
 
   auto thread_id_to_output_indexing =
-      fusion->ComputeThreadIdToOutputIndexing(0, &indexing_context_);
+      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context_);
   EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
               MatchIndexingString(R"(
     (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (0,
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.cc b/third_party/xla/xla/service/gpu/fusions/loop.cc
index 35b1f18348ac6d..e7a13200fe391f 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop.cc
@@ -215,24 +215,23 @@ LoopFusion::LoopFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis), config_(ComputeLoopFusionConfig(analysis)) {}
 
 std::optional<IndexingMap> LoopFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, IndexingContext* indexing_context) const {
+    int64_t root_index, mlir::MLIRContext* ctx) const {
   auto launch_dims = launch_dimensions();
   return GetDefaultThreadIdToOutputIndexingMap(
-      launch_dims, config_.unroll_factor, GetElementShape(analysis_),
-      indexing_context);
+      launch_dims, config_.unroll_factor, GetElementShape(analysis_), ctx);
 }
 
 std::optional<IndexingMap> LoopFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    IndexingContext* indexing_context) const {
+    mlir::MLIRContext* ctx) const {
   std::optional<IndexingMap> thread_id_to_output_indexing =
-      ComputeThreadIdToOutputIndexing(root_index, indexing_context);
+      ComputeThreadIdToOutputIndexing(root_index, ctx);
   if (!thread_id_to_output_indexing.has_value()) {
     return std::nullopt;
   }
   const HloInstruction* fusion_root = analysis_.fusion_roots()[root_index];
-  auto output_to_input_indexing = ComputeOutputToInputIndexing(
-      fusion_root, /*output_id=*/0, indexing_context);
+  auto output_to_input_indexing =
+      ComputeOutputToInputIndexing(fusion_root, /*output_id=*/0, ctx);
   IndexingMapSet output_to_input_indexing_set =
       output_to_input_indexing.indexing_maps[hero_operand_index];
   // Since we are computing the indexing for a non-fusion op, there is only one
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.h b/third_party/xla/xla/service/gpu/fusions/loop.h
index 9371015cf0a356..e466abe66a843f 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.h
+++ b/third_party/xla/xla/service/gpu/fusions/loop.h
@@ -40,11 +40,11 @@ class LoopFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, IndexingContext* indexing_context) const override;
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override;
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
index 0989de4bde6726..82734d06cc9c9a 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
@@ -63,24 +63,23 @@ const Shape& GetFusionResultShape(const HloFusionAnalysis& analysis) {
 }  // namespace
 
 std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, IndexingContext* indexing_context) const {
+    int64_t root_index, mlir::MLIRContext* ctx) const {
   auto launch_dims = launch_dimensions();
   return GetDefaultThreadIdToOutputIndexingMap(
-      launch_dims, config_.unroll_factor, GetFusionResultShape(analysis_),
-      indexing_context);
+      launch_dims, config_.unroll_factor, GetFusionResultShape(analysis_), ctx);
 }
 
 std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    IndexingContext* indexing_context) const {
+    mlir::MLIRContext* ctx) const {
   std::optional<IndexingMap> thread_id_to_output_indexing =
-      ComputeThreadIdToOutputIndexing(root_index, indexing_context);
+      ComputeThreadIdToOutputIndexing(root_index, ctx);
   if (!thread_id_to_output_indexing.has_value()) {
     return std::nullopt;
   }
   const HloInstruction* fusion_root = analysis_.fusion_roots()[root_index];
-  auto output_to_input_indexing = ComputeOutputToInputIndexing(
-      fusion_root, /*output_id=*/0, indexing_context);
+  auto output_to_input_indexing =
+      ComputeOutputToInputIndexing(fusion_root, /*output_id=*/0, ctx);
   IndexingMapSet output_to_input_indexing_set =
       output_to_input_indexing.indexing_maps[hero_operand_index];
   // Since we are computing the indexing for a non-fusion op, there is only one
@@ -107,8 +106,8 @@ absl::Status MlirLoopFusion::EmitEntryFunction(
 
   // We enforce that all the root shapes have identical dimensions in
   // IsHloOpSupported.
-  IndexingContext indexing_context{entry_function.getContext()};
-  auto indexing = ComputeThreadIdToOutputIndexing(0, &indexing_context);
+  auto indexing =
+      ComputeThreadIdToOutputIndexing(0, entry_function.getContext());
   TF_RET_CHECK(indexing) << "Indexing is never nullopt";
 
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir.h b/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
index b70b7070ab626f..228c8c87b5ff28 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
@@ -37,11 +37,11 @@ class MlirLoopFusion : public MlirFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, IndexingContext* indexing_context) const override;
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override;
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitEntryFunction(
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc
index 9febfd5d565e66..1f3d41bddc46a0 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc
@@ -47,8 +47,8 @@ TEST_F(MlirLoopFusionTest, ThreadId_IndexingUnrolled) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   MlirLoopFusion fusion(analysis);
-  auto thread_id_to_output_indexing = fusion.ComputeThreadIdToOutputIndexing(
-      /*root_index=*/0, &indexing_context_);
+  auto thread_id_to_output_indexing =
+      fusion.ComputeThreadIdToOutputIndexing(/*root_index=*/0, &mlir_context_);
 
   EXPECT_THAT(thread_id_to_output_indexing->ToString(thread_id_printer_),
               MatchIndexingString(R"(
@@ -90,8 +90,8 @@ TEST_F(MlirLoopFusionTest, ThreadId_IndexingNotUnrolled) {
   auto analysis = AnalyzeFusion(*root, device_info_);
 
   MlirLoopFusion fusion(analysis);
-  auto thread_id_to_output_indexing = fusion.ComputeThreadIdToOutputIndexing(
-      /*root_index=*/0, &indexing_context_);
+  auto thread_id_to_output_indexing =
+      fusion.ComputeThreadIdToOutputIndexing(/*root_index=*/0, &mlir_context_);
   EXPECT_THAT(thread_id_to_output_indexing->ToString(thread_id_printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (th_x)
@@ -106,7 +106,7 @@ TEST_F(MlirLoopFusionTest, ThreadId_IndexingNotUnrolled) {
               unroll_id in [0, 0]
             )"));
   auto thread_id_to_input_indexing = fusion.ComputeThreadIdToInputIndexing(
-      /*root_index=*/0, /*hero_operand_index=*/0, &indexing_context_);
+      /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
   EXPECT_THAT(thread_id_to_input_indexing->ToString(thread_id_printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (th_x)
@@ -142,8 +142,8 @@ TEST_F(MlirLoopFusionTest, ThreadId_Broadcast) {
   auto analysis = AnalyzeFusion(*root, device_info_);
 
   MlirLoopFusion fusion(analysis);
-  auto thread_id_to_output_indexing = fusion.ComputeThreadIdToOutputIndexing(
-      /*root_index=*/0, &indexing_context_);
+  auto thread_id_to_output_indexing =
+      fusion.ComputeThreadIdToOutputIndexing(/*root_index=*/0, &mlir_context_);
   EXPECT_THAT(thread_id_to_output_indexing->ToString(thread_id_printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
@@ -162,7 +162,7 @@ TEST_F(MlirLoopFusionTest, ThreadId_Broadcast) {
                 th_x + bl_x * 128 in [0, 5999]
             )"));
   auto thread_id_to_input_indexing = fusion.ComputeThreadIdToInputIndexing(
-      /*root_index=*/0, /*hero_operand_index=*/0, &indexing_context_);
+      /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
   EXPECT_THAT(thread_id_to_input_indexing->ToString(thread_id_printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_test.cc b/third_party/xla/xla/service/gpu/fusions/loop_test.cc
index 91e56aafe0a6b5..1bb5fdb8705d30 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_test.cc
@@ -37,7 +37,6 @@ namespace {
 
 class LoopTest : public HloTestBase {
  public:
-  LoopTest() : indexing_context_(&mlir_context_) {}
   void SetUp() override {
     HloTestBase::SetUp();
 
@@ -51,7 +50,6 @@ class LoopTest : public HloTestBase {
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
   AffineMapPrinter printer_;
   mlir::MLIRContext mlir_context_;
-  IndexingContext indexing_context_;
 };
 
 absl::StatusOr<std::unique_ptr<KernelFusionInterface>> GetFusion(
@@ -86,7 +84,7 @@ TEST_F(LoopTest, ThreadIndexingUnrolled) {
   TF_ASSERT_OK_AND_ASSIGN(auto loop_fusion, GetFusion(analysis));
   auto thread_id_to_output_indexing =
       loop_fusion->ComputeThreadIdToOutputIndexing(/*root_index=*/0,
-                                                   &indexing_context_);
+                                                   &mlir_context_);
 
   EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
               MatchIndexingString(R"(
@@ -129,7 +127,7 @@ TEST_F(LoopTest, ThreadIndexingNotUnrolled) {
   TF_ASSERT_OK_AND_ASSIGN(auto loop_fusion, GetFusion(analysis));
   auto thread_id_to_output_indexing =
       loop_fusion->ComputeThreadIdToOutputIndexing(/*root_index=*/0,
-                                                   &indexing_context_);
+                                                   &mlir_context_);
   EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (th_x)
@@ -145,7 +143,7 @@ TEST_F(LoopTest, ThreadIndexingNotUnrolled) {
             )"));
   auto thread_id_to_input_indexing =
       loop_fusion->ComputeThreadIdToInputIndexing(
-          /*root_index=*/0, /*hero_operand_index=*/0, &indexing_context_);
+          /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
   EXPECT_THAT(thread_id_to_input_indexing->ToString(printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (th_x)
@@ -182,7 +180,7 @@ TEST_F(LoopTest, Broadcast) {
   TF_ASSERT_OK_AND_ASSIGN(auto loop_fusion, GetFusion(analysis));
   auto thread_id_to_output_indexing =
       loop_fusion->ComputeThreadIdToOutputIndexing(/*root_index=*/0,
-                                                   &indexing_context_);
+                                                   &mlir_context_);
   EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
@@ -202,7 +200,7 @@ TEST_F(LoopTest, Broadcast) {
             )"));
   auto thread_id_to_input_indexing =
       loop_fusion->ComputeThreadIdToInputIndexing(
-          /*root_index=*/0, /*hero_operand_index=*/0, &indexing_context_);
+          /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
   EXPECT_THAT(thread_id_to_input_indexing->ToString(printer_),
               MatchIndexingString(R"(
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index 71b954ee2e693e..7f938e6c5a72ad 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -69,7 +69,6 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
@@ -89,6 +88,7 @@ using mlir::Block;
 using mlir::ImplicitLocOpBuilder;
 using mlir::IRMapping;
 using mlir::Location;
+using mlir::MLIRContext;
 using mlir::OpBuilder;
 using mlir::Value;
 using mlir::ValueRange;
@@ -250,9 +250,9 @@ absl::StatusOr<SmallVector<Value>> EmitReduce(
     const HloInstruction* instr, ValueRange indices,
     const OperandProvider& operand_provider,
     const CallTargetProvider& call_target_provider, ImplicitLocOpBuilder& b) {
-  IndexingContext indexing_context{b.getContext()};
+  auto* mlir_context = b.getContext();
   HloInstructionIndexing indexing =
-      ComputeOutputToInputIndexing(instr, 0, &indexing_context);
+      ComputeOutputToInputIndexing(instr, 0, mlir_context);
   const auto& indexing_map = *indexing.indexing_maps[0].begin();
 
   SmallVector<Value> init_values;
@@ -303,9 +303,9 @@ absl::StatusOr<SmallVector<Value>> EmitReduceWindow(
     const HloInstruction* instr, mlir::Type result_element_type,
     ValueRange indices, const OperandProvider& operand_provider,
     const CallTargetProvider& call_target_provider, ImplicitLocOpBuilder& b) {
-  IndexingContext indexing_context{b.getContext()};
+  MLIRContext* mlir_context = b.getContext();
   HloInstructionIndexing indexing =
-      ComputeOutputToInputIndexing(instr, 0, &indexing_context);
+      ComputeOutputToInputIndexing(instr, 0, mlir_context);
   const auto& indexing_map = *indexing.indexing_maps[0].begin();
 
   auto reduce_window = DynCast<HloReduceWindowInstruction>(instr);
@@ -529,8 +529,7 @@ absl::StatusOr<SmallVector<Value>> EmitPad(
     const HloInstruction* instr, mlir::Type result_element_type,
     ValueRange indices, const OperandProvider& operand_provider,
     ImplicitLocOpBuilder& b) {
-  IndexingContext indexing_context{b.getContext()};
-  auto indexing = ComputeOutputToInputIndexing(instr, 0, &indexing_context);
+  auto indexing = ComputeOutputToInputIndexing(instr, 0, b.getContext());
   const auto& indexing_map = *indexing.indexing_maps[0].begin();
   mlir::Value is_in_bounds = CheckConstraints(indexing_map, indices, {}, b);
 
@@ -668,7 +667,7 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
     result_element_type = sign_converter.convertType(element_mlir_type);
   }
 
-  IndexingContext indexing_context(builder.getContext());
+  auto* mlir_context = builder.getContext();
   // Handle ops that aren't elementwise and aren't just indexing
   // transformations.
   switch (instr->opcode()) {
@@ -725,7 +724,7 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
         if (i > 0 && !ShapeUtil::EqualIgnoringElementType(
                          first_shape, instr->operand(i)->shape())) {
           auto operand_map = GetBitcastMap(
-              first_shape, instr->operand(i)->shape(), &indexing_context);
+              first_shape, instr->operand(i)->shape(), mlir_context);
           operand_indices =
               ApplyAffineMap(operand_map.GetAffineMap(), indices, {}, builder);
         } else {
@@ -756,9 +755,8 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
                             operand->shape().element_type(), builder));
     arg_types.push_back(operand_element_type);
   }
-  auto input_indices =
-      GetInputIndices(ComputeOutputToInputIndexing(instr, 0, &indexing_context),
-                      indices, builder);
+  auto input_indices = GetInputIndices(
+      ComputeOutputToInputIndexing(instr, 0, mlir_context), indices, builder);
   SmallVector<Value> operands;
   for (auto&& [operand_number, operand_indices] :
        llvm::enumerate(input_indices)) {
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
index 79836a9d5ed8a3..6baf86372613e5 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/ir_emitter_context.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/stream_executor/device_description.h"
 
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter_test.cc
index d5623e11ae58ee..b0ed47330a5558 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter_test.cc
@@ -61,12 +61,12 @@ class DummyCopyFusionEmitter : public MlirFusionEmitterBase {
   LaunchDimensions launch_dimensions() const final { return {1, 100}; }
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t, IndexingContext*) const final {
+      int64_t, mlir::MLIRContext*) const final {
     return std::nullopt;
   }
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
-      int64_t, int64_t, IndexingContext*) const final {
+      int64_t, int64_t, mlir::MLIRContext*) const final {
     return std::nullopt;
   }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
index 241ba30b77e1ec..585bc4b5cf6420 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "xla/service/gpu/fusions/mlir/passes.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
@@ -115,9 +114,8 @@ struct RewriteAffineApply
       }
     }
 
-    IndexingContext indexing_context(op->getContext());
-    IndexingMap map(&indexing_context, op.getAffineMap(), dim_ranges,
-                    symbol_ranges, /*rt_vars=*/{});
+    IndexingMap map(op.getAffineMap(), dim_ranges, symbol_ranges,
+                    /*rt_vars=*/{});
     map.Simplify();
     auto expr = map.GetAffineMap().getResult(0);
 
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc b/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc
index bdf424c079c7a3..2dfc06b9e747af 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.cc
@@ -49,8 +49,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-MlirEmitterTestBaseImpl::MlirEmitterTestBaseImpl()
-    : indexing_context_(&mlir_context_) {
+MlirEmitterTestBaseImpl::MlirEmitterTestBaseImpl() {
   // clang-format off
   mlir_context_.loadDialect<
       mlir::affine::AffineDialect,
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.h b/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.h
index 147b57b6f84b70..a299c2ea4007ba 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 
@@ -51,7 +50,6 @@ class MlirEmitterTestBaseImpl : public HloTestBase {
   stream_executor::DeviceDescription device_info_ =
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
   mlir::MLIRContext mlir_context_;
-  IndexingContext indexing_context_;
   AffineMapPrinter thread_id_printer_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
index fe5b3a42a62779..86bb721129d009 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
@@ -317,19 +317,18 @@ ReductionInfo ReductionInfo::Create(const HloFusionAnalysis& analysis) {
 }
 
 std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, IndexingContext* indexing_context) const {
+    int64_t root_index, mlir::MLIRContext* ctx) const {
   if (!groups_.is_reduction_root[root_index]) {
     // Non-transpose roots are elementwise by definition.
-    return ComputeThreadIdToInputIndexing(root_index, 0, indexing_context);
+    return ComputeThreadIdToInputIndexing(root_index, 0, ctx);
   }
   auto* root = analysis_.fusion_roots()[root_index];
   auto* hero = analysis_.fusion_heroes()[root_index];
 
-  auto mlir_context = indexing_context->GetMLIRContext();
-  auto block_offsets = GetBlockOffsetsForTiling(tiling_, mlir_context);
-  auto thread_ids = DelinearizeInBoundsIndex(
-      mlir::getAffineDimExpr(0, mlir_context), tiling_.GetThreadsPerBlock(),
-      tiling_.GetThreadStrides());
+  auto block_offsets = GetBlockOffsetsForTiling(tiling_, ctx);
+  auto thread_ids = DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx),
+                                             tiling_.GetThreadsPerBlock(),
+                                             tiling_.GetThreadStrides());
 
   auto physical_shape = ShapeUtil::DeleteDimensions(hero->dimensions(),
                                                     hero->operand(0)->shape());
@@ -353,10 +352,9 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
   auto physical_index = [&]() {
     if (is_row_reduction_) {
       IndexingMap linear_index(
-          indexing_context,
           mlir::AffineMap::get(
               6, 0, block_offsets.getResult(kRowKept) + thread_ids[kRowKept],
-              mlir_context),
+              ctx),
           dimension_ranges, /*range_vars=*/{}, /*rt_vars=*/{});
       int rows_per_warp = GetRowsPerWarp();
       if (rows_per_warp > 1) {
@@ -369,21 +367,20 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
       return ComposeIndexingMaps(
           linear_index, GetBitcastMap(ShapeUtil::MakeShape(
                                           PRED, {tiling_.GetShape()[kRowKept]}),
-                                      physical_shape, indexing_context));
+                                      physical_shape, ctx));
     }
 
     IndexingMap projected_index(
-        indexing_context,
         mlir::AffineMap::get(
             6, 0,
             {block_offsets.getResult(kColMajorKept),
              block_offsets.getResult(kColMinorKept) + thread_ids[kColReduced]},
-            mlir_context),
+            ctx),
         dimension_ranges, /*range_vars=*/{}, /*rt_vars=*/{});
 
     projected_index.AddConstraint(
         mlir::getAffineDimExpr(
-            KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context) %
+            KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx) %
             WarpSize(),
         {0, 0});
     if (!is_row_reduction_) {
@@ -398,25 +395,24 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
         GetBitcastMap(ShapeUtil::DeleteDimension(
                           ReductionDimensions::kColReducedDimension,
                           tiling_.GetXlaShape()),
-                      physical_shape, indexing_context));
+                      physical_shape, ctx));
   }();
 
   auto map = ComposeIndexingMaps(
       physical_index,
-      GetBitcastMap(FirstShape(hero->shape()), FirstShape(root->shape()),
-                    indexing_context));
+      GetBitcastMap(FirstShape(hero->shape()), FirstShape(root->shape()), ctx));
 
   int group_index = groups_.group_id_per_root[root_index];
   map.AddConstraint(
       mlir::getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[1],
-                             mlir_context),
+                             ctx),
       {group_index, group_index});
   return map;
 }
 
 std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    IndexingContext* indexing_context) const {
+    mlir::MLIRContext* ctx) const {
   auto* hero = analysis_.fusion_heroes()[root_index];
   if (groups_.is_reduction_root[root_index] &&
       hero_operand_index >= hero->operand_count() / 2) {
@@ -425,16 +421,15 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToInputIndexing(
   }
 
   auto map = ComposeIndexingMaps(
-      GetIndexingMapForTiling(tiling_, indexing_context),
+      GetIndexingMapForTiling(tiling_, ctx),
       GetBitcastMap(tiling_.GetXlaShape(),
-                    hero->operand(hero_operand_index)->shape(),
-                    indexing_context));
+                    hero->operand(hero_operand_index)->shape(), ctx));
   // Only threads with the right y block index actually do anything for this
   // root.
   int group_index = groups_.group_id_per_root[root_index];
   map.AddConstraint(
       mlir::getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[1],
-                             indexing_context->GetMLIRContext()),
+                             ctx),
       {group_index, group_index});
   return map;
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base.h b/third_party/xla/xla/service/gpu/fusions/reduction_base.h
index 89442524b7e058..93c2ecc2681f83 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base.h
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base.h
@@ -57,11 +57,11 @@ class ReductionInfo {
   int GetRowsPerWarp() const;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, IndexingContext* indexing_context) const;
+      int64_t root_index, mlir::MLIRContext* ctx) const;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const;
+      mlir::MLIRContext* ctx) const;
 
   LaunchDimensions launch_dimensions() const;
 
@@ -93,16 +93,15 @@ class ReductionFusionBase : public Base {
       : analysis_(analysis), reduction_info_(ReductionInfo::Create(analysis)) {}
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, IndexingContext* indexing_context) const override {
-    return reduction_info().ComputeThreadIdToOutputIndexing(root_index,
-                                                            indexing_context);
+      int64_t root_index, mlir::MLIRContext* ctx) const override {
+    return reduction_info().ComputeThreadIdToOutputIndexing(root_index, ctx);
   }
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override {
+      mlir::MLIRContext* ctx) const override {
     return reduction_info().ComputeThreadIdToInputIndexing(
-        root_index, hero_operand_index, indexing_context);
+        root_index, hero_operand_index, ctx);
   }
 
   LaunchDimensions launch_dimensions() const override {
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc b/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
index 6b7e8dcc2c4f42..2c4ffa0e9ce078 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter_context.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
@@ -36,14 +35,9 @@ namespace gpu {
 namespace {
 
 class ReductionTest : public HloTestBase {
- public:
-  ReductionTest() : indexing_context_(&mlir_context_) {}
-
  protected:
   stream_executor::DeviceDescription device_info_ =
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  mlir::MLIRContext mlir_context_;
-  IndexingContext indexing_context_;
 };
 
 class FakeReductionFusion : public ReductionFusionBase<KernelFusionInterface> {
@@ -84,10 +78,11 @@ TEST_F(ReductionTest, ThreadIndexingRowReduction) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
+  mlir::MLIRContext mlir_context;
 
-  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(R"(
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           (d3 * 8 + d0 floordiv 32) floordiv 64,
           (d3 * 8 + d0 floordiv 32) mod 64,
@@ -108,7 +103,7 @@ TEST_F(ReductionTest, ThreadIndexingRowReduction) {
         d3 * 8 + d0 floordiv 32 in [0, 6399]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &indexing_context_)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5) -> (
           (d3 * 8 + d0 floordiv 32) floordiv 64,
@@ -152,10 +147,11 @@ TEST_F(ReductionTest, ThreadIndexingMultiRowReduction) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
+  mlir::MLIRContext mlir_context;
 
-  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(R"(
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 + (d0 floordiv 4) floordiv 64,
           (d0 floordiv 4) mod 64,
@@ -176,7 +172,7 @@ TEST_F(ReductionTest, ThreadIndexingMultiRowReduction) {
         d3 * 64 + d0 floordiv 4 in [0, 6399]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &indexing_context_)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5) -> (
           d3 + (d0 floordiv 4) floordiv 64,
@@ -221,10 +217,11 @@ TEST_F(ReductionTest, ThreadIndexingColumnReduction) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
+  mlir::MLIRContext mlir_context;
 
-  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(R"(
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3,
           d0 floordiv 32 + s1 * 32,
@@ -238,7 +235,7 @@ TEST_F(ReductionTest, ThreadIndexingColumnReduction) {
         d0 mod 32 in [0, 31]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &indexing_context_)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5) -> (
           d3,
@@ -276,9 +273,10 @@ TEST_F(ReductionTest, ThreadIndexingOutputLayout) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
+  mlir::MLIRContext mlir_context;
 
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &indexing_context_)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5) -> (
           (d3 * 8 + d0 floordiv 32) floordiv 64,
@@ -324,6 +322,7 @@ TEST_F(ReductionTest, ThreadIndexingSideOutput) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
+  mlir::MLIRContext mlir_context;
 
   constexpr char kExpectedIndexing[] = R"(
       (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
@@ -345,11 +344,11 @@ TEST_F(ReductionTest, ThreadIndexingSideOutput) {
       d0 mod 32 + s2 * 32 in [0, 511]
       d3 * 8 + d0 floordiv 32 in [0, 6399]
   )";
-  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(1, 0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(kExpectedIndexing));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(1, &indexing_context_)->ToString(),
+      fusion.ComputeThreadIdToInputIndexing(1, 0, &mlir_context)->ToString(),
+      MatchIndexingString(kExpectedIndexing));
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToOutputIndexing(1, &mlir_context)->ToString(),
       MatchIndexingString(kExpectedIndexing));
 }
 
@@ -378,9 +377,9 @@ TEST_F(ReductionTest, bla) {
   FakeReductionFusion fusion(analysis);
   mlir::MLIRContext mlir_context;
 
-  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(R"(
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3] -> (
           d3,
           (d0 + s2 * 512) * 2 + s3
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
index c0e500803c0b46..1ff23dddcf51ba 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
@@ -147,9 +147,7 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
   int num_warps_row = tiling.GetThreadsPerBlock()
                           [ReductionDimensions::kRowMinorReducedDimension] /
                       WarpSize();
-
-  auto* mlir_context = state.entry_function.getContext();
-  IndexingContext indexing_context(mlir_context);
+  auto ctx = state.entry_function.getContext();
 
   auto zero = builder.create<mlir::arith::ConstantIndexOp>(0);
   auto lane_id = builder.create<mlir::gpu::LaneIdOp>();
@@ -163,10 +161,10 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
   auto thread_ids = mlir_converter::ApplyAffineMap(
       mlir::AffineMap::get(
           /*dimCount=*/1, /*symbolCount=*/0,
-          DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, mlir_context),
+          DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx),
                                    tiling.GetThreadsPerBlock(),
                                    tiling.GetThreadStrides()),
-          mlir_context),
+          ctx),
       {thread_id}, {}, builder);
   SmallVector<Value> thread_and_block_indices{thread_id, zero, zero,
                                               block_id,  zero, zero};
@@ -202,7 +200,7 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
   }
   bool use_shared = !shared_tile_size.empty();
 
-  auto output_indexing = ComputeThreadIdToOutputIndexing(0, &indexing_context);
+  auto output_indexing = ComputeThreadIdToOutputIndexing(0, ctx);
   auto output_indices = mlir_converter::ApplyAffineMap(
       output_indexing->GetAffineMap(), thread_and_block_indices, {}, builder);
   auto thread_has_output = mlir_converter::CheckConstraints(
@@ -238,7 +236,7 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
   SmallVector<llvm::SmallVector<Value>> results;
   for (auto* hero : reduction_heroes_) {
     auto input_indexing = ComputeThreadIdToInputIndexing(
-        reduction_roots_.at(hero).front(), 0, &indexing_context);
+        reduction_roots_.at(hero).front(), 0, ctx);
     TF_ASSIGN_OR_RETURN(
         auto accumulated,
         state.EmitPerThreadReducedElements(*input_indexing, hero, inits[hero]));
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.h b/third_party/xla/xla/service/gpu/fusions/scatter.h
index 6b0e2c5fe81eb9..6982bbc8e6bd2c 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.h
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.h
@@ -44,7 +44,7 @@ class ScatterFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, IndexingContext* indexing_context) const override {
+      int64_t root_index, mlir::MLIRContext* ctx) const override {
     // The kernel iterates over updates, whose correspondence to output
     // elements cannot be computed statically.
     return std::nullopt;
@@ -52,7 +52,7 @@ class ScatterFusion : public KernelFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override {
+      mlir::MLIRContext* ctx) const override {
     // TODO(b/319081342): Implement this.
     return std::nullopt;
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
index 979c4208d7ff8f..102240b31a7c33 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
@@ -83,13 +83,13 @@ bool MlirScatterFusion::IsSupported(const HloFusionAnalysis& analysis) {
 }
 
 std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, IndexingContext* indexing_context) const {
+    int64_t root_index, mlir::MLIRContext* ctx) const {
   return std::nullopt;
 }
 
 std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    IndexingContext* indexing_context) const {
+    mlir::MLIRContext* ctx) const {
   auto* scatter =
       DynCast<HloScatterInstruction>(analysis_.fusion_heroes().front());
   int64_t scatter_operand_count = scatter->scatter_operand_count();
@@ -106,8 +106,7 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
   // Compute thread id mapping based on the first update operand.
   Shape scatter_update_shape = scatter->scatter_updates().front()->shape();
   IndexingMap scatter_update_map = GetDefaultThreadIdToOutputIndexingMap(
-      launch_dimensions(), config_.unroll_factor, scatter_update_shape,
-      indexing_context);
+      launch_dimensions(), config_.unroll_factor, scatter_update_shape, ctx);
 
   // For scatter indices we project indexing for scatter updates and take the
   // first result of the affine map only, because they coincide.
@@ -115,14 +114,11 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
     Shape scatter_indices_shape = scatter->scatter_indices()->shape();
     CHECK_EQ(scatter_indices_shape.rank(), 2) << scatter->ToString();
     // Create a map from scatter update to scatter indices.
-    auto* mlir_context = indexing_context->GetMLIRContext();
     IndexingMap updates_to_indices_map{
-        indexing_context,
         mlir::AffineMap::get(
             /*dimCount=*/scatter_update_shape.rank(), /*symbolCount=*/1,
-            {mlir::getAffineDimExpr(0, mlir_context),
-             mlir::getAffineSymbolExpr(0, mlir_context)},
-            mlir_context),
+            {mlir::getAffineDimExpr(0, ctx), mlir::getAffineSymbolExpr(0, ctx)},
+            ctx),
         DimVarsFromTensorSizes(scatter_update_shape.dimensions()),
         RangeVarsFromTensorSizes({scatter_indices_shape.dimensions(1)}),
         /*rt_vars=*/{}};
@@ -189,11 +185,10 @@ absl::Status MlirScatterFusion::EmitEntryFunction(
   const HloInstruction* scatter_update = scatter->operand(kScatterUpdateIndex);
 
   mlir::MLIRContext* mlir_context = entry_function.getContext();
-  IndexingContext indexing_context{mlir_context};
   auto thread_id_to_update_map =
       ComputeThreadIdToInputIndexing(
           /*root_index=*/0, /*hero_operand_index=*/kScatterUpdateIndex,
-          &indexing_context)
+          mlir_context)
           .value();
   thread_id_to_update_map.Simplify();
   thread_id_to_update_map.RemoveUnusedSymbols();
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
index 016a67c7c512fd..e66e2c6a4f5a78 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
@@ -42,11 +42,11 @@ class MlirScatterFusion : public MlirFusionEmitterBase {
   static bool IsSupported(const HloFusionAnalysis& analysis);
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, IndexingContext* indexing_context) const override;
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override;
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitEntryFunction(
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
index dd868683d745bf..f7fdba3b97db30 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
@@ -97,25 +97,25 @@ TEST_F(MlirScatterFusionTest, ThreadId_IndexingUnrolled) {
   EXPECT_THAT(
       fusion
           .ComputeThreadIdToInputIndexing(
-              /*root_index=*/0, /*hero_operand_index=*/3, &indexing_context_)
+              /*root_index=*/0, /*hero_operand_index=*/3, &mlir_context_)
           ->ToString(thread_id_printer_),
       MatchIndexingString(kUpdatesIndexing));
   EXPECT_THAT(
       fusion
           .ComputeThreadIdToInputIndexing(
-              /*root_index=*/0, /*hero_operand_index=*/4, &indexing_context_)
+              /*root_index=*/0, /*hero_operand_index=*/4, &mlir_context_)
           ->ToString(thread_id_printer_),
       MatchIndexingString(kUpdatesIndexing));
   EXPECT_THAT(
       fusion
           .ComputeThreadIdToInputIndexing(
-              /*root_index=*/1, /*hero_operand_index=*/3, &indexing_context_)
+              /*root_index=*/1, /*hero_operand_index=*/3, &mlir_context_)
           ->ToString(thread_id_printer_),
       MatchIndexingString(kUpdatesIndexing));
   EXPECT_THAT(
       fusion
           .ComputeThreadIdToInputIndexing(
-              /*root_index=*/1, /*hero_operand_index=*/4, &indexing_context_)
+              /*root_index=*/1, /*hero_operand_index=*/4, &mlir_context_)
           ->ToString(thread_id_printer_),
       MatchIndexingString(kUpdatesIndexing));
 
@@ -137,13 +137,13 @@ TEST_F(MlirScatterFusionTest, ThreadId_IndexingUnrolled) {
   EXPECT_THAT(
       fusion
           .ComputeThreadIdToInputIndexing(
-              /*root_index=*/0, /*hero_operand_index=*/2, &indexing_context_)
+              /*root_index=*/0, /*hero_operand_index=*/2, &mlir_context_)
           ->ToString(thread_id_printer_),
       MatchIndexingString(kIndicesIndexing));
   EXPECT_THAT(
       fusion
           .ComputeThreadIdToInputIndexing(
-              /*root_index=*/1, /*hero_operand_index=*/2, &indexing_context_)
+              /*root_index=*/1, /*hero_operand_index=*/2, &mlir_context_)
           ->ToString(thread_id_printer_),
       MatchIndexingString(kIndicesIndexing));
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.cc b/third_party/xla/xla/service/gpu/fusions/transpose.cc
index fbce46e7b82665..99f113cbafbea7 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.cc
@@ -284,20 +284,19 @@ LaunchDimensions TransposeFusion::launch_dimensions() const {
 }
 
 std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, IndexingContext* indexing_context) const {
-  auto* mlir_context = indexing_context->GetMLIRContext();
+    int64_t root_index, mlir::MLIRContext* ctx) const {
   const auto& hero = *analysis_.fusion_heroes()[root_index];
   const auto& root = *analysis_.fusion_roots()[root_index];
   if (!GetDescriptionForTiledTransposeEmitter(root, hero)) {
     // Non-transpose roots are elementwise by definition.
-    return ComputeThreadIdToInputIndexing(root_index, 0, indexing_context);
+    return ComputeThreadIdToInputIndexing(root_index, 0, ctx);
   }
 
   // The block offsets are permuted, but the thread offsets remain the same.
-  auto block_offset = GetBlockOffsetsForTiling(tiling_, mlir_context)
+  auto block_offset = GetBlockOffsetsForTiling(tiling_, ctx)
                           .getSubMap(std::vector<unsigned>{permutation_.begin(),
                                                            permutation_.end()});
-  auto thread_offset = GetThreadOffsetsForTiling(tiling_, mlir_context);
+  auto thread_offset = GetThreadOffsetsForTiling(tiling_, ctx);
   auto permuted_tiled_shape =
       ShapeUtil::MakeShape(U8, Permute(tiling_.GetShape(), permutation_));
 
@@ -305,21 +304,20 @@ std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToOutputIndexing(
       GetIndexingMapForTiling(
           block_offset, thread_offset, tiling_.GetNumThreadsPerBlock(),
           tiling_.GetNumBlocks(), tiling_.GetThreadTileSize(),
-          permuted_tiled_shape.dimensions(), indexing_context),
-      GetBitcastMap(permuted_tiled_shape, hero.shape(), indexing_context));
+          permuted_tiled_shape.dimensions()),
+      GetBitcastMap(permuted_tiled_shape, hero.shape(), ctx));
   map.Simplify();
   return map;
 }
 
 std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    IndexingContext* indexing_context) const {
+    mlir::MLIRContext* ctx) const {
   const auto& hero = *analysis_.fusion_heroes()[root_index];
 
   auto map = ComposeIndexingMaps(
-      GetIndexingMapForTiling(tiling_, indexing_context),
-      GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(),
-                    indexing_context));
+      GetIndexingMapForTiling(tiling_, ctx),
+      GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(), ctx));
   map.Simplify();
   return map;
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.h b/third_party/xla/xla/service/gpu/fusions/transpose.h
index d45cf15c762561..899b1cb94390ae 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.h
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.h
@@ -64,11 +64,11 @@ class TransposeFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, IndexingContext* indexing_context) const override;
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override;
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
index 7c0c172d230857..c654c2e4ec8b99 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
@@ -139,10 +139,9 @@ MlirTransposeFusion::MlirTransposeFusion(const HloFusionAnalysis& analysis)
 }
 
 std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, IndexingContext* indexing_context) const {
+    int64_t root_index, MLIRContext* mlir_context) const {
   const auto& hero = *analysis_.fusion_heroes()[root_index];
   // The block offsets are permuted, but the thread offsets remain the same.
-  auto* mlir_context = indexing_context->GetMLIRContext();
   auto block_offset = GetBlockOffsetsForTiling(tiling_, mlir_context)
                           .getSubMap(std::vector<unsigned>{permutation_.begin(),
                                                            permutation_.end()});
@@ -154,34 +153,33 @@ std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
       GetIndexingMapForTiling(
           block_offset, thread_offset, tiling_.GetNumThreadsPerBlock(),
           tiling_.GetNumBlocks(), tiling_.GetThreadTileSize(),
-          permuted_tiled_shape.dimensions(), indexing_context),
-      GetBitcastMap(permuted_tiled_shape, hero.shape(), indexing_context));
+          permuted_tiled_shape.dimensions()),
+      GetBitcastMap(permuted_tiled_shape, hero.shape(), mlir_context));
   map.Simplify();
   return map;
 }
 
 IndexingMap MlirTransposeFusion::ComputeThreadIdToInputIndexing(
-    const HloInstruction& hero, IndexingContext* indexing_context) const {
+    const HloInstruction& hero, MLIRContext* mlir_context) const {
   auto map = ComposeIndexingMaps(
-      GetIndexingMapForTiling(tiling_, indexing_context),
+      GetIndexingMapForTiling(tiling_, mlir_context),
       GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(),
-                    indexing_context));
+                    mlir_context));
   map.Simplify();
   return map;
 }
 
 std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    IndexingContext* indexing_context) const {
+    MLIRContext* mlir_context) const {
   const auto& hero = *analysis_.fusion_heroes()[root_index];
   const auto& root = *analysis_.fusion_roots()[root_index];
   if (!GetDescriptionForTiledTransposeEmitter(root, hero)) {
     // Non-transpose roots are elementwise by definition.
-    return ComputeThreadIdToOutputIndexing(root_index, indexing_context);
+    return ComputeThreadIdToOutputIndexing(root_index, mlir_context);
   }
-
   return ComputeThreadIdToInputIndexing(*analysis_.fusion_heroes()[root_index],
-                                        indexing_context);
+                                        mlir_context);
 }
 
 LaunchDimensions MlirTransposeFusion::launch_dimensions() const {
@@ -193,7 +191,6 @@ LaunchDimensions MlirTransposeFusion::launch_dimensions() const {
 IndexingMap GetSharedMemoryWriteIndexingMap(
     const IndexingMap& thread_id_indexing, int loop_dim) {
   auto* mlir_context = thread_id_indexing.GetMLIRContext();
-  IndexingContext indexing_context{mlir_context};
 
   AffineExpr c0 = mlir::getAffineConstantExpr(0, mlir_context);
   AffineExpr th_x = mlir::getAffineDimExpr(0, mlir_context);
@@ -201,7 +198,6 @@ IndexingMap GetSharedMemoryWriteIndexingMap(
   mlir::bindSymbolsList(mlir_context, llvm::MutableArrayRef(tile_sizes));
 
   IndexingMap shmem_write_indexing{
-      &indexing_context,
       AffineMap::get(
           thread_id_indexing.GetDimensionCount(),
           thread_id_indexing.GetSymbolCount(),
@@ -221,10 +217,8 @@ IndexingMap GetSharedMemoryReadIndexingMap(
     const IndexingMap& thread_id_indexing, int loop_dim) {
   IndexingMap write_indexing =
       GetSharedMemoryWriteIndexingMap(thread_id_indexing, loop_dim);
-  return IndexingMap{thread_id_indexing.GetIndexingContext(),
-                     write_indexing.GetAffineMap().getSubMap({0, 2, 1}),
-                     write_indexing.GetDimVars(),
-                     write_indexing.GetRangeVars(),
+  return IndexingMap{write_indexing.GetAffineMap().getSubMap({0, 2, 1}),
+                     write_indexing.GetDimVars(), write_indexing.GetRangeVars(),
                      write_indexing.GetRTVars(),
                      write_indexing.GetConstraints()};
 }
@@ -240,11 +234,11 @@ absl::StatusOr<SmallVector<Value, 4>> MlirTransposeFusion::EmitWriteToShMemMlir(
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
   int num_outputs = entry_function.getArguments().size() - num_inputs;
 
-  IndexingContext indexing_context{builder.getContext()};
+  MLIRContext* mlir_context = builder.getContext();
   SmallVector<Value> shmem_intermediate_result;
   for (auto* transpose : shmem_transposes_) {
     auto input_indexing =
-        ComputeThreadIdToInputIndexing(*transpose, &indexing_context);
+        ComputeThreadIdToInputIndexing(*transpose, mlir_context);
     IndexingMap shmem_input_indexing =
         GetSharedMemoryWriteIndexingMap(input_indexing, permutation_[2]);
 
@@ -294,14 +288,13 @@ absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
     const CallTargetProvider& call_targets, ValueRange shmem_tensors) const {
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
   auto* mlir_context = builder.getContext();
-  IndexingContext indexing_context{mlir_context};
   ValueRange output_tensor_args =
       entry_function.getArguments().drop_front(num_inputs);
-  auto output_indexing = *ComputeThreadIdToOutputIndexing(0, &indexing_context);
+  auto output_indexing = *ComputeThreadIdToOutputIndexing(0, mlir_context);
   auto shmem_output_indexing =
       GetSharedMemoryReadIndexingMap(output_indexing, permutation_[2]);
   auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(
-      analysis_.fusion_heroes()[0], &indexing_context);
+      analysis_.fusion_heroes()[0], mlir_context);
   auto root_indexing = ComposeIndexingMaps(output_indexing, epilogue_indexing);
   auto result_tensors = EmitThreadLoopNest(
       builder, output_tensor_args, output_indexing,
@@ -330,7 +323,7 @@ absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
             indices = root_indices;
           } else {
             auto bitcast_map =
-                GetBitcastMap(first_shape, root->shape(), &indexing_context);
+                GetBitcastMap(first_shape, root->shape(), mlir_context);
             indices = ApplyAffineMap(bitcast_map.GetAffineMap(), root_indices,
                                      {}, builder);
           }
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
index 3eb6e6fef98a74..8329cdd852ae63 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
@@ -52,15 +52,15 @@ class MlirTransposeFusion : public MlirFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, IndexingContext* indexing_context) const override;
+      int64_t root_index, mlir::MLIRContext* mlir_context) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      IndexingContext* indexing_context) const override;
+      mlir::MLIRContext* mlir_context) const override;
 
  protected:
   IndexingMap ComputeThreadIdToInputIndexing(
-      const HloInstruction& hero, IndexingContext* indexing_context) const;
+      const HloInstruction& hero, mlir::MLIRContext* mlir_context) const;
 
   absl::Status EmitEntryFunction(
       const mlir_converter::PartitionedComputations& computations,
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
index bbffea39df1042..2b3ddc04e5a3eb 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
@@ -46,9 +46,9 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing021) {
   auto analysis = AnalyzeFusion(*root, device_info_);
 
   MlirTransposeFusion fusion(analysis);
-  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(R"(
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
+      MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
           d0 floordiv 32 + s1 * 4,
@@ -67,7 +67,7 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing021) {
         s2 in [0, 0]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &indexing_context_)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
@@ -105,9 +105,9 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing201) {
   auto analysis = AnalyzeFusion(*root, device_info_);
   MlirTransposeFusion fusion(analysis);
 
-  EXPECT_THAT(fusion.ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(R"(
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
+      MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
           d0 floordiv 32 + (d3 * 32 + s1 * 4) mod 64,
@@ -126,7 +126,7 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing201) {
         s2 in [0, 0]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &indexing_context_)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d0 floordiv 32 + s1 * 4,
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_test.cc b/third_party/xla/xla/service/gpu/fusions/transpose_test.cc
index 94d3df1898ad3b..d7363bbd39f382 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_test.cc
@@ -37,14 +37,9 @@ namespace {
 using ::testing::HasSubstr;
 
 class TransposeTest : public HloTestBase {
- public:
-  TransposeTest() : indexing_context_(&mlir_context_) {}
-
  protected:
   stream_executor::DeviceDescription device_info_ =
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  mlir::MLIRContext mlir_context_;
-  IndexingContext indexing_context_;
 };
 
 absl::StatusOr<std::unique_ptr<TransposeFusion>> GetTransposeFusion(
@@ -79,9 +74,9 @@ TEST_F(TransposeTest, ThreadIndexing021) {
   TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetTransposeFusion(analysis));
   mlir::MLIRContext mlir_context;
 
-  EXPECT_THAT(fusion->ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(R"(
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
           d0 floordiv 32 + s1 * 4,
@@ -99,9 +94,9 @@ TEST_F(TransposeTest, ThreadIndexing021) {
         s1 in [0, 7]
         s2 in [0, 0]
       )"));
-  EXPECT_THAT(fusion->ComputeThreadIdToOutputIndexing(0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(R"(
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
           d0 floordiv 32 + (d3 mod 2) * 32 + s1 * 4,
@@ -141,9 +136,9 @@ TEST_F(TransposeTest, ThreadIndexing201) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetTransposeFusion(analysis));
   mlir::MLIRContext mlir_context;
-  EXPECT_THAT(fusion->ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(R"(
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
           d0 floordiv 32 + (d3 * 32 + s1 * 4) mod 64,
@@ -161,9 +156,9 @@ TEST_F(TransposeTest, ThreadIndexing201) {
         s1 in [0, 7]
         s2 in [0, 0]
       )"));
-  EXPECT_THAT(fusion->ComputeThreadIdToOutputIndexing(0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(R"(
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d0 floordiv 32 + s1 * 4,
           d3 floordiv 2,
@@ -205,9 +200,9 @@ TEST_F(TransposeTest, ThreadIndexingPartialBlock) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetTransposeFusion(analysis));
   mlir::MLIRContext mlir_context;
-  EXPECT_THAT(fusion->ComputeThreadIdToInputIndexing(0, 0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(R"(
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d0 floordiv 32 + s0 * 4,
           d3,
@@ -227,9 +222,9 @@ TEST_F(TransposeTest, ThreadIndexingPartialBlock) {
         d0 floordiv 32 + s0 * 4 in [0, 23]
         d0 mod 32 in [0, 23]
       )"));
-  EXPECT_THAT(fusion->ComputeThreadIdToOutputIndexing(0, &indexing_context_)
-                  ->ToString(),
-              MatchIndexingString(R"(
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           s0,
           d0 floordiv 32,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index 2ae9a636d7fcc3..cc79e4cd3c8266 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
 #include "xla/service/name_uniquer.h"
 #include "xla/stream_executor/device_description.h"
@@ -70,7 +69,6 @@ class IrEmitterContext {
         platform_name_(std::move(platform_name)),
         gpu_device_info_(gpu_device_info),
         mlir_context_(mlir_context),
-        indexing_context_(mlir_context_),
         llvm_module_(llvm_module),
         emit_kernels_(emit_kernels) {}
   // Disallow copy and assign.
@@ -100,7 +98,6 @@ class IrEmitterContext {
     return cc != nullptr ? *cc : se::RocmComputeCapability();
   }
   mlir::MLIRContext* mlir_context() { return mlir_context_; }
-  IndexingContext* indexing_context() { return &indexing_context_; }
   llvm::Module* llvm_module() { return llvm_module_; }
   NameUniquer* name_uniquer() { return &name_uniquer_; }
 
@@ -129,7 +126,6 @@ class IrEmitterContext {
   std::string platform_name_;
   const se::DeviceDescription& gpu_device_info_;
   mlir::MLIRContext* mlir_context_;
-  IndexingContext indexing_context_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
   std::vector<GpuExecutable::ConstantInfo> constants_;
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 136c0ae48fb0f5..cef7a9fbe4afce 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -419,10 +419,7 @@ xla_cc_test(
 cc_library(
     name = "indexing_map",
     srcs = ["indexing_map.cc"],
-    hdrs = [
-        "indexing_context.h",
-        "indexing_map.h",
-    ],
+    hdrs = ["indexing_map.h"],
     deps = [
         ":affine_map_printer",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index 9b93141f74e0fb..283569d551a6b7 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -223,14 +223,11 @@ bool IsCoalesced(const IndexingMap& thread_id_to_input_indexing_map,
   if (thread_id_to_input_indexing_map.GetAffineMap().getNumResults() == 0) {
     return true;
   }
-  IndexingContext* indexing_context =
-      thread_id_to_input_indexing_map.GetIndexingContext();
-  mlir::MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+  MLIRContext* mlir_context = thread_id_to_input_indexing_map.GetMLIRContext();
   AffineExpr thread_x_dim = mlir::getAffineDimExpr(
       KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context);
   AffineExpr c0 = mlir::getAffineConstantExpr(0, mlir_context);
   IndexingMap thread_x_first_32_elements{
-      indexing_context,
       AffineMap::get(1, 0, {thread_x_dim, c0, c0, c0, c0, c0}, mlir_context),
       {DimVar{{0, 31}}},
       /*range_vars=*/{},
@@ -261,8 +258,7 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
     const HloFusionAdaptor& fusion_adaptor,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    KernelFusionInterface* fusion_interface,
-    IndexingContext* indexing_context) {
+    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context) {
   GroupedByOpIndexingMap result;
   for (const auto& [root_index, hero] :
        llvm::enumerate(fusion_analysis.fusion_heroes())) {
@@ -274,7 +270,7 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
       // Compute thread ID -> hero operand indexing map.
       std::optional<IndexingMap> thread_id_to_hero_operand_map =
           fusion_interface->ComputeThreadIdToInputIndexing(
-              root_index, hero_operand_index, indexing_context);
+              root_index, hero_operand_index, mlir_context);
       if (!thread_id_to_hero_operand_map.has_value()) {
         return std::nullopt;
       }
@@ -282,7 +278,7 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
       HloInstructionAdaptor hero_operand_adaptor(*hero_operand);
       GroupedByOpIndexingMap instr_indexing_keyed_by_operands =
           ComputeGroupedOutputToInputIndexing(
-              fusion_adaptor, hero_operand_adaptor, indexing_context);
+              fusion_adaptor, hero_operand_adaptor, mlir_context);
       // For every operand compute thread ID -> physical layout of operand
       // indexing map.
       for (const HloInstruction* operand : operands) {
@@ -296,11 +292,11 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
 
         IndexingMap operand_logical_to_physical_map =
             GetIndexingMapFromLogicalToPhysicalLayout(operand_shape,
-                                                      indexing_context);
+                                                      mlir_context);
         IndexingMap operand_physical_to_linearized_shape = GetBitcastMap(
             ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                 operand_shape),
-            GetLinearizedShape(operand_shape), indexing_context);
+            GetLinearizedShape(operand_shape), mlir_context);
         IndexingMap operand_logical_to_linearized_physical_shape =
             operand_logical_to_physical_map *
             operand_physical_to_linearized_shape;
@@ -335,12 +331,12 @@ CoalescingAnalysis::CoalescingAnalysis(
     const HloInstruction* instr,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    KernelFusionInterface* fusion_interface, IndexingContext* indexing_context,
+    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context,
     bool use_heuristic) {
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(instr);
   if (!use_heuristic && ComputeCoalescingForAllOperands(
                             *fusion_adaptor, operands, fusion_analysis,
-                            fusion_interface, indexing_context)) {
+                            fusion_interface, mlir_context)) {
     return;
   }
   // If ComputeCoalescingForAllOperands fails, fallback to using the heuristic.
@@ -352,12 +348,12 @@ CoalescingAnalysis::CoalescingAnalysis(
     const HloInstruction* producer, const HloInstruction* consumer,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    KernelFusionInterface* fusion_interface, IndexingContext* indexing_context,
+    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context,
     bool use_heuristic) {
   ProducerConsumerFusion fusion_adaptor(producer, consumer);
   if (!use_heuristic &&
       ComputeCoalescingForAllOperands(fusion_adaptor, operands, fusion_analysis,
-                                      fusion_interface, indexing_context)) {
+                                      fusion_interface, mlir_context)) {
     return;
   }
   // If ComputeCoalescingForAllOperands fails, fallback to using the heuristic.
@@ -369,12 +365,11 @@ bool CoalescingAnalysis::ComputeCoalescingForAllOperands(
     const HloFusionAdaptor& fusion_adaptor,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    KernelFusionInterface* fusion_interface,
-    IndexingContext* indexing_context) {
+    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context) {
   std::optional<GroupedByOpIndexingMap> thread_id_to_input_memory_layouts =
       GetThreadIdToInputMemoryLayoutsMaps(fusion_adaptor, operands,
                                           fusion_analysis, fusion_interface,
-                                          indexing_context);
+                                          mlir_context);
   if (!thread_id_to_input_memory_layouts.has_value()) {
     return false;
   }
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
index 86e93dcad69d3b..300036aa453bae 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
@@ -38,7 +38,7 @@ class CoalescingAnalysis {
                      absl::Span<const HloInstruction* const> operands,
                      const HloFusionAnalysis& fusion_analysis,
                      KernelFusionInterface* fusion_interface = nullptr,
-                     IndexingContext* indexing_context = nullptr,
+                     mlir::MLIRContext* mlir_context = nullptr,
                      bool use_heuristic = true);
 
   // Computes read coalescing for operands of fused `producer` and `consumer`.
@@ -47,7 +47,7 @@ class CoalescingAnalysis {
                      absl::Span<const HloInstruction* const> operands,
                      const HloFusionAnalysis& fusion_analysis,
                      KernelFusionInterface* fusion_interface = nullptr,
-                     IndexingContext* indexing_context = nullptr,
+                     mlir::MLIRContext* mlir_context = nullptr,
                      bool use_heuristic = true);
 
   // Returns true if the operand is read coalesced.
@@ -58,8 +58,7 @@ class CoalescingAnalysis {
       const HloFusionAdaptor& fusion_adaptor,
       absl::Span<const HloInstruction* const> operands,
       const HloFusionAnalysis& fusion_analysis,
-      KernelFusionInterface* fusion_interface,
-      IndexingContext* indexing_context = nullptr);
+      KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context);
 
   absl::flat_hash_map<const HloInstruction*, bool> coalescing_per_operand_;
   bool is_coalesced_computed_by_heuristic_ = false;
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index 5a788bb1e0fee1..18a69aa6bf404b 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/hlo_traversal.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -45,8 +44,6 @@ using ::testing::ElementsAre;
 
 class CoalescingTest : public HloTestBase {
  public:
-  CoalescingTest() : indexing_context_(&mlir_context_) {}
-
   std::vector<bool> IsReadCoalescedPerOperand(absl::string_view hlo_string) {
     auto module = ParseAndReturnVerifiedModule(hlo_string).value();
     HloInstruction* root = module->entry_computation()->root_instruction();
@@ -61,7 +58,7 @@ class CoalescingTest : public HloTestBase {
     EXPECT_TRUE(emitter.ok());
 
     CoalescingAnalysis coalescing_analysis(root, root->operands(), analysis,
-                                           fusion, &indexing_context_,
+                                           fusion, &mlir_context_,
                                            /*use_heuristic=*/false);
 
     std::vector<bool> results;
@@ -83,7 +80,6 @@ class CoalescingTest : public HloTestBase {
   stream_executor::DeviceDescription device_info_ =
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
   mlir::MLIRContext mlir_context_;
-  IndexingContext indexing_context_;
 };
 
 TEST_F(CoalescingTest, IdentityLayout) {
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
index ef4cd4a0654988..6e8b57d62dcbd3 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -123,7 +123,7 @@ GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForFusion(
   // operands. For each instruction, tells which elements of the instructions
   // result will be used to compute one result element of the fusion.
   auto grouped_fusion_indexing = ComputeGroupedOutputToInputIndexing(
-      fusion_adaptor, roots[0], &indexing_context_);
+      fusion_adaptor, roots[0], mlir_context_);
 
   int64_t flops = 0;
   int64_t bytes_read = 0;
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
index 0f2b66eef4ca07..14d7e520a820d3 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
 #include "xla/service/gpu/model/hlo_op_profiles.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
@@ -43,8 +42,7 @@ class GpuPerformanceModelWithIndexingAnalysis : public GpuPerformanceModelBase {
       : hlo_op_profile_(&HloOpProfiles::Singleton().GetProfile(device_info)),
         device_info_(device_info),
         shape_size_(shape_size),
-        mlir_context_(mlir_context),
-        indexing_context_(mlir_context_) {}
+        mlir_context_(mlir_context) {}
 
   EstimateRunTimeData EstimateRunTimeForFusion(
       const HloFusionAnalysis& fusion_analysis, bool is_coalesced = true);
@@ -70,7 +68,6 @@ class GpuPerformanceModelWithIndexingAnalysis : public GpuPerformanceModelBase {
   const se::DeviceDescription* device_info_;
   HloCostAnalysis::ShapeSizeFunction shape_size_;
   mlir::MLIRContext* mlir_context_;
-  IndexingContext indexing_context_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index 5666258b2276d1..f96133e0089ba1 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -53,7 +53,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -80,9 +79,8 @@ HloInstructionIndexing CreateUnknownIndexing(int64_t count = 1) {
 }
 
 HloInstructionIndexing ComputeOutputToInputCwiseOpIndexing(
-    const HloInstruction* instr, IndexingContext* indexing_context) {
-  IndexingMap identity_map =
-      CreateIdentityMap(instr->shape(), indexing_context);
+    const HloInstruction* instr, MLIRContext* mlir_context) {
+  IndexingMap identity_map = CreateIdentityMap(instr->shape(), mlir_context);
 
   HloInstructionIndexing instr_indexing;
   instr_indexing.indexing_maps.resize(instr->operand_count());
@@ -94,24 +92,21 @@ HloInstructionIndexing ComputeOutputToInputCwiseOpIndexing(
 }
 
 HloInstructionIndexing ComputeInputToOutputCwiseOpIndexing(
-    const HloInstruction* instr, IndexingContext* indexing_context) {
-  IndexingMap identity_map =
-      CreateIdentityMap(instr->shape(), indexing_context);
+    const HloInstruction* instr, MLIRContext* mlir_context) {
+  IndexingMap identity_map = CreateIdentityMap(instr->shape(), mlir_context);
   return HloInstructionIndexing::FromIndexingMaps({identity_map});
 }
 
 HloInstructionIndexing ComputeOutputToInputBroadcastOpIndexing(
-    const HloBroadcastInstruction* bcast, IndexingContext* indexing_context) {
+    const HloBroadcastInstruction* bcast, MLIRContext* mlir_context) {
   auto output_dims = bcast->shape().dimensions();
 
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
   std::vector<AffineExpr> exprs;
   exprs.reserve(bcast->dimensions().size());
   for (int64_t bcast_dim : bcast->dimensions()) {
     exprs.push_back(getAffineDimExpr(bcast_dim, mlir_context));
   }
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
                      mlir_context),
       output_dims, {});
@@ -119,9 +114,7 @@ HloInstructionIndexing ComputeOutputToInputBroadcastOpIndexing(
 }
 
 HloInstructionIndexing ComputeInputToOutputBroadcastOpIndexing(
-    const HloBroadcastInstruction* bcast, IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    const HloBroadcastInstruction* bcast, MLIRContext* mlir_context) {
   absl::Span<const int64_t> bcast_dims = bcast->dimensions();
 
   const Shape& input_shape = bcast->operand(0)->shape();
@@ -144,7 +137,6 @@ HloInstructionIndexing ComputeInputToOutputBroadcastOpIndexing(
         std::distance(bcast_dims.begin(), bcast_dim), mlir_context));
   }
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(input_shape.rank(), added_dims_sizes.size(), exprs,
                      mlir_context),
       input_shape.dimensions(), added_dims_sizes);
@@ -153,10 +145,7 @@ HloInstructionIndexing ComputeInputToOutputBroadcastOpIndexing(
 }
 
 HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
-    const HloConcatenateInstruction* concat,
-    IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    const HloConcatenateInstruction* concat, MLIRContext* mlir_context) {
   const auto& operand_0_dims = concat->operand(0)->shape().dimensions();
 
   // Initialize affine map and domain. Only concat_dim elements of both have to
@@ -175,7 +164,7 @@ HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
     int64_t operand_concat_dim = operand->shape().dimensions()[concat_dim];
     dim_vars[concat_dim] = DimVar{{offset, offset + operand_concat_dim - 1}};
     concat_indexing.indexing_maps[operand_id].insert(
-        IndexingMap(indexing_context, affine_map.getAffineMap(), dim_vars,
+        IndexingMap(affine_map.getAffineMap(), dim_vars,
                     /*range_vars=*/{}, /*rt_vars=*/{}));
     offset += operand_concat_dim;
   }
@@ -184,9 +173,7 @@ HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
 
 HloInstructionIndexing ComputeInputToOutputConcatenateOpIndexing(
     const HloConcatenateInstruction* concat, int input_id,
-    IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    MLIRContext* mlir_context) {
   int64_t concat_dim = concat->concatenate_dimension();
   int64_t offset = 0;
   for (int64_t operand_id = 0; operand_id < input_id; ++operand_id) {
@@ -199,8 +186,8 @@ HloInstructionIndexing ComputeInputToOutputConcatenateOpIndexing(
       AffineMap::getMultiDimIdentityMap(operand_dims.size(), mlir_context);
   affine_map.setResult(concat_dim,
                        getAffineDimExpr(concat_dim, mlir_context) + offset);
-  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context, affine_map.getAffineMap(), operand_dims, {});
+  IndexingMap indexing_map =
+      IndexingMap::FromTensorSizes(affine_map.getAffineMap(), operand_dims, {});
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
@@ -208,10 +195,10 @@ HloInstructionIndexing ComputeInputToOutputConcatenateOpIndexing(
 // until the HloParameterInstruction is found.
 HloInstructionIndexing ComputeOutputToInputFusionOpIndexing(
     const HloFusionInstruction* fusion, int output_id,
-    IndexingContext* indexing_context) {
+    MLIRContext* mlir_context) {
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(fusion);
   auto grouped_indexing_maps = ComputeGroupedOutputToInputIndexing(
-      *fusion_adaptor, fusion_adaptor->GetRoots()[output_id], indexing_context);
+      *fusion_adaptor, fusion_adaptor->GetRoots()[output_id], mlir_context);
 
   // After the traversal, `grouped_indexing_maps` is keyed by
   // HloParameterInstructions. Convert them back to the operand id and return.
@@ -224,9 +211,7 @@ HloInstructionIndexing ComputeOutputToInputFusionOpIndexing(
 }
 
 HloInstructionIndexing ComputeOutputToInputDotOpIndexing(
-    const HloDotInstruction* dot, IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    const HloDotInstruction* dot, MLIRContext* mlir_context) {
   CHECK_NE(dot, nullptr);
   const DotDimensionNumbers& dim_numbers = dot->dot_dimension_numbers();
   absl::Span<const int64_t> lhs_contracting_dims(
@@ -291,13 +276,11 @@ HloInstructionIndexing ComputeOutputToInputDotOpIndexing(
   }
 
   IndexingMap lhs_indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(dot->shape().rank(), input_dim_sizes.size(), lhs_exprs,
                      mlir_context),
       dot->shape().dimensions(), input_dim_sizes);
 
   IndexingMap rhs_indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(dot->shape().rank(), input_dim_sizes.size(), rhs_exprs,
                      mlir_context),
       dot->shape().dimensions(), input_dim_sizes);
@@ -307,9 +290,7 @@ HloInstructionIndexing ComputeOutputToInputDotOpIndexing(
 
 HloInstructionIndexing ComputeOutputToInputDynamicSliceOpIndexing(
     const HloDynamicSliceInstruction* dynamic_slice,
-    IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    MLIRContext* mlir_context) {
   const Shape& input_shape = dynamic_slice->operand(0)->shape();
   const Shape& output_shape = dynamic_slice->shape();
   int64_t rank = output_shape.rank();
@@ -321,9 +302,8 @@ HloInstructionIndexing ComputeOutputToInputDynamicSliceOpIndexing(
   // tensors.
   AffineMap empty_results_affine_map = AffineMap::get(
       /*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{}, mlir_context);
-  IndexingMap start_indices_map =
-      IndexingMap::FromTensorSizes(indexing_context, empty_results_affine_map,
-                                   output_shape.dimensions(), {});
+  IndexingMap start_indices_map = IndexingMap::FromTensorSizes(
+      empty_results_affine_map, output_shape.dimensions(), {});
 
   std::vector<RTVar> offsets_rt_vars;
   offsets_rt_vars.reserve(rank);
@@ -341,8 +321,7 @@ HloInstructionIndexing ComputeOutputToInputDynamicSliceOpIndexing(
   std::vector<IndexingMap> indexing_maps(dynamic_slice->operand_count(),
                                          start_indices_map);
   indexing_maps.front() =
-      IndexingMap{indexing_context,
-                  AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank, exprs,
+      IndexingMap{AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank, exprs,
                                  mlir_context),
                   start_indices_map.GetDimVars(), /*range_vars=*/{},
                   std::move(offsets_rt_vars)};
@@ -350,10 +329,7 @@ HloInstructionIndexing ComputeOutputToInputDynamicSliceOpIndexing(
 }
 
 HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
-    const HloDynamicUpdateSliceInstruction* dus,
-    IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    const HloDynamicUpdateSliceInstruction* dus, MLIRContext* mlir_context) {
   const Shape& update_shape = dus->operand(1)->shape();
   const Shape& output_shape = dus->shape();
   int64_t rank = output_shape.rank();
@@ -364,7 +340,6 @@ HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
     identity.push_back(getAffineDimExpr(dim, mlir_context));
   }
   IndexingMap operand_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/0, /*results=*/identity,
                      mlir_context),
       output_shape.dimensions(), {});
@@ -372,9 +347,8 @@ HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
   // start_indices: (d0, ... d_{N-1}) -> ()
   AffineMap empty_results_affine_map = AffineMap::get(
       /*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{}, mlir_context);
-  IndexingMap start_indices_map =
-      IndexingMap::FromTensorSizes(indexing_context, empty_results_affine_map,
-                                   output_shape.dimensions(), {});
+  IndexingMap start_indices_map = IndexingMap::FromTensorSizes(
+      empty_results_affine_map, output_shape.dimensions(), {});
 
   // update: (d_0 - s_0, ..., d_{N-1} - s_{N-1})
   std::vector<AffineExpr> exprs;
@@ -388,8 +362,7 @@ HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
     rt_vars.push_back(RTVar{feasible_values, dus->operand(2 + dim),
                             empty_results_affine_map});
   }
-  IndexingMap update_map{indexing_context,
-                         AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank,
+  IndexingMap update_map{AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank,
                                         /*results=*/exprs, mlir_context),
                          operand_map.GetDimVars(),
                          /*range_vars=*/{}, rt_vars};
@@ -402,11 +375,9 @@ HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
 }
 
 HloInstructionIndexing ComputeOutputToInputGatherOpIndexing(
-    const HloGatherInstruction* gather, IndexingContext* indexing_context) {
+    const HloGatherInstruction* gather, MLIRContext* mlir_context) {
   CHECK(GatherSimplifier::IsSimplifiedGather(gather))
       << "Non-simplified HLO Gather is not supported.";
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
   const Shape& operand_shape = gather->operand(0)->shape();
   const Shape& indices_shape = gather->operand(1)->shape();
 
@@ -425,7 +396,6 @@ HloInstructionIndexing ComputeOutputToInputGatherOpIndexing(
   std::vector<DimVar> dim_vars =
       DimVarsFromTensorSizes(output_shape.dimensions());
   IndexingMap indices_map{
-      indexing_context,
       AffineMap::get(output_rank, 1,
                      {indices_id_dim, getAffineSymbolExpr(0, mlir_context)},
                      mlir_context),
@@ -457,7 +427,6 @@ HloInstructionIndexing ComputeOutputToInputGatherOpIndexing(
         exprs.back() + getAffineSymbolExpr(operand_dim_id, mlir_context);
   }
   IndexingMap operand_map = {
-      indexing_context,
       AffineMap::get(/*dimCount=*/output_rank,
                      /*symbolCount=*/index_vector_length, exprs, mlir_context),
       std::move(dim_vars), /*range_vars=*/{}, std::move(rt_vars)};
@@ -468,10 +437,7 @@ IndexingMap ComputeOutputToInputPadOpIndexingImpl(
     absl::Span<const int64_t> output_dims,
     absl::Span<const int64_t> padding_low,
     absl::Span<const int64_t> padding_high,
-    absl::Span<const int64_t> padding_interior,
-    IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    absl::Span<const int64_t> padding_interior, MLIRContext* mlir_context) {
   int64_t output_rank = output_dims.size();
 
   std::vector<AffineExpr> exprs;
@@ -496,7 +462,6 @@ IndexingMap ComputeOutputToInputPadOpIndexingImpl(
     ++output_dim_id;
   }
   return IndexingMap{
-      indexing_context,
       AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
       std::move(dim_vars),
       /*range_vars = */ {},
@@ -505,9 +470,7 @@ IndexingMap ComputeOutputToInputPadOpIndexingImpl(
 }
 
 HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
-    const HloPadInstruction* pad, IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    const HloPadInstruction* pad, MLIRContext* mlir_context) {
   const Shape& output_shape = pad->shape();
   int64_t rank = output_shape.rank();
   SmallVector<int64_t> padding_low, padding_high, padding_interior;
@@ -521,9 +484,8 @@ HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
   }
   IndexingMap input_indexing_map = ComputeOutputToInputPadOpIndexingImpl(
       output_shape.dimensions(), padding_low, padding_high, padding_interior,
-      indexing_context);
+      mlir_context);
   IndexingMap padding_value_indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {}, mlir_context),
       output_shape.dimensions(), /*symbol_upper_bounds=*/{});
   return HloInstructionIndexing::FromIndexingMaps(
@@ -532,9 +494,7 @@ HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
 
 HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
     const HloReduceInstruction* reduce, int output_id,
-    IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    MLIRContext* mlir_context) {
   absl::flat_hash_set<int64_t> reduce_dims_ids(reduce->dimensions().begin(),
                                                reduce->dimensions().end());
 
@@ -556,12 +516,10 @@ HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
     exprs.push_back(getAffineDimExpr(output_dim_id++, mlir_context));
   }
   IndexingMap inputs_indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(output_shape.rank(), reduce_dims_ids.size(), exprs,
                      mlir_context),
       output_shape.dimensions(), parallel_dims_sizes);
   IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {}, mlir_context),
       output_shape.dimensions(), {});
 
@@ -578,9 +536,7 @@ HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
 
 HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
     const HloReduceInstruction* reduce, int input_id,
-    IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    MLIRContext* mlir_context) {
   absl::flat_hash_set<int64_t> reduce_dims_ids(reduce->dimensions().begin(),
                                                reduce->dimensions().end());
   const Shape& input_shape = reduce->operand(input_id)->shape();
@@ -600,12 +556,10 @@ HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
     inits_exprs.push_back(getAffineSymbolExpr(output_dim_id++, mlir_context));
   }
   IndexingMap inputs_indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(input_shape.rank(), /*symbolCount=*/0, inputs_exprs,
                      mlir_context),
       input_shape.dimensions(), {});
   IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(0, /*symbolCount=*/output_rank, inits_exprs, mlir_context),
       {}, output_shape.dimensions());
 
@@ -625,9 +579,7 @@ HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
 // of bounds.
 HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
     const HloReduceWindowInstruction* reduce_window, int output_id,
-    IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    MLIRContext* mlir_context) {
   const Shape& input_shape = reduce_window->operand(0)->shape();
   const Shape& output_shape = GetOutputShape(reduce_window, 0);
   int64_t rank = input_shape.rank();
@@ -669,11 +621,11 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
   // Indexing map for pad op that pads the input.
   IndexingMap padded_input_indexing = ComputeOutputToInputPadOpIndexingImpl(
       padded_input_dimensions, padding_low, padding_high, padding_interior,
-      indexing_context);
+      mlir_context);
   // Indexing map for reduce-window, that does not do any padding.
   IndexingMap reduce_window_indexing_no_padding(
-      indexing_context, AffineMap::get(rank, rank, exprs, mlir_context),
-      dim_vars, range_vars, /*rt_vars=*/{});
+      AffineMap::get(rank, rank, exprs, mlir_context), dim_vars, range_vars,
+      /*rt_vars=*/{});
 
   // Composed indexing.
   IndexingMap inputs_indexing = ComposeIndexingMaps(
@@ -683,7 +635,6 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
 
   // Indexing map for the init value.
   IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {}, mlir_context),
       output_shape.dimensions(), /*symbol_upper_bounds=*/{});
 
@@ -855,35 +806,30 @@ AffineMap ComputeReshapeIndexingMap(const Shape& input, const Shape& output,
 };
 
 HloInstructionIndexing ComputeOutputToInputReshapeOpIndexing(
-    const HloReshapeInstruction* reshape, IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    const HloReshapeInstruction* reshape, MLIRContext* mlir_context) {
   const auto& input = reshape->operand(0)->shape();
   const auto& output = reshape->shape();
 
   IndexingMap reshape_indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context, ComputeReshapeIndexingMap(input, output, mlir_context),
+      ComputeReshapeIndexingMap(input, output, mlir_context),
       output.dimensions(), {});
   reshape_indexing_map.Simplify();
   return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
 }
 HloInstructionIndexing ComputeInputToOutputReshapeOpIndexing(
-    const HloReshapeInstruction* reshape, IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
-
+    const HloReshapeInstruction* reshape, MLIRContext* mlir_context) {
   const auto& input = reshape->operand(0)->shape();
   const auto& output = reshape->shape();
 
   IndexingMap reshape_indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context, ComputeReshapeIndexingMap(output, input, mlir_context),
+      ComputeReshapeIndexingMap(output, input, mlir_context),
       input.dimensions(), {});
   reshape_indexing_map.Simplify();
   return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
 }
 
 HloInstructionIndexing ComputeReverseOpIndexing(
-    const HloReverseInstruction* reverse, IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+    const HloReverseInstruction* reverse, MLIRContext* mlir_context) {
   absl::flat_hash_set<int64_t> reverse_dims(reverse->dimensions().begin(),
                                             reverse->dimensions().end());
   auto output_dims = reverse->shape().dimensions();
@@ -900,7 +846,6 @@ HloInstructionIndexing ComputeReverseOpIndexing(
   }
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
                      mlir_context),
       output_dims, {});
@@ -909,8 +854,7 @@ HloInstructionIndexing ComputeReverseOpIndexing(
 }
 
 HloInstructionIndexing ComputeOutputToInputSliceOpIndexing(
-    const HloSliceInstruction* slice, IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+    const HloSliceInstruction* slice, MLIRContext* mlir_context) {
   auto output_rank = slice->shape().rank();
 
   std::vector<AffineExpr> exprs;
@@ -921,7 +865,6 @@ HloInstructionIndexing ComputeOutputToInputSliceOpIndexing(
                     slice->slice_starts()[dim]);
   }
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      indexing_context,
       AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
       slice->shape().dimensions(), {});
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
@@ -935,31 +878,25 @@ AffineMap ComputeTransposeIndexingMap(absl::Span<const int64_t> permutation,
 }
 
 HloInstructionIndexing ComputeOutputToInputTransposeOpIndexing(
-    const HloTransposeInstruction* transpose,
-    IndexingContext* indexing_context) {
-  AffineMap inverse_permutation =
-      ComputeTransposeIndexingMap(InversePermutation(transpose->dimensions()),
-                                  indexing_context->GetMLIRContext());
-  return HloInstructionIndexing::FromIndexingMaps(
-      {IndexingMap::FromTensorSizes(indexing_context, inverse_permutation,
-                                    transpose->shape().dimensions(), {})});
+    const HloTransposeInstruction* transpose, MLIRContext* mlir_context) {
+  AffineMap inverse_permutation = ComputeTransposeIndexingMap(
+      InversePermutation(transpose->dimensions()), mlir_context);
+  return HloInstructionIndexing::FromIndexingMaps({IndexingMap::FromTensorSizes(
+      inverse_permutation, transpose->shape().dimensions(), {})});
 }
 
 HloInstructionIndexing ComputeInputToOutputTransposeOpIndexing(
-    const HloTransposeInstruction* transpose,
-    IndexingContext* indexing_context) {
-  AffineMap forward_permutation = ComputeTransposeIndexingMap(
-      transpose->dimensions(), indexing_context->GetMLIRContext());
+    const HloTransposeInstruction* transpose, MLIRContext* mlir_context) {
+  AffineMap forward_permutation =
+      ComputeTransposeIndexingMap(transpose->dimensions(), mlir_context);
   return HloInstructionIndexing::FromIndexingMaps({IndexingMap::FromTensorSizes(
-      indexing_context, forward_permutation,
-      transpose->operand(0)->shape().dimensions(), {})});
+      forward_permutation, transpose->operand(0)->shape().dimensions(), {})});
 }
 
 }  // namespace
 
 IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
-                          IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+                          MLIRContext* mlir_context) {
   ShapeUtil::BitcastDecomposition decomposed_bitcast =
       ShapeUtil::DecomposeBitcast(input_shape, output_shape);
 
@@ -971,7 +908,6 @@ IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
         << "Failed to deduce permutation for a bitcast.";
 
     return IndexingMap::FromTensorSizes(
-        indexing_context,
         ComputeTransposeIndexingMap(permutation.value(), mlir_context),
         input_shape.dimensions(), {});
   }
@@ -980,7 +916,6 @@ IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
     // Note: ComputeReshapeIndexingMap assumes it's computing an output->input
     // indexing, so input and output are reversed.
     return IndexingMap::FromTensorSizes(
-        indexing_context,
         ComputeReshapeIndexingMap(output_shape, input_shape, mlir_context),
         input_shape.dimensions(), {});
   }
@@ -994,24 +929,24 @@ IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
       ComputeTransposeIndexingMap(trt.transpose2_dims, mlir_context);
   auto bitcast_map =
       transpose_map_2.compose(reshape_map).compose(transpose_map_1);
-  return IndexingMap::FromTensorSizes(indexing_context, bitcast_map,
-                                      input_shape.dimensions(), {});
+  return IndexingMap::FromTensorSizes(bitcast_map, input_shape.dimensions(),
+                                      {});
 }
 
 namespace {
 
 HloInstructionIndexing ComputeOutputToInputBitcastOpIndexing(
-    const HloInstruction* bitcast, IndexingContext* indexing_context) {
-  auto bitcast_map = GetBitcastMap(
-      bitcast->shape(), bitcast->operand(0)->shape(), indexing_context);
+    const HloInstruction* bitcast, MLIRContext* mlir_context) {
+  auto bitcast_map = GetBitcastMap(bitcast->shape(),
+                                   bitcast->operand(0)->shape(), mlir_context);
   bitcast_map.Simplify();
   return HloInstructionIndexing::FromIndexingMaps({bitcast_map});
 }
 
 HloInstructionIndexing ComputeInputToOutputBitcastOpIndexing(
-    const HloInstruction* bitcast, IndexingContext* indexing_context) {
+    const HloInstruction* bitcast, MLIRContext* mlir_context) {
   auto bitcast_map = GetBitcastMap(bitcast->operand(0)->shape(),
-                                   bitcast->shape(), indexing_context);
+                                   bitcast->shape(), mlir_context);
   bitcast_map.Simplify();
   return HloInstructionIndexing::FromIndexingMaps({bitcast_map});
 }
@@ -1034,19 +969,16 @@ AffineMap GetTilingAffineMap(llvm::ArrayRef<AffineExpr> exprs,
 
 }  // namespace
 
-IndexingMap CreateIdentityMap(const Shape& shape,
-                              IndexingContext* indexing_context) {
+IndexingMap CreateIdentityMap(const Shape& shape, MLIRContext* mlir_context) {
   if (shape.IsTuple()) {
     // Should happen only for variadic reduce. In that case all tuple shapes are
     // equal.
-    return CreateIdentityMap(shape.tuple_shapes(0), indexing_context);
+    return CreateIdentityMap(shape.tuple_shapes(0), mlir_context);
   }
 
   auto dimensions = shape.dimensions();
   IndexingMap identity_map = IndexingMap::FromTensorSizes(
-      indexing_context,
-      AffineMap::getMultiDimIdentityMap(dimensions.size(),
-                                        indexing_context->GetMLIRContext()),
+      AffineMap::getMultiDimIdentityMap(dimensions.size(), mlir_context),
       dimensions, {});
   return identity_map;
 }
@@ -1079,14 +1011,12 @@ llvm::SmallVector<AffineExpr, 4> DelinearizeInBoundsIndex(
 }
 
 IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
-    const Shape& shape, IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+    const Shape& shape, MLIRContext* mlir_context) {
   if (shape.rank() == 0) {
-    return IndexingMap(indexing_context, AffineMap::get(mlir_context),
+    return IndexingMap(AffineMap::get(mlir_context),
                        /*dim_vars=*/{}, /*range vars=*/{}, /*rt_vars=*/{});
   }
   return IndexingMap::FromTensorSizes(
-      indexing_context,
       ComputeTransposeIndexingMap(
           InversePermutation(ToTransposeDimensions(shape.layout())),
           mlir_context),
@@ -1096,14 +1026,12 @@ IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
 }
 
 IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(
-    const Shape& shape, IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+    const Shape& shape, MLIRContext* mlir_context) {
   if (shape.rank() == 0) {
-    return IndexingMap(indexing_context, AffineMap::get(mlir_context),
+    return IndexingMap(AffineMap::get(mlir_context),
                        /*dim_vars=*/{}, /*range vars=*/{}, /*rt_vars=*/{});
   }
   return IndexingMap::FromTensorSizes(
-      indexing_context,
       ComputeTransposeIndexingMap(ToTransposeDimensions(shape.layout()),
                                   mlir_context),
       shape.dimensions(), {});
@@ -1136,13 +1064,12 @@ AffineMap GetThreadOffsetsForTiling(const Tiling& tiling,
 }
 
 IndexingMap GetIndexingMapForTiling(const Tiling& tiling,
-                                    IndexingContext* indexing_context) {
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+                                    MLIRContext* mlir_context) {
   return GetIndexingMapForTiling(
       GetBlockOffsetsForTiling(tiling, mlir_context),
       GetThreadOffsetsForTiling(tiling, mlir_context),
       tiling.GetNumThreadsPerBlock(), tiling.GetNumBlocks(),
-      tiling.GetThreadTileSize(), tiling.GetShape(), indexing_context);
+      tiling.GetThreadTileSize(), tiling.GetShape());
 }
 
 IndexingMap GetIndexingMapForTiling(AffineMap block_offsets,
@@ -1150,8 +1077,8 @@ IndexingMap GetIndexingMapForTiling(AffineMap block_offsets,
                                     int64_t threads_per_block,
                                     int64_t num_blocks,
                                     absl::Span<const int64_t> thread_tile_sizes,
-                                    absl::Span<const int64_t> tiled_shape,
-                                    IndexingContext* indexing_context) {
+                                    absl::Span<const int64_t> tiled_shape) {
+  auto* mlir_context = block_offsets.getContext();
   llvm::SmallVector<AffineExpr, 4> offsets;
   offsets.reserve(block_offsets.getNumResults());
   for (auto [block, thread] :
@@ -1163,8 +1090,8 @@ IndexingMap GetIndexingMapForTiling(AffineMap block_offsets,
   };
   auto affine_map = mlir::AffineMap::get(block_offsets.getNumDims(),
                                          block_offsets.getNumSymbols(), offsets,
-                                         indexing_context->GetMLIRContext());
-  IndexingMap map{indexing_context, affine_map, dimension_ranges,
+                                         mlir_context);
+  IndexingMap map{affine_map, dimension_ranges,
                   RangeVarsFromTensorSizes(thread_tile_sizes), /*rt_vars=*/{}};
   for (int i = 0; i < tiled_shape.size(); ++i) {
     map.AddConstraint(affine_map.getResult(i), {0, tiled_shape[i] - 1});
@@ -1256,7 +1183,7 @@ GroupedByOpIndexingMap GroupIndexingMapsByProducers(
 
 GroupedByOpIndexingMap ComputeGroupedOutputToInputIndexing(
     const HloFusionAdaptor& fusion_adaptor, HloInstructionAdaptor target_instr,
-    IndexingContext* ctx) {
+    MLIRContext* ctx) {
   auto initial_map = CreateIdentityMap(target_instr.instruction().shape(), ctx);
 
   GroupedByOpIndexingMap grouped_indexing_maps;
@@ -1310,9 +1237,9 @@ bool FuseProducerConsumerOutputToInputIndexing(
     const HloInstruction* producer_instr,
     absl::flat_hash_map<const HloInstruction*, IndexingMapSet>*
         consumer_indexing,
-    IndexingContext* indexing_context) {
+    MLIRContext* mlir_context) {
   auto producer_indexing = ComputeOutputToInputIndexing(
-      producer_instr, /*output_id=*/0, indexing_context);
+      producer_instr, /*output_id=*/0, mlir_context);
   auto consumer_indexing_maps = (*consumer_indexing)[producer_instr];
   for (const auto& [producer_operand_id, producer_operand_indexing] :
        llvm::enumerate(producer_indexing.indexing_maps)) {
@@ -1331,7 +1258,7 @@ bool FuseProducerConsumerOutputToInputIndexing(
 
 HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
                                                     int output_id,
-                                                    IndexingContext* ctx) {
+                                                    MLIRContext* ctx) {
   if (HloInstruction::IsOpElementwise(instr->opcode())) {
     return ComputeOutputToInputCwiseOpIndexing(instr, ctx);
   }
@@ -1394,7 +1321,7 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
 
 HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
                                                     int input_id,
-                                                    IndexingContext* ctx) {
+                                                    MLIRContext* ctx) {
   if (HloInstruction::IsOpElementwise(instr->opcode())) {
     return ComputeInputToOutputCwiseOpIndexing(instr, ctx);
   }
@@ -1431,15 +1358,15 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
 }
 
 IndexingMap ComputeEpilogueInputToOutputIndexing(
-    const HloInstruction* epilogue_root, IndexingContext* indexing_context,
+    const HloInstruction* epilogue_root, MLIRContext* mlir_context,
     std::function<bool(const HloInstruction*)> is_root) {
   auto* instr = epilogue_root;
-  auto root_indexing = CreateIdentityMap(instr->shape(), indexing_context);
+  auto root_indexing = CreateIdentityMap(instr->shape(), mlir_context);
   while (!is_root(instr)) {
     // There can be multiple users, but they must have compatible indexing maps.
     auto* user = instr->users().front();
     auto user_indexing = ComputeInputToOutputIndexing(
-        user, user->operand_index(instr), indexing_context);
+        user, user->operand_index(instr), mlir_context);
     root_indexing = root_indexing * *user_indexing.indexing_maps[0].begin();
     root_indexing.Simplify();
     instr = user;
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.h b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
index e277e543163fb1..b6c346ffabfcc8 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
@@ -67,15 +67,15 @@ std::string ToString(const mlir::AffineMap& affine_map);
 
 // Computes indexing maps for all input operands necessary to compute an element
 // of the `output_id` instruction output.
-HloInstructionIndexing ComputeOutputToInputIndexing(
-    const HloInstruction* instr, int output_id,
-    IndexingContext* indexing_context);
+HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
+                                                    int output_id,
+                                                    mlir::MLIRContext* ctx);
 
 // Computes indexing maps for all output operands that the element of the
 // `input_id` instruction input will participate in.
-HloInstructionIndexing ComputeInputToOutputIndexing(
-    const HloInstruction* instr, int input_id,
-    IndexingContext* indexing_context);
+HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
+                                                    int input_id,
+                                                    mlir::MLIRContext* ctx);
 
 // Computes the indexing for `epilogue_parent`'s epilogue. For example, if
 // `epilogue_parent` is a transpose, computes the input to output indexing for
@@ -94,7 +94,7 @@ HloInstructionIndexing ComputeInputToOutputIndexing(
 // FindNonTrivialHero, i.e., each instruction in the epilogue only has a single
 // user, or the users have identical indexing maps.
 IndexingMap ComputeEpilogueInputToOutputIndexing(
-    const HloInstruction* epilogue_root, IndexingContext* indexing_context,
+    const HloInstruction* epilogue_root, mlir::MLIRContext* ctx,
     std::function<bool(const HloInstruction*)> is_root =
         [](const HloInstruction* instr) { return instr->IsRoot(); });
 
@@ -105,7 +105,7 @@ using GroupedByOpIndexingMap =
 // cluster starting with `target_instr` and going from def to use.
 GroupedByOpIndexingMap ComputeGroupedOutputToInputIndexing(
     const HloFusionAdaptor& fusion_adaptor, HloInstructionAdaptor target_instr,
-    IndexingContext* indexing_context);
+    mlir::MLIRContext* ctx);
 
 // Groups indexing maps by instructions.
 absl::flat_hash_map<const HloInstruction*, IndexingMapSet>
@@ -118,52 +118,51 @@ bool FuseProducerConsumerOutputToInputIndexing(
     const HloInstruction* producer_instr,
     absl::flat_hash_map<const HloInstruction*, IndexingMapSet>*
         consumer_indexing,
-    IndexingContext* mlir_context);
+    mlir::MLIRContext* mlir_context);
 
 // Creates an indexing map for bitcasting from `input_shape` to `output_shape`.
 // Equivalent to linearizing the input_shape index and then delinearizing it
 // to output_shape.
 IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
-                          IndexingContext* indexing_context);
+                          mlir::MLIRContext* ctx);
 
 // Creates an indexing map from the physical layout of the tensor to its logical
 // layout.
-IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
-    const Shape& shape, IndexingContext* indexing_context);
+IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(const Shape& shape,
+                                                      mlir::MLIRContext* ctx);
 
 // Creates an indexing map from the logical layout of the tensor to its physical
 // layout.
-IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(
-    const Shape& shape, IndexingContext* indexing_context);
+IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(const Shape& shape,
+                                                      mlir::MLIRContext* ctx);
 
 // Creates an indexing map from thread and block IDs to elements of the tiled
 // shape. Uses the same convention as KernelFusionInterface: dimensions 0 to 2
 // are thread indices (currently only 0 is used), dimensions 3 to 5 are block
 // indices (currently only 3 is used).
 mlir::AffineMap GetBlockOffsetsForTiling(const Tiling& tiling,
-                                         mlir::MLIRContext* mlir_context);
+                                         mlir::MLIRContext* ctx);
 mlir::AffineMap GetThreadOffsetsForTiling(const Tiling& tiling,
-                                          mlir::MLIRContext* mlir_context);
+                                          mlir::MLIRContext* ctx);
 
 // Convenience functions for the two functions above
 // (`GetBlockOffsestsForTiling` + `GetThreadOffsetsForTiling`). Also sets up
 // the ranges of dimensions and symbols.
 IndexingMap GetIndexingMapForTiling(const Tiling& tiling,
-                                    IndexingContext* indexing_context);
+                                    mlir::MLIRContext* ctx);
 IndexingMap GetIndexingMapForTiling(mlir::AffineMap block_offsets,
                                     mlir::AffineMap thread_offsets,
                                     int64_t threads_per_block,
                                     int64_t num_blocks,
                                     absl::Span<const int64_t> thread_tile_sizes,
-                                    absl::Span<const int64_t> tiled_shape,
-                                    IndexingContext* indexing_context);
+                                    absl::Span<const int64_t> tiled_shape);
 
 // Returns the shape of the output of the instruction.
 const Shape& GetOutputShape(const HloInstruction* instr, int64_t output_id);
 
 // Creates an identity indexing map corresponding to the parameter shape.
 IndexingMap CreateIdentityMap(const Shape& shape,
-                              IndexingContext* indexing_context);
+                              mlir::MLIRContext* mlir_context);
 
 llvm::SmallVector<mlir::AffineExpr, 4> DelinearizeInBoundsIndex(
     mlir::AffineExpr linear, absl::Span<const int64_t> sizes,
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 2d4855f6840ae5..12256b22989d6b 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/hlo_traversal.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
@@ -92,7 +91,7 @@ TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing) {
   auto fusion_adaptor = ProducerConsumerFusion(transpose, root);
 
   auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
-      fusion_adaptor, fusion_adaptor.GetRoots()[0], &indexing_context_);
+      fusion_adaptor, fusion_adaptor.GetRoots()[0], &mlir_context_);
   EXPECT_THAT(grouped_indexing,
               UnorderedElementsAre(
                   Pair(root, ElementsAre(MatchIndexingMap(R"(
@@ -149,7 +148,7 @@ TEST_F(IndexingAnalysisTest,
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(root);
 
   auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
-      *fusion_adaptor, fusion_adaptor->GetRoots()[0], &indexing_context_);
+      *fusion_adaptor, fusion_adaptor->GetRoots()[0], &mlir_context_);
 
   EXPECT_THAT(grouped_indexing,
               UnorderedElementsAre(
@@ -201,7 +200,7 @@ TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing_SingleOp) {
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(exponential);
   HloInstructionAdaptor parameter_adaptor(*parameter);
   auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
-      *fusion_adaptor, parameter_adaptor, &indexing_context_);
+      *fusion_adaptor, parameter_adaptor, &mlir_context_);
   EXPECT_THAT(grouped_indexing, UnorderedElementsAre(Pair(
                                     parameter, ElementsAre(MatchIndexingMap(R"(
                                                      (d0, d1) -> (d0, d1)
@@ -241,7 +240,7 @@ TEST_F(IndexingAnalysisTest,
   auto parameter_0 = bcast.GetOperand(0);
 
   auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
-      *fusion_adaptor, bcast, &indexing_context_);
+      *fusion_adaptor, bcast, &mlir_context_);
   EXPECT_THAT(
       grouped_indexing,
       UnorderedElementsAre(
@@ -2254,7 +2253,7 @@ TEST_F(IndexingAnalysisTest, TilingIndexing) {
   Tiling tiling{/*shape=*/{1022, 256, 16},
                 /*tile_sizes=*/{8, 1, 4},
                 /*num_threads=*/{1, 4, 4}};
-  auto indexing_map = GetIndexingMapForTiling(tiling, &indexing_context_);
+  auto indexing_map = GetIndexingMapForTiling(tiling, &mlir_context_);
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
@@ -2289,7 +2288,7 @@ TEST_F(IndexingAnalysisTest, EpilogueIndexing) {
   ASSERT_TRUE(module.ok());
   EXPECT_THAT(ComputeEpilogueInputToOutputIndexing(
                   (*module)->entry_computation()->GetInstructionWithName("t"),
-                  &indexing_context_)
+                  &mlir_context_)
                   .ToString(),
               MatchIndexingString(R"(
                   (d0, d1) -> (d0 + d1 * 1000)
@@ -2310,7 +2309,7 @@ TEST_F(IndexingAnalysisTest, EpilogueIndexing_NoEpilogue) {
   ASSERT_TRUE(module.ok());
   EXPECT_THAT(ComputeEpilogueInputToOutputIndexing(
                   (*module)->entry_computation()->GetInstructionWithName("t"),
-                  &indexing_context_)
+                  &mlir_context_)
                   .ToString(),
               MatchIndexingString(R"(
                   (d0, d1) -> (d0, d1)
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index 33a5c8e7e82dd4..48186d460d75b9 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
 
 namespace xla {
@@ -620,20 +619,11 @@ std::vector<RangeVar> RangeVarsFromTensorSizes(
 }
 
 IndexingMap IndexingMap::FromTensorSizes(
-    IndexingContext* indexing_context, AffineMap affine_map,
-    absl::Span<const int64_t> dim_upper_bounds,
+    AffineMap affine_map, absl::Span<const int64_t> dim_upper_bounds,
     absl::Span<const int64_t> symbol_upper_bounds) {
-  return IndexingMap{
-      indexing_context, affine_map, DimVarsFromTensorSizes(dim_upper_bounds),
-      RangeVarsFromTensorSizes(symbol_upper_bounds), /*rt_vars=*/{}};
-}
-
-mlir::MLIRContext* IndexingMap::GetMLIRContext() const {
-  return indexing_context_->GetMLIRContext();
-}
-
-IndexingContext* IndexingMap::GetIndexingContext() const {
-  return indexing_context_;
+  return IndexingMap{affine_map, DimVarsFromTensorSizes(dim_upper_bounds),
+                     RangeVarsFromTensorSizes(symbol_upper_bounds),
+                     /*rt_vars=*/{}};
 }
 
 const Interval& IndexingMap::GetDimensionBound(int64_t dim_id) const {
@@ -868,6 +858,10 @@ void IndexingMap::Print(std::ostream& out,
   }
 }
 
+MLIRContext* IndexingMap::GetMLIRContext() const {
+  return IsUndefined() ? nullptr : affine_map_.getContext();
+}
+
 std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map) {
   AffineMapPrinter printer;
   indexing_map.Print(out, printer);
@@ -1177,9 +1171,7 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
     combined_symbol_ranges.push_back(symbol_range);
   }
 
-  IndexingContext* indexing_context = first.GetIndexingContext();
-  IndexingMap composed_indexing_map(indexing_context, composed_map,
-                                    first.GetDimVars(),
+  IndexingMap composed_indexing_map(composed_map, first.GetDimVars(),
                                     std::move(combined_symbol_ranges),
                                     /*rt_vars=*/{});
   // Add constraints that are already present in the producer_map. We have to
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index 5b5452bf846595..b3f5538f227c94 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -37,8 +37,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-class IndexingContext;
-
 // Interval represents a closed interval [lower_bound, upper_bound].
 struct Interval {
   std::string ToString() const;
@@ -215,12 +213,10 @@ std::vector<RangeVar> RangeVarsFromTensorSizes(
 class IndexingMap {
  public:
   IndexingMap(
-      IndexingContext* indexing_context, mlir::AffineMap affine_map,
-      std::vector<DimVar> dimensions, std::vector<RangeVar> range_vars,
-      std::vector<RTVar> rt_vars,
+      mlir::AffineMap affine_map, std::vector<DimVar> dimensions,
+      std::vector<RangeVar> range_vars, std::vector<RTVar> rt_vars,
       absl::Span<std::pair<mlir::AffineExpr, Interval>> constraints = {})
-      : indexing_context_(indexing_context),
-        affine_map_(affine_map),
+      : affine_map_(affine_map),
         dim_vars_(std::move(dimensions)),
         range_vars_(std::move(range_vars)),
         rt_vars_(std::move(rt_vars)) {
@@ -228,12 +224,10 @@ class IndexingMap {
       AddConstraint(expr, range);
     }
   }
-  IndexingMap(IndexingContext* indexing_context, mlir::AffineMap affine_map,
-              std::vector<DimVar> dimensions, std::vector<RangeVar> range_vars,
-              std::vector<RTVar> rt_vars,
+  IndexingMap(mlir::AffineMap affine_map, std::vector<DimVar> dimensions,
+              std::vector<RangeVar> range_vars, std::vector<RTVar> rt_vars,
               const llvm::DenseMap<mlir::AffineExpr, Interval>& constraints)
-      : indexing_context_(indexing_context),
-        affine_map_(affine_map),
+      : affine_map_(affine_map),
         dim_vars_(std::move(dimensions)),
         range_vars_(std::move(range_vars)),
         rt_vars_(std::move(rt_vars)),
@@ -242,8 +236,7 @@ class IndexingMap {
   static IndexingMap GetUndefined() { return IndexingMap(); }
 
   static IndexingMap FromTensorSizes(
-      IndexingContext* indexing_context, mlir::AffineMap affine_map,
-      absl::Span<const int64_t> dim_upper_bounds,
+      mlir::AffineMap affine_map, absl::Span<const int64_t> dim_upper_bounds,
       absl::Span<const int64_t> symbol_upper_bounds);
 
   std::string ToString(
@@ -257,9 +250,6 @@ class IndexingMap {
   // Return MLIRContext.
   mlir::MLIRContext* GetMLIRContext() const;
 
-  // Return IndexingContext.
-  IndexingContext* GetIndexingContext() const;
-
   // Returns the affine map.
   mlir::AffineMap GetAffineMap() const { return affine_map_; }
 
@@ -339,7 +329,6 @@ class IndexingMap {
   // Merges "mod" constraints for the same AffineExpr.
   void MergeModConstraints();
 
-  IndexingContext* indexing_context_ = nullptr;
   mlir::AffineMap affine_map_;
   std::vector<DimVar> dim_vars_;
   std::vector<RangeVar> range_vars_;
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index ddfceefe4843be..4598849d9f0338 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -38,9 +38,7 @@ using ::testing::ElementsAre;
 
 class IndexingMapTest : public HloTestBase {
  public:
-  IndexingMapTest() : indexing_context_(&mlir_context_) {}
   mlir::MLIRContext mlir_context_;
-  IndexingContext indexing_context_;
   AffineMapPrinter printer_;
 };
 
@@ -52,7 +50,6 @@ TEST_F(IndexingMapTest, RTVar) {
                                     /*instr=*/nullptr, zero_dim_map})};
 
   IndexingMap indexing_map(
-      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1, s2] -> (d1, d0, s0 + s1, s1)",
                      &mlir_context_),
       {DimVar{{0, 99}}, DimVar{{0, 43}}}, {RangeVar{{-99, 99}}},
@@ -77,7 +74,6 @@ TEST_F(IndexingMapTest, RTVar) {
 
 TEST_F(IndexingMapTest, Evaluation) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
       {4, 4}, {2, 2});
 
@@ -102,12 +98,10 @@ TEST_F(IndexingMapTest, Evaluation) {
 
 TEST_F(IndexingMapTest, Composition_Permutation) {
   IndexingMap producer = IndexingMap::FromTensorSizes(
-      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
       {4, 4}, {2, 2});
 
   IndexingMap consumer = IndexingMap::FromTensorSizes(
-      &indexing_context_,
       ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_), {4}, {4});
 
   auto composed = ComposeIndexingMaps(consumer, producer);
@@ -123,12 +117,10 @@ TEST_F(IndexingMapTest, Composition_Permutation) {
 
 TEST_F(IndexingMapTest, Composition_RestrictedInterval) {
   IndexingMap producer = IndexingMap::FromTensorSizes(
-      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
       {5, 6}, {7, 2});
 
   IndexingMap consumer = IndexingMap::FromTensorSizes(
-      &indexing_context_,
       ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_), {10}, {8});
 
   auto composed = ComposeIndexingMaps(consumer, producer);
@@ -144,7 +136,6 @@ TEST_F(IndexingMapTest, Composition_RestrictedInterval) {
 
 TEST_F(IndexingMapTest, Composition_ProducerAndConsumerHaveConstraints) {
   IndexingMap producer = IndexingMap::FromTensorSizes(
-      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
       {50, 60}, {70, 20});
   producer.AddConstraint(ParseAffineExpr("d0 mod 8", &mlir_context_),
@@ -153,7 +144,6 @@ TEST_F(IndexingMapTest, Composition_ProducerAndConsumerHaveConstraints) {
                          Interval{1, 1});
 
   IndexingMap consumer = IndexingMap::FromTensorSizes(
-      &indexing_context_,
       ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_), {10}, {8});
   consumer.AddConstraint(ParseAffineExpr("d0 + s0", &mlir_context_),
                          Interval{0, 20});
@@ -189,7 +179,6 @@ TEST_F(IndexingMapTest, Composition_ProducerAndConsumerHaveConstraints) {
 
 TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintUsesSymbol) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1)", &mlir_context_),
       {50, 60}, {70, 20});
   // This constraint cannot be removed, because it contains a "used symbol".
@@ -212,7 +201,6 @@ TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintUsesSymbol) {
 
 TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintUsesOnlyUnusedSymbols) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_,
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1)", &mlir_context_),
       {50, 60}, {70, 20});
   // This constraint can be removed, because it contains only the unused symbol.
@@ -230,7 +218,6 @@ TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintUsesOnlyUnusedSymbols) {
 
 TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithManySymbols) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_,
       ParseAffineMap("(d0)[s0, s1, s2, s3, s4] -> (d0 * 4 + s1 + s3 - 42)",
                      &mlir_context_),
       {32}, {1, 2, 3, 4, 5});
@@ -250,8 +237,7 @@ TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithManySymbols) {
 
 TEST_F(IndexingMapTest, ConstraintIntervalSimplification_Sum) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100},
-      {});
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100}, {});
 
   indexing_map.AddConstraint(ParseAffineExpr("(d0 mod 8) + 5", &mlir_context_),
                              Interval{50, 54});
@@ -267,8 +253,7 @@ TEST_F(IndexingMapTest, ConstraintIntervalSimplification_Sum) {
 TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_FloorDivPositiveDivisorPositiveBounds) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100},
-      {});
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100}, {});
 
   indexing_map.AddConstraint(ParseAffineExpr("d0 floordiv 8", &mlir_context_),
                              Interval{5, 11});
@@ -281,9 +266,9 @@ TEST_F(IndexingMapTest,
 
 TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_FloorDivPositiveDivisorNegativeBounds) {
-  IndexingMap indexing_map = IndexingMap(
-      &indexing_context_, ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-      {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
+  IndexingMap indexing_map =
+      IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
+                  {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 floordiv 3", &mlir_context_),
                              Interval{-11, -5});
@@ -297,9 +282,9 @@ TEST_F(IndexingMapTest,
 
 TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_FloorDivNegativeDivisorNegativeBounds) {
-  IndexingMap indexing_map = IndexingMap(
-      &indexing_context_, ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-      {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
+  IndexingMap indexing_map =
+      IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
+                  {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 floordiv -3", &mlir_context_),
                              Interval{-11, -5});
@@ -314,8 +299,7 @@ TEST_F(IndexingMapTest,
 TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_MulPositiveMultiplierPositiveBounds) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100},
-      {});
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100}, {});
 
   indexing_map.AddConstraint(ParseAffineExpr("d0 * 8", &mlir_context_),
                              Interval{14, 33});
@@ -328,9 +312,9 @@ TEST_F(IndexingMapTest,
 
 TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_MulPositiveMultiplierNegativeBounds) {
-  IndexingMap indexing_map = IndexingMap(
-      &indexing_context_, ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-      {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
+  IndexingMap indexing_map =
+      IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
+                  {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 * 3", &mlir_context_),
                              Interval{-11, -5});
@@ -344,9 +328,9 @@ TEST_F(IndexingMapTest,
 
 TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_MulNegativeMultiplierNegativeBounds) {
-  IndexingMap indexing_map = IndexingMap(
-      &indexing_context_, ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-      {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
+  IndexingMap indexing_map =
+      IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
+                  {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 * -3", &mlir_context_),
                              Interval{-11, -5});
@@ -360,7 +344,6 @@ TEST_F(IndexingMapTest,
 
 TEST_F(IndexingMapTest, ConstraintMerge_Mod) {
   IndexingMap indexing_map(
-      &indexing_context_,
       ParseAffineMap("(d0)[s0, s1] -> (d0, s1, s0)", &mlir_context_),
       {DimVar{{0, 4}}}, {RangeVar{{-21, -1}}, RangeVar{{0, 10}}},
       /*rt_vars=*/{});
@@ -387,9 +370,9 @@ TEST_F(IndexingMapTest, ConstraintMerge_Mod) {
 }
 
 TEST_F(IndexingMapTest, AffineMapSimplification_ConstantDims) {
-  IndexingMap indexing_map = IndexingMap(
-      &indexing_context_, ParseAffineMap("(d0) -> (d0)", &mlir_context_),
-      {DimVar{{5, 5}}}, /*range_vars=*/{}, /*rt_vars=*/{});
+  IndexingMap indexing_map =
+      IndexingMap(ParseAffineMap("(d0) -> (d0)", &mlir_context_),
+                  {DimVar{{5, 5}}}, /*range_vars=*/{}, /*rt_vars=*/{});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0) -> (5)
@@ -402,8 +385,7 @@ TEST_F(IndexingMapTest,
        AffineMapSimplification_DivsAndModsIfSmallerThanDivisor) {
   auto serialized_map = "(d0, d1) -> (d0 + d1 floordiv 16, d1 mod 16)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_),
-      {8, 16}, {});
+      ParseAffineMap(serialized_map, &mlir_context_), {8, 16}, {});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0, d1) -> (d0, d1)
@@ -420,8 +402,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsAndModsWithMultipliers) {
       "d2 mod 10)";
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_),
-      {9, 9, 9}, {});
+      ParseAffineMap(serialized_map, &mlir_context_), {9, 9, 9}, {});
   indexing_map.Simplify();
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
@@ -440,8 +421,7 @@ TEST_F(IndexingMapTest,
       "                 (d0 * 16 + d1 * 4 + d2) mod 8)";
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_),
-      {10, 10, 10}, {});
+      ParseAffineMap(serialized_map, &mlir_context_), {10, 10, 10}, {});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
     (d0, d1, d2) -> (d0 * 2 + (d1 + d2 floordiv 4) floordiv 2,
@@ -458,8 +438,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsAndModsWithReverse) {
       "(d0, d1) -> (-((d0 * -11 - d1 + 109) floordiv 11) + 9, "
       "d0 * 11 + d1 + ((d0 * -11 - d1 + 109) floordiv 11) * 11 - 99)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_),
-      {8, 9}, {});
+      ParseAffineMap(serialized_map, &mlir_context_), {8, 9}, {});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                  (d0, d1) -> (d0, d1)
@@ -473,8 +452,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_SimplifyReshape) {
   auto serialized_map =
       "()[s0] -> ((s0 * 128) mod 715 + ((s0 * 128) floordiv 715) * 715)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_), {},
-      {128});
+      ParseAffineMap(serialized_map, &mlir_context_), {}, {128});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0] -> (s0 * 128)
@@ -487,8 +465,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_SimplifyReshape_Regression) {
   auto serialized_map =
       "()[s0] -> ((s0 * 128) mod 715 + ((s0 * 64) floordiv 715) * 715)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_), {},
-      {128});
+      ParseAffineMap(serialized_map, &mlir_context_), {}, {128});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0] -> ((s0 * 128) mod 715 + ((s0 * 64) floordiv 715) * 715)
@@ -501,8 +478,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsInSequence) {
       "()[s0] -> (s0 - ((s0 floordiv 2) floordiv 7) * 14 + (s0 floordiv 14) * "
       "14)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_), {},
-      {1234});
+      ParseAffineMap(serialized_map, &mlir_context_), {}, {1234});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                  ()[s0] -> (s0)
@@ -516,8 +492,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivGcdGreater1) {
       "()[s0, s1, s2] -> (s0 * 512 + s1 * 4 + s2 - ((s0 * 2 + s1 floordiv 64) "
       "floordiv 3) * 768 + ((s0 * 128 + s1) floordiv 192) * 768)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_), {},
-      {1234, 128, 4});
+      ParseAffineMap(serialized_map, &mlir_context_), {}, {1234, 128, 4});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1, s2] -> (s0 * 512 + s1 * 4 + s2)
@@ -533,8 +508,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_ExtractFromMod) {
       "()[s0, s1, s2, s3] -> ((s0 * 458752 + s1 + s2 * 4 + s3 * 512) mod "
       "20000)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_), {},
-      {872, 4, 128, 896});
+      ParseAffineMap(serialized_map, &mlir_context_), {}, {872, 4, 128, 896});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1, s2, s3] -> (
@@ -554,8 +528,7 @@ TEST_F(IndexingMapTest,
       "()[s0, s1] -> ((s0 * 16 - (s1 floordiv 4) floordiv 2 + (s1 floordiv 8) "
       "* 2) floordiv 4)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      &indexing_context_, ParseAffineMap(serialized_map, &mlir_context_), {},
-      {2, 128});
+      ParseAffineMap(serialized_map, &mlir_context_), {}, {2, 128});
   indexing_map.Simplify();
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1] -> (
diff --git a/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc b/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
index 55fa6433ea77ba..e7b7e39ac71325 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
@@ -53,17 +53,17 @@ HloInstruction* IndexingTestBase::ParseAndGetRoot(
 HloInstructionIndexing IndexingTestBase::GetOutputToInputIndexing(
     const HloInstruction* instr, int output_id, bool use_physical_layout) {
   HloInstructionIndexing indexing =
-      ComputeOutputToInputIndexing(instr, output_id, &indexing_context_);
+      ComputeOutputToInputIndexing(instr, output_id, &mlir_context_);
 
   if (!use_physical_layout) return indexing;
 
   IndexingMap output_permutation = GetIndexingMapFromPhysicalLayoutToLogical(
-      GetOutputShape(instr, output_id), &indexing_context_);
+      GetOutputShape(instr, output_id), &mlir_context_);
 
   for (const auto& [operand_id, indexing_maps] :
        llvm::enumerate(indexing.indexing_maps)) {
     IndexingMap operand_permutation = GetIndexingMapFromLogicalToPhysicalLayout(
-        instr->operand(operand_id)->shape(), &indexing_context_);
+        instr->operand(operand_id)->shape(), &mlir_context_);
 
     absl::flat_hash_set<IndexingMap> operand_indexing_maps;
     for (const IndexingMap& indexing_map : indexing_maps) {
@@ -86,17 +86,17 @@ HloInstructionIndexing IndexingTestBase::GetOutputToInputIndexing(
 HloInstructionIndexing IndexingTestBase::GetInputToOutputIndexing(
     const HloInstruction* instr, int input_id, bool use_physical_layout) {
   HloInstructionIndexing indexing =
-      ComputeInputToOutputIndexing(instr, input_id, &indexing_context_);
+      ComputeInputToOutputIndexing(instr, input_id, &mlir_context_);
 
   if (!use_physical_layout) return indexing;
 
   IndexingMap input_permutation = GetIndexingMapFromPhysicalLayoutToLogical(
-      instr->operand(input_id)->shape(), &indexing_context_);
+      instr->operand(input_id)->shape(), &mlir_context_);
 
   for (const auto& [output_id, indexing_maps] :
        llvm::enumerate(indexing.indexing_maps)) {
     IndexingMap operand_permutation = GetIndexingMapFromLogicalToPhysicalLayout(
-        GetOutputShape(instr, output_id), &indexing_context_);
+        GetOutputShape(instr, output_id), &mlir_context_);
 
     absl::flat_hash_set<IndexingMap> operand_indexing_maps;
     for (const IndexingMap& indexing_map : indexing_maps) {
diff --git a/third_party/xla/xla/service/gpu/model/indexing_test_utils.h b/third_party/xla/xla/service/gpu/model/indexing_test_utils.h
index a0a304b0d43104..62abd0e5e7fdb4 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_test_utils.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_test_utils.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
 
@@ -53,8 +52,6 @@ MATCHER_P(MatchIndexingString, indexing_string, "") {
 
 class IndexingTestBase : public HloTestBase {
  public:
-  IndexingTestBase()
-      : HloTestBase(), mlir_context_(), indexing_context_(&mlir_context_) {}
   HloInstruction* ParseAndGetRoot(absl::string_view hlo_string);
 
   HloInstructionIndexing GetOutputToInputIndexing(
@@ -66,7 +63,6 @@ class IndexingTestBase : public HloTestBase {
       bool use_physical_layout = false);
 
   mlir::MLIRContext mlir_context_;
-  IndexingContext indexing_context_;
   std::unique_ptr<VerifiedHloModule> module_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index d1b360fd5c325f..445024ff688683 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/model/tile_analysis.h"
 #include "xla/shape.h"
@@ -49,6 +48,7 @@ namespace {
 
 using ::mlir::AffineExpr;
 using ::mlir::AffineMap;
+using ::mlir::MLIRContext;
 using ::mlir::SmallVector;
 
 struct HloAndPath {
@@ -59,7 +59,7 @@ struct HloAndPath {
 }  // namespace
 
 /*static*/ SymbolicTileAnalysisOrError SymbolicTileAnalysis::AnalyzeComputation(
-    const HloComputation& computation, IndexingContext* ctx) {
+    const HloComputation& computation, MLIRContext* ctx) {
   absl::flat_hash_map<InstructionPathFromRoot, SymbolicTile>
       symbolic_tile_from_path;
   ConstHloInstructionMap<absl::flat_hash_set<InstructionPathFromRoot>>
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
index 9baf9bc781df36..f95a87b1b2e9f4 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -25,8 +25,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/tile_analysis.h"
 #include "xla/service/instruction_fusion.h"
 
@@ -54,7 +54,7 @@ class SymbolicTileAnalysis {
   // Tries to construct a symbolic tile analysis from a computation. Returns
   // a diagnostic if the construction fails for any reason.
   static SymbolicTileAnalysisOrError AnalyzeComputation(
-      const HloComputation& computation, IndexingContext* ctx);
+      const HloComputation& computation, mlir::MLIRContext* ctx);
 
   // Evaluates the tile offsets of an instruction from the analyzed computation
   // following the provided path from the root. Tile parameters must have been
@@ -81,8 +81,8 @@ class SymbolicTileAnalysis {
   void SetTileParametersWithDefaultOffsetsAndStrides(
       absl::Span<int64_t const> sizes);
 
-  // Returns the underlying IndexingContext.
-  IndexingContext* GetIndexingContext() const { return context_; };
+  // Return the underlying MLIRContext.
+  mlir::MLIRContext* GetMLIRContext() const { return context_; };
 
  private:
   SymbolicTileAnalysis(
@@ -90,7 +90,7 @@ class SymbolicTileAnalysis {
           symbolic_tile_from_path,
       ConstHloInstructionMap<absl::flat_hash_set<InstructionPathFromRoot>>
           paths_from_root_to_instruction,
-      IndexingContext* context)
+      mlir::MLIRContext* context)
       : symbolic_tile_from_path_(symbolic_tile_from_path),
         paths_from_root_to_instruction_(paths_from_root_to_instruction),
         context_(context) {}
@@ -101,7 +101,7 @@ class SymbolicTileAnalysis {
   // the possible paths from the root instruction to the key instruction.
   ConstHloInstructionMap<absl::flat_hash_set<InstructionPathFromRoot>>
       paths_from_root_to_instruction_;
-  IndexingContext* context_;
+  mlir::MLIRContext* context_;
   // Optionally set tile parameters. These parameters can be set by calling
   // `SetTileParameters`, and correspond to the output tile for the analyzed
   // computation. The order and type of parameters are as explained in the
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
index 6518cea7ae16e0..6e23b7d169014b 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
 #include "tsl/platform/statusor.h"
@@ -56,11 +55,10 @@ ENTRY main {
 })"));
 
   mlir::MLIRContext mlir_ctx;
-  IndexingContext ctx(&mlir_ctx);
 
   SymbolicTileAnalysisOrError analysis_or_error =
       SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
-                                               &ctx);
+                                               &mlir_ctx);
 
   ASSERT_TRUE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
   SymbolicTileAnalysis analysis =
@@ -94,10 +92,9 @@ ENTRY main {
 })"));
 
   mlir::MLIRContext mlir_ctx;
-  IndexingContext ctx(&mlir_ctx);
   SymbolicTileAnalysisOrError analysis_or_error =
       SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
-                                               &ctx);
+                                               &mlir_ctx);
   EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
 }
 
@@ -110,10 +107,9 @@ ENTRY main {
 })"));
 
   mlir::MLIRContext mlir_ctx;
-  IndexingContext ctx(&mlir_ctx);
   SymbolicTileAnalysisOrError analysis_or_error =
       SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
-                                               &ctx);
+                                               &mlir_ctx);
   EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
 }
 
@@ -126,10 +122,9 @@ ENTRY main {
 })"));
 
   mlir::MLIRContext mlir_ctx;
-  IndexingContext ctx(&mlir_ctx);
   SymbolicTileAnalysisOrError analysis_or_error =
       SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
-                                               &ctx);
+                                               &mlir_ctx);
   EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
 }
 
@@ -143,10 +138,9 @@ ENTRY main {
 })"));
 
   mlir::MLIRContext mlir_ctx;
-  IndexingContext ctx(&mlir_ctx);
   SymbolicTileAnalysisOrError analysis_or_error =
       SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
-                                               &ctx);
+                                               &mlir_ctx);
   EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.cc b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
index 76e2db2f4e0750..63f528ef23271a 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
-#include "xla/service/gpu/model/indexing_context.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
@@ -256,8 +255,7 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
   if (indexing_map.GetRTVarsCount()) {
     return std::nullopt;
   }
-  IndexingContext* indexing_context = indexing_map.GetIndexingContext();
-  MLIRContext* mlir_context = indexing_context->GetMLIRContext();
+  MLIRContext* mlir_context = indexing_map.GetMLIRContext();
   int64_t num_input_dims = indexing_map.GetDimensionCount();
   std::vector<AffineExpr> exprs;
   exprs.reserve(num_input_dims);
@@ -299,8 +297,8 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
       mlir_context);
 
   IndexingMap composed_indexing_map(
-      indexing_context, indexing_map.GetAffineMap().compose(producer_map),
-      tile_dim_vars, tile_range_vars, /*rt_vars=*/{});
+      indexing_map.GetAffineMap().compose(producer_map), tile_dim_vars,
+      tile_range_vars, /*rt_vars=*/{});
 
   composed_indexing_map.Simplify();
 

From 702568d6ecd296802f46427c75d684b69a1e26b4 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 22 Mar 2024 13:55:52 -0700
Subject: [PATCH 323/670] [xla:gpu] Support dynamic slices for
 AddressComputationFusionRewriter

PiperOrigin-RevId: 618279032
---
 .../address_computation_fusion_rewriter.cc    |  16 +-
 ...ddress_computation_fusion_rewriter_test.cc | 210 ++++++++++++++++++
 2 files changed, 220 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index 4e63e48997c5f9..dfb81a149721de 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -156,17 +156,19 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
           if (dynamic) {
             if (const auto slice_instr =
                     DynCast<HloDynamicSliceInstruction>(cur)) {
-              if (IsAlignedSlice(slice_instr->shape(),
-                                 slice_instr->operand(0)->shape(), nullptr))
+              if (IsAlignedSlice(slice_instr->operand(0)->shape(),
+                                 slice_instr->shape(), nullptr)) {
                 slice_found = true;
-              return slice_found;
+                return slice_found;
+              }
             }
           } else {
             if (const auto slice_instr = DynCast<HloSliceInstruction>(cur)) {
               if (IsAlignedSlice(slice_instr->operand(0)->shape(),
-                                 slice_instr->shape(), slice_instr))
+                                 slice_instr->shape(), slice_instr)) {
                 slice_found = true;
-              return slice_found;
+                return slice_found;
+              }
             }
           }
           return cur->user_count() > 1 || !IsNoOp(cur);
@@ -379,7 +381,9 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
     return true;
   };
 
-  return process_slices(false);
+  TF_ASSIGN_OR_RETURN(bool static_sliced, process_slices(false));
+  TF_ASSIGN_OR_RETURN(bool dynamic_sliced, process_slices(true));
+  return static_sliced || dynamic_sliced;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
index 0bf2040567d333..0ec82c1f3395fe 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
@@ -1030,4 +1030,214 @@ TEST_F(AddressComputationFusionRewriterTest, UnalignedSlice) {
                             std::nullopt);
 }
 
+TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemm) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      ROOT custom-call.1 = f16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P1]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemmWithWorkspace) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      ROOT custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P1]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       [[CC:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:       [[DOT:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[CC]]), index=0
+    ; CHECK:       [[WORKSPACE:%[^ ]+]] = s8[256]{0} get-tuple-element([[CC]]), index=1
+    ; CHECK:       ROOT [[TUPLE:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DOT]], [[WORKSPACE]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemmNotRoot) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      custom-call.1 = f16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      ROOT res = f16[8,8]{1,0} add(custom-call.1, custom-call.1)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P1]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT {{.*}} = f16[8,8]{1,0} add([[FUSION]], [[FUSION]])
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
 }  // namespace xla::gpu

From 8c76cb8ff5226fd1d015c3f656576cfa5c05c86b Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Fri, 22 Mar 2024 14:25:29 -0700
Subject: [PATCH 324/670] Add unbounded dynamism test for OrOp.

PiperOrigin-RevId: 618287342
---
 .../xla/xla/client/xla_builder_test.cc        | 41 +++++++++++++++----
 .../xla/xla/service/shape_inference_test.cc   | 28 +++++++++++--
 2 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index 1aac6fe5ec956a..b4c418ad88da1f 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -1776,6 +1776,7 @@ struct BinaryOpTestCase {
 constexpr absl::string_view kBroadcastDimensionMismatch =
     "Broadcast dimension 0 mismatch: 2 != -9223372036854775808; f32[2] and "
     "f32[?,10].";
+std::array<const int64_t, 0> empty_array = {};
 std::array<const int64_t, 1> zero_array = {0};
 
 class XlaBuilderUnboundedUnaryOpTest
@@ -1818,7 +1819,7 @@ TEST(XlaBuilderTest, UnboundedAddScalarBroadcast) {
   TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[]"));
   TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
   Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
-      /*broadcast_dimensions=*/{});
+      /*broadcast_dimensions=*/empty_array);
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
@@ -1842,7 +1843,7 @@ TEST(XlaBuilderTest, UnboundedAddUnsupportedImplicitBroadcast) {
   TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2]"));
   TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
   Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
-      /*broadcast_dimensions=*/{0});
+      /*broadcast_dimensions=*/zero_array);
   EXPECT_THAT(BuildHloModule(b),
               StatusIs(_, HasSubstr(kBroadcastDimensionMismatch)));
 }
@@ -2165,6 +2166,21 @@ TEST(XlaBuilderTest, UnboundedGather) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
+TEST(XlaBuilderTest, UnboundedOr) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
+                          ParseShape("s32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs,
+                          ParseShape("s32[?, 1, ?, 2, ?, <=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                          ParseShape("s32[?, ?, 2, 2, <=2, <=2, ?]"));
+  Or(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
+     /*broadcast_dimensions=*/empty_array);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, UnboundedPad) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, 10]"));
@@ -2439,31 +2455,38 @@ INSTANTIATE_TEST_SUITE_P(
     UnboundedDynamism, XlaBuilderUnboundedBinaryOpTest,
     ::testing::ValuesIn<BinaryOpTestCase>({
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Add},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Add},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Add},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Atan2},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Atan2},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Div},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Div},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Div},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Max},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Max},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Max},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Mul},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Mul},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Mul},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "pred[?, 10]", &Ne},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Pow},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Pow},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Pow},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Sub},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Sub},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Sub},
     }));
diff --git a/third_party/xla/xla/service/shape_inference_test.cc b/third_party/xla/xla/service/shape_inference_test.cc
index a1013bee6137db..ef34bb1aa2fb80 100644
--- a/third_party/xla/xla/service/shape_inference_test.cc
+++ b/third_party/xla/xla/service/shape_inference_test.cc
@@ -127,8 +127,8 @@ struct BinaryOpTestCase {
   std::optional<std::string_view> error_message;
 };
 
-// Subclass for testing unbounded dynamic and op
-class UnboundedAndOpShapeInferenceTest
+// Subclass for testing unbounded dynamic logical ops
+class UnboundedLogicalOpShapeInferenceTest
     : public ::testing::TestWithParam<BinaryOpTestCase> {};
 
 // Subclass for testing unbounded dynamic binary ops
@@ -3910,7 +3910,7 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAdd) {
   }
 }
 
-TEST_P(UnboundedAndOpShapeInferenceTest, UnboundedAnd) {
+TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedAnd) {
   TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
   absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
@@ -4309,6 +4309,25 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMul) {
   }
 }
 
+TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedOr) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kOr, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_status.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+        << " expected: " << ShapeUtil::HumanString(expected);
+  } else {
+    ASSERT_TRUE(GetParam().error_message.has_value());
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr(*GetParam().error_message));
+  }
+}
+
 TEST_F(ShapeInferenceTest, UnboundedPad) {
   TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, 10]"));
   TF_ASSERT_OK_AND_ASSIGN(Shape padding_value, ParseShape("f32[]"));
@@ -4546,7 +4565,8 @@ TEST_F(ShapeInferenceTest, UnboundedTransposeRank1) {
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
-INSTANTIATE_TEST_SUITE_P(UnboundedDynamism, UnboundedAndOpShapeInferenceTest,
+INSTANTIATE_TEST_SUITE_P(UnboundedDynamism,
+                         UnboundedLogicalOpShapeInferenceTest,
                          ::testing::ValuesIn<BinaryOpTestCase>(
                              {// LHS | RHS | bdims | Res
                               // 1   | ?   | []    | ?

From 3cd578987684b5520d2ef538106d6773d4ce6812 Mon Sep 17 00:00:00 2001
From: Martin Wicke <wicke@google.com>
Date: Fri, 22 Mar 2024 14:40:31 -0700
Subject: [PATCH 325/670] Fix the comment for an Env. It is currently full of
 lies.

PiperOrigin-RevId: 618291265
---
 third_party/xla/third_party/tsl/tsl/platform/env.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/platform/env.h b/third_party/xla/third_party/tsl/tsl/platform/env.h
index fe3354c765a06f..35b446a99445a5 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/env.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/env.h
@@ -54,8 +54,12 @@ struct ThreadOptions;
 /// Callers may wish to provide a custom Env object to get fine grain
 /// control.
 ///
-/// All Env implementations are safe for concurrent access from
-/// multiple threads without any external synchronization.
+/// All Env implementations of file-system modifying functionality are safe
+/// for concurrent access from multiple threads without any external
+/// synchronization, *however*, Envs and their underlying file systems are
+/// global objects, and therefore, if any thread modifies options, the modified
+/// options take effect process-wide. The SetOption functions themselves are
+/// also *not* thread safe.
 class Env {
  public:
   Env();

From 113b9e512993b1a7b37e08ce665e49675a9f630f Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Fri, 22 Mar 2024 15:08:39 -0700
Subject: [PATCH 326/670] Pass ModuleOp directly from pre-calibration to
 post-calibration

This will avoid saving calibrated model as well as loading it.

PiperOrigin-RevId: 618298618
---
 .../mlir/quantization/stablehlo/cc/BUILD      |   8 +-
 .../stablehlo/cc/calibration/BUILD            |  19 +-
 .../stablehlo/cc/calibration/component.cc     |  67 +----
 .../stablehlo/cc/calibration/statistics.cc    |  24 +-
 .../stablehlo/cc/calibration/statistics.h     |   5 +-
 .../quantization/stablehlo/cc/debugger.cc     |  63 ++---
 .../mlir/quantization/stablehlo/cc/debugger.h |  35 +--
 .../stablehlo/cc/pass_pipeline.cc             |   2 +
 .../tensorflow/passes/add_dump_tensor_op.cc   |   6 +-
 .../mlir/quantization/tensorflow/python/BUILD |  11 +-
 .../integration_test/quantize_model_test.py   |   8 +-
 .../python/pywrap_quantize_model.cc           |  78 +-----
 .../tensorflow/python/quantize_model.cc       | 260 +++++++++++++-----
 .../tensorflow/python/quantize_model.h        |  18 +-
 .../tensorflow/python/quantize_model.py       |  27 +-
 .../python/representative_dataset.py          |  42 ++-
 .../tensorflow/tests/add_dump_tensor_op.mlir  |  90 +++---
 .../tests/add_dump_tensor_op_stablehlo.mlir   |  30 +-
 ...-record-representative-dataset-saver.pbtxt |   2 +-
 ...-record-representative-dataset-saver.pbtxt |   2 +-
 .../tools/def_file_filter/symbols_pybind.txt  |   3 +-
 .../tools/def_file_filter/symbols_pybind.txt  |   3 +-
 22 files changed, 425 insertions(+), 378 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
index a30e8225117a6b..7c7b57451a5f4a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
@@ -111,13 +111,11 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":graph_def",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
index 90afbe53209347..5783ffddd4f050 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
@@ -25,14 +25,14 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:graph_def",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
-        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -44,28 +44,18 @@ cc_library(
     deps = [
         ":representative_dataset",
         ":statistics",
-        "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:component",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:debugger",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:saved_model_export",
-        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:saved_model_import",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:types",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:convert_asset_args",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/python:unfreeze_constants",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
-        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/core/protobuf:for_core_protos_cc",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:die_if_null",
         "@com_google_absl//absl/status",
@@ -76,7 +66,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
index 7fb126fe0993b4..ba1671ceb696ca 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h"
 
-#include <memory>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -22,7 +21,6 @@ limitations under the License.
 
 #include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/die_if_null.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -32,42 +30,36 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace mlir::quant::stablehlo {
 
 using ::stablehlo::quantization::AddCalibrationStatistics;
 using ::stablehlo::quantization::CreateRepresentativeDatasetFileMap;
+using ::stablehlo::quantization::DisableDebugging;
 using ::stablehlo::quantization::QuantizationConfig;
 using ::stablehlo::quantization::RepresentativeDatasetConfig;
 using ::stablehlo::quantization::io::CreateTmpDir;
 using ::stablehlo::quantization::io::GetLocalTmpFileName;
 using ::tensorflow::AssetFileDef;
-using ::tensorflow::SavedModelBundle;
 using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::ExportedModel;
-using ::tensorflow::quantization::PreprocessAndFreezeGraph;
 using ::tensorflow::quantization::PyFunctionLibrary;
 
-using ImportedMlirModuleOp =
-    std::pair<ModuleOp, std::unique_ptr<SavedModelBundle>>;
-
 CalibrationComponent::CalibrationComponent(
     absl::Nonnull<MLIRContext*> ctx,
     absl::Nonnull<const PyFunctionLibrary*> py_function_lib,
@@ -88,6 +80,13 @@ absl::StatusOr<ExportedModel> CalibrationComponent::ExportToSavedModel(
     ModuleOp module_op, const absl::string_view dst_saved_model_path) {
   TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
 
+  // Clone ModuleOp and function aliases so changes in this pipeline won't
+  // be reflected in the original values.
+  mlir::OwningOpRef<mlir::ModuleOp> cloned_module_ref(module_op.clone());
+
+  // Disable DumpTensor ops when running calibration.
+  DisableDebugging(*cloned_module_ref);
+
   // `duplicate_shape_determining_constants = false` because the
   // resulting graph of this step is not expected to be loaded on TPU.
   const ExportOptions export_opts = {
@@ -96,11 +95,11 @@ absl::StatusOr<ExportedModel> CalibrationComponent::ExportToSavedModel(
       /*debug_name=*/absl::StrCat(kName, kExportStepSuffix)};
 
   TF_ASSIGN_OR_RETURN(const SmallVector<AssetFileDef> asset_file_defs,
-                      RunExportPasses(export_opts, *ctx_, module_op));
+                      RunExportPasses(export_opts, *ctx_, *cloned_module_ref));
 
   TF_ASSIGN_OR_RETURN(ExportedModel exported_model,
                       ConvertMlirModuleToExportedModel(
-                          module_op, checkpoint_dir, function_aliases_,
+                          *cloned_module_ref, checkpoint_dir, function_aliases_,
                           {asset_file_defs.begin(), asset_file_defs.end()}));
 
   py_function_lib_->SaveExportedModel(dst_saved_model_path, exported_model,
@@ -110,35 +109,6 @@ absl::StatusOr<ExportedModel> CalibrationComponent::ExportToSavedModel(
   return exported_model;
 }
 
-absl::StatusOr<ModuleOp> CalibrationComponent::ImportCalibratedSavedModel(
-    const absl::string_view calibrated_saved_model_path) {
-  // Convert the SavedModelBundle to an MLIR module.
-  TF_ASSIGN_OR_RETURN(ImportedMlirModuleOp imported_module,
-                      SavedModelToMlirModuleOp(calibrated_saved_model_path,
-                                               tags_, signature_keys_, *ctx_));
-  ModuleOp module_op = imported_module.first;
-
-  UpdateFunctionAliases(function_aliases_, module_op);
-
-  // Collect the names of the functions that have aliases so that they may not
-  // be inlined.
-  absl::flat_hash_set<std::string> aliased_function_names;
-  absl::c_for_each(function_aliases_, [&](const auto& aliases) {
-    return aliased_function_names.insert(aliases.first);
-  });
-
-  // Freezing is required again since variables might have been produced
-  // during the pre-calibration step. `is_inliner_run = false` to prevent the
-  // functions lifted for quantization from being inlined.
-  TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
-      /*mlir_dump_file_prefix=*/kName, /*is_inliner_run=*/false,
-      /*noinline_functions=*/aliased_function_names, module_op, ctx_,
-      imported_module.second == nullptr ? nullptr
-                                        : imported_module.second->GetSession(),
-      /*run_tf_to_stablehlo=*/false, /*deserialize_xla_call_module=*/true));
-  return module_op;
-}
-
 absl::StatusOr<ModuleOp> CalibrationComponent::Run(
     ModuleOp module_op, const QuantizationConfig& config) {
   // Exports the pre-calibrated model to SavedModel.
@@ -168,23 +138,14 @@ absl::StatusOr<ModuleOp> CalibrationComponent::Run(
       /*force_graph_mode_calibration=*/true, representative_dataset_file_map);
 
   if (absl::Status status = AddCalibrationStatistics(
-          *exported_model.mutable_graph_def(), config.calibration_options(),
-          *py_function_lib_);
+          module_op, config.calibration_options(), *py_function_lib_);
       !status.ok()) {
     LOG(WARNING) << "Some CustomAggregator ops do not have min or max "
                     "values. Parts of the graph are not quantized. "
                  << status;
   }
 
-  // Exports the calibrated model with statistics attached to the graph.
-  TF_ASSIGN_OR_RETURN(const std::string calibrated_saved_model_path,
-                      CreateTmpDir());
-  py_function_lib_->SaveExportedModel(calibrated_saved_model_path,
-                                      exported_model, src_saved_model_path_,
-                                      tags_, signature_def_map_);
-
-  // Imports the calibrated saved model back to `ModuleOp`.
-  return ImportCalibratedSavedModel(calibrated_saved_model_path);
+  return module_op;
 }
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
index 22160a8820dfcd..39f4ca8449ae05 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
@@ -19,21 +19,19 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
 
 namespace stablehlo::quantization {
 namespace {
 
 using ::stablehlo::quantization::CalibrationOptions;
-using ::tensorflow::GraphDef;
-using ::tensorflow::NodeDef;
 using ::tensorflow::calibrator::CalibrationStatistics;
 using ::tensorflow::calibrator::CalibratorSingleton;
 using ::tensorflow::quantization::PyFunctionLibrary;
@@ -41,13 +39,12 @@ using ::tensorflow::quantization::PyFunctionLibrary;
 }  // namespace
 
 absl::Status AddCalibrationStatistics(
-    GraphDef& graph_def, const CalibrationOptions& calibration_options,
+    mlir::ModuleOp module_op, const CalibrationOptions& calibration_options,
     const PyFunctionLibrary& py_function_library) {
   absl::Status status = absl::OkStatus();
-  MutateNodeDefs(graph_def, [&py_function_library, &calibration_options,
-                             &status](NodeDef& node_def) {
-    if (node_def.op() != "CustomAggregator") return;
-    const std::string& id = node_def.attr().at("id").s();
+  module_op.walk([&py_function_library, &calibration_options,
+                  &status](mlir::TF::CustomAggregatorOp aggregator_op) {
+    mlir::StringRef id = aggregator_op.getId();
     std::optional<CalibrationStatistics> statistics =
         CalibratorSingleton::GetStatistics(id);
     if (statistics == std::nullopt) {
@@ -63,8 +60,9 @@ absl::Status AddCalibrationStatistics(
                                                       calibration_options);
     CalibratorSingleton::ClearData(id);
 
-    (*node_def.mutable_attr())["min"].set_f(min_value);
-    (*node_def.mutable_attr())["max"].set_f(max_value);
+    mlir::OpBuilder builder(aggregator_op);
+    aggregator_op->setAttr("min", builder.getF32FloatAttr(min_value));
+    aggregator_op->setAttr("max", builder.getF32FloatAttr(max_value));
   });
   return status;
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
index 0069692381b6d5..9b67f22a2dac72 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
@@ -16,10 +16,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_STATISTICS_H_
 
 #include "absl/status/status.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
 
 namespace stablehlo::quantization {
 
@@ -28,7 +27,7 @@ namespace stablehlo::quantization {
 // respectively. `calibration_options` provides the strategy to retrieve min and
 // max values.
 absl::Status AddCalibrationStatistics(
-    tensorflow::GraphDef& graph_def,
+    mlir::ModuleOp module_op,
     const stablehlo::quantization::CalibrationOptions& calibration_options,
     const tensorflow::quantization::PyFunctionLibrary& py_function_library);
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
index 134ce2a5a89ebd..a06c7f8ed79fb4 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
@@ -14,61 +14,34 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
 
-#include <string>
-#include <unordered_set>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace stablehlo::quantization {
-namespace {
 
-using ::stablehlo::quantization::DebuggerConfig;
-using ::tensorflow::NodeDef;
-using ::tensorflow::SignatureDef;
-using ::tensorflow::quantization::ExportedModel;
-using ::tensorflow::quantization::PyFunctionLibrary;
+void DisableDebugging(mlir::ModuleOp module_op) {
+  module_op.walk(
+      [](mlir::TF::DumpTensorOp dump_op) { dump_op.setEnabled(false); });
+}
 
-}  // namespace
+void EnableDebugging(tensorflow::quantization::ExportedModel& exported_model) {
+  MutateNodeDefs(*exported_model.mutable_graph_def(),
+                 [](tensorflow::NodeDef& node_def) {
+                   if (node_def.op() == "DumpTensor") {
+                     (*node_def.mutable_attr())["enabled"].set_b(true);
+                   }
+                 });
+}
 
-void EnableDebugging(
-    ExportedModel& exported_model, const DebuggerConfig& debugger_config,
-    const PyFunctionLibrary& py_function_library,
-    const absl::string_view src_saved_model_path,
-    const std::unordered_set<std::string>& tags,
-    const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map) {
-  // Enable `DumpTensor` nodes in `graph_def`. DumpTensor is disabled by
-  // default to avoid logging data during calibration.
-  MutateNodeDefs(*exported_model.mutable_graph_def(), [](NodeDef& node_def) {
-    if (node_def.op() == "DumpTensor") {
-      (*node_def.mutable_attr())["enabled"].set_b(true);
-    }
+void ChangeToQuantizedFilename(mlir::ModuleOp module_op) {
+  module_op.walk([](mlir::TF::DumpTensorOp dump_op) {
+    dump_op.setFileName("quantized_tensor_data.pb");
   });
-
-  if (debugger_config.debugger_type() ==
-      DebuggerConfig::DEBUGGER_TYPE_WHOLE_MODEL) {
-    // TODO: b/295139417 - Remove CustomAggregator op in unquantized dump model.
-    // TODO: b/296916287 - Create a separate function for saving unquantized
-    // dump model.
-    py_function_library.SaveExportedModel(
-        debugger_config.unquantized_dump_model_path(), exported_model,
-        src_saved_model_path, tags, signature_def_map);
-
-    // Update the `DumpTensor` ops' file name in `graph_def`.
-    MutateNodeDefs(*exported_model.mutable_graph_def(), [](NodeDef& node_def) {
-      if (node_def.op() == "DumpTensor") {
-        (*node_def.mutable_attr())["file_name"].set_s(
-            "quantized_tensor_data.pb");
-      }
-    });
-  }
 }
 
 }  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
index 4cb1523a7594ee..f034e4d94ee4bf 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
@@ -15,35 +15,20 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_DEBUGGER_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_DEBUGGER_H_
 
-#include <string>
-#include <unordered_set>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace stablehlo::quantization {
 
-// Enables debugging on `exported_model` by updating the `DumpTensor` ops.
-//
-// Saves the current model to `debugger_options.unquantized_dump_model_path()`
-// if the debugger type is `DEBUGGER_TYPE_WHOLE_MODEL`. This is required because
-// in whole-model debugging mode the `DumpTensor` ops for the unquantized
-// tensors are only inserted in the unquantized model whereas `DumpTensor` ops
-// for the quantized tensors are only inserted in the quantized model. Both
-// models are required to be able to dump both quantized and unquantized tensors
-// and compare them offline.
-void EnableDebugging(
-    tensorflow::quantization::ExportedModel& exported_model,
-    const stablehlo::quantization::DebuggerConfig& debugger_config,
-    const tensorflow::quantization::PyFunctionLibrary& py_function_library,
-    absl::string_view src_saved_model_path,
-    const std::unordered_set<std::string>& tags,
-    const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
-        signature_def_map);
+// Disables debugging on `DumpTensor` ops.
+void DisableDebugging(mlir::ModuleOp module_op);
+
+// Enables debugging on `DumpTensor` ops.
+void EnableDebugging(tensorflow::quantization::ExportedModel& exported_model);
+
+// Changes the filename from `unquantized_tensor_data.pb` to
+// `quantized_tensor_data.pb`.
+void ChangeToQuantizedFilename(mlir::ModuleOp module_op);
 
 }  // namespace stablehlo::quantization
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
index 7f2df7a572b530..c871ab3ac1adc2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
@@ -63,6 +63,8 @@ void AddPostCalibrationPasses(
   // For debugging purposes.
   options.mlir_dump_file_name_ = "quantize_composite_functions";
   options.enable_weight_only_ = false;
+
+  AddShapeLegalizationPasses(pm);
   pm.addNestedPass<func::FuncOp>(
       CreateConvertCustomAggregationOpToQuantStatsPass());
   pm.addPass(createQuantizeCompositeFunctionsPass(options));
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
index 00ee53b84647eb..239fe32946ab87 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
@@ -182,7 +182,7 @@ class AddDumpTensorOp : public OpRewritePattern<LiftedOpT> {
         rewriter.getNamedAttr("file_name", rewriter.getStringAttr(file_name)),
         // The op is disabled by default. Otherwise, values will be saved
         // during calibration.
-        rewriter.getNamedAttr("enabled", rewriter.getBoolAttr(false)),
+        rewriter.getNamedAttr("enabled", rewriter.getBoolAttr(enabled)),
         rewriter.getNamedAttr("func_name", rewriter.getStringAttr(func_name)),
         rewriter.getNamedAttr("node_name", rewriter.getStringAttr(node_name)),
     };
@@ -246,7 +246,7 @@ class AddDumpTensorOp : public OpRewritePattern<LiftedOpT> {
     // Attach DumpTensorOp to its output layer.
     SmallVector<NamedAttribute> dump_attributes =
         CreateDumpAttributes(rewriter, folder_name, file_name,
-                             /*enabled=*/false, func_name, node_name);
+                             /*enabled=*/true, func_name, node_name);
     rewriter.create<TF::DumpTensorOp>(op->getLoc(), TypeRange{}, result,
                                       dump_attributes);
 
@@ -261,7 +261,7 @@ class AddDumpTensorOp : public OpRewritePattern<LiftedOpT> {
       // Attach second DumpTensorOp to its output unquantized layer.
       SmallVector<NamedAttribute> dump_attributes = CreateDumpAttributes(
           rewriter, folder_name, /*file_name=*/"unquantized_tensor_data.pb",
-          /*enabled=*/false, func_name, node_name);
+          /*enabled=*/true, func_name, node_name);
       rewriter.create<TF::DumpTensorOp>(op.getLoc(), TypeRange{},
                                         new_op->getResult(0), dump_attributes);
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index 4542dfadf4496d..2ad4a1898bc72e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -32,15 +32,19 @@ cc_library(
         "//tensorflow/python:__pkg__",
     ],
     deps = [
+        ":py_function_lib",
         ":unfreeze_constants",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:context",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:debugger",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:post_calibration",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:pre_calibration",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:saved_model_export",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:saved_model_import",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:types",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:statistics",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
@@ -55,7 +59,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:path",
-        "//tensorflow/core/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -83,6 +86,7 @@ cc_library(
     hdrs = ["quantize_model.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":py_function_lib",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -169,11 +173,9 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings:string_view",
-        "@pybind11",
     ],
 )
 
@@ -232,9 +234,6 @@ tf_python_pybind_extension(
         ":py_function_lib",
         ":quantize_model_cc",
         ":type_casters",
-        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:debugger",
-        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
-        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:statistics",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index c705b939b2f346..ec86deac1b497d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -3595,7 +3595,9 @@ def test_ptq_multiple_signatures_invalid_dataset_raises_value_error(self):
         for _ in range(8)
     ]
 
-    with self.assertRaisesRegex(ValueError, 'Invalid representative dataset.'):
+    with self.assertRaisesRegex(
+        Exception, 'Representative dataset is not a mapping'
+    ):
       quantize_model.quantize(
           self._input_saved_model_path,
           output_directory=self._output_saved_model_path,
@@ -3950,8 +3952,8 @@ def test_ptq_model_with_tf1_saved_model_invalid_input_key_raises_value_error(
     )
 
     with self.assertRaisesRegex(
-        ValueError,
-        'Failed to run graph for post-training quantization calibration',
+        Exception,
+        'Invalid input keys for representative sample.',
     ):
       quantize_model.quantize(
           self._input_saved_model_path,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
index d61cb59905d66f..a0865c44664290 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
@@ -30,9 +30,6 @@ limitations under the License.
 #include "pybind11_abseil/import_status_module.h"  // from @pybind11_abseil
 #include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil  // IWYU pragma: keep
 #include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h"
@@ -45,17 +42,13 @@ namespace py = pybind11;
 
 namespace {
 
-using ::stablehlo::quantization::AddCalibrationStatistics;
-using ::stablehlo::quantization::EnableDebugging;
-using ::stablehlo::quantization::io::CreateTmpDir;
 using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::ExportedModel;
 using ::tensorflow::quantization::PyFunctionLibrary;
 using ::tensorflow::quantization::QuantizationOptions;
-using ::tensorflow::quantization::QuantizePtqDynamicRange;
-using ::tensorflow::quantization::QuantizePtqModelPostCalibration;
-using ::tensorflow::quantization::QuantizePtqModelPreCalibration;
+using ::tensorflow::quantization::QuantizeDynamicRangePtq;
 using ::tensorflow::quantization::QuantizeQatModel;
+using ::tensorflow::quantization::QuantizeStaticRangePtq;
 using ::tensorflow::quantization::QuantizeWeightOnly;
 using ::tensorflow::quantization::RepresentativeDatasetFile;
 
@@ -132,7 +125,7 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
                     quantization_options.tags().end());
 
         const absl::StatusOr<ExportedModel> exported_model =
-            QuantizePtqDynamicRange(src_saved_model_path, signature_keys, tags,
+            QuantizeDynamicRangePtq(src_saved_model_path, signature_keys, tags,
                                     quantization_options);
 
         // Remove the `tpu` tag from the debug quantized saved model as it is
@@ -222,64 +215,13 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
         std::unordered_set<std::string> tags;
         tags.insert(quantization_options.tags().begin(),
                     quantization_options.tags().end());
-
-        absl::StatusOr<ExportedModel> exported_model =
-            QuantizePtqModelPreCalibration(src_saved_model_path, signature_keys,
-                                           tags, quantization_options);
+        const absl::StatusOr<ExportedModel> exported_model =
+            QuantizeStaticRangePtq(src_saved_model_path, signature_keys, tags,
+                                   quantization_options, signature_def_map,
+                                   py_function_library,
+                                   representative_dataset_file_map_serialized);
         if (!exported_model.ok()) return exported_model.status();
 
-        const absl::StatusOr<std::string> precalibrated_saved_model_dir =
-            CreateTmpDir();
-        if (!precalibrated_saved_model_dir.ok()) {
-          throw py::value_error(
-              precalibrated_saved_model_dir.status().ToString());
-        }
-
-        py_function_library.SaveExportedModel(
-            *precalibrated_saved_model_dir, *exported_model,
-            src_saved_model_path, tags, signature_def_map);
-
-        py_function_library.RunCalibration(
-            *precalibrated_saved_model_dir, signature_keys, tags,
-            quantization_options.calibration_options(),
-            quantization_options.force_graph_mode_calibration(),
-            representative_dataset_file_map_serialized);
-
-        if (absl::Status status = AddCalibrationStatistics(
-                *exported_model->mutable_graph_def(),
-                quantization_options.calibration_options(),
-                py_function_library);
-            !status.ok()) {
-          LOG(WARNING) << "Some CustomAggregator ops do not have min or max "
-                          "values. Parts of the graph are not quantized. "
-                       << status;
-        }
-
-        if (quantization_options.has_debugger_config()) {
-          EnableDebugging(*exported_model,
-                          quantization_options.debugger_config(),
-                          py_function_library, src_saved_model_path, tags,
-                          signature_def_map);
-        }
-
-        const absl::StatusOr<std::string> calibrated_saved_model_path =
-            CreateTmpDir();
-        if (!calibrated_saved_model_path.ok()) {
-          throw py::value_error(
-              calibrated_saved_model_path.status().ToString());
-        }
-
-        py_function_library.SaveExportedModel(
-            *calibrated_saved_model_path, *exported_model, src_saved_model_path,
-            tags, signature_def_map);
-
-        const absl::StatusOr<ExportedModel> post_calibrated_exported_model =
-            QuantizePtqModelPostCalibration(*calibrated_saved_model_path,
-                                            signature_keys, tags,
-                                            quantization_options);
-        if (!post_calibrated_exported_model.ok())
-          return post_calibrated_exported_model.status();
-
         // Remove the `tpu` tag from the debug quantized saved model as it is
         // for CPU. Note the 'tpu' value should be the same as `TPU` defined in
         // tensorflow/python/saved_model/tag_constants.py.
@@ -287,8 +229,8 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
           tags.erase("tpu");
         }
         py_function_library.SaveExportedModel(
-            dst_saved_model_path, *post_calibrated_exported_model,
-            *calibrated_saved_model_path, tags, signature_def_map);
+            dst_saved_model_path, *exported_model, src_saved_model_path, tags,
+            signature_def_map);
 
         return absl::OkStatus();
       },
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index d4c1c8a5174046..99471a5772b428 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -34,18 +34,23 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h"
@@ -54,7 +59,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
 #include "tsl/platform/errors.h"
@@ -75,9 +79,15 @@ using ::mlir::quant::stablehlo::kExportStepSuffix;
 using ::mlir::quant::stablehlo::PostCalibrationComponent;
 using ::mlir::quant::stablehlo::PreCalibrationComponent;
 using ::mlir::quant::stablehlo::UpdateFunctionAliases;
+using ::stablehlo::quantization::AddCalibrationStatistics;
+using ::stablehlo::quantization::ChangeToQuantizedFilename;
 using ::stablehlo::quantization::DebuggerConfig;
+using ::stablehlo::quantization::DisableDebugging;
+using ::stablehlo::quantization::EnableDebugging;
 using ::stablehlo::quantization::QuantizationConfig;
+using ::stablehlo::quantization::io::CreateTmpDir;
 using ::stablehlo::quantization::io::GetLocalTmpFileName;
+using ::tensorflow::quantization::PyFunctionLibrary;
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportAndPreprocessSavedModel(
     absl::string_view saved_model_path,
@@ -142,14 +152,100 @@ absl::StatusOr<ExportedModel> ModuleOpToExportedModel(
       {asset_file_defs.begin(), asset_file_defs.end()});
 }
 
+absl::StatusOr<ExportedModel> ExportCalibrationModel(
+    mlir::ModuleOp module_op, mlir::MLIRContext *context,
+    const QuantizationOptions &quantization_options,
+    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
+  // Clone ModuleOp and function aliases so changes in this pipeline won't
+  // be reflected in the original values.
+  mlir::OwningOpRef<mlir::ModuleOp> cloned_module_ref(module_op.clone());
+
+  // Disable DumpTensor ops when running calibration.
+  DisableDebugging(*cloned_module_ref);
+
+  absl::StatusOr<ExportedModel> exported_model = ModuleOpToExportedModel(
+      *cloned_module_ref, context, kTfQuantPtqPreCalibrationStepName,
+      /*unfreeze_constants=*/!quantization_options.freeze_all_variables(),
+      function_aliases);
+  if (!exported_model.status().ok()) {
+    return absl::InternalError(
+        absl::StrCat("Failed to export calibration model: ",
+                     exported_model.status().message()));
+  }
+
+  return *exported_model;
+}
+
+absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibrationImpl(
+    mlir::ModuleOp module_op, mlir::MLIRContext *context,
+    const QuantizationOptions &quantization_options,
+    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
+  const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
+  // Use StableHLO Quantizer option if opset is specified.
+  if (is_stablehlo) {
+    QuantizationConfig quantization_config;
+    *quantization_config.mutable_debugger_config() =
+        quantization_options.debugger_config();
+    PreCalibrationComponent pre_calibration_component(context);
+    TF_ASSIGN_OR_RETURN(module_op, pre_calibration_component.Run(
+                                       module_op, quantization_config));
+  } else {
+    TF_RETURN_IF_ERROR(RunPasses(
+        /*name=*/
+        kTfQuantPtqPreCalibrationStepName, /*add_passes_func=*/
+        [&quantization_options](mlir::PassManager &pm) {
+          AddQuantizePtqPreCalibrationPasses(pm, quantization_options);
+        },
+        *context, module_op));
+  }
+
+  return ExportCalibrationModel(module_op, context, quantization_options,
+                                function_aliases);
+}
+
+absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibrationImpl(
+    mlir::ModuleOp module_op, mlir::MLIRContext *context,
+    const QuantizationOptions &quantization_options,
+    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
+  const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
+  // Use StableHLO Quantizer option if opset is specified.
+  if (is_stablehlo) {
+    QuantizationConfig quantization_config{};
+    quantization_config.mutable_static_range_ptq_preset()
+        ->set_enable_per_channel_quantized_weight(
+            quantization_options.enable_per_channel_quantization());
+    // When targeting server TPUs quantized types should be unpacked into
+    // integer ops.
+    quantization_config.mutable_pipeline_config()->set_unpack_quantized_types(
+        true);
+
+    PostCalibrationComponent post_calibration_component(context);
+    TF_ASSIGN_OR_RETURN(module_op, post_calibration_component.Run(
+                                       module_op, quantization_config));
+  } else {
+    TF_RETURN_IF_ERROR(RunPasses(
+        /*name=*/
+        kTfQuantPtqPostCalibrationStepName, /*add_passes_func=*/
+        [&quantization_options](mlir::PassManager &pm) {
+          AddQuantizePtqPostCalibrationPasses(
+              pm, quantization_options, kTfQuantPtqPostCalibrationStepName);
+        },
+        *context, module_op));
+  }
+
+  return ModuleOpToExportedModel(
+      module_op, context, kTfQuantPtqPostCalibrationStepName,
+      /*unfreeze_constants=*/!quantization_options.freeze_all_variables(),
+      function_aliases);
+}
+
 }  // namespace
 
 absl::StatusOr<ExportedModel> QuantizeQatModel(
-    const absl::string_view saved_model_path,
+    absl::string_view saved_model_path,
     const std::vector<std::string> &signature_keys,
     const std::unordered_set<std::string> &tags,
     const QuantizationOptions &quantization_options) {
-  // Convert the SavedModelBundle to an MLIR module.
   std::unique_ptr<mlir::MLIRContext> context =
       CreateMlirContextForQuantization();
 
@@ -188,11 +284,10 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
 }
 
 absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
-    const absl::string_view saved_model_path,
+    absl::string_view saved_model_path,
     const std::vector<std::string> &signature_keys,
     const std::unordered_set<std::string> &tags,
     const QuantizationOptions &quantization_options) {
-  // Convert the SavedModelBundle to an MLIR module.
   std::unique_ptr<mlir::MLIRContext> context =
       CreateMlirContextForQuantization();
 
@@ -217,32 +312,12 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
   }
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
-  // Use StableHLO Quantizer option if opset is specified.
-  if (is_stablehlo) {
-    QuantizationConfig quantization_config;
-    *quantization_config.mutable_debugger_config() =
-        quantization_options.debugger_config();
-    PreCalibrationComponent pre_calibration_component(context.get());
-    TF_ASSIGN_OR_RETURN(*module_ref, pre_calibration_component.Run(
-                                         *module_ref, quantization_config));
-  } else {
-    TF_RETURN_IF_ERROR(RunPasses(
-        /*name=*/
-        kTfQuantPtqPreCalibrationStepName, /*add_passes_func=*/
-        [&quantization_options](mlir::PassManager &pm) {
-          AddQuantizePtqPreCalibrationPasses(pm, quantization_options);
-        },
-        *context, *module_ref));
-  }
-
-  return ModuleOpToExportedModel(
-      *module_ref, context.get(), kTfQuantPtqPreCalibrationStepName,
-      /*unfreeze_constants=*/!quantization_options.freeze_all_variables(),
-      *function_aliases);
+  return QuantizePtqModelPreCalibrationImpl(
+      *module_ref, context.get(), quantization_options, *function_aliases);
 }
 
 absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
-    const absl::string_view saved_model_path,
+    absl::string_view saved_model_path,
     const std::vector<std::string> &signature_keys,
     const std::unordered_set<std::string> &tags,
     const QuantizationOptions &quantization_options) {
@@ -256,7 +331,6 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
         "Failed to get function alias: ", function_aliases.status().message()));
   }
 
-  const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
   // Freezing is required again since variables might have been produced during
   // the pre-calibration step. `is_inliner_run = false` to prevent the functions
   // lifted for quantization from being inlined.
@@ -265,7 +339,7 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
           saved_model_path, signature_keys, tags, context.get(),
           /*is_inliner_run=*/false,
           /*run_tf_to_stablehlo=*/false,
-          /*deserialize_xla_call_module=*/is_stablehlo, *function_aliases);
+          /*deserialize_xla_call_module=*/false, *function_aliases);
   if (!module.status().ok()) {
     return absl::InternalError(
         absl::StrCat("Failed to import and preprocess SavedModel: ",
@@ -273,39 +347,12 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
   }
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
-  // Use StableHLO Quantizer option if opset is specified.
-  if (is_stablehlo) {
-    QuantizationConfig quantization_config{};
-    quantization_config.mutable_static_range_ptq_preset()
-        ->set_enable_per_channel_quantized_weight(
-            quantization_options.enable_per_channel_quantization());
-    // When targeting server TPUs quantized types should be unpacked into
-    // integer ops.
-    quantization_config.mutable_pipeline_config()->set_unpack_quantized_types(
-        true);
-
-    PostCalibrationComponent post_calibration_component(context.get());
-    TF_ASSIGN_OR_RETURN(*module_ref, post_calibration_component.Run(
-                                         *module_ref, quantization_config));
-  } else {
-    TF_RETURN_IF_ERROR(RunPasses(
-        /*name=*/
-        kTfQuantPtqPostCalibrationStepName, /*add_passes_func=*/
-        [&quantization_options](mlir::PassManager &pm) {
-          AddQuantizePtqPostCalibrationPasses(
-              pm, quantization_options, kTfQuantPtqPostCalibrationStepName);
-        },
-        *context, *module_ref));
-  }
-
-  return ModuleOpToExportedModel(
-      *module_ref, context.get(), kTfQuantPtqPostCalibrationStepName,
-      /*unfreeze_constants=*/!quantization_options.freeze_all_variables(),
-      *function_aliases);
+  return QuantizePtqModelPostCalibrationImpl(
+      *module_ref, context.get(), quantization_options, *function_aliases);
 }
 
-absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
-    const absl::string_view saved_model_path,
+absl::StatusOr<ExportedModel> QuantizeDynamicRangePtq(
+    absl::string_view saved_model_path,
     const std::vector<std::string> &signature_keys,
     const std::unordered_set<std::string> &tags,
     const QuantizationOptions &quantization_options) {
@@ -325,13 +372,11 @@ absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
           /*is_inliner_run=*/true,
           /*run_tf_to_stablehlo=*/false, /*deserialize_xla_call_module=*/false,
           *function_aliases);
-
   if (!module.status().ok()) {
     return absl::InternalError(
         absl::StrCat("Failed to import and preprocess SavedModel: ",
                      module.status().message()));
   }
-
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
   TF_RETURN_IF_ERROR(RunPasses(
@@ -352,7 +397,7 @@ absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
 // TODO: b/297626257 - [Converter Component][TF-Quantizer] Clean up
 // quantize_model.cc by factoring out repeated codes
 absl::StatusOr<ExportedModel> QuantizeWeightOnly(
-    const absl::string_view saved_model_path,
+    absl::string_view saved_model_path,
     const QuantizationOptions &quantization_options) {
   std::unique_ptr<mlir::MLIRContext> context =
       CreateMlirContextForQuantization();
@@ -375,13 +420,11 @@ absl::StatusOr<ExportedModel> QuantizeWeightOnly(
            quantization_options.tags().end()},
           context.get(), /*is_inliner_run=*/true, /*run_tf_to_stablehlo=*/false,
           /*deserialize_xla_call_module=*/false, *function_aliases);
-
   if (!module.status().ok()) {
     return absl::InternalError(
         absl::StrCat("Failed to import and preprocess SavedModel: ",
                      module.status().message()));
   }
-
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
   TF_RETURN_IF_ERROR(RunPasses(
@@ -399,5 +442,90 @@ absl::StatusOr<ExportedModel> QuantizeWeightOnly(
       *function_aliases);
 }
 
+absl::StatusOr<ExportedModel> QuantizeStaticRangePtq(
+    absl::string_view saved_model_path,
+    const std::vector<std::string> &signature_keys,
+    const std::unordered_set<std::string> &tags,
+    const QuantizationOptions &quantization_options,
+    const absl::flat_hash_map<std::string, SignatureDef> &signature_def_map,
+    const PyFunctionLibrary &py_function_library,
+    const absl::flat_hash_map<std::string, RepresentativeDatasetFile>
+        &representative_dataset_file_map_serialized) {
+  std::unique_ptr<mlir::MLIRContext> context =
+      CreateMlirContextForQuantization();
+
+  absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
+      function_aliases = GetFunctionAliases(saved_model_path, tags);
+  if (!function_aliases.ok()) {
+    return absl::InternalError(absl::StrCat(
+        "Failed to get function alias: ", function_aliases.status().message()));
+  }
+
+  const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
+  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module =
+      ImportAndPreprocessSavedModel(
+          saved_model_path, signature_keys, tags, context.get(),
+          /*is_inliner_run=*/true,
+          /*run_tf_to_stablehlo=*/is_stablehlo,
+          /*deserialize_xla_call_module=*/false, *function_aliases);
+  if (!module.status().ok()) {
+    return absl::InternalError(
+        absl::StrCat("Failed to import and preprocess SavedModel: ",
+                     module.status().message()));
+  }
+  mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
+
+  TF_ASSIGN_OR_RETURN(
+      absl::StatusOr<ExportedModel> pre_calibration_exported_model,
+      QuantizePtqModelPreCalibrationImpl(
+          *module_ref, context.get(), quantization_options, *function_aliases));
+
+  TF_ASSIGN_OR_RETURN(
+      const absl::StatusOr<std::string> precalibrated_saved_model_dir,
+      CreateTmpDir());
+
+  py_function_library.SaveExportedModel(
+      *precalibrated_saved_model_dir, *pre_calibration_exported_model,
+      saved_model_path, tags, signature_def_map);
+
+  py_function_library.RunCalibration(
+      *precalibrated_saved_model_dir, signature_keys, tags,
+      quantization_options.calibration_options(),
+      quantization_options.force_graph_mode_calibration(),
+      representative_dataset_file_map_serialized);
+
+  if (absl::Status status = AddCalibrationStatistics(
+          *module_ref, quantization_options.calibration_options(),
+          py_function_library);
+      !status.ok()) {
+    LOG(WARNING) << "Some CustomAggregator ops do not have min or max "
+                    "values. Parts of the graph are not quantized. "
+                 << status;
+  }
+
+  // Saves the current model to the `unquantized_dump_model_path` if the
+  // debugger type is `DEBUGGER_TYPE_WHOLE_MODEL`. This is required
+  // because in whole-model debugging mode the `DumpTensor` ops for the
+  // unquantized tensors are only inserted in the unquantized model
+  // whereas `DumpTensor` ops for the quantized tensors are only inserted
+  // in the quantized model. Both models are required to be able to dump
+  // both quantized and unquantized tensors and compare them offline.
+  if (quantization_options.has_debugger_config() &&
+      quantization_options.debugger_config().debugger_type() ==
+          DebuggerConfig::DEBUGGER_TYPE_WHOLE_MODEL) {
+    EnableDebugging(*pre_calibration_exported_model);
+    ChangeToQuantizedFilename(*module_ref);
+
+    absl::string_view unquantized_dump_model_path =
+        quantization_options.debugger_config().unquantized_dump_model_path();
+    py_function_library.SaveExportedModel(
+        unquantized_dump_model_path, *pre_calibration_exported_model,
+        saved_model_path, tags, signature_def_map);
+  }
+
+  return QuantizePtqModelPostCalibrationImpl(
+      *module_ref, context.get(), quantization_options, *function_aliases);
+}
+
 }  // namespace quantization
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
index 556086ce018123..ec7df2929660d5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 
 namespace tensorflow {
@@ -46,17 +47,30 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
     const std::unordered_set<std::string>& tags,
     const QuantizationOptions& quantization_options);
 
-// Apply post-training dynamic range quantization to the model.
-absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
+// Applies post-training dynamic-range quantization to the model.
+absl::StatusOr<ExportedModel> QuantizeDynamicRangePtq(
     absl::string_view saved_model_path,
     const std::vector<std::string>& signature_keys,
     const std::unordered_set<std::string>& tags,
     const QuantizationOptions& quantization_options);
 
+// Applies post-training static-range weight-only quantization to the model.
 absl::StatusOr<ExportedModel> QuantizeWeightOnly(
     absl::string_view saved_model_path,
     const QuantizationOptions& quantization_options);
 
+// Applies post-training static-range quantization to the model.
+absl::StatusOr<ExportedModel> QuantizeStaticRangePtq(
+    absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationOptions& quantization_options,
+    const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map,
+    const PyFunctionLibrary& py_function_library,
+    const absl::flat_hash_map<std::string, RepresentativeDatasetFile>&
+        representative_dataset_file_map_serialized);
+
+// Legacy versions of static-range quantization.
 absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
     absl::string_view saved_model_path,
     const std::vector<std::string>& signature_keys,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index 961db5334e3bbe..094f344221581f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -160,21 +160,40 @@ def _run_static_range_ptq(
   signature_def_map_serialized = _serialize_signature_def_map(signature_def_map)
 
   if isinstance(representative_dataset, Mapping):
+    if set(signature_def_map.keys()) != set(representative_dataset.keys()):
+      raise ValueError(
+          'The signature keys and the keys of representative dataset map '
+          f'do not match. Signature keys: {set(signature_def_map.keys())}, '
+          f'representative dataset map: {set(representative_dataset.keys())}.'
+      )
     representative_dataset_map = representative_dataset
+  elif len(signature_def_map.keys()) > 1:
+    raise ValueError(
+        'Representative dataset is not a mapping (got: '
+        f'{type(representative_dataset)}), but there is more than one '
+        'signature key provided. Please provide a map of '
+        '{signature_key -> dataset} with more than one signature key.'
+    )
   else:
     representative_dataset_map = {
         list(signature_def_map.keys())[0]: representative_dataset,
     }
 
   # Save the representative dataset to temporary TFRecord files.
+  # TODO: b/329552787 - If the representative dataset is in QuantizationOptions
+  # avoid loading then saving it again.
   path_map = {}
-  for signature_key in representative_dataset_map.keys():
-    path_map[signature_key] = tempfile.mkstemp(
+  expected_input_key_map = {}
+  for signature_key, signature_def in signature_def_map.items():
+    # Filepath is the second return value of mkstemp.
+    _, path_map[signature_key] = tempfile.mkstemp(
         suffix='.tfrecord', prefix=signature_key
-    )[1]  # Filepath.
+    )
+    expected_input_key_map[signature_key] = signature_def.inputs.keys()
 
   dataset_file_map = repr_dataset.TfRecordRepresentativeDatasetSaver(
-      path_map
+      path_map=path_map,
+      expected_input_key_map=expected_input_key_map,
   ).save(representative_dataset_map)
 
   # `quantize_ptq_static_range` requires `RepresentativeDatasetFile`s to be
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py
index fabda2ebad3397..c18358745866b4 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Defines types required for representative datasets for quantization."""
 
-import collections.abc
+from collections.abc import Collection, Sized
 import os
 from typing import Iterable, Mapping, Optional, Union
 
@@ -117,7 +117,11 @@ class TfRecordRepresentativeDatasetSaver(RepresentativeDatasetSaver):
   ```
   """
 
-  def __init__(self, path_map: Mapping[str, os.PathLike[str]]):
+  def __init__(
+      self,
+      path_map: Mapping[str, os.PathLike[str]],
+      expected_input_key_map: Optional[Mapping[str, Collection[str]]] = None,
+  ):
     """Initializes TFRecord represenatative dataset saver.
 
     Args:
@@ -125,8 +129,22 @@ def __init__(self, path_map: Mapping[str, os.PathLike[str]]):
         to which a `RepresentativeDataset` is saved. The signature def keys
         should be a subset of the `SignatureDef` keys of the
         `representative_dataset` argument of the `save()` call.
+      expected_input_key_map: Signature def key -> expected input keys. If set,
+        validate that the sample has same set of input keys before saving.
+
+    Raises:
+      KeyError: If path_map and expected_input_key_map have different keys.
     """
     self.path_map: Mapping[str, os.PathLike[str]] = path_map
+    self.expected_input_key_map: Mapping[str, Collection[str]] = {}
+    if expected_input_key_map is not None:
+      if set(path_map.keys()) != set(expected_input_key_map.keys()):
+        raise KeyError(
+            'The `path_map` and `expected_input_key_map` should have the same'
+            ' set of keys.'
+        )
+
+      self.expected_input_key_map = expected_input_key_map
 
   def _save_tf_record_dataset(
       self,
@@ -143,6 +161,10 @@ def _save_tf_record_dataset(
 
     Returns:
       a RepresentativeDatasetFile instance contains the path to the saved file.
+
+    Raises:
+      KeyError: If the set of input keys in the dataset samples doesn't match
+      the set of expected input keys.
     """
     # When running in graph mode (TF1), tf.Tensor types should be converted to
     # numpy ndarray types to be compatible with `make_tensor_proto`.
@@ -150,9 +172,23 @@ def _save_tf_record_dataset(
       with session.Session() as sess:
         repr_ds = replace_tensors_by_numpy_ndarrays(repr_ds, sess)
 
+    expected_input_keys = self.expected_input_key_map.get(
+        signature_def_key, None
+    )
     tfrecord_file_path = self.path_map[signature_def_key]
     with python_io.TFRecordWriter(tfrecord_file_path) as writer:
       for repr_sample in repr_ds:
+        if (
+            expected_input_keys is not None
+            and set(repr_sample.keys()) != expected_input_keys
+        ):
+          raise KeyError(
+              'Invalid input keys for representative sample. The function'
+              f' expects input keys of: {set(expected_input_keys)}. Got:'
+              f' {set(repr_sample.keys())}. Please provide correct input keys'
+              ' for representative samples.'
+          )
+
         sample = _RepresentativeDataSample()
         for input_name, input_value in repr_sample.items():
           sample.tensor_proto_inputs[input_name].CopyFrom(
@@ -317,7 +353,7 @@ def get_num_samples(repr_ds: RepresentativeDataset) -> Optional[int]:
     is malformed; it simply means the size cannot be determined without
     iterating the whole dataset.
   """
-  if isinstance(repr_ds, collections.abc.Sized):
+  if isinstance(repr_ds, Sized):
     try:
       return len(repr_ds)
     except Exception as ex:  # pylint: disable=broad-except
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir
index fe12e5935a8791..d50f28941f4269 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir
@@ -29,7 +29,7 @@ module {
 // WholeModel-DAG: %[[b:.*]] = "tf.Const"() <{value = dense<[-2.000000e+00, 3.000000e+00
 // WholeModel-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}>
 // WholeModel-DAG: %[[output1:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
 // WholeModel-DAG: return %[[output0]], %[[output1]]
 
 // IntPerLayer-LABEL: func @conv
@@ -38,8 +38,8 @@ module {
 // IntPerLayer-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
 // IntPerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
 // IntPerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %cst, %cst_0) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
 // IntPerLayer-DAG: return %[[output0]], %[[output1_quantized]]
 
 // FloatPerLayer-LABEL: func @conv
@@ -48,8 +48,8 @@ module {
 // FloatPerLayer-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
 // FloatPerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
 // FloatPerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
 // FloatPerLayer-DAG: return %[[output0]], %[[output1_unquantized]]
 }
 
@@ -86,9 +86,9 @@ module {
 // WholeModel-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}0.193340182, 0.285152316
 // WholeModel-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
 // WholeModel-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[output0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
+// WholeModel-DAG: "tf.DumpTensor"(%[[output0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
 // WholeModel-DAG: %[[output1:.*]] = "tf.PartitionedCall"(%[[output0]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
+// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
 // WholeModel-DAG: return %[[output1]]
 
 // IntPerLayer-LABEL: func @multiple_conv2d
@@ -98,12 +98,12 @@ module {
 // IntPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
 // IntPerLayer-DAG: %[[output0_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // IntPerLayer-DAG: %[[output0_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_0}>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
 // IntPerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%[[output0_quantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
 // IntPerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%[[output0_quantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
 // IntPerLayer-DAG: return %[[output1_quantized]]
 
 // FloatPerLayer-LABEL: func @multiple_conv2d
@@ -113,12 +113,12 @@ module {
 // FloatPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
 // FloatPerLayer-DAG: %[[output0_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // FloatPerLayer-DAG: %[[output0_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_0}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
 // FloatPerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%[[output0_unquantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
 // FloatPerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%[[output0_unquantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
 // FloatPerLayer-DAG: return %[[output1_unquantized]]
 }
 
@@ -146,8 +146,8 @@ module {
 // WholeModel-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893
 // WholeModel-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // WholeModel-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
 // WholeModel-DAG: return %[[m1]]
 
 // IntPerLayer-LABEL: func @matmul2
@@ -155,12 +155,12 @@ module {
 // IntPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893
 // IntPerLayer-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // IntPerLayer-DAG: %[[m0_1:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
 // IntPerLayer-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // IntPerLayer-DAG: %[[m1_0:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
 // IntPerLayer-DAG: return %[[m1]] : tensor<2x2xf32>
 
 // FloatPerLayer-LABEL: func @matmul2
@@ -168,12 +168,12 @@ module {
 // FloatPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893
 // FloatPerLayer-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // FloatPerLayer-DAG: %[[m0_1:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
 // FloatPerLayer-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0_1]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // FloatPerLayer-DAG: %[[m1_0:.*]] = "tf.PartitionedCall"(%[[m0_1]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
 // FloatPerLayer-DAG: return %[[m1_0]] : tensor<2x2xf32>
 }
 
@@ -203,8 +203,8 @@ module {
 // WholeModel-DAG: %[[pc_0:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // WholeModel-DAG: %[[sm_0:.*]] = "tf.Softmax"(%[[pc_0]]) {T = "tfdtype$DT_FLOAT"}
 // WholeModel-DAG: %[[pc_1:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
 // WholeModel-DAG: return %[[pc_1]]
 
 // IntPerLayer-LABEL: func @matmul2_softmax
@@ -212,13 +212,13 @@ module {
 // IntPerLayer-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893, -0.708605706
 // IntPerLayer-DAG: %[[pc_0:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // IntPerLayer-DAG: %[[pc_1:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
 // IntPerLayer-DAG: %[[sm_0:.*]] = "tf.Softmax"(%[[pc_0]]) {T = "tfdtype$DT_FLOAT"}
 // IntPerLayer-DAG: %[[pc_2:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
 // IntPerLayer-DAG: %[[pc_3:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_2]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_3]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_2]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_3]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
 // IntPerLayer-DAG: return %[[pc_2]]
 
 // FloatPerLayer-LABEL: func @matmul2_softmax
@@ -226,13 +226,13 @@ module {
 // FloatPerLayer-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893, -0.708605706
 // FloatPerLayer-DAG: %[[pc_0:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // FloatPerLayer-DAG: %[[pc_1:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
 // FloatPerLayer-DAG: %[[sm_0:.*]] = "tf.Softmax"(%[[pc_1]]) {T = "tfdtype$DT_FLOAT"}
 // FloatPerLayer-DAG: %[[pc_2:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
 // FloatPerLayer-DAG: %[[pc_3:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_2]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_3]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_2]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_3]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
 // FloatPerLayer-DAG: return %[[pc_3]]
 }
 
@@ -263,8 +263,8 @@ module {
 // WholeModel-DAG: %[[axis:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}
 // WholeModel-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // WholeModel-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
 // WholeModel-DAG: %[[c:.*]] = "tf.ConcatV2"(%[[m0]], %[[m1]], %[[axis]])
 // WholeModel-DAG: return %[[c]]
 
@@ -274,12 +274,12 @@ module {
 // IntPerLayer-DAG: %[[axis:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}> : () -> tensor<i32>
 // IntPerLayer-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // IntPerLayer-DAG: %[[m0_1:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
 // IntPerLayer-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // IntPerLayer-DAG: %[[m1_0:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
 // IntPerLayer-DAG: %4 = "tf.ConcatV2"(%[[m0]], %[[m1]], %[[axis]]) : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<i32>) -> tensor<2x4xf32>
 // IntPerLayer-DAG: return %4 : tensor<2x4xf32>
 
@@ -289,12 +289,12 @@ module {
 // FloatPerLayer-DAG: %[[axis:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}> : () -> tensor<i32>
 // FloatPerLayer-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // FloatPerLayer-DAG: %[[m0_1:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
 // FloatPerLayer-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0_1]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // FloatPerLayer-DAG: %[[m1_0:.*]] = "tf.PartitionedCall"(%[[m0_1]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
 // FloatPerLayer-DAG: %4 = "tf.ConcatV2"(%1, %[[m1_0]], %[[axis]]) : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<i32>) -> tensor<2x4xf32>
 // FloatPerLayer-DAG: return %4 : tensor<2x4xf32>
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op_stablehlo.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op_stablehlo.mlir
index 357a6119fa8b0f..bdb9b320109597 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op_stablehlo.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op_stablehlo.mlir
@@ -35,9 +35,9 @@ module {
 // WholeModel-DAG: %[[b0:.*]] = stablehlo.constant dense<[-0.211145893
 // WholeModel-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>
 // WholeModel-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// WholeModel-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// WholeModel-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // WholeModel-DAG: %[[matmul1_q:.*]] = "tf.XlaCallModule"(%[[matmul0_q]], %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// WholeModel-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// WholeModel-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // WholeModel-DAG: return %[[matmul1_q]] : tensor<?x2xf32>
 // WholeModel-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_2
 // WholeModel-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_1
@@ -46,13 +46,13 @@ module {
 // IntPerLayer-DAG: %[[b0:.*]] = stablehlo.constant dense<[-0.211145893
 // IntPerLayer-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344
 // IntPerLayer-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // IntPerLayer-DAG: %[[matmul0_uq:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2_0, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2_0"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // IntPerLayer-DAG: %[[matmul1_q:.*]] = "tf.XlaCallModule"(%[[matmul0_q]], %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // IntPerLayer-DAG: %[[matmul1_uq:.*]] = "tf.XlaCallModule"(%[[matmul0_q]], %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1_0, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1_0"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_uq]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // IntPerLayer-DAG: return %[[matmul1_q]] : tensor<?x2xf32>
 // IntPerLayer-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_2
 // IntPerLayer-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_1
@@ -63,13 +63,13 @@ module {
 // FloatPerLayer-DAG: %[[b0:.*]] = stablehlo.constant dense<[-0.211145893
 // FloatPerLayer-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344
 // FloatPerLayer-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // FloatPerLayer-DAG: %[[matmul0_uq:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2_0, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2_0"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // FloatPerLayer-DAG: %[[matmul1_q:.*]] = "tf.XlaCallModule"(%[[matmul0_uq]], %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // FloatPerLayer-DAG: %[[matmul1_uq:.*]] = "tf.XlaCallModule"(%[[matmul0_uq]], %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1_0, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1_0"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_uq]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // FloatPerLayer-DAG: return %[[matmul1_uq]] : tensor<?x2xf32>
 // FloatPerLayer-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_2
 // FloatPerLayer-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_1
@@ -96,7 +96,7 @@ module {
 // WholeModel-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344
 // WholeModel-DAG: %[[c0:.*]] = stablehlo.constant dense<1.000000e+00
 // WholeModel-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]]) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-// WholeModel-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
+// WholeModel-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
 // WholeModel-DAG: %[[concat:.*]] = stablehlo.concatenate %[[matmul0_q]], %[[c0]], dim = 0 : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x3xf32>
 // WholeModel-DAG: return %[[concat]] : tensor<2x3xf32>
 // WholeModel-DAG: func.func private @composite_dot_general_fn_1
@@ -105,9 +105,9 @@ module {
 // IntPerLayer-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344
 // IntPerLayer-DAG: %[[c0:.*]] = stablehlo.constant dense<1.000000e+00
 // IntPerLayer-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]]) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
 // IntPerLayer-DAG: %[[matmul0_uq:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]]) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1_0, _original_entry_function = "composite_dot_general_fn_1_0", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
 // IntPerLayer-DAG: %[[concat:.*]] = stablehlo.concatenate %[[matmul0_q]], %[[c0]], dim = 0 : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x3xf32>
 // IntPerLayer-DAG: return %[[concat]] : tensor<2x3xf32>
 // IntPerLayer-DAG: func.func private @composite_dot_general_fn_1
@@ -117,9 +117,9 @@ module {
 // FloatPerLayer-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344
 // FloatPerLayer-DAG: %[[c0:.*]] = stablehlo.constant dense<1.000000e+00
 // FloatPerLayer-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]]) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
 // FloatPerLayer-DAG: %[[matmul0_uq:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]]) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1_0, _original_entry_function = "composite_dot_general_fn_1_0", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
 // FloatPerLayer-DAG: %[[concat:.*]] = stablehlo.concatenate %[[matmul0_uq]], %[[c0]], dim = 0 : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x3xf32>
 // FloatPerLayer-DAG: return %[[concat]] : tensor<2x3xf32>
 // FloatPerLayer-DAG: func.func private @composite_dot_general_fn_1
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt
index e29e12405deecd..510414a0313081 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'path_map\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'path_map\', \'expected_input_key_map\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt
index e29e12405deecd..510414a0313081 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'path_map\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'path_map\', \'expected_input_key_map\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index f1bb21d9100706..1a158e696f8590 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -562,7 +562,8 @@ tensorflow::Safe_PyObjectPtr
 tensorflow::quantization::QuantizeQatModel
 tensorflow::quantization::QuantizePtqModelPreCalibration
 tensorflow::quantization::QuantizePtqModelPostCalibration
-tensorflow::quantization::QuantizePtqDynamicRange
+tensorflow::quantization::QuantizeStaticRangePtq
+tensorflow::quantization::QuantizeDynamicRangePtq
 tensorflow::quantization::QuantizeWeightOnly
 
 [//tensorflow/dtensor/cc:dtensor_device_cc] # DTensor
diff --git a/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt b/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
index f1bb21d9100706..1a158e696f8590 100644
--- a/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
+++ b/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
@@ -562,7 +562,8 @@ tensorflow::Safe_PyObjectPtr
 tensorflow::quantization::QuantizeQatModel
 tensorflow::quantization::QuantizePtqModelPreCalibration
 tensorflow::quantization::QuantizePtqModelPostCalibration
-tensorflow::quantization::QuantizePtqDynamicRange
+tensorflow::quantization::QuantizeStaticRangePtq
+tensorflow::quantization::QuantizeDynamicRangePtq
 tensorflow::quantization::QuantizeWeightOnly
 
 [//tensorflow/dtensor/cc:dtensor_device_cc] # DTensor

From e87a5620b95907675a69433c45a594da70f880e9 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Fri, 22 Mar 2024 15:49:55 -0700
Subject: [PATCH 327/670] [XLA:GPU][TileAnalysis] Re-think symbolic tiles and
 their parameter extraction.

Previously, symbolic tiles could take offsets and strides as input parameters.
While there is a use for it at codegen time (e.g. to use the symbolic tile to
directly produce the overall offset for loading a parameter, and similarly for
the strides), the benefit is marginal. However, it turns out to be highly
problematic for more important use cases, such as concrete size propagation;
indeed, a combination of a non-trivial reshape with an unknown stride gives
rise to stride-dependent sizes---thus completely preventing concrete size
propagation until a constant stride is chosen.

If we need to choose a constant stride anyway, then there is no use in carrying
around this parameter. There is likely no such problem with offsets, but we
likewise elide them from the list of input parameters in order to simplify
things.

We take this opportunity to simplify our extraction logic as well, and
implement logic able to handle merge reshapes correctly. Thanks to
the simplification of the structure of symbolic tiles, we can now make stronger
assumptions about our intermediate expressions (e.g. where constants appear),
and as a result the logic is much easier to follow.

PiperOrigin-RevId: 618308540
---
 third_party/xla/xla/service/gpu/model/BUILD   |   4 +-
 .../gpu/model/symbolic_tile_analysis.cc       |  19 +-
 .../gpu/model/symbolic_tile_analysis.h        |   9 +-
 .../gpu/model/symbolic_tile_analysis_test.cc  |   4 +-
 .../xla/service/gpu/model/tile_analysis.cc    | 471 +++++++++---------
 .../xla/xla/service/gpu/model/tile_analysis.h |  17 +-
 .../service/gpu/model/tile_analysis_test.cc   | 207 +++++---
 7 files changed, 385 insertions(+), 346 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index cef7a9fbe4afce..f62cc090b390b9 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -523,13 +523,13 @@ cc_library(
     deps = [
         ":affine_map_printer",
         ":indexing_map",
-        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index 445024ff688683..178ea0969dd7b0 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -217,24 +217,9 @@ std::vector<int64_t> SymbolicTileAnalysis::TileStrides(
                          *tile_parameters_);
 }
 
-void SymbolicTileAnalysis::SetTileParameters(
-    absl::Span<int64_t const> parameters) {
+void SymbolicTileAnalysis::SetTileSizes(absl::Span<int64_t const> sizes) {
   // TODO(bchetioui): CHECK num parameters somehow?
-  tile_parameters_ = std::vector(parameters.begin(), parameters.end());
-}
-
-void SymbolicTileAnalysis::SetTileParametersWithDefaultOffsetsAndStrides(
-    absl::Span<int64_t const> sizes) {
-  std::vector<int64_t> parameters;
-  parameters.reserve(3 * sizes.size());
-
-  for (int64_t size : sizes) {
-    // Untiled dims have offset = 0 and stride = 1.
-    parameters.push_back(0);
-    parameters.push_back(size);
-    parameters.push_back(1);
-  }
-  SetTileParameters(parameters);
+  tile_parameters_ = std::vector(sizes.begin(), sizes.end());
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
index f95a87b1b2e9f4..d8ced62be4ea75 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -72,14 +72,9 @@ class SymbolicTileAnalysis {
   std::vector<int64_t> TileStrides(absl::Nonnull<const HloInstruction*> hlo,
                                    const InstructionPathFromRoot& path) const;
 
-  // Populates tile parameters. This is a prerequisite in order to extract
+  // Populates input tile sizes. This is a prerequisite in order to extract
   // concrete values using `TileOffsets`, `TileSizes`, and `TileStrides`.
-  void SetTileParameters(absl::Span<int64_t const> parameters);
-
-  // Populates tile parameters with given sizes. All offsets are 0 and strides
-  // are 1.
-  void SetTileParametersWithDefaultOffsetsAndStrides(
-      absl::Span<int64_t const> sizes);
+  void SetTileSizes(absl::Span<int64_t const> sizes);
 
   // Return the underlying MLIRContext.
   mlir::MLIRContext* GetMLIRContext() const { return context_; };
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
index 6e23b7d169014b..6cb24760cace43 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
@@ -15,10 +15,8 @@ limitations under the License.
 
 #include "xla/service/gpu/model/symbolic_tile_analysis.h"
 
-#include <cstdint>
 #include <memory>
 #include <variant>
-#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -64,7 +62,7 @@ ENTRY main {
   SymbolicTileAnalysis analysis =
       std::get<SymbolicTileAnalysis>(analysis_or_error);
 
-  analysis.SetTileParametersWithDefaultOffsetsAndStrides(/*sizes=*/{1, 10});
+  analysis.SetTileSizes(/*sizes=*/{1, 10});
 
   const HloInstruction* p0 =
       module->entry_computation()->parameter_instruction(0);
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.cc b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
index 63f528ef23271a..018788d560cb3c 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
@@ -16,20 +16,21 @@ limitations under the License.
 #include "xla/service/gpu/model/tile_analysis.h"
 
 #include <cstdint>
-#include <iterator>
 #include <optional>
 #include <ostream>
 #include <sstream>
 #include <string>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "xla/service/gpu/model/indexing_map.h"
@@ -38,38 +39,34 @@ namespace xla {
 namespace gpu {
 namespace {
 
-using absl::StrCat;
-using mlir::AffineDimExpr;
-using mlir::AffineExpr;
-using mlir::AffineMap;
-using mlir::AffineSymbolExpr;
-using mlir::getAffineConstantExpr;
-using mlir::getAffineDimExpr;
-using mlir::MLIRContext;
-using mlir::simplifyAffineExpr;
-
-// The number of tile parameters that are inserted for each input dimension when
-// constructing a symbolic tile from an indexing map.
-constexpr int kNumTileParametersPerInputDim = 3;
-
-// Internal helper that checks whether an affine map of the form
-//     (index0, ..., index{M-1})
-//       [sym0, ..., sym{P-1}, offset0, size0, stride0, ...,
-//        offset{M-1}, size{M-1}, stride{M-1}]
-//  -> (expr0, ..., expr{N-1})
+using ::absl::StrCat;
+using ::mlir::AffineExpr;
+using ::mlir::AffineExprKind;
+using ::mlir::AffineMap;
+using ::mlir::getAffineConstantExpr;
+using ::mlir::getAffineDimExpr;
+using ::mlir::MLIRContext;
+
+// Internal helper that checks whether an affine map describes a tileable space.
+// In simple terms, this currently returns true if "dimensions don't mix", i.e.,
+// every result expression only refers to a single dimension (or symbol).
 //
-// describes a symbolic tile. The documentation for
-// `RawSymbolicTileFromIndexingMap` explains what this means in details.
-bool AffineMapDescribesTile(AffineMap affine_map) {
-  int64_t num_known_symbols =
-      affine_map.getNumSymbols() -
-      kNumTileParametersPerInputDim * affine_map.getNumDims();
-  for (AffineExpr result_expr : affine_map.getResults()) {
+// TODO(b/328427138): this is too restrictive for expressions involving e.g.
+// (output-to-input) split reshapes, where several symbols may appear within the
+// same expression but still yield a tileable space. This will be handled in a
+// forthcoming change.
+bool IndexingMapDescribesTileableSpace(const IndexingMap& indexing_map) {
+  for (AffineExpr result_expr : indexing_map.GetAffineMap().getResults()) {
+    // Using a simple integer here might be overly restrictive, since there may
+    // be cases where the same symbol appears in several places within the
+    // expression. It is a bit unclear whether this is a case that would happen
+    // in practice and whether we would be able to handle it well in all cases
+    // if it did. For that reason, we err on the side of conservatism and
+    // explicitly do not support such cases.
     int64_t num_hits = 0;
-    result_expr.walk([&num_hits, &num_known_symbols](AffineExpr expr) {
-      if (auto symbol_expr = llvm::dyn_cast<AffineSymbolExpr>(expr)) {
-        num_hits += (symbol_expr.getPosition() < num_known_symbols);
-      } else if (auto dim_expr = llvm::dyn_cast<AffineDimExpr>(expr)) {
+    result_expr.walk([&num_hits](AffineExpr expr) {
+      if (expr.getKind() == AffineExprKind::SymbolId ||
+          expr.getKind() == AffineExprKind::DimId) {
         ++num_hits;
       }
     });
@@ -81,237 +78,241 @@ bool AffineMapDescribesTile(AffineMap affine_map) {
   return true;
 }
 
-// Internal helper to construct symbolic tiles. The only difference with an
-// actual symbolic tile is that this structure does not enforce the relevant
-// important invariants by construction.
-struct RawSymbolicTile {
-  AffineMap offset_map;
-  AffineMap size_map;
-  AffineMap stride_map;
-};
-
-// Helper to perform function applications as described in the documentation of
-// `RawSymbolicTileFromIndexingMap`.
-AffineMap SubstituteAllIndicesAndKnownSymbolsWithSameValue(
-    AffineMap affine_map, AffineExpr value, int64_t num_known_symbols) {
+// Helper to perform function application to using the same parameter for every
+// dimension and symbol parameter.
+AffineMap SubstituteAllIndicesAndKnownSymbolsWithSameValue(AffineMap affine_map,
+                                                           AffineExpr value) {
   MLIRContext* mlir_context = affine_map.getContext();
-  int64_t num_input_dims = affine_map.getNumDims();
+  int64_t num_dims = affine_map.getNumDims();
+  int64_t num_symbols = affine_map.getNumSymbols();
   llvm::DenseMap<AffineExpr, AffineExpr> indices;
 
-  for (int64_t i = 0; i < num_input_dims; ++i) {
+  for (int64_t i = 0; i < num_dims; ++i) {
     indices[getAffineDimExpr(i, mlir_context)] = value;
   }
 
-  for (int64_t i = 0; i < num_known_symbols; ++i) {
+  for (int64_t i = 0; i < num_symbols; ++i) {
     indices[getAffineSymbolExpr(i, mlir_context)] = value;
   }
 
-  return simplifyAffineMap(affine_map.replace(indices, affine_map.getNumDims(),
-                                              affine_map.getNumSymbols()));
+  return simplifyAffineMap(affine_map.replace(indices, num_dims, num_symbols));
 }
 
-// Extracts non-negative offset, size, and stride expression for each result
-// expression in the parameter affine map if the affine map describes a tile.
-// Returns `std::nullopt` if the parameter affine map does not describe a tile.
-//
-// The parameter affine map f must follow the pattern
-//     (index0, ..., index{M-1})
-//       [sym0, ..., sym{P-1}, offset0, size0, stride0, ...,
-//        offset{M-1}, size{M-1}, stride{M-1}]
-//  -> (expr0, ..., expr{N-1})
-// where the result expressions expr0, ..., expr{N-1} are strided expressions of
-// the form
-//     offset_expr{i} + stride_expr{i} * index_expr{i}
-// with 0 <= i < N, and index_expr{i} is either a symbol with known bound
-// (sym0, ..., sym{P-1}) or an index.
-//
-// Let f'(x0, ..., x{M-1})[x{M}, ..., x{M+P-1}]
-//   = f(x0, ..., x{M-1})[x{M}, ..., x{M+P-1}, offset0, size0, stride0, ...,
-//                        offset{M-1}, size{M-1}, stride{M-1}]
-//
-// Then, the following equations hold:
-//
-// (1) f'(0, ..., 0)[0, ..., 0]{i}
-//   = offset_expr{i} + stride_expr{i} * 0
-//   = offset_expr{i}
-//
-// (2) f'(1, ..., 1)[1, ..., 1]{i} - f'(0, ..., 0)[0, ..., 0]{i}
-//   = offset_expr{i} + stride_expr{i} * 1 - offset_expr{i}
-//   = stride_expr{i}
-//
-// (3) If stride_expr{i} = 0, we automatically set size_expr{i} = 1. This
-//     happens when the strided expression points to a single value that is the
-//     same for all elements in the tile.
-//
-//     If stride_expr{i} != 0, then the relevant size expression can be obtained
-//     by analyzing the index expression, which is known to be either a symbol
-//     with known bound, or an index parameter. In the former case, we set the
-//     size to be the upper bound of the symbol; in the latter case, we
-//     substitute the index parameter by its corresponding size parameter.
-//
-// Strictly solving (1) may yield negative strides (e.g. in the case of
-// reverse). Conceptually, negative strides denote of a decremental iteration
-// order over indices {0, ..., size_expr{i} - 1}. This is not normalized in
-// symbolic tiles, and must be handled by consumers.
-//
-// The resulting affine maps elide known symbols from the list of parameter
-// symbols, since they will have been replaced by constants.
-std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
-    const IndexingMap& indexing_map) {
-  AffineMap affine_map = indexing_map.GetAffineMap();
-  if (!AffineMapDescribesTile(affine_map)) {
+struct SizeAndStrideExpression {
+  AffineExpr size;
+  AffineExpr stride;
+};
+
+// Converts a dimension expression to a symbol expression with the corresponding
+// index.
+AffineExpr ToSymbol(mlir::AffineDimExpr dim_expr) {
+  return mlir::getAffineSymbolExpr(dim_expr.getPosition(),
+                                   dim_expr.getContext());
+}
+
+std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromMod(
+    AffineExpr lhs, AffineExpr modulus) {
+  // TODO(b/326998704): derive constraints here, as well as the non-one stride
+  // case, both in the code and in the proof.
+  // Let f(d0) = d0 mod c. Then, given an input tile size n,
+  // {f(x) | x in Fin(n)} contains:
+  //   * n elements if n < c (and we add a constraint such that c | n);
+  //   * c elements if n >= c (and we add a constraint such that n | c).
+  // Given these constraints and assumptions, we derive
+  //   card({f(x) | x in Fin(n)}) = n - ((n - 1) floordiv n) * n.
+  // Proof:
+  //   * n < c (and c | n):
+  //       n - ((n - 1) floordiv c) * c
+  //     = n - 0 * c           (n < c => n floordiv c == 0)
+  //     = n
+  //   * n >= c (and n | c):
+  //       n - ((n - 1) floordiv c) * c
+  //     = n - (n / c - 1) * c     (n | c => (n - 1) floordiv c = n / c)
+  //     = n - (n - c)
+  //     = c
+  CHECK(modulus.getKind() == AffineExprKind::Constant);
+  if (auto dim_expr = llvm::dyn_cast<mlir::AffineDimExpr>(lhs)) {
+    AffineExpr sym = ToSymbol(dim_expr);
+    AffineExpr size = sym - mlir::getAffineBinaryOpExpr(
+                                AffineExprKind::FloorDiv, sym - 1, modulus) *
+                                modulus;
+    // In this case, stride is effectively 1 mod modulus = 1.
+    return SizeAndStrideExpression{
+        size, /*stride=*/getAffineConstantExpr(1, lhs.getContext())};
+  }
+
+  return std::nullopt;
+}
+
+std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromFloorDiv(
+    AffineExpr num, AffineExpr den) {
+  if (den.getKind() != AffineExprKind::Constant) {
     return std::nullopt;
   }
 
-  MLIRContext* mlir_context = affine_map.getContext();
-  int64_t num_known_symbols =
-      affine_map.getNumSymbols() -
-      affine_map.getNumDims() * kNumTileParametersPerInputDim;
-  int64_t num_results = affine_map.getNumResults();
+  if (auto dim_expr = llvm::dyn_cast<mlir::AffineDimExpr>(num)) {
+    // Let f(d0) = d0 floordiv c. Then, given an input tile size n,
+    // {f(x) | x in Fin(n)} contains n ceildiv c elements, with stride
+    // (1 ceildiv c) = 1.
+    //
+    // We represent `a ceildiv b` as `(a + b - 1) floordiv b`, since indexing
+    // maps are not compatible with CeilDiv affine expressions.
+    AffineExpr size = mlir::getAffineBinaryOpExpr(
+        AffineExprKind::FloorDiv, ToSymbol(dim_expr) + (den - 1), den);
+    return SizeAndStrideExpression{
+        size, /*stride=*/getAffineConstantExpr(1, num.getContext())};
+  }
 
-  // offsets_expr = f'(0, ..., 0)[0, ..., 0]
-  AffineMap f_prime_0 = SubstituteAllIndicesAndKnownSymbolsWithSameValue(
-      affine_map, getAffineConstantExpr(0, mlir_context), num_known_symbols);
-  llvm::ArrayRef<AffineExpr> offset_expressions = f_prime_0.getResults();
+  return std::nullopt;
+}
 
-  // Compute f'(1, ..., 1)[1, ..., 1].
-  AffineMap f_prime_1 = SubstituteAllIndicesAndKnownSymbolsWithSameValue(
-      affine_map, getAffineConstantExpr(1, mlir_context), num_known_symbols);
+std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
+    AffineExpr strided_indexing, absl::Span<Interval const> symbol_intervals) {
+  MLIRContext* ctx = strided_indexing.getContext();
+  // Deal with the symbol case (capturing a whole untiled dimension).
+  // TODO(b/330906085): concatenating across a reduction dimension needs to be
+  // handled by this code.
+  if (auto symbol = llvm::dyn_cast<mlir::AffineSymbolExpr>(strided_indexing)) {
+    const Interval& symbol_interval = symbol_intervals[symbol.getPosition()];
+    if (symbol_interval.lower != 0) {
+      return std::nullopt;
+    }
 
-  // strides_expr = f'(1, ..., 1)[1, ..., 1] - f'(0, ..., 0)[0, ..., 0]
-  std::vector<AffineExpr> stride_expressions;
-  stride_expressions.reserve(num_results);
-  for (auto [sub_lhs, sub_rhs] :
-       llvm::zip(f_prime_1.getResults(), offset_expressions)) {
-    stride_expressions.push_back(
-        simplifyAffineExpr(sub_lhs - sub_rhs, affine_map.getNumDims(),
-                           affine_map.getNumSymbols()));
+    return SizeAndStrideExpression{
+        /*size=*/getAffineConstantExpr(symbol_interval.upper + 1, ctx),
+        /*stride=*/getAffineConstantExpr(1, ctx)};
   }
 
-  // Deduce size_expr. At each index, if the stride is non-zero, once rid of
-  // the offset expression, the remaining expression can be one of two things;
-  //   1. a single parameter---either an index parameter, or a symbol with
-  //      known bounds. This parameter is the size expression, and in the
-  //      case of a symbol with known bounds, we can directly make it a
-  //      constant;
-  //   2. the product of an index parameter (or symbol with known bound)
-  //      with an expression consisting only of constants, and offsets
-  //      and strides parameters. In that case, the index parameter/symbol
-  //      with known bound is the size expression, and we do like in the
-  //      first bullet.
-  // This structure is guaranteed by the `AffineMapDescribesTile` filter
-  // at the top of the function.
-  std::vector<AffineExpr> size_expressions;
-  size_expressions.reserve(num_results);
-  constexpr int kSizePositionWithinTileParameters = 1;
-  for (auto [offset_expr, stride_expr, input_expr] : llvm::zip(
-           offset_expressions, stride_expressions, affine_map.getResults())) {
-    AffineExpr size_expr;
-    if (stride_expr == getAffineConstantExpr(0, mlir_context)) {
-      size_expr = getAffineConstantExpr(1, mlir_context);
-    } else {
-      AffineExpr strided_size_expr =
-          simplifyAffineExpr(input_expr - offset_expr, affine_map.getNumDims(),
-                             affine_map.getNumSymbols());
-
-      strided_size_expr.walk([&](AffineExpr expr) {
-        auto symbol_expr = llvm::dyn_cast<AffineSymbolExpr>(expr);
-        if (symbol_expr && symbol_expr.getPosition() < num_known_symbols) {
-          CHECK(!size_expr);
-          const Interval& symbol_range =
-              indexing_map.GetSymbolBound(symbol_expr.getPosition());
-          size_expr = getAffineConstantExpr(
-              symbol_range.upper - symbol_range.lower + 1, mlir_context);
-        } else if (auto dim_expr = llvm::dyn_cast<AffineDimExpr>(expr)) {
-          CHECK(!size_expr);
-          size_expr = getAffineSymbolExpr(
-              num_known_symbols +
-                  dim_expr.getPosition() * kNumTileParametersPerInputDim +
-                  kSizePositionWithinTileParameters,
-              mlir_context);
+  AffineMapPrinter printer;
+
+  // TODO(b/328427138): support multivariate size expressions.
+  switch (strided_indexing.getKind()) {
+    case AffineExprKind::DimId:
+      return SizeAndStrideExpression{
+          /*size=*/ToSymbol(llvm::cast<mlir::AffineDimExpr>(strided_indexing)),
+          /*stride=*/getAffineConstantExpr(1, ctx)};
+    case mlir::AffineExprKind::Mul: {
+      auto mul = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
+      AffineExpr lhs = mul.getLHS();
+      // The stride may not be fully collapsed if it is negative; in that case,
+      // we need to extract the negative multiplier first.
+      if (auto rhs = llvm::dyn_cast<mlir::AffineConstantExpr>(mul.getRHS());
+          rhs && rhs.getValue() == -1) {
+        std::optional<SizeAndStrideExpression> maybe_size_and_stride =
+            ExtractSizeAndStride(lhs, symbol_intervals);
+        if (!maybe_size_and_stride.has_value()) {
+          return std::nullopt;
         }
-      });
-    }
-    size_expressions.push_back(size_expr);
-  }
 
-  int64_t num_symbols = affine_map.getNumSymbols();
-  return RawSymbolicTile(
-      {.offset_map =
-           AffineMap::get(0, num_symbols, offset_expressions, mlir_context)
-               .shiftSymbols(-num_known_symbols),
-       .size_map =
-           AffineMap::get(0, num_symbols, size_expressions, mlir_context)
-               .shiftSymbols(-num_known_symbols),
-       .stride_map =
-           AffineMap::get(0, num_symbols, stride_expressions, mlir_context)
-               .shiftSymbols(-num_known_symbols)});
+        return SizeAndStrideExpression{
+            /*size=*/maybe_size_and_stride->size,
+            /*stride=*/maybe_size_and_stride->stride * rhs};
+      }
+      CHECK(lhs.getKind() == AffineExprKind::DimId);
+      return SizeAndStrideExpression{
+          /*size=*/ToSymbol(llvm::cast<mlir::AffineDimExpr>(lhs)),
+          /*stride=*/mul.getRHS()};
+    }
+    case mlir::AffineExprKind::Mod: {
+      auto mod = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
+      return ExtractSizeAndStrideFromMod(mod.getLHS(), mod.getRHS());
+    }
+    case mlir::AffineExprKind::FloorDiv: {
+      auto floor_div = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
+      return ExtractSizeAndStrideFromFloorDiv(floor_div.getLHS(),
+                                              floor_div.getRHS());
+    };
+    case mlir::AffineExprKind::Constant:
+      return SizeAndStrideExpression{/*size=*/getAffineConstantExpr(1, ctx),
+                                     /*stride=*/getAffineConstantExpr(0, ctx)};
+    case mlir::AffineExprKind::SymbolId:
+      VLOG(1) << "Encountered complex size expression involving symbol "
+              << printer.ToString(strided_indexing);
+      return std::nullopt;
+    case mlir::AffineExprKind::Add:
+      // TODO(b/328427138): this should only be necessary in the multivariate
+      // case, and will be implemented later.
+      VLOG(1) << "Encountered complex strided indexing expression "
+              << printer.ToString(strided_indexing);
+      return std::nullopt;
+    case mlir::AffineExprKind::CeilDiv:
+      LOG(FATAL) << "unreachable";
+  };
 }
 
 }  // anonymous namespace
 
 /*static*/ std::optional<SymbolicTile> SymbolicTile::FromIndexingMap(
     const IndexingMap& indexing_map) {
+  // Bail out on runtime offsets.
   if (indexing_map.GetRTVarsCount()) {
     return std::nullopt;
   }
-  MLIRContext* mlir_context = indexing_map.GetMLIRContext();
-  int64_t num_input_dims = indexing_map.GetDimensionCount();
-  std::vector<AffineExpr> exprs;
-  exprs.reserve(num_input_dims);
-
-  std::vector<DimVar> tile_dim_vars;
-  tile_dim_vars.reserve(num_input_dims);
-  std::vector<RangeVar> tile_range_vars;
-  tile_range_vars.reserve(kNumTileParametersPerInputDim * num_input_dims +
-                          indexing_map.GetAffineMap().getNumSymbols());
-
-  // The symbols declared in 'indexing_map.affine_map' will precede those
-  // defined in the producer map we construct here.
-  absl::c_copy(indexing_map.GetRangeVars(),
-               std::back_inserter(tile_range_vars));
-
-  // For each input dims we add kNumTileParametersPerInputDim = 3 symbols, as
-  // well as a single dim. Symbols are ordered in (offset, size, stride)
-  // triplets.
-  for (int64_t dim = 0; dim < num_input_dims; ++dim) {
-    AffineExpr index = getAffineDimExpr(dim, mlir_context);
-    AffineExpr offset =
-        getAffineSymbolExpr(kNumTileParametersPerInputDim * dim, mlir_context);
-    AffineExpr stride = getAffineSymbolExpr(
-        kNumTileParametersPerInputDim * dim + 2, mlir_context);
-
-    exprs.push_back(offset + stride * index);
-
-    Interval range = indexing_map.GetDimensionBound(dim);
-    tile_dim_vars.push_back({range});
-
-    for (int64_t symbol_index = 0; symbol_index < kNumTileParametersPerInputDim;
-         ++symbol_index) {
-      tile_range_vars.push_back({range});
-    }
+  // TODO(b/328427138): handle multiple symbols in a single tile to support
+  // merging dimensions.
+  if (!IndexingMapDescribesTileableSpace(indexing_map)) {
+    return std::nullopt;
   }
 
-  AffineMap producer_map = AffineMap::get(
-      num_input_dims, kNumTileParametersPerInputDim * num_input_dims, exprs,
-      mlir_context);
-
-  IndexingMap composed_indexing_map(
-      indexing_map.GetAffineMap().compose(producer_map), tile_dim_vars,
-      tile_range_vars, /*rt_vars=*/{});
+  AffineMap input_affine_map = indexing_map.GetAffineMap();
+  MLIRContext* mlir_context = input_affine_map.getContext();
+
+  // If indexing_map describes a tileable space, then input_affine_map can be
+  // expressed as
+  //   f(dim0, ..., dim{M-1})[sym0, ..., sym{P-1}] = (expr0, ..., expr{N-1})
+  // where the result expressions expr0, ..., expr{N-1} are strided expressions
+  // of the form
+  //     offset_expr{i} + stride_expr{i} * index_expr{i}
+  // with 0 <= i < N.
+  //
+  // We are interested in extracting expressions for offset_expr{i},
+  // stride_expr{i}, and size_expr{i} (the count of different values that
+  // expr{i} can represent).
+  //
+  // We have that the following equations hold:
+  //
+  // (1) f(0, ..., 0)[0, ..., 0]{i}
+  //   = offset_expr{i} + stride_expr{i} * 0
+  //   = offset_expr{i}
+  //
+  // (2) f(x0, ..., x{M-1})[x{M}, ..., x{M+P-1}]{i} - f(0, ..., 0)[0, ..., 0]{i}
+  //   = offset_expr{i} + stride_expr{i} * index_expr{i} - offset_expr{i}
+  //   = stride_expr{i} * index_expr{i}
+  //
+  // offset_expressions = f(0, ..., 0)[0, ..., 0].
+  llvm::ArrayRef<AffineExpr> offset_expressions =
+      SubstituteAllIndicesAndKnownSymbolsWithSameValue(
+          input_affine_map, getAffineConstantExpr(0, mlir_context))
+          .getResults();
 
-  composed_indexing_map.Simplify();
-
-  std::optional<RawSymbolicTile> maybe_raw_symbolic_tile =
-      RawSymbolicTileFromIndexingMap(composed_indexing_map);
-
-  if (!maybe_raw_symbolic_tile.has_value()) {
-    return std::nullopt;
+  std::vector<AffineExpr> size_expressions;
+  std::vector<AffineExpr> stride_expressions;
+  size_expressions.reserve(offset_expressions.size());
+  stride_expressions.reserve(offset_expressions.size());
+
+  // strided_indexing_expressions =
+  //     f(x0, ..., x{M-1})[x{M}, ..., x{M+P-1}] - offset_expressions
+  for (auto [composite_indexing, offset] :
+       llvm::zip(input_affine_map.getResults(), offset_expressions)) {
+    std::optional<SizeAndStrideExpression> maybe_size_and_stride =
+        ExtractSizeAndStride(composite_indexing - offset,
+                             indexing_map.GetSymbolBounds());
+    if (!maybe_size_and_stride.has_value()) {
+      return std::nullopt;
+    }
+    size_expressions.push_back(maybe_size_and_stride->size);
+    stride_expressions.push_back(maybe_size_and_stride->stride);
   }
 
-  return SymbolicTile(maybe_raw_symbolic_tile->offset_map,
-                      maybe_raw_symbolic_tile->size_map,
-                      maybe_raw_symbolic_tile->stride_map);
+  int64_t num_symbols = input_affine_map.getNumDims();
+  AffineMap offset_map =
+      AffineMap::get(0, num_symbols, offset_expressions, mlir_context);
+  AffineMap size_map =
+      AffineMap::get(0, num_symbols, size_expressions, mlir_context);
+  AffineMap stride_map =
+      AffineMap::get(0, num_symbols, stride_expressions, mlir_context);
+
+  return SymbolicTile(offset_map, size_map, stride_map);
 }
 
 std::string SymbolicTile::ToString(const AffineMapPrinter& printer) const {
@@ -335,17 +336,9 @@ void SymbolicTile::Print(std::ostream& out,
 
 std::ostream& operator<<(std::ostream& out, const SymbolicTile& symbolic_tile) {
   AffineMapPrinter printer;
-
-  // This utilizes the assumption that symbols are structured as triplets, i.e.
-  // [offset0, size0, stride0, ... offset{N-1}, size{N-1}, stride{N-1}]
-  // where N is the tensor rank.
-  for (int64_t triplet_start = 0;
-       triplet_start < symbolic_tile.offset_map().getNumSymbols();
-       triplet_start += kNumTileParametersPerInputDim) {
-    int64_t triplet_idx = triplet_start / kNumTileParametersPerInputDim;
-    printer.SetSymbolName(triplet_start, StrCat("offset", triplet_idx));
-    printer.SetSymbolName(triplet_start + 1, StrCat("size", triplet_idx));
-    printer.SetSymbolName(triplet_start + 2, StrCat("stride", triplet_idx));
+  for (int64_t symbol_id = 0;
+       symbol_id < symbolic_tile.size_map().getNumSymbols(); symbol_id++) {
+    printer.SetSymbolName(symbol_id, StrCat("size", symbol_id));
   }
 
   symbolic_tile.Print(out, printer);
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.h b/third_party/xla/xla/service/gpu/model/tile_analysis.h
index 5ccbe497aecebe..54fcdd396fa589 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.h
@@ -37,16 +37,18 @@ namespace gpu {
 // A N-dimensional symbolic tile is a function from offsets, strides, and sizes
 // to a N-dimensional tile. It can be represented as three affine maps with
 // domain
-//     ()[offset0, size0, stride0, ... offset{M-1}, size{P-1}, stride{M-1}]
+//     ()[size0, ..., size{M-1}}]
 // and respective co-domains
-//     (offset0', ..., offset'{N-1})     (offset_map())
-//     (size0', ..., size'{N-1})         (size_map())
-//     (stride0', ..., stride'{N-1})     (stride_map())
+//     (offset0, ..., offset{N-1})     (offset_map())
+//     (size0', ..., size'{N-1})       (size_map())
+//     (stride0, ..., stride{N-1})     (stride_map())
 // where maps respectively encode the offset, size, and stride component of
 // each strided expression in the tile. The parameters to the maps above are all
-// assumed to be non-negative, but results of stride_map() may be negative.
+// assumed to be strictly positive, but results of stride_map() may be negative.
+// The input offsets are assumed to be all 0s, and the input strides are assumed
+// to be all 1s.
 //
-// A symbolic tile with 3*M symbols and N results is constructed using an
+// A symbolic tile with M symbols and N results is constructed using an
 // `IndexingMap` with M input dimensions and N results. The construction of the
 // symbolic tile may fail if any one of the resulting expressions is not a
 // strided expression as described above.
@@ -74,8 +76,7 @@ class SymbolicTile {
       : offset_map_(offset_map), size_map_(size_map), stride_map_(stride_map) {}
 };
 
-// Prints symbolic_tile with triplet labels for each symbol.
-// i.e. a symbol si which corresponds to an offset will be labeled offseti.
+// Prints symbolic_tile with renamed labels for each symbol, s{i} => size{i}.
 std::ostream& operator<<(std::ostream& out, const SymbolicTile& symbolic_tile);
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
index da941e0d6a4ebb..e8188a09694eeb 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
@@ -23,11 +23,9 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -74,17 +72,15 @@ TEST_F(SymbolicTileTest, CanPropagateTileFromDotOutputToInputs) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s3, 0)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, s4, 19)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, s5, 1)")));
+      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, 0, 0)",
+                                 "()[s0, s1, s2] -> (s0, s1, 19)",
+                                 "()[s0, s1, s2] -> (1, 1, 1)")));
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, 0, s6)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, 19, s7)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, 1, s8)")));
+      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, 0, 0)",
+                                 "()[s0, s1, s2] -> (s0, 19, s2)",
+                                 "()[s0, s1, s2] -> (1, 1, 1)")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileThroughTrivialReshape) {
@@ -96,18 +92,33 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughTrivialReshape) {
     }
   )"));
 
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile("()[s0, s1, s2, s3] -> (0, 0, 0)",
+                                 "()[s0, s1, s2, s3] -> (s1, s2, s3)",
+                                 "()[s0, s1, s2, s3] -> (1, 1, 1)")));
+}
+
+TEST_F(SymbolicTileTest,
+       CanPropagateTileThroughNonTrivialMergeReshapeFromOutputToInput) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      ROOT bitcast = f32[48,4]{1,0} bitcast(p0)
+    }
+  )"));
+
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] "
-          "-> (s3, s6, s9)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] "
-          "-> (s4, s7, s10)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] "
-          "-> (s5, s8, s11)")));
+          "()[s0, s1] -> (0, 0, 0, 0)",
+          "()[s0, s1] -> "
+          "(1, (s0 + 5) floordiv 6, s0 - ((s0 - 1) floordiv 6) * 6, s1)",
+          "()[s0, s1] -> (0, 1, 1, 1)")));
 }
 
-TEST_F(SymbolicTileTest, FailsToPropagateTileThroughReshape) {
+TEST_F(SymbolicTileTest, FailsToPropagateTileThroughNonTrivialReshape) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -133,9 +144,8 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughElementwiseOp) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (s0)",
-                                 "()[s0, s1, s2] -> (s1)",
-                                 "()[s0, s1, s2] -> (s2)")));
+      Optional(MatchSymbolicTile("()[s0] -> (0)", "()[s0] -> (s0)",
+                                 "()[s0] -> (1)")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileFromBroadcastOutputToInput) {
@@ -149,9 +159,8 @@ TEST_F(SymbolicTileTest, CanPropagateTileFromBroadcastOutputToInput) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2, s3, s4, s5] -> (s3)",
-                                 "()[s0, s1, s2, s3, s4, s5] -> (s4)",
-                                 "()[s0, s1, s2, s3, s4, s5] -> (s5)")));
+      Optional(MatchSymbolicTile("()[s0, s1] -> (0)", "()[s0, s1] -> (s1)",
+                                 "()[s0, s1] -> (1)")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileFromReduceOutputToInput) {
@@ -172,9 +181,8 @@ TEST_F(SymbolicTileTest, CanPropagateTileFromReduceOutputToInput) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, s0)",
-                                 "()[s0, s1, s2] -> (125, s1)",
-                                 "()[s0, s1, s2] -> (1, s2)")));
+      Optional(MatchSymbolicTile("()[s0] -> (0, 0)", "()[s0] -> (125, s0)",
+                                 "()[s0] -> (1, 1)")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileThroughReverse) {
@@ -188,9 +196,8 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughReverse) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (-s0 + 178)",
-                                 "()[s0, s1, s2] -> (s1)",
-                                 "()[s0, s1, s2] -> (-s2)")));
+      Optional(MatchSymbolicTile("()[s0] -> (178)", "()[s0] -> (s0)",
+                                 "()[s0] -> (-1)")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileFromSliceOutputToInput) {
@@ -204,10 +211,9 @@ TEST_F(SymbolicTileTest, CanPropagateTileFromSliceOutputToInput) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5] -> (s0 * 2 + 40, s3 * 4 + 20)",
-          "()[s0, s1, s2, s3, s4, s5] -> (s1, s4)",
-          "()[s0, s1, s2, s3, s4, s5] -> (s2 * 2, s5 * 4)")));
+      Optional(MatchSymbolicTile("()[s0, s1] -> (40, 20)",
+                                 "()[s0, s1] -> (s0, s1)",
+                                 "()[s0, s1] -> (2, 4)")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileThroughTranspose) {
@@ -221,13 +227,13 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughTranspose) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2, s3, s4, s5] -> (s3, s0)",
-                                 "()[s0, s1, s2, s3, s4, s5] -> (s4, s1)",
-                                 "()[s0, s1, s2, s3, s4, s5] -> (s5, s2)")));
+      Optional(MatchSymbolicTile("()[s0, s1] -> (0, 0)",
+                                 "()[s0, s1] -> (s1, s0)",
+                                 "()[s0, s1] -> (1, 1)")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileThroughConcatenate) {
-  // TODO(325488844): Add additional concat test cases with constraints.
+  // TODO(b/325488844): Add additional concat test cases with constraints.
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -240,26 +246,23 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughConcatenate) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s3, s6)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, s4, s7)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, s5, s8)")));
+      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, 0, 0)",
+                                 "()[s0, s1, s2] -> (s0, s1, s2)",
+                                 "()[s0, s1, s2] -> (1, 1, 1)")));
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s3 - 5, s6)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, s4, s7)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, s5, s8)")));
+      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, -5, 0)",
+                                 "()[s0, s1, s2] -> (s0, s1, s2)",
+                                 "()[s0, s1, s2] -> (1, 1, 1)")));
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[2].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s3 - 16, s6)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, s4, s7)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, s5, s8)")));
+      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, -16, 0)",
+                                 "()[s0, s1, s2] -> (s0, s1, s2)",
+                                 "()[s0, s1, s2] -> (1, 1, 1)")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileThroughPadOpWithoutInteriorPadding) {
-  // TODO(325488844): Add pad tests with defined constraints on tile input.
+  // TODO(b/325488844): Add pad tests with defined constraints on tile input.
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -271,10 +274,84 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughPadOpWithoutInteriorPadding) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(
-          MatchSymbolicTile("()[s0, s1, s2, s3, s4, s5] -> (s0 - 2, s3 - 1)",
-                            "()[s0, s1, s2, s3, s4, s5] -> (s1, s4)",
-                            "()[s0, s1, s2, s3, s4, s5] -> (s2, s5)")));
+      Optional(MatchSymbolicTile("()[s0, s1] -> (-2, -1)",
+                                 "()[s0, s1] -> (s0, s1)",
+                                 "()[s0, s1] -> (1, 1)")));
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileThroughSplitReshapeOfReverse) {
+  // A split reshape of a reverse creates a negative unit stride atop a
+  // floordiv.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    computation {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      reverse = f32[1,8,6,4]{3,2,1,0} reverse(p0), dimensions={1,2}
+      ROOT bitcast = f32[48,4]{1,0} bitcast(reverse)
+    }
+
+    ENTRY e {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      ROOT fusion = f32[48,4]{1,0} fusion(p0), kind=kLoop, calls=computation
+    }
+  )"));
+
+  // TODO(b/328190548): normalize strides to be positive.
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile(
+          "()[s0, s1] -> (0, 7, 5, 0)",
+          "()[s0, s1] -> "
+          "(1, (s0 + 5) floordiv 6, s0 - ((s0 - 1) floordiv 6) * 6, s1)",
+          "()[s0, s1] -> (0, -1, -1, 1)")));
+}
+
+TEST_F(SymbolicTileTest,
+       FailsGracefullyAtPropagatingTileThroughSliceOfSplitReshape) {
+  // TODO(b/326998704): constraints should allow us to unblock this use case.
+  // A slice of a split reshape creates a non-unit stride atop a floordiv.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    computation {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      bitcast = f32[48,4]{1,0} bitcast(p0)
+      ROOT slice = f32[5,2]{1,0} slice(bitcast), slice={[20:45:5], [0:4:2]}
+    }
+
+    ENTRY e {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      ROOT fusion = f32[5,2]{1,0} fusion(p0), kind=kLoop, calls=computation
+    }
+  )"));
+
+  EXPECT_EQ(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      std::nullopt);
+}
+
+TEST_F(SymbolicTileTest,
+       FailsGracefullyAtPropagatingTileThroughSliceOfSplitReshapeOfReverse) {
+  // TODO(b/326998704): constraints should allow us to unblock this use case.
+  // A slice of a split reshape of a reverse creates a negative non-unit stride
+  // atop a floordiv.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    computation {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      reverse = f32[1,8,6,4]{3,2,1,0} reverse(p0), dimensions={1,2}
+      bitcast = f32[48,4]{1,0} bitcast(reverse)
+      ROOT slice = f32[5,2]{1,0} slice(bitcast), slice={[20:45:5], [0:4:2]}
+    }
+
+    ENTRY e {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      ROOT fusion = f32[5,2]{1,0} fusion(p0), kind=kLoop, calls=computation
+    }
+  )"));
+
+  EXPECT_EQ(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      std::nullopt);
 }
 
 TEST_F(SymbolicTileTest, CanPrintSymbolicTileWithNamedTriplets) {
@@ -299,26 +376,16 @@ TEST_F(SymbolicTileTest, CanPrintSymbolicTileWithNamedTriplets) {
           .value();
 
   ss << first_operand_tile;
-  EXPECT_THAT(
-      ss.str(),
-      AllOf(HasSubstr("()[offset0, size0, stride0, offset1, size1, stride1] "
-                      "-> (offset0, 0)"),
-            HasSubstr("()[offset0, size0, stride0, offset1, size1, stride1] "
-                      "-> (size0, 19)"),
-            HasSubstr("()[offset0, size0, stride0, offset1, size1, stride1] "
-                      "-> (stride0, 1)")));
+  EXPECT_THAT(ss.str(), AllOf(HasSubstr("()[size0, size1] -> (0, 0)"),
+                              HasSubstr("()[size0, size1] -> (size0, 19)"),
+                              HasSubstr("()[size0, size1] -> (1, 1)")));
 
   // Clear the stream and load the second map.
   ss.str("");
   ss << second_operand_tile;
-  EXPECT_THAT(
-      ss.str(),
-      AllOf(HasSubstr("()[offset0, size0, stride0, offset1, size1, stride1] "
-                      "-> (0, offset1)"),
-            HasSubstr("()[offset0, size0, stride0, offset1, size1, stride1] "
-                      "-> (19, size1)"),
-            HasSubstr("()[offset0, size0, stride0, offset1, size1, stride1] "
-                      "-> (1, stride1)")));
+  EXPECT_THAT(ss.str(), AllOf(HasSubstr("()[size0, size1] -> (0, 0)"),
+                              HasSubstr("()[size0, size1] -> (19, size1)"),
+                              HasSubstr("()[size0, size1] -> (1, 1)")));
 }
 
 }  // namespace

From bb5f75eec3094daf1d5d29739cf744b8f25bf78a Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Fri, 22 Mar 2024 16:00:25 -0700
Subject: [PATCH 328/670] #tf-data Support global shuffle for the from_list
 dataset.

PiperOrigin-RevId: 618310825
---
 .../core/kernels/data/experimental/BUILD      |  2 +
 .../data/experimental/list_dataset_op.cc      | 28 ++++++-
 .../data/experimental/kernel_tests/BUILD      |  2 +
 .../kernel_tests/from_list_test.py            | 73 +++++++++++++++++++
 4 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 54af95bb0669b1..81f4abfbcb0bba 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -462,8 +462,10 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core/data:dataset_utils",
+        "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
         "//tensorflow/core/data:split_utils",
+        "@com_google_absl//absl/status",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/experimental/list_dataset_op.cc b/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
index 8678a687fa3637..a9aa407d04e95a 100644
--- a/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
@@ -14,12 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/experimental/list_dataset_op.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/split_utils.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
@@ -87,8 +90,17 @@ class ListDatasetOp::Dataset : public DatasetBase {
 
   Status CheckExternalState() const override { return absl::OkStatus(); }
 
-  Status Get(OpKernelContext* ctx, int64 index,
-             std::vector<Tensor>* out_tensors) const override {
+  absl::Status RandomIndexingCompatible() const override {
+    return absl::OkStatus();
+  }
+
+  absl::Status Get(OpKernelContext* ctx, int64_t index,
+                   std::vector<Tensor>* out_tensors) const override {
+    return Get(index, out_tensors);
+  }
+
+  absl::Status Get(int64_t index,
+                   std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     out_tensors->clear();
     out_tensors->reserve(num_components_);
@@ -126,7 +138,8 @@ class ListDatasetOp::Dataset : public DatasetBase {
   class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<Dataset>(params) {}
+        : DatasetIterator<Dataset>(params),
+          global_shuffle_iterator_(dataset()) {}
 
     bool SymbolicCheckpointCompatible() const override { return true; }
 
@@ -144,6 +157,11 @@ class ListDatasetOp::Dataset : public DatasetBase {
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
+      if (ctx->index_mapper() != nullptr) {
+        return global_shuffle_iterator_.GetNext(ctx, out_tensors,
+                                                end_of_sequence);
+      }
+
       Tensor split;
       TF_RETURN_IF_ERROR(split_provider_->GetNext(&split, end_of_sequence));
       if (*end_of_sequence) {
@@ -173,12 +191,16 @@ class ListDatasetOp::Dataset : public DatasetBase {
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
+      if (ctx->restored_element_count().has_value()) {
+        return global_shuffle_iterator_.Restore(ctx);
+      }
       return split_provider_->Restore(
           [this](const std::string& key) { return full_name(key); }, reader);
     }
 
    private:
     std::shared_ptr<SplitProvider> split_provider_;
+    GlobalShuffleIterator global_shuffle_iterator_;
   };
 
   const std::vector<Tensor> tensors_;
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index ae9f5462886eeb..5cf6657ea5d292 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -176,8 +176,10 @@ tf_py_strict_test(
     name = "from_list_test",
     size = "small",
     srcs = ["from_list_test.py"],
+    shard_count = 4,
     deps = [
         "//tensorflow/python/data/experimental/ops:from_list",
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
diff --git a/tensorflow/python/data/experimental/kernel_tests/from_list_test.py b/tensorflow/python/data/experimental/kernel_tests/from_list_test.py
index 7249aa596caaad..cf07d43195314d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/from_list_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/from_list_test.py
@@ -13,11 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for `tf.data.experimental.from_list()."""
+
 import collections
+from typing import Callable, Optional
+
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import from_list
+from tensorflow.python.data.experimental.ops import global_shuffle_op
 from tensorflow.python.data.experimental.ops import random_access
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
@@ -226,5 +230,74 @@ def testDict(self, verify_fn):
         self, lambda: self._build_list_dataset(dict_elements), num_outputs=3)
 
 
+class FromListGlobalShuffleTest(
+    test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[100],
+              repetitions=[1, 2],
+              seed=[None, 42],
+              reshuffle_each_iteration=[True, False])))
+  def test(
+      self,
+      dataset_range: int,
+      repetitions: int,
+      seed: Optional[int],
+      reshuffle_each_iteration: bool):
+    dataset = from_list.from_list(list(range(dataset_range)))
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+    if repetitions > 1:
+      dataset = dataset.repeat(repetitions)
+    dataset = global_shuffle_op._global_shuffle(
+        dataset, seed=seed, reshuffle_each_iteration=reshuffle_each_iteration)
+
+    expected = list(range(dataset_range)) * repetitions
+    dataset_output = self.getDatasetOutput(
+        dataset, requires_initialization=True)
+    self.assertCountEqual(dataset_output, expected)
+    self.assertNotEqual(dataset_output, expected)
+    self.assertLen(dataset_output, self.evaluate(dataset.cardinality()))
+
+
+class FromListGlobalShuffleCheckpointTest(
+    checkpoint_test_base.CheckpointTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[10],
+              repetitions=[1, 2],
+              reshuffle_each_iteration=[True, False],
+              symbolic_checkpoint=[True, False])))
+  def test(
+      self,
+      verify_fn: Callable[..., None],
+      dataset_range: int,
+      repetitions: int,
+      reshuffle_each_iteration: bool,
+      symbolic_checkpoint: bool):
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset = from_list.from_list(list(range(dataset_range)))
+      dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+      if repetitions > 1:
+        dataset = dataset.repeat(repetitions)
+      dataset = global_shuffle_op._global_shuffle(
+          dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(
+        self,
+        _build_dataset,
+        num_outputs=dataset_range * repetitions,
+        assert_items_equal=reshuffle_each_iteration)
+
+
 if __name__ == "__main__":
   test.main()

From b08847295f566c9274133bd2c8f023a4c74ebff0 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Fri, 22 Mar 2024 16:58:11 -0700
Subject: [PATCH 329/670] #tf-data Do not skip in the global shuffling mode.

PiperOrigin-RevId: 618322598
---
 .../core/kernels/data/skip_dataset_op.cc      | 54 +++++++++----------
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index c5ccea131b96c2..1e2d000865e844 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -169,20 +169,20 @@ class SkipDatasetOp::Dataset : public DatasetBase {
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
-      mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+      if (ctx->index_mapper() != nullptr) {
+        return Get(ctx, out_tensors, end_of_sequence);
+      }
 
+      mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
       if (!input_impl_) {
         *end_of_sequence = true;
         return absl::OkStatus();
       }
 
-      IteratorContextWithIndexMapper ctx_with_index_mapper(ctx, this);
       if (i_ < dataset()->count_) {
         int num_skipped;
-        TF_RETURN_IF_ERROR(input_impl_->Skip(ctx_with_index_mapper.Get(),
-                                             dataset()->count_ - i_,
+        TF_RETURN_IF_ERROR(input_impl_->Skip(ctx, dataset()->count_ - i_,
                                              end_of_sequence, &num_skipped));
-        ctx_with_index_mapper.MergeCheckpoint();
         i_ += num_skipped;
         if (*end_of_sequence) {
           // We reached the end before the count was reached.
@@ -192,27 +192,36 @@ class SkipDatasetOp::Dataset : public DatasetBase {
       }
 
       // Return GetNext() on the underlying iterator.
-      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx_with_index_mapper.Get(),
-                                              out_tensors, end_of_sequence));
-      ctx_with_index_mapper.MergeCheckpoint();
+      TF_RETURN_IF_ERROR(
+          input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
       if (*end_of_sequence) {
         input_impl_.reset();
       }
       return absl::OkStatus();
     }
 
+    absl::Status Get(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) {
+      mutex_lock l(mu_);
+      if (!input_impl_) {
+        *end_of_sequence = true;
+        return absl::OkStatus();
+      }
+
+      IteratorContextWithIndexMapper ctx_with_index_mapper(ctx, this);
+      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx_with_index_mapper.Get(),
+                                              out_tensors, end_of_sequence));
+      ctx_with_index_mapper.MergeCheckpoint();
+      return absl::OkStatus();
+    }
+
     IndexMapperFn GetIndexMapper(
         IndexMapperFn parent_index_mapper) const override {
       int64_t skip_count = dataset()->count_;
-      return [parent_index_mapper,
-              skip_count](size_t element_position) -> size_t {
-        if (element_position < skip_count) {
-          // The first `skip_count` elements are to be skipped.
-          return parent_index_mapper(element_position);
-        }
-        // Maps the range [skip_count, cardinality) to a permuted range.
-        return parent_index_mapper(element_position - skip_count) + skip_count;
-      };
+      return
+          [parent_index_mapper, skip_count](size_t element_position) -> size_t {
+            return parent_index_mapper(element_position) + skip_count;
+          };
     }
 
    protected:
@@ -238,17 +247,6 @@ class SkipDatasetOp::Dataset : public DatasetBase {
                            IteratorStateReader* reader) override {
       if (ctx->restored_element_count().has_value()) {
         mutex_lock l(mu_);
-        if (*ctx->restored_element_count() > 0) {
-          i_ = dataset()->count_;
-          // For upstream iterators, the restored count is the returned element
-          // count + skipped element count.
-          IteratorContext::Params params(ctx);
-          params.restored_element_count =
-              *ctx->restored_element_count() + dataset()->count_;
-          IteratorContext ctx_with_restored_count(params);
-          return RestoreInput(&ctx_with_restored_count, reader, input_impl_);
-        }
-        i_ = 0;
         return RestoreInput(ctx, reader, input_impl_);
       }
 

From 3bddf6e11c5d55fe397685ad74f9815060727a31 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 17:02:06 -0700
Subject: [PATCH 330/670] Fix a bug in the pattern to fuse reshape around batch
 matmul.

FuseReshapesAroundBatchMatMulLHS1 pattern to apply only when higher shaped LHS is being reshaped. Added a missing check to make sure the LHS shape. is greated than 3.

PiperOrigin-RevId: 618323430
---
 .../compiler/mlir/lite/tests/optimize.mlir       | 16 ++++++++++++++++
 .../mlir/lite/transforms/optimize_patterns.td    |  3 ++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 8548151458a26c..75c1a791eeca73 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -773,6 +773,22 @@ func.func @FuseReshapeAroundBMMLHS(%arg0: tensor<6x5x1024xf32>) -> tensor<6x5x81
   // CHECK: return %0 : tensor<6x5x8192xf32>
 }
 
+// CHECK-LABEL: @FuseReshapeAroundBMMLHSNegative
+func.func @FuseReshapeAroundBMMLHSNegative(%arg0: tensor<1x64xf32>, %arg1: tensor<1x64x1024xf32> ) -> (tensor<1x1024xf32> )  {
+  %cst = arith.constant dense<[1, 1024]> : tensor<2xi32>
+  %cst_0 = arith.constant dense<[1, 1, 64]> : tensor<3xi32>
+  %0 = "tfl.reshape"(%arg0, %cst_0) : (tensor<1x64xf32>, tensor<3xi32>) -> tensor<1x1x64xf32>
+  %1 = "tfl.batch_matmul"(%0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x1x64xf32>, tensor<1x64x1024xf32>) -> tensor<1x1x1024xf32>
+  %2 = "tfl.reshape"(%1, %cst) : (tensor<1x1x1024xf32>, tensor<2xi32>) -> tensor<1x1024xf32>
+  return %2 : tensor<1x1024xf32>
+  // CHECK: %cst = arith.constant dense<[1, 1024]> : tensor<2xi32>
+  // CHECK: %cst_0 = arith.constant dense<[1, 1, 64]> : tensor<3xi32>
+  // CHECK: %0 = "tfl.reshape"(%arg0, %cst_0) : (tensor<1x64xf32>, tensor<3xi32>) -> tensor<1x1x64xf32>
+  // CHECK: %1 = "tfl.batch_matmul"(%0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x1x64xf32>, tensor<1x64x1024xf32>) -> tensor<1x1x1024xf32>
+  // CHECK: %2 = "tfl.reshape"(%1, %cst) : (tensor<1x1x1024xf32>, tensor<2xi32>) -> tensor<1x1024xf32>
+  // CHECK: return %2 : tensor<1x1024xf32>
+}
+
 // CHECK-LABEL: @FuseReshapeAroundBMMNagativeTest
 func.func @FuseReshapeAroundBMMNagativeTest(%arg0: tensor<5x4x1x1024xf32>, %arg1: tensor<5x1024x8192xf32>) -> tensor<5x4x1x8192xf32> {
   %cst = arith.constant dense_resource<__elided__> : tensor<3xi32>
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index c96266da31ddc9..0b068972c8fd30 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -1529,7 +1529,8 @@ def FuseReshapesAroundBatchMatMulLHS1: Pat<
       $rhs, $adj_x, $adj_y, $bool_attr),
     (Arith_ConstantOp $s1)),
   (TFL_BatchMatMulOp $input, $rhs, $adj_x, $adj_y, $bool_attr),
-  [(HasRank<3> $rhs),
+  [(HasRankAtLeast<3> $input),
+   (HasRank<3> $rhs),
    (HasRank<3> $initial_shape_change),
    (IsBroadcastDimEqualToOne $rhs),
    (IsBroadcastDimEqualToOne $input),

From 51871ec0c5d2925cbbf7aa539087ac51ea27892e Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Fri, 22 Mar 2024 17:47:58 -0700
Subject: [PATCH 331/670] #tf-data Re-use `GlobalShuffleIterator` for the Bag
 dataset.

Added an `AnyContext` class for use cases that need to pass
in either an `OpKernelContext` or `IteratorContext`.

For the `BagDataset::Get`, it needs to access `ctx.allocator`
from either `OpKernelContext` or `IteratorContext`.

PiperOrigin-RevId: 618331917
---
 tensorflow/core/data/dataset_utils.cc         | 10 +++----
 tensorflow/core/data/dataset_utils.h          | 21 +--------------
 tensorflow/core/data/global_shuffle_utils.cc  |  3 ++-
 tensorflow/core/framework/dataset.cc          |  3 ++-
 tensorflow/core/framework/dataset.h           | 26 +++++++++++++++++--
 .../core/kernels/data/batch_dataset_op.cc     |  8 +++---
 .../data/experimental/list_dataset_op.cc      |  5 ++--
 .../kernels/data/parallel_batch_dataset_op.cc |  5 ++--
 .../core/kernels/data/range_dataset_op.cc     |  5 ++--
 .../core/kernels/data/tensor_dataset_op.cc    |  6 +++--
 .../kernels/data/tensor_slice_dataset_op.cc   |  6 +++--
 11 files changed, 53 insertions(+), 45 deletions(-)

diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index 0118dc86b52fe0..91ee86ad1b2211 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -787,7 +787,7 @@ Status ProcessBatch(int64_t batch_size, int64_t num_elements,
   return absl::OkStatus();
 }
 
-Status CopyBatch(CopyBatchParams params,
+Status CopyBatch(AnyContext ctx,
                  std::vector<std::vector<Tensor>>&& batch_elements,
                  bool parallel_copy, std::vector<Tensor>* out_tensors) {
   const size_t num_tuple_components = batch_elements.at(0).size();
@@ -799,7 +799,7 @@ Status CopyBatch(CopyBatchParams params,
     TensorShape first_element_shape(first_element.shape());
     TensorShape batch_component_shape({num_batch_elements});
     batch_component_shape.AppendShape(first_element_shape);
-    out_tensors->emplace_back(params.allocator, first_element.dtype(),
+    out_tensors->emplace_back(ctx.allocator, first_element.dtype(),
                               batch_component_shape);
     if (!out_tensors->back().IsInitialized()) {
       return errors::ResourceExhausted(
@@ -837,7 +837,7 @@ Status CopyBatch(CopyBatchParams params,
     if (parallel_copy && total_bytes >= (1 << 20)) {
       Status status;
       mutex status_mu;
-      const auto num_threads = params.runner_threadpool_size;
+      const auto num_threads = ctx.runner_threadpool_size;
       const auto slice_size = num_batch_elements / num_threads;
       int64_t offset = 0;
       BlockingCounter counter(num_threads);
@@ -847,8 +847,8 @@ Status CopyBatch(CopyBatchParams params,
         // evenly, the size of some slices is incremented to guarantee their
         // sizes add up to the total number of elements.
         if (i < num_batch_elements % num_threads) ++length;
-        (*params.runner)([offset, length, &status, &status_mu, &counter,
-                          &copy_element_fn]() {
+        (*ctx.runner)([offset, length, &status, &status_mu, &counter,
+                       &copy_element_fn]() {
           Status s;
           for (size_t j = offset; j < offset + length; ++j) {
             s.Update(copy_element_fn(j));
diff --git a/tensorflow/core/data/dataset_utils.h b/tensorflow/core/data/dataset_utils.h
index b5089b965b7611..78fac87b213985 100644
--- a/tensorflow/core/data/dataset_utils.h
+++ b/tensorflow/core/data/dataset_utils.h
@@ -279,25 +279,6 @@ Status ProcessBatch(int64_t batch_size, int64_t num_elements,
                     IteratorContext* ctx, std::vector<Tensor>* output,
                     bool* end_of_sequence, std::vector<Tensor>* batch);
 
-// Constructs and stores the parameters for the CopyBatch function.
-struct CopyBatchParams {
-  Allocator* allocator;
-  std::function<void(std::function<void()>)>* runner;
-  int64 runner_threadpool_size;
-
-  explicit CopyBatchParams(IteratorContext* ctx) {
-    allocator = ctx->allocator({});
-    runner = ctx->runner();
-    runner_threadpool_size = ctx->runner_threadpool_size();
-  }
-
-  explicit CopyBatchParams(OpKernelContext* ctx) {
-    allocator = ctx->get_allocator({});
-    runner = ctx->runner();
-    runner_threadpool_size = GetRunnerThreadpoolSizeFromOpKernelContext(ctx);
-  }
-};
-
 // Copies the input elements to a batch.
 //
 // The `batch_elements` argument contains the individual elements to copy into a
@@ -305,7 +286,7 @@ struct CopyBatchParams {
 // copy.
 // The `out_tensors` argument will be used to store the resulting batch (one for
 // each component of the input).
-Status CopyBatch(CopyBatchParams params,
+Status CopyBatch(AnyContext ctx,
                  std::vector<std::vector<Tensor>>&& batch_elements,
                  bool parallel_copy, std::vector<Tensor>* out_tensors);
 
diff --git a/tensorflow/core/data/global_shuffle_utils.cc b/tensorflow/core/data/global_shuffle_utils.cc
index 97a826ec7e9ac7..dd792763875be1 100644
--- a/tensorflow/core/data/global_shuffle_utils.cc
+++ b/tensorflow/core/data/global_shuffle_utils.cc
@@ -60,7 +60,8 @@ absl::Status GlobalShuffleIterator::GetNext(IteratorContext* ctx,
 
   absl::MutexLock l(&mu_);
   int64_t output_index = ctx->index_mapper()(element_count_++);
-  absl::Status status = dataset_->Get(output_index, out_tensors);
+  absl::Status status =
+      dataset_->Get(AnyContext(ctx), output_index, out_tensors);
   if (absl::IsOutOfRange(status)) {
     *end_of_sequence = true;
     return absl::OkStatus();
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index d0b9743b112e19..6c841bb4768435 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -835,7 +835,8 @@ Status DatasetBase::Get(OpKernelContext* ctx, int64 index,
                                DebugString());
 }
 
-Status DatasetBase::Get(int64 index, std::vector<Tensor>* out_tensors) const {
+Status DatasetBase::Get(AnyContext ctx, int64 index,
+                        std::vector<Tensor>* out_tensors) const {
   return errors::Unimplemented("Random access is not implemented for dataset ",
                                DebugString());
 }
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index b1cc9e7514aa2b..ab823a5cf9fec0 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -979,6 +979,26 @@ class IteratorContext {
   MemoryCheckpoint checkpoint_;
 };
 
+// Generic context that can be constructed with either an `OpKernelContext` or
+// `IteratorContext`.
+struct AnyContext {
+  Allocator* allocator;
+  std::function<void(std::function<void()>)>* runner;
+  int64_t runner_threadpool_size;
+
+  explicit AnyContext(IteratorContext* ctx) {
+    allocator = ctx->allocator({});
+    runner = ctx->runner();
+    runner_threadpool_size = ctx->runner_threadpool_size();
+  }
+
+  explicit AnyContext(OpKernelContext* ctx) {
+    allocator = ctx->get_allocator({});
+    runner = ctx->runner();
+    runner_threadpool_size = GetRunnerThreadpoolSizeFromOpKernelContext(ctx);
+  }
+};
+
 // Represents the current position in a range of outputs, where the
 // range of outputs is typically represented by an `DatasetBase`,
 // defined below.
@@ -1346,9 +1366,11 @@ class DatasetBase : public core::RefCounted {
   virtual Status Get(OpKernelContext* ctx, int64 index,
                      std::vector<Tensor>* out_tensors) const;
 
-  // Same as above, but without an `OpKernelContext`. Used to support datasets
+  // Same as above, but with an `AnyContext`, which can be constructed from
+  // either an `OpKernelContext` or `IteratorContext`. Used to support datasets
   // that provide random access through both the dataset and iterator APIs.
-  virtual Status Get(int64 index, std::vector<Tensor>* out_tensors) const;
+  virtual Status Get(AnyContext ctx, int64 index,
+                     std::vector<Tensor>* out_tensors) const;
 
   // Returns true if the dataset and its inputs support random access.
   virtual absl::Status RandomIndexingCompatible() const {
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 78d9bce7ae0ea7..13db60592ab475 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -158,9 +158,8 @@ class BatchDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(input_->Get(ctx, i, &batch_element_tuple));
       batch_elements.emplace_back(std::move(batch_element_tuple));
     }
-    TF_RETURN_IF_ERROR(CopyBatch(CopyBatchParams(ctx),
-                                 std::move(batch_elements), parallel_copy_,
-                                 out_tensors));
+    TF_RETURN_IF_ERROR(CopyBatch(AnyContext(ctx), std::move(batch_elements),
+                                 parallel_copy_, out_tensors));
     return absl::OkStatus();
   }
 
@@ -247,8 +246,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
       // respective slice locations. This would require a different GetNext()
       // overload that supports zero-copy, and might make sense in an
       // optimization pass.
-      TF_RETURN_IF_ERROR(CopyBatch(CopyBatchParams(ctx),
-                                   std::move(batch_elements),
+      TF_RETURN_IF_ERROR(CopyBatch(AnyContext(ctx), std::move(batch_elements),
                                    dataset()->parallel_copy_, out_tensors));
 
       *end_of_sequence = false;
diff --git a/tensorflow/core/kernels/data/experimental/list_dataset_op.cc b/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
index a9aa407d04e95a..c93ccce81b4474 100644
--- a/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/split_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
@@ -96,10 +97,10 @@ class ListDatasetOp::Dataset : public DatasetBase {
 
   absl::Status Get(OpKernelContext* ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) const override {
-    return Get(index, out_tensors);
+    return Get(AnyContext(ctx), index, out_tensors);
   }
 
-  absl::Status Get(int64_t index,
+  absl::Status Get(AnyContext ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     out_tensors->clear();
diff --git a/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc b/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
index f3b7d661b40272..21b7769feb450a 100644
--- a/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
@@ -423,9 +423,8 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
         Status status;
         {
           mutex_lock l(result->mu);
-          status =
-              CopyBatch(CopyBatchParams(ctx.get()), std::move(batch_elements),
-                        dataset()->parallel_copy_, &result->output);
+          status = CopyBatch(AnyContext(ctx.get()), std::move(batch_elements),
+                             dataset()->parallel_copy_, &result->output);
           result->status.Update(status);
 
           if (result->status.ok()) {
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 471a2aeb39b951..bf71d672c86bef 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -237,10 +237,11 @@ class RangeDatasetOp::Dataset : public DatasetBase {
 
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override {
-    return Get(index, out_tensors);
+    return Get(AnyContext(ctx), index, out_tensors);
   }
 
-  Status Get(int64 index, std::vector<Tensor>* out_tensors) const override {
+  Status Get(AnyContext ctx, int64 index,
+             std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     return ConvertOutputTypes(output_dtypes(), out_tensors,
                               start_ + (index * step_));
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 2f3dec08deebe1..02736e5a0ddd1a 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/split_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
@@ -88,10 +89,11 @@ class TensorDatasetOp::Dataset : public DatasetBase {
 
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override {
-    return Get(index, out_tensors);
+    return Get(AnyContext(ctx), index, out_tensors);
   }
 
-  Status Get(int64 index, std::vector<Tensor>* out_tensors) const override {
+  Status Get(AnyContext ctx, int64 index,
+             std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     *out_tensors = tensors_;
     return absl::OkStatus();
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 9d3c4f3f1eb1f9..dad1e8ce9e9950 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/split_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
@@ -100,10 +101,11 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
 
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override {
-    return Get(index, out_tensors);
+    return Get(AnyContext(ctx), index, out_tensors);
   }
 
-  Status Get(int64 index, std::vector<Tensor>* out_tensors) const override {
+  Status Get(AnyContext ctx, int64 index,
+             std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     out_tensors->clear();
     out_tensors->reserve(tensors_.size());

From 29f1e12e1a01487f80611b9ab7c00385b716dc0d Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Fri, 22 Mar 2024 19:34:23 -0700
Subject: [PATCH 332/670] graphcycles: extract NodeIO out of Node
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To improve cache locality when doing DFS.

name                          old time/op  new time/op  delta
BM_StressTest/2048             492ns ± 2%   470ns ± 1%   -4.38%  (p=0.008 n=5+5)
BM_StressTest/4096             499ns ± 2%   478ns ± 1%   -4.07%  (p=0.008 n=5+5)
BM_StressTest/32768            512ns ± 2%   494ns ± 1%   -3.52%  (p=0.016 n=5+4)
BM_StressTest/262144           558ns ± 2%   537ns ± 1%   -3.77%  (p=0.016 n=5+4)
BM_StressTest/1048576          621ns ± 2%   692ns ±23%     ~     (p=0.690 n=5+5)
BM_ContractEdge/1000           104ns ± 2%   106ns ± 5%     ~     (p=0.286 n=5+4)
BM_ContractEdge/10000          119ns ± 2%   119ns ± 1%     ~     (p=0.651 n=5+5)
BM_IsReachableNonConst/10     9.76ns ±10%  9.17ns ±12%     ~     (p=0.095 n=5+5)
BM_IsReachableNonConst/50     15.4ns ±11%  13.6ns ± 5%  -11.41%  (p=0.032 n=5+5)
BM_IsReachableNonConst/100    17.5ns ±13%  15.3ns ± 7%  -12.66%  (p=0.032 n=5+5)
BM_IsReachableNonConst/200    20.1ns ± 7%  18.1ns ± 9%   -9.71%  (p=0.016 n=5+5)
BM_IsReachableNonConst/1000   25.5ns ± 5%  22.7ns ± 3%  -10.98%  (p=0.008 n=5+5)
BM_IsReachableNonConst/30000  45.0ns ± 3%  39.2ns ± 3%  -12.95%  (p=0.008 n=5+5)

PiperOrigin-RevId: 618348701
---
 .../xla/service/graphcycles/graphcycles.cc    | 73 ++++++++++++-------
 1 file changed, 45 insertions(+), 28 deletions(-)

diff --git a/third_party/xla/xla/service/graphcycles/graphcycles.cc b/third_party/xla/xla/service/graphcycles/graphcycles.cc
index 99e4193d9ef001..c8308d4d2e13f1 100644
--- a/third_party/xla/xla/service/graphcycles/graphcycles.cc
+++ b/third_party/xla/xla/service/graphcycles/graphcycles.cc
@@ -53,7 +53,17 @@ using OrderedNodeSet = OrderedSet<int32_t>;
 
 struct Node {
   int32_t rank;        // rank number assigned by Pearce-Kelly algorithm
+  // Note (ecg@): the padding between these two fields bothered me, so I tried
+  // the following alternatives:
+  // - Separate bitmap to track visited[].
+  // - Separate std::vector<bool> visited.
+  // - Tagged top or bottom bit of "rank" to keep track of "visited".
+  // However, keeping the bool here (despite the padding) achieves the best
+  // performance for the IsReachableNonConst microbenchmark.
   bool visited;        // Temporary marker used by depth-first-search
+};
+
+struct NodeIO {
   OrderedNodeSet in;   // List of immediate predecessor nodes in graph
   OrderedNodeSet out;  // List of immediate successor nodes in graph
 };
@@ -62,6 +72,7 @@ struct Node {
 
 struct GraphCycles::Rep {
   std::vector<Node> nodes_;
+  std::vector<NodeIO> node_io_;
   std::vector<int32_t> free_nodes_;  // Indices for unused entries in nodes_
 
   // Temporary state.
@@ -94,7 +105,8 @@ bool GraphCycles::CheckInvariants() const {
     if (!ranks.insert(nx->rank).second) {
       LOG(FATAL) << "Duplicate occurrence of rank " << nx->rank;
     }
-    for (int32_t y : nx->out.GetSequence()) {
+    NodeIO* nx_io = &r->node_io_[x];
+    for (int32_t y : nx_io->out.GetSequence()) {
       Node* ny = &r->nodes_[y];
       if (nx->rank >= ny->rank) {
         LOG(FATAL) << "Edge " << x << "->" << y << " has bad rank assignment "
@@ -111,6 +123,7 @@ int32_t GraphCycles::NewNode() {
     n.visited = false;
     n.rank = rep_->nodes_.size();
     rep_->nodes_.emplace_back(n);
+    rep_->node_io_.emplace_back();
     rep_->node_data_.push_back(nullptr);
     return n.rank;
   } else {
@@ -124,12 +137,12 @@ int32_t GraphCycles::NewNode() {
 }
 
 void GraphCycles::RemoveNode(int32_t node) {
-  Node* x = &rep_->nodes_[node];
+  NodeIO* x = &rep_->node_io_[node];
   for (int32_t y : x->out.GetSequence()) {
-    rep_->nodes_[y].in.Erase(node);
+    rep_->node_io_[y].in.Erase(node);
   }
   for (int32_t y : x->in.GetSequence()) {
-    rep_->nodes_[y].out.Erase(node);
+    rep_->node_io_[y].out.Erase(node);
   }
   x->in.Clear();
   x->out.Clear();
@@ -145,12 +158,12 @@ void GraphCycles::SetNodeData(int32_t node, void* data) {
 }
 
 bool GraphCycles::HasEdge(int32_t x, int32_t y) const {
-  return rep_->nodes_[x].out.Contains(y);
+  return rep_->node_io_[x].out.Contains(y);
 }
 
 void GraphCycles::RemoveEdge(int32_t x, int32_t y) {
-  rep_->nodes_[x].out.Erase(y);
-  rep_->nodes_[y].in.Erase(x);
+  rep_->node_io_[x].out.Erase(y);
+  rep_->node_io_[y].in.Erase(x);
   // No need to update the rank assignment since a previous valid
   // rank assignment remains valid after an edge deletion.
 }
@@ -167,15 +180,17 @@ static void ClearVisitedBits(GraphCycles::Rep* r,
 bool GraphCycles::InsertEdge(int32_t x, int32_t y) {
   if (x == y) return false;
   Rep* r = rep_;
-  Node* nx = &r->nodes_[x];
-  if (!nx->out.Insert(y)) {
+  NodeIO* nx_io = &r->node_io_[x];
+  if (!nx_io->out.Insert(y)) {
     // Edge already exists.
     return true;
   }
 
-  Node* ny = &r->nodes_[y];
-  ny->in.Insert(x);
+  NodeIO* ny_io = &r->node_io_[y];
+  ny_io->in.Insert(x);
 
+  Node* nx = &r->nodes_[x];
+  Node* ny = &r->nodes_[y];
   if (nx->rank <= ny->rank) {
     // New edge is consistent with existing rank assignment.
     return true;
@@ -185,8 +200,8 @@ bool GraphCycles::InsertEdge(int32_t x, int32_t y) {
   // We only need to consider nodes that fall in the range [ny->rank,nx->rank].
   if (!ForwardDFS(r, y, nx->rank)) {
     // Found a cycle.  Undo the insertion and tell caller.
-    nx->out.Erase(y);
-    ny->in.Erase(x);
+    nx_io->out.Erase(y);
+    ny_io->in.Erase(x);
     // Since we do not call Reorder() on this path, clear any visited
     // markers left by ForwardDFS.
     ClearVisitedBits(r, r->deltaf_);
@@ -212,7 +227,8 @@ static bool ForwardDFS(GraphCycles::Rep* r, int32_t n, int32_t upper_bound) {
     nn->visited = true;
     r->deltaf_.push_back(n);
 
-    for (auto w : nn->out.GetSequence()) {
+    NodeIO* nn_io = &r->node_io_[n];
+    for (auto w : nn_io->out.GetSequence()) {
       Node* nw = &r->nodes_[w];
       if (nw->rank == upper_bound) {
         return false;  // Cycle
@@ -238,7 +254,8 @@ static void BackwardDFS(GraphCycles::Rep* r, int32_t n, int32_t lower_bound) {
     nn->visited = true;
     r->deltab_.push_back(n);
 
-    for (auto w : nn->in.GetSequence()) {
+    NodeIO* nn_io = &r->node_io_[n];
+    for (auto w : nn_io->in.GetSequence()) {
       Node* nw = &r->nodes_[w];
       if (!nw->visited && lower_bound < nw->rank) {
         r->stack_.push_back(w);
@@ -320,7 +337,7 @@ int GraphCycles::FindPath(int32_t x, int32_t y, int max_path_len,
       return path_len;
     }
 
-    for (auto w : r->nodes_[n].out.GetSequence()) {
+    for (auto w : r->node_io_[n].out.GetSequence()) {
       if (seen.insert(w).second) {
         r->stack_.push_back(w);
       }
@@ -373,29 +390,29 @@ std::optional<int32_t> GraphCycles::ContractEdge(int32_t a, int32_t b) {
     return std::nullopt;
   }
 
-  if (rep_->nodes_[b].in.Size() + rep_->nodes_[b].out.Size() >
-      rep_->nodes_[a].in.Size() + rep_->nodes_[a].out.Size()) {
+  if (rep_->node_io_[b].in.Size() + rep_->node_io_[b].out.Size() >
+      rep_->node_io_[a].in.Size() + rep_->node_io_[a].out.Size()) {
     // Swap "a" and "b" to minimize copying.
     std::swap(a, b);
   }
 
-  Node* nb = &rep_->nodes_[b];
-  OrderedNodeSet out = std::move(nb->out);
-  OrderedNodeSet in = std::move(nb->in);
+  NodeIO* nb_io = &rep_->node_io_[b];
+  OrderedNodeSet out = std::move(nb_io->out);
+  OrderedNodeSet in = std::move(nb_io->in);
   for (int32_t y : out.GetSequence()) {
-    rep_->nodes_[y].in.Erase(b);
+    rep_->node_io_[y].in.Erase(b);
   }
   for (int32_t y : in.GetSequence()) {
-    rep_->nodes_[y].out.Erase(b);
+    rep_->node_io_[y].out.Erase(b);
   }
   rep_->free_nodes_.push_back(b);
 
-  rep_->nodes_[a].out.Reserve(rep_->nodes_[a].out.Size() + out.Size());
+  rep_->node_io_[a].out.Reserve(rep_->node_io_[a].out.Size() + out.Size());
   for (int32_t y : out.GetSequence()) {
     InsertEdge(a, y);
   }
 
-  rep_->nodes_[a].in.Reserve(rep_->nodes_[a].in.Size() + in.Size());
+  rep_->node_io_[a].in.Reserve(rep_->node_io_[a].in.Size() + in.Size());
   for (int32_t y : in.GetSequence()) {
     InsertEdge(y, a);
   }
@@ -405,11 +422,11 @@ std::optional<int32_t> GraphCycles::ContractEdge(int32_t a, int32_t b) {
 }
 
 absl::Span<const int32_t> GraphCycles::Successors(int32_t node) const {
-  return rep_->nodes_[node].out.GetSequence();
+  return rep_->node_io_[node].out.GetSequence();
 }
 
 absl::Span<const int32_t> GraphCycles::Predecessors(int32_t node) const {
-  return rep_->nodes_[node].in.GetSequence();
+  return rep_->node_io_[node].in.GetSequence();
 }
 
 std::vector<int32_t> GraphCycles::SuccessorsCopy(int32_t node) const {
@@ -459,7 +476,7 @@ std::string GraphCycles::DebugString() const {
       continue;
     }
 
-    for (int32_t succ : rep_->nodes_[i].out.GetSequence()) {
+    for (int32_t succ : rep_->node_io_[i].out.GetSequence()) {
       absl::StrAppend(&result, "  \"", i, "\" -> \"", succ, "\"\n");
     }
   }

From 2f605c58a81522bb86a63aa2c48728ac2cbea6ca Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Fri, 22 Mar 2024 20:13:24 -0700
Subject: [PATCH 333/670] [xla:gpu] dot_merger: reorder merge checks to call
 IsReachableNonConst last

Only perform the reachability check if necessary, since it can be expensive.

This brings a 1.5x compilation time speedup for a large, dense model.

PiperOrigin-RevId: 618355183
---
 third_party/xla/xla/service/dot_merger.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/dot_merger.cc b/third_party/xla/xla/service/dot_merger.cc
index fb4f08731bd10c..71dbd9e73d3b5f 100644
--- a/third_party/xla/xla/service/dot_merger.cc
+++ b/third_party/xla/xla/service/dot_merger.cc
@@ -392,9 +392,10 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp,
         int32_t b_id = graph_id(b);
 
         if (dead_instrs.contains(a) || dead_instrs.contains(b) ||
+            (!is_merge_candidate(a) && !is_merge_candidate(b)) ||
+            // Perform reachability checks last since they can be expensive.
             graph.IsReachableNonConst(a_id, b_id) ||
-            graph.IsReachableNonConst(b_id, a_id) ||
-            (!is_merge_candidate(a) && !is_merge_candidate(b))) {
+            graph.IsReachableNonConst(b_id, a_id)) {
           continue;
         }
 

From 6f0100912cee6ea0c16ca8ea378e270503e3dc78 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 23:06:34 -0700
Subject: [PATCH 334/670] Automated Code Change

PiperOrigin-RevId: 618378151
---
 .../convert/op_stats_to_overview_page.cc      | 28 +++++++++----------
 .../profiler/convert/op_stats_to_tf_stats.cc  |  4 +--
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 37017a70f7c472..621fe05afc9aa5 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -193,12 +193,12 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
     OverviewTfOp* op = analysis.add_top_device_ops();
     op->set_name(metrics->name());
     op->set_category(metrics->category());
-    op->set_self_time_fraction(
-        SafeDivide(metrics->self_time_ps(), total_device_time_ps));
+    op->set_self_time_fraction(tsl::profiler::SafeDivide(
+        metrics->self_time_ps(), total_device_time_ps));
     device_cumulative_fraction += op->self_time_fraction();
     op->set_cumulative_time_fraction(device_cumulative_fraction);
-    op->set_flop_rate(
-        SafeDivide(metrics->flops(), PicoToNano(metrics->time_ps())));
+    op->set_flop_rate(tsl::profiler::SafeDivide(
+        metrics->flops(), PicoToNano(metrics->time_ps())));
     auto iter = kernel_stats_by_op_name.find(op->name());
     if (iter != kernel_stats_by_op_name.end()) {
       op->set_is_op_tensorcore_eligible(
@@ -211,12 +211,12 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
       op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps();
   analysis.set_device_compute_16bit_percent(
       100.0 *
-      SafeDivide(
+      tsl::profiler::SafeDivide(
           op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps(),
           total_device_compute_ps));
   analysis.set_device_compute_32bit_percent(
       100.0 *
-      SafeDivide(
+      tsl::profiler::SafeDivide(
           op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps(),
           total_device_compute_ps));
 
@@ -256,19 +256,19 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
   }
   uint64 num_total_tf_ops = num_host_tf_ops + num_device_tf_ops;
   analysis.set_host_tf_op_percent(
-      100.0 * SafeDivide(num_host_tf_ops, num_total_tf_ops));
+      100.0 * tsl::profiler::SafeDivide(num_host_tf_ops, num_total_tf_ops));
   analysis.set_device_tf_op_percent(
-      100.0 * SafeDivide(num_device_tf_ops, num_total_tf_ops));
+      100.0 * tsl::profiler::SafeDivide(num_device_tf_ops, num_total_tf_ops));
   analysis.set_host_trace_level(op_stats.run_environment().host_trace_level());
   analysis.set_host_op_time_eager_percent(
-      100.0 *
-      SafeDivide(eager_host_op_time_ps, total_host_op_time_ps_exclude_idle));
+      100.0 * tsl::profiler::SafeDivide(eager_host_op_time_ps,
+                                        total_host_op_time_ps_exclude_idle));
   analysis.set_device_op_time_eager_percent(
-      100.0 * SafeDivide(eager_device_op_time_ps,
-                         total_device_op_time_ps_exclude_idle));
+      100.0 * tsl::profiler::SafeDivide(eager_device_op_time_ps,
+                                        total_device_op_time_ps_exclude_idle));
   analysis.set_device_op_time_outside_compilation_percent(
-      100.0 * SafeDivide(outside_compilation_device_op_time_ps,
-                         total_device_op_time_ps_exclude_idle));
+      100.0 * tsl::profiler::SafeDivide(outside_compilation_device_op_time_ps,
+                                        total_device_op_time_ps_exclude_idle));
   return analysis;
 }
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
index bb5d6977045516..3f9cbb9510ea60 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
@@ -70,8 +70,8 @@ TfStatsTable GenerateTfStatsTable(
     auto iter = kernel_stats_by_op_name.find(record->op_name());
     if (iter != kernel_stats_by_op_name.end()) {
       record->set_gpu_tensorcore_utilization(
-          SafeDivide(iter->second.tensor_core_duration_ns,
-                     iter->second.total_duration_ns));
+          tsl::profiler::SafeDivide(iter->second.tensor_core_duration_ns,
+                                    iter->second.total_duration_ns));
     } else {
       record->set_gpu_tensorcore_utilization(0.0);
     }

From 12a1a5c364c7104b4d7dc26785540718c030f6d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 Mar 2024 23:08:21 -0700
Subject: [PATCH 335/670] Automated Code Change

PiperOrigin-RevId: 618378402
---
 .../profiler/convert/op_stats_to_tf_stats_test.cc    | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
index 31bba4a868f4e0..a6d8d63369f006 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
@@ -139,15 +139,16 @@ occ_pct:100)MULTI";
   EXPECT_EQ(kTfOp1, record_0.op_name());
   EXPECT_EQ(kTfOp1, record_0.op_type());
   EXPECT_EQ(2, record_0.occurrences());
-  EXPECT_EQ(
-      NanoToMicro(kKernel1DurationNs) * 2 + NanoToMicro(kKernel2DurationNs) * 2,
-      record_0.total_self_time_in_us());
+  EXPECT_EQ(tsl::profiler::NanoToMicro(kKernel1DurationNs) * 2 +
+                tsl::profiler::NanoToMicro(kKernel2DurationNs) * 2,
+            record_0.total_self_time_in_us());
 
   const TfStatsRecord& record_1 = tf_stats.with_idle().tf_stats_record(1);
   EXPECT_EQ(kTfOp3, record_1.op_name());
   EXPECT_EQ(kTfOp3, record_1.op_type());
   EXPECT_EQ(1, record_1.occurrences());
-  EXPECT_EQ(NanoToMicro(kKernel4DurationNs) + NanoToMicro(kKernel5DurationNs),
+  EXPECT_EQ(tsl::profiler::NanoToMicro(kKernel4DurationNs) +
+                tsl::profiler::NanoToMicro(kKernel5DurationNs),
             record_1.total_self_time_in_us());
   // GPU TensorCore utilization is 0.5 because kernel4 is using TensorCore and
   // kernel5 is not using TensorCore, and they have the same duration.
@@ -157,7 +158,8 @@ occ_pct:100)MULTI";
   EXPECT_EQ(kTfOp2, record_2.op_name());
   EXPECT_EQ(kTfOp2, record_2.op_type());
   EXPECT_EQ(1, record_2.occurrences());
-  EXPECT_EQ(NanoToMicro(kKernel3DurationNs), record_2.total_self_time_in_us());
+  EXPECT_EQ(tsl::profiler::NanoToMicro(kKernel3DurationNs),
+            record_2.total_self_time_in_us());
 }
 
 }  // namespace

From 19d2ff58aa99164a75351a01f2a403717334f171 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Mar 2024 01:54:20 -0700
Subject: [PATCH 336/670] Automated Code Change

PiperOrigin-RevId: 618403903
---
 .../core/common_runtime/node_file_writer.cc      |  2 +-
 .../core/common_runtime/node_file_writer.h       |  2 +-
 .../optimize_cross_host_control_deps.cc          |  2 +-
 .../optimize_function_graph_utils.cc             | 16 +++++++++-------
 .../optimize_function_graph_utils.h              |  8 +++++---
 .../optimize_function_graph_utils_test.cc        | 13 +++++++------
 .../optimized_function_graph_info.cc             |  4 ++--
 .../optimized_function_graph_info.h              |  2 +-
 .../optimized_function_graph_info_test.cc        |  5 +++--
 .../core/common_runtime/partitioning_utils.cc    |  2 +-
 .../core/common_runtime/partitioning_utils.h     |  2 +-
 .../process_function_library_runtime.cc          |  2 +-
 tensorflow/core/common_runtime/type_inference.cc |  4 ++--
 13 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/tensorflow/core/common_runtime/node_file_writer.cc b/tensorflow/core/common_runtime/node_file_writer.cc
index 10fb334b38f8d3..83b95c7010b359 100644
--- a/tensorflow/core/common_runtime/node_file_writer.cc
+++ b/tensorflow/core/common_runtime/node_file_writer.cc
@@ -66,7 +66,7 @@ const int kMaxInt32Elems = 10;
 
 namespace tensorflow {
 
-/*static*/ StatusOr<NodeFileWriter*>
+/*static*/ absl::StatusOr<NodeFileWriter*>
 tensorflow::NodeFileWriter::GetNodeFileWriterIfEnabled(
     const std::string& device_name, Env* env) {
   // First get the directory from TF_NODE_FILE_WRITER_DIRECTORY.
diff --git a/tensorflow/core/common_runtime/node_file_writer.h b/tensorflow/core/common_runtime/node_file_writer.h
index f0e0ae830c6e48..bf6a2ddafd114d 100644
--- a/tensorflow/core/common_runtime/node_file_writer.h
+++ b/tensorflow/core/common_runtime/node_file_writer.h
@@ -37,7 +37,7 @@ class NodeFileWriter {
   // TF_NODE_FILE_WRITER_DIRECTORY is set, which specifies the directory where
   // the node file will be created in. Otherwise, returns nullptr. When called
   // with the same device_name, the same NodeFileWriter will be returned.
-  static StatusOr<NodeFileWriter*> GetNodeFileWriterIfEnabled(
+  static absl::StatusOr<NodeFileWriter*> GetNodeFileWriterIfEnabled(
       const std::string& device_name, Env* env);
 
   // Records the execution of a node, if eligible, by writing the node to the
diff --git a/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc b/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc
index 6deb020f816b31..28c8309f823240 100644
--- a/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc
+++ b/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc
@@ -78,7 +78,7 @@ class DeviceLookup {
  public:
   DeviceLookup() = default;
 
-  static StatusOr<DeviceLookup> FromGraph(Graph* graph) {
+  static absl::StatusOr<DeviceLookup> FromGraph(Graph* graph) {
     DeviceLookup lookup;
     for (Node* n : graph->op_nodes()) {
       string device;
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
index 264067a10a73d5..5ed05fc3486823 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
@@ -208,8 +208,8 @@ Status WriteToCache(const std::string& dir_name, const std::string& file_name,
 
 // Retrieves the OptimizedFunctionGraphInfo from a cache file.
 // Returns error if cache file loading fails.
-StatusOr<OptimizedFunctionGraphInfo> ReadFromCache(const string& file_name,
-                                                   Env* env) {
+absl::StatusOr<OptimizedFunctionGraphInfo> ReadFromCache(
+    const string& file_name, Env* env) {
   absl::Time cache_reading_start_time = absl::Now();
 
   OptimizedFunctionGraph optimized_function_graph_proto;
@@ -464,7 +464,7 @@ Status PinArgsAndRets(const std::vector<string>& input_devices,
   return absl::OkStatus();
 }
 
-StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
+absl::StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
     const string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
@@ -652,7 +652,8 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
       optimization_source);
 }
 
-StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
+absl::StatusOr<OptimizedFunctionGraphInfo>
+OptimizeFunctionGraphOrReadFromFileCache(
     const string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
@@ -693,7 +694,7 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
         << "TensorFlow graph cache existed; reading from cache; function name: "
         << function_name << ", full cache file path: " << file_name;
 
-    StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info =
+    absl::StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info =
         ReadFromCache(file_name, env);
     if (optimized_function_graph_info.ok()) {
       metrics::UpdateFunctionGraphOptimizationSavingTime(
@@ -734,7 +735,7 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
   // Step 1: Run the graph optimization passes normally.
   absl::Time optimization_start_time = absl::Now();
   TF_ASSIGN_OR_RETURN(
-      StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info,
+      absl::StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info,
       OptimizeFunctionGraph(function_name, attrs, options, dev_set,
                             input_lib_def, composite_devices, cpu_device,
                             default_device, env, OptimizedFunctionGraph::JIT));
@@ -770,7 +771,8 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
   return optimized_function_graph_info;
 }
 
-StatusOr<std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>>
+absl::StatusOr<
+    std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>>
 PreprocessAndPartitionGraph(
     const std::string& function_name,
     OptimizedFunctionGraphInfo& input_optimized_graph,
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils.h b/tensorflow/core/common_runtime/optimize_function_graph_utils.h
index d9bcd859dce794..2ab19703778461 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils.h
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils.h
@@ -54,7 +54,7 @@ Status PinArgsAndRets(const std::vector<string>& input_devices,
 // Outputs graph optimization result after all the graph optimization (up till
 // before graph partitioning); returns error if optimization fails. Note that
 // the `input_lib_def` will be used only if the lib_def in `options` is nullptr.
-StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
+absl::StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
     const string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
@@ -67,7 +67,8 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
 // the file cache if existent. If cache loading fails, it goes ahead and runs
 // the graph optimization passes. Returns error if running the optimization
 // passes fails.
-StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
+absl::StatusOr<OptimizedFunctionGraphInfo>
+OptimizeFunctionGraphOrReadFromFileCache(
     const string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
@@ -78,7 +79,8 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
 // Pre-processes, partitions and post-optimizes the input graph; returns
 // subgraph result (maps from device name to the subgraph); returns error if any
 // optimization or partitioning step fails.
-StatusOr<std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>>
+absl::StatusOr<
+    std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>>
 PreprocessAndPartitionGraph(
     const std::string& function_name,
     OptimizedFunctionGraphInfo& input_optimized_graph,
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc b/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
index e6d93b95658413..b2cf3d13e78766 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
@@ -65,7 +65,7 @@ void TestOptimizeFunctionGraphWithFunctionNotFound(bool load_from_cache) {
 
   // Try to optimize a function called "FindDevice" which does not exist in
   // library.
-  StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info;
+  absl::StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info;
   if (load_from_cache) {
     optimized_function_graph_info = OptimizeFunctionGraphOrReadFromFileCache(
         "FindDevice", {}, opts, device_set, lib_def.get(),
@@ -109,10 +109,11 @@ TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphReturnsCorrectResult) {
     device_set.AddDevice(device.get());
   }
 
-  const StatusOr<OptimizedFunctionGraphInfo> aot_result = OptimizeFunctionGraph(
-      "FindDevice", {}, opts, device_set, lib_def.get(),
-      /*composite_devices=*/{}, devices[0].get(), devices[1].get(),
-      Env::Default(), OptimizedFunctionGraph::AOT);
+  const absl::StatusOr<OptimizedFunctionGraphInfo> aot_result =
+      OptimizeFunctionGraph("FindDevice", {}, opts, device_set, lib_def.get(),
+                            /*composite_devices=*/{}, devices[0].get(),
+                            devices[1].get(), Env::Default(),
+                            OptimizedFunctionGraph::AOT);
   TF_EXPECT_OK(aot_result.status());
   EXPECT_EQ(aot_result->name, "FindDevice");
   // FindDevice function has one return node.
@@ -166,7 +167,7 @@ TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphAndWriteToCache) {
             0);
 
   // Expect no caching with an extremely high caching threshold.
-  StatusOr<OptimizedFunctionGraphInfo> optimized_info =
+  absl::StatusOr<OptimizedFunctionGraphInfo> optimized_info =
       OptimizeFunctionGraphOrReadFromFileCache(
           "FindDevice_1234", {}, opts, device_set, lib_def.get(),
           /*composite_devices=*/{}, devices[0].get(), devices[1].get(),
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info.cc b/tensorflow/core/common_runtime/optimized_function_graph_info.cc
index 03314351fb15c5..388c484954b690 100644
--- a/tensorflow/core/common_runtime/optimized_function_graph_info.cc
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info.cc
@@ -42,8 +42,8 @@ OptimizedFunctionGraph OptimizedFunctionGraphInfo::ToProto(
   return proto;
 }
 
-StatusOr<OptimizedFunctionGraphInfo> OptimizedFunctionGraphInfo::FromProto(
-    OptimizedFunctionGraph&& proto) {
+absl::StatusOr<OptimizedFunctionGraphInfo>
+OptimizedFunctionGraphInfo::FromProto(OptimizedFunctionGraph&& proto) {
   // Reconstruct the lib_def.
   FunctionLibraryDefinition lib_def(OpRegistry::Global());
   FunctionDefLibrary proto_library;
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info.h b/tensorflow/core/common_runtime/optimized_function_graph_info.h
index 90008571de02fc..b2bd9af5bb1c5a 100644
--- a/tensorflow/core/common_runtime/optimized_function_graph_info.h
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info.h
@@ -80,7 +80,7 @@ struct OptimizedFunctionGraphInfo {
 
   // Converts from the proto to struct OptimizedFunctionGraphInfo. Returns error
   // if the conversion fails.
-  static StatusOr<OptimizedFunctionGraphInfo> FromProto(
+  static absl::StatusOr<OptimizedFunctionGraphInfo> FromProto(
       OptimizedFunctionGraph&& proto);
 };
 
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc b/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc
index a7c03b13382453..800da5f50d6297 100644
--- a/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc
@@ -87,7 +87,8 @@ constexpr absl::string_view kLibraryPb =
          })pb";
 
 // Creates a simple graph with one trivial node.
-StatusOr<OptimizedFunctionGraphInfo> CreateSimpleOptimizedFunctionGraphInfo() {
+absl::StatusOr<OptimizedFunctionGraphInfo>
+CreateSimpleOptimizedFunctionGraphInfo() {
   NodeDef node_def;
   TF_RETURN_IF_ERROR(NodeDefBuilder("A", "OneOutput").Finalize(&node_def));
   auto graph = std::make_unique<Graph>(OpRegistry::Global());
@@ -162,7 +163,7 @@ TEST(OptimizedFunctionGraphUtilsTest, FromProtoProducesCorrectResult) {
           kLibraryPb),
       &proto);
 
-  const StatusOr<OptimizedFunctionGraphInfo> test_result =
+  const absl::StatusOr<OptimizedFunctionGraphInfo> test_result =
       OptimizedFunctionGraphInfo::FromProto(std::move(proto));
   TF_EXPECT_OK(test_result.status());
   // Compare graph.
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
index c6a95632ee3b74..4110f956f74599 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -143,7 +143,7 @@ Status PartitionFunctionGraph(
   return absl::OkStatus();
 }
 
-StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
+absl::StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
     const DeviceSet& device_set, std::unique_ptr<Graph> graph) {
   // Skip transfer op insertion if the graph nodes are not assigned to multiple
   // devices.
diff --git a/tensorflow/core/common_runtime/partitioning_utils.h b/tensorflow/core/common_runtime/partitioning_utils.h
index f62633996ecf99..2fff5dc96a572e 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.h
+++ b/tensorflow/core/common_runtime/partitioning_utils.h
@@ -45,7 +45,7 @@ Status PartitionFunctionGraph(
 // Note that, the returned graph is intended to be used by TF MLIR importer.
 // The dependencies between send/recv pairs ensure the importer will generate TF
 // MLIR ops in a valid order.
-StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
+absl::StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
     const DeviceSet& device_set, std::unique_ptr<Graph> graph);
 
 // This function performs bookkeeping to track which `Arg` and `Retval` nodes
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index e124d89c140b08..71784b67180628 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -573,7 +573,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     }
   }
 
-  StatusOr<OptimizedFunctionGraphInfo> optimized_graph_info =
+  absl::StatusOr<OptimizedFunctionGraphInfo> optimized_graph_info =
       (!optimized_graph_proto.has_value() ||
        !optimized_graph_proto.value().ok())
           ? OptimizeFunctionGraphOrReadFromFileCache(
diff --git a/tensorflow/core/common_runtime/type_inference.cc b/tensorflow/core/common_runtime/type_inference.cc
index 3ad86d8792bc46..fe1c15465810db 100644
--- a/tensorflow/core/common_runtime/type_inference.cc
+++ b/tensorflow/core/common_runtime/type_inference.cc
@@ -124,8 +124,8 @@ Status update_inferred_type(Node* target, const FullTypeDef& t, bool& updated) {
   return absl::OkStatus();
 }
 
-StatusOr<FullTypeDef> run_inference(const string& fn_name,
-                                    const TypeRefVector& in_types) {
+absl::StatusOr<FullTypeDef> run_inference(const string& fn_name,
+                                          const TypeRefVector& in_types) {
   // TODO(b/224776031): Things remaining to implement:
   //  * look up function by name
   //  * execute pass on its graph

From e66512de307a24709916b710e2d077b88fbbdc6e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Mar 2024 02:01:50 -0700
Subject: [PATCH 337/670] compat: Update forward compatibility horizon to
 2024-03-23

PiperOrigin-RevId: 618405038
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index d4e91c5142a1cc..46ce87e2f8d611 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 22)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 23)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 63320da7afb1cb7ef064e3053b793dc1b32cd2bc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Mar 2024 02:02:10 -0700
Subject: [PATCH 338/670] Update GraphDef version to 1810.

PiperOrigin-RevId: 618405120
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 20d9b13e9e37ba..84aad8c370b97e 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1809  // Updated: 2024/3/22
+#define TF_GRAPH_DEF_VERSION 1810  // Updated: 2024/3/23
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 94c11d4660acac1d06abf1a1cc14c1581665f102 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Mar 2024 02:44:19 -0700
Subject: [PATCH 339/670] Automated Code Change

PiperOrigin-RevId: 618411919
---
 third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index cd1e82368f988c..52d9f288df29ec 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -4688,9 +4688,10 @@ class ParameterizedFp8GemmRewriteTest : public ParameterizedGemmRewriteTest {
     }
   }
 
-  StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
-      absl::string_view hlo_text, int64_t replica_count = 1,
-      int64_t num_partitions = 1) {
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
+  ParseAndReturnVerifiedModule(absl::string_view hlo_text,
+                               int64_t replica_count = 1,
+                               int64_t num_partitions = 1) {
     return GemmRewriteTest::ParseAndReturnVerifiedModule(
         absl::StrReplaceAll(hlo_text, replacements_));
   }

From 490a5d5472da3ce4bb5be6e807275a52df610166 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Mar 2024 02:46:05 -0700
Subject: [PATCH 340/670] Automated Code Change

PiperOrigin-RevId: 618412188
---
 third_party/xla/xla/shape_util.cc         | 10 ++++----
 third_party/xla/xla/shape_util.h          | 14 ++++++------
 third_party/xla/xla/shape_util_test.cc    | 28 ++++++++++++-----------
 third_party/xla/xla/status_macros_test.cc | 11 +++++----
 4 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index adb7ad910a87aa..132c951cbeec36 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -117,7 +117,7 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
 namespace {
 // Constructs and returns the new shape with the given minor_to_major order in
 // its Layout.
-StatusOr<Shape> MakeShapeWithLayoutInternal(
+absl::StatusOr<Shape> MakeShapeWithLayoutInternal(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions,
     absl::Span<const int64_t> minor_to_major,
     absl::Span<const DimLevelType> dim_level_types,
@@ -304,7 +304,7 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
   return output;
 }
 
-/* static */ StatusOr<Shape> ShapeUtil::MakeValidatedShape(
+/* static */ absl::StatusOr<Shape> ShapeUtil::MakeValidatedShape(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions) {
   Shape shape;
   if (!FillNewShape(element_type, dimensions, &shape)) {
@@ -315,7 +315,7 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
   return std::move(shape);
 }
 
-/* static */ StatusOr<Shape> ShapeUtil::MakeValidatedShape(
+/* static */ absl::StatusOr<Shape> ShapeUtil::MakeValidatedShape(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions,
     const std::vector<bool>& dynamic_dimensions) {
   if (dynamic_dimensions.size() != dimensions.size()) {
@@ -1100,7 +1100,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
   return *return_shape;
 }
 
-/* static */ StatusOr<const Shape*> ShapeUtil::TryGetSubshape(
+/* static */ absl::StatusOr<const Shape*> ShapeUtil::TryGetSubshape(
     const Shape& shape, ShapeIndexView index) {
   const Shape* return_shape = &shape;
   for (auto i : index) {
@@ -1931,7 +1931,7 @@ struct ParallelState {
     auto indexes_copy = s.indexes;
     pstate.pool->Schedule([indexes_copy, &visitor_function, &pstate] {
       const int thread_id = pstate.pool->CurrentThreadId();
-      StatusOr<bool> result = visitor_function(indexes_copy, thread_id);
+      absl::StatusOr<bool> result = visitor_function(indexes_copy, thread_id);
       if (!result.ok()) {
         absl::MutexLock lock(&pstate.mu);
         if (pstate.status.ok()) {
diff --git a/third_party/xla/xla/shape_util.h b/third_party/xla/xla/shape_util.h
index 61ce32ee992248..548d39961b9ce4 100644
--- a/third_party/xla/xla/shape_util.h
+++ b/third_party/xla/xla/shape_util.h
@@ -417,9 +417,9 @@ class ShapeUtil {
   // dimensions. Method checks if the element type is valid, the shape's
   // size fits in std::numeric_limits<int64_t>::max(), and dynamic size is not
   // marked static.
-  static StatusOr<Shape> MakeValidatedShape(
+  static absl::StatusOr<Shape> MakeValidatedShape(
       PrimitiveType element_type, absl::Span<const int64_t> dimensions);
-  static StatusOr<Shape> MakeValidatedShape(
+  static absl::StatusOr<Shape> MakeValidatedShape(
       PrimitiveType element_type, absl::Span<const int64_t> dimensions,
       const std::vector<bool>& dynamic_dimensions);
 
@@ -554,8 +554,8 @@ class ShapeUtil {
   // Faster version for one index.
   static const Shape& GetSubshapeOneIndex(const Shape& shape, int64_t index);
 
-  static StatusOr<const Shape*> TryGetSubshape(const Shape& shape,
-                                               ShapeIndexView index);
+  static absl::StatusOr<const Shape*> TryGetSubshape(const Shape& shape,
+                                                     ShapeIndexView index);
   static Shape* GetMutableSubshape(Shape* shape, ShapeIndexView index);
 
   // Returns whether the given index in the given shape is a leaf element of the
@@ -871,7 +871,7 @@ class ShapeUtil {
                                        const xla::Shape& bounded_shape);
 
   using ForEachVisitorFunction =
-      absl::FunctionRef<StatusOr<bool>(absl::Span<const int64_t>)>;
+      absl::FunctionRef<absl::StatusOr<bool>(absl::Span<const int64_t>)>;
 
   using ForEachVisitorFunctionNoStatus =
       absl::FunctionRef<bool(absl::Span<const int64_t>)>;
@@ -936,12 +936,12 @@ class ShapeUtil {
   static void ForEachIndex(const Shape& shape,
                            const ForEachVisitorFunction& visitor_function) {
     ForEachIndexWithStatus(shape, [&](absl::Span<const int64_t> indices) {
-      return StatusOr<bool>(visitor_function(indices));
+      return absl::StatusOr<bool>(visitor_function(indices));
     }).IgnoreError();
   }
 
   using ForEachParallelVisitorFunction =
-      absl::FunctionRef<StatusOr<bool>(absl::Span<const int64_t>, int)>;
+      absl::FunctionRef<absl::StatusOr<bool>(absl::Span<const int64_t>, int)>;
 
   // A parallel version of ForEachIndex(WithStatus). This can only be used if
   // the visitor_function is thread-safe and the order of iteration does not
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index b6a2bfbac2d764..cee69daab1ee44 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -636,7 +636,8 @@ TEST(ShapeUtilTest, ForEachIndexWithStatus) {
   // Increments at every invocation.
   int invocations = 0;
   auto increment_func =
-      [&invocations](absl::Span<const int64_t> indexes) -> StatusOr<bool> {
+      [&invocations](
+          absl::Span<const int64_t> indexes) -> absl::StatusOr<bool> {
     if (++invocations == 5) {
       return Unimplemented("Cannot increment beyond 5.");
     }
@@ -658,7 +659,7 @@ TEST(ShapeUtilTest, GetForEachIndexParallelThreadCount) {
 
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100});
   auto check_func = [kThreadCount](absl::Span<const int64_t> /*indexes*/,
-                                   int thread_id) -> StatusOr<bool> {
+                                   int thread_id) -> absl::StatusOr<bool> {
     EXPECT_GE(thread_id, -1);
     EXPECT_LT(thread_id, kThreadCount);
     return true;
@@ -675,7 +676,7 @@ TEST(ShapeUtilTest, ForEachIndexParallel) {
   int64_t output[10][10];
   int init = 5;
   auto set_func = [&](absl::Span<const int64_t> indexes,
-                      int /*thread_id*/) -> StatusOr<bool> {
+                      int /*thread_id*/) -> absl::StatusOr<bool> {
     output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
     return true;
   };
@@ -693,7 +694,7 @@ TEST(ShapeUtilTest, ForEachIndexParallel_Rank0) {
   Shape shape = ShapeUtil::MakeShape(F32, {});
   int64_t output = -1;
   auto set_func = [&](absl::Span<const int64_t> indexes,
-                      int /*thread_id*/) -> StatusOr<bool> {
+                      int /*thread_id*/) -> absl::StatusOr<bool> {
     output = indexes.size();
     return true;
   };
@@ -708,7 +709,7 @@ TEST(ShapeUtilTest, ForEachIndexParallel_Empty) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 0});
   bool called = false;
   auto set_func = [&](absl::Span<const int64_t> indexes,
-                      int /*thread_id*/) -> StatusOr<bool> {
+                      int /*thread_id*/) -> absl::StatusOr<bool> {
     called = true;
     return true;
   };
@@ -727,7 +728,7 @@ TEST(ShapeUtilTest, ForEachIndexParallel_DimensionPinnedWithZeros) {
   int64_t output[2][2] = {};
   int init = 5;
   auto set_func = [&](absl::Span<const int64_t> indexes,
-                      int /*thread_id*/) -> StatusOr<bool> {
+                      int /*thread_id*/) -> absl::StatusOr<bool> {
     output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
     return true;
   };
@@ -751,7 +752,7 @@ TEST(ShapeUtilTest, ForEachIndexParallel_WithSkips) {
   int64_t output[10][10] = {};
   int init = 5;
   auto set_func = [&](absl::Span<const int64_t> indexes,
-                      int /*thread_id*/) -> StatusOr<bool> {
+                      int /*thread_id*/) -> absl::StatusOr<bool> {
     output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
     return true;
   };
@@ -775,13 +776,13 @@ TEST(ShapeUtilTest, ForEachIndexParallel_CalledTwice) {
   int64_t output[10][10];
   int init = 5;
   auto set_func = [&](absl::Span<const int64_t> indexes,
-                      int /*thread_id*/) -> StatusOr<bool> {
+                      int /*thread_id*/) -> absl::StatusOr<bool> {
     output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
     return true;
   };
   int init2 = 15;
   auto set_func2 = [&](absl::Span<const int64_t> indexes,
-                       int /*thread_id*/) -> StatusOr<bool> {
+                       int /*thread_id*/) -> absl::StatusOr<bool> {
     output[indexes[0]][indexes[1]] = init2 + indexes[0] + indexes[1];
     return true;
   };
@@ -811,8 +812,9 @@ TEST(ShapeUtilTest, ForEachIndexParallel_CalledFromMultipleThreads) {
                                  kCallingThreads);
     for (int t = 0; t < kCallingThreads; ++t) {
       pool.Schedule([&output, &kShape, t] {
-        auto set_func = [&output, t](absl::Span<const int64_t> indexes,
-                                     int /*thread_id*/) -> StatusOr<bool> {
+        auto set_func = [&output, t](
+                            absl::Span<const int64_t> indexes,
+                            int /*thread_id*/) -> absl::StatusOr<bool> {
           output[t][indexes[0]][indexes[1]] = kInit + indexes[0] + indexes[1];
           return true;
         };
@@ -975,7 +977,7 @@ TEST(ShapeUtilTest, UpdateDynamicDimensions) {
 }
 
 TEST(ShapeUtilTest, InvalidDynamicDimension) {
-  StatusOr<Shape> error_status = ShapeUtil::MakeValidatedShape(
+  absl::StatusOr<Shape> error_status = ShapeUtil::MakeValidatedShape(
       F32, {Shape::kUnboundedSize, Shape::kUnboundedSize}, {true, false});
 
   EXPECT_FALSE(error_status.ok());
@@ -1623,7 +1625,7 @@ void BM_ForEachIndex(::testing::benchmark::State& state) {
   for (auto s : state) {
     int count = 0;
     auto increment_func =
-        [&count](absl::Span<const int64_t> indexes) -> StatusOr<bool> {
+        [&count](absl::Span<const int64_t> indexes) -> absl::StatusOr<bool> {
       count++;
       return true;
     };
diff --git a/third_party/xla/xla/status_macros_test.cc b/third_party/xla/xla/status_macros_test.cc
index 80df11ee1cd3ca..fe09a008143db1 100644
--- a/third_party/xla/xla/status_macros_test.cc
+++ b/third_party/xla/xla/status_macros_test.cc
@@ -59,9 +59,9 @@ TEST(StatusMacros, RetCheckSucceeding) {
   EXPECT_IS_OK(status);
 }
 
-StatusOr<int> CreateIntSuccessfully() { return 42; }
+absl::StatusOr<int> CreateIntSuccessfully() { return 42; }
 
-StatusOr<int> CreateIntUnsuccessfully() {
+absl::StatusOr<int> CreateIntUnsuccessfully() {
   return tsl::errors::Internal("foobar");
 }
 
@@ -76,19 +76,20 @@ Status ReturnStatusError() { return (tsl::errors::Internal("foobar")); }
 
 using StatusReturningFunction = std::function<Status()>;
 
-StatusOr<int> CallStatusReturningFunction(const StatusReturningFunction& func) {
+absl::StatusOr<int> CallStatusReturningFunction(
+    const StatusReturningFunction& func) {
   TF_RETURN_IF_ERROR(func());
   return 42;
 }
 
 TEST(StatusMacros, ReturnIfErrorOnOK) {
-  StatusOr<int> rc = CallStatusReturningFunction(ReturnStatusOK);
+  absl::StatusOr<int> rc = CallStatusReturningFunction(ReturnStatusOK);
   EXPECT_IS_OK(rc);
   EXPECT_EQ(42, std::move(rc).value());
 }
 
 TEST(StatusMacros, ReturnIfErrorOnError) {
-  StatusOr<int> rc = CallStatusReturningFunction(ReturnStatusError);
+  absl::StatusOr<int> rc = CallStatusReturningFunction(ReturnStatusError);
   EXPECT_FALSE(rc.ok());
   EXPECT_EQ(rc.status().code(), tsl::error::INTERNAL);
 }

From 6a08c69da5b4372d6ae00264a974c6a250cb1fad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Mar 2024 03:27:45 -0700
Subject: [PATCH 341/670] Automated Code Change

PiperOrigin-RevId: 618418197
---
 .../mhlo_to_hlo/attribute_exporter.cc         | 20 ++++++++++---------
 .../mhlo_to_hlo/attribute_exporter.h          | 17 ++++++++--------
 .../xla/translate/mhlo_to_hlo/layout_util.cc  |  2 +-
 .../xla/translate/mhlo_to_hlo/layout_util.h   |  6 +++---
 .../translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc  | 10 +++++-----
 .../xla/translate/mhlo_to_hlo/translate.cc    |  2 +-
 .../mhlo_to_hlo/type_to_shape_test.cc         |  4 ++--
 7 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.cc b/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.cc
index b3d8b93ddfa748..21dfb491ec6e9b 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.cc
@@ -58,7 +58,7 @@ ConvolutionDimensionNumbers ConvertConvDimensionNumbers(
 
 // Convert replica group from MLIR encoding to HLO.
 // See HloFunctionImporter::ConvertReplicaGroups for the MLIR encoding.
-StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
+absl::StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
     mlir::DenseIntElementsAttr input) {
   mlir::RankedTensorType type =
       input.getType().dyn_cast<mlir::RankedTensorType>();
@@ -85,7 +85,7 @@ StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
 
 // Convert a (N, 2) dense attribute to a list of tuples. This is the way padding
 // and source-target pairs are defined in HLO.
-StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
+absl::StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
     std::optional<mlir::DenseIntElementsAttr> optional_attr) {
   if (!optional_attr.has_value())
     return std::vector<std::pair<int64_t, int64_t>>{};
@@ -105,7 +105,7 @@ StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
   return out;
 }
 
-StatusOr<FftType> ConvertFftType(llvm::StringRef type_string) {
+absl::StatusOr<FftType> ConvertFftType(llvm::StringRef type_string) {
   std::optional<mlir::mhlo::FftType> type =
       mlir::mhlo::symbolizeEnum<mlir::mhlo::FftType>(type_string);
   if (!type) return InvalidArgument("Unknown FFT type %s", type_string.str());
@@ -124,7 +124,7 @@ StatusOr<FftType> ConvertFftType(llvm::StringRef type_string) {
   }
 }
 
-StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
+absl::StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
     llvm::StringRef transpose_string) {
   std::optional<mlir::mhlo::Transpose> transpose =
       mlir::mhlo::symbolizeTranspose(transpose_string);
@@ -145,7 +145,7 @@ StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
   }
 }
 
-StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
+absl::StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
     mlir::mhlo::CustomCallSchedule schedule) {
   switch (schedule) {
     case mlir::mhlo::CustomCallSchedule::NONE:
@@ -160,7 +160,7 @@ StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
   }
 }
 
-StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
+absl::StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
     mlir::mhlo::CustomCallApiVersion api_version) {
   switch (api_version) {
     case mlir::mhlo::CustomCallApiVersion::API_VERSION_UNSPECIFIED:
@@ -179,7 +179,8 @@ StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
   }
 }
 
-StatusOr<std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
+absl::StatusOr<
+    std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
 ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr) {
   std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>> aliasInfo;
   for (auto attr : aliasArrayAttr.getValue()) {
@@ -196,7 +197,8 @@ ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr) {
 std::optional<xla::OpSharding> ConvertSharding(llvm::StringRef sharding) {
   xla::OpSharding sharding_proto;
   if (sharding_proto.ParseFromString(sharding.str())) return sharding_proto;
-  StatusOr<xla::HloSharding> sharding_cpp = xla::ParseSharding(sharding.str());
+  absl::StatusOr<xla::HloSharding> sharding_cpp =
+      xla::ParseSharding(sharding.str());
   if (sharding_cpp.ok()) return sharding_cpp->ToProto();
   return std::nullopt;
 }
@@ -248,7 +250,7 @@ DotDimensionNumbers ConvertDotDimensionNumbers(
   return output;
 }
 
-StatusOr<std::vector<int64_t>> ConvertMlirArrayAttrToInt64Array(
+absl::StatusOr<std::vector<int64_t>> ConvertMlirArrayAttrToInt64Array(
     const mlir::ArrayAttr& array) {
   int rank = array.size();
   std::vector<int64_t> converted_array(rank);
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h b/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h
index 7cc8734ad81116..96aa716b588628 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h
@@ -33,25 +33,26 @@ namespace xla {
 ConvolutionDimensionNumbers ConvertConvDimensionNumbers(
     mlir::mhlo::ConvDimensionNumbersAttr input);
 
-StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
+absl::StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
     mlir::DenseIntElementsAttr input);
 
 // Convert a (N, 2) dense attribute to a list of tuples. This is the way padding
 // and source-target pairs are defined in HLO.
-StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
+absl::StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
     std::optional<mlir::DenseIntElementsAttr> optional_attr);
 
-StatusOr<FftType> ConvertFftType(llvm::StringRef type_string);
-StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
+absl::StatusOr<FftType> ConvertFftType(llvm::StringRef type_string);
+absl::StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
     llvm::StringRef transpose_string);
 
-StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
+absl::StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
     mlir::mhlo::CustomCallSchedule schedule);
 
-StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
+absl::StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
     mlir::mhlo::CustomCallApiVersion api_version);
 
-StatusOr<std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
+absl::StatusOr<
+    std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
 ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr);
 
 // Returns an OpSharding that represents the result of parsing the given string:
@@ -67,7 +68,7 @@ DotDimensionNumbers ConvertDotDimensionNumbers(
     absl::Span<const int64_t> rhs_batch,
     absl::Span<const int64_t> rhs_contract);
 
-StatusOr<std::vector<int64_t>> ConvertMlirArrayAttrToInt64Array(
+absl::StatusOr<std::vector<int64_t>> ConvertMlirArrayAttrToInt64Array(
     const mlir::ArrayAttr& array);
 }  // namespace xla
 #endif  // XLA_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
index f04de43383760e..0955046f1fe38d 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
@@ -63,7 +63,7 @@ xla::Status RewriteLayoutWithShardedShape(
 
 // There is a shape_representation_fn or sharding for an output, this function
 // uses a reshape to fix the layout.
-xla::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
+absl::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
     xla::XlaBuilder* builder, xla::XlaOp original, xla::Shape original_shape,
     const LayoutPreferenceFn& layout_preference_fn,
     const ShapeRepresentationFn& shape_representation_fn,
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
index 5db4ea2f6ff26c..2e72c6d36fca68 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
@@ -52,11 +52,11 @@ enum class XlaLayoutPreference {
 // The following defines the layout preference of an xla tensor.
 // The return value of LayoutPreferenceFn can be used in
 // ShapeRepresentationFn.
-typedef std::function<xla::StatusOr<XlaLayoutPreference>(
+typedef std::function<absl::StatusOr<XlaLayoutPreference>(
     const xla::Shape& shape)>
     LayoutPreferenceFn;
 
-typedef std::function<xla::StatusOr<xla::Shape>(
+typedef std::function<absl::StatusOr<xla::Shape>(
     const xla::Shape& shape, bool fast_mem,
     XlaLayoutPreference layout_preference)>
     ShapeRepresentationFn;
@@ -73,7 +73,7 @@ xla::Status RewriteLayoutWithShardedShape(
 
 // Adds reshapes to fix the layout of an output, if a shape_representation_fn or
 // sharding is present.
-xla::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
+absl::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
     xla::XlaBuilder* builder, xla::XlaOp original, xla::Shape original_shape,
     const LayoutPreferenceFn& layout_preference_fn,
     const ShapeRepresentationFn& shape_representation_fn,
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 5df73a64c517ab..597272b23d627b 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -184,8 +184,8 @@ xla::Array<T> ArrayFromDenseElementsAttr(mlir::DenseElementsAttr dense_attr) {
   return array;
 }
 
-StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
-                                                  xla::Layout layout) {
+absl::StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
+                                                        xla::Layout layout) {
   auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>();
   if (!dense_attr)
     return tsl::errors::Unimplemented("Only dense elements attr are supported");
@@ -193,7 +193,7 @@ StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
   xla::Shape shape = xla::TypeToShape(dense_attr.getType());
 
   return xla::primitive_util::PrimitiveTypeSwitch<StatusOr<xla::Literal>>(
-      [&](auto primitive_type_constant) -> StatusOr<xla::Literal> {
+      [&](auto primitive_type_constant) -> absl::StatusOr<xla::Literal> {
         if constexpr (xla::primitive_util::IsArrayType(
                           primitive_type_constant)) {
           using cpp_type =
@@ -2056,7 +2056,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
     }
   }
 
-  StatusOr<xla::Literal> literal;
+  absl::StatusOr<xla::Literal> literal;
   const xla::Literal* literal_ptr = nullptr;
   auto literal_attr = op->getAttrOfType<DenseElementsAttr>(kLiteralAttr);
   if (literal_attr) {
@@ -3164,7 +3164,7 @@ LogicalResult ConvertToHloModule::Lower(
         if (!is_entry_function || !has_ret_shardings) continue;
 
         xla::Shape return_shape = xla::TypeToShape(ret.get().getType());
-        StatusOr<xla::XlaOp> reshape =
+        absl::StatusOr<xla::XlaOp> reshape =
             ReshapeWithCorrectRepresentationAndSharding(
                 builder, returns[index], return_shape,
                 options_.layout_preference_fn, options_.shape_representation_fn,
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/translate.cc b/third_party/xla/xla/translate/mhlo_to_hlo/translate.cc
index b78aaba15bb75a..409bab6c722cef 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/translate.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/translate.cc
@@ -43,7 +43,7 @@ mlir::LogicalResult MlirHloToHloTranslateFunction(mlir::ModuleOp module,
   return mlir::success();
 }
 
-StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
+absl::StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
     const HloProto& hlo_proto) {
   const HloModuleProto& module_proto = hlo_proto.hlo_module();
   TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape_test.cc b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape_test.cc
index 6739edc952a82d..36ef41a55ac78f 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape_test.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape_test.cc
@@ -154,7 +154,7 @@ TEST(TypeToShapeTest, ConvertMemRefToShape) {
   MLIRContext context;
   mlir::Builder builder(&context);
 
-  StatusOr<mlir::Type> mlir_type =
+  absl::StatusOr<mlir::Type> mlir_type =
       ConvertShapeToType<MemRefType>(shape, builder);
   ASSERT_TRUE(mlir_type.ok());
   mlir::Type type = std::move(mlir_type).value();
@@ -171,7 +171,7 @@ TEST(TypeToShapeTest, ConvertMemRefToShape2) {
   MLIRContext context;
   mlir::Builder builder(&context);
 
-  StatusOr<mlir::Type> mlir_type =
+  absl::StatusOr<mlir::Type> mlir_type =
       ConvertShapeToType<MemRefType>(shape, builder);
   ASSERT_TRUE(mlir_type.ok());
   mlir::Type type = std::move(mlir_type).value();

From b20bcd3245c2c35eb1bedf00321fc2722ad52eb3 Mon Sep 17 00:00:00 2001
From: Shahriar Rouf <nafi@google.com>
Date: Sat, 23 Mar 2024 04:52:36 -0700
Subject: [PATCH 342/670] Optimize XLA Compiler.

This results in 2% improvement on some representative large model compilation benchmarks.

1. `MayUseOperandValue(position.index, user)`, `(user->IsRoot() && root_positions.contains(user)` is computed once per <position, user> instead of once per <position, user, operand>. We don't need to pass `i` to `MayUseOperandValue`. We can instead bring the `CHECK` out at the call site.
2. Use `emplace_back` instead of `push_back` and `std::move`. Why: https://godbolt.org/z/sT54Ef88h

PiperOrigin-RevId: 618429256
---
 third_party/xla/xla/service/hlo_value.cc | 46 ++++++++++++------------
 third_party/xla/xla/service/hlo_value.h  |  9 +++++
 2 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/third_party/xla/xla/service/hlo_value.cc b/third_party/xla/xla/service/hlo_value.cc
index 43ae714ac77ff5..a74e43b5996ec6 100644
--- a/third_party/xla/xla/service/hlo_value.cc
+++ b/third_party/xla/xla/service/hlo_value.cc
@@ -114,15 +114,13 @@ namespace {
 // ShapeIndex in the given operand. Generally, instruction which pass through
 // values transparently without reading the value are not considered to use the
 // value.
-bool MayUseOperandValue(int64_t operand_number, const ShapeIndex& index,
-                        const HloInstruction* user) {
+bool MayUseOperandValue(const ShapeIndex& index, const HloInstruction* user) {
   switch (user->opcode()) {
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kCopy:
       // These instructions only access the top-level values of their
       // operand. Non-top-level (nested) values are passed through
       // transparently.
-      CHECK_EQ(operand_number, 0);
       return index.empty();
     case HloOpcode::kDomain:
     case HloOpcode::kTuple:
@@ -172,6 +170,19 @@ HloValue::Uses HloValue::ComputeUses() const {
   // Build vector of HloUses for the value.
   for (const HloPosition& position : positions_) {
     for (HloInstruction* const user : position.instruction->users()) {
+#ifndef NDEBUG
+      // If user is in the root positions of this value, it must be a root.
+      if (root_positions.contains(user)) {
+        CHECK(user->IsRoot());
+      }
+#endif  // NDEBUG
+      // Root instructions of computations are considered to be uses whether
+      // or not the root instruction itself actually uses the value.
+      if (!MayUseOperandValue(position.index, user) &&
+          !(user->IsRoot() && root_positions.contains(user))) {
+        continue;
+      }
+
       int i = -1;
       for (const auto& operand : user->operands()) {
         ++i;
@@ -180,28 +191,19 @@ HloValue::Uses HloValue::ComputeUses() const {
           continue;
         }
 
+        uses.emplace_back(user, i, position.index);
 #ifndef NDEBUG
-        // If user is in the root positions of this value, it must be a root.
-        if (root_positions.contains(user)) {
-          CHECK(user->IsRoot());
+        // The new use must not already exist in uses.
+        for (int index = 0; index + 1 < uses.size(); ++index) {
+          DCHECK_NE(uses[index], uses.back());
         }
 #endif  // NDEBUG
-
-        // Root instructions of computations are considered to be uses whether
-        // or not the root instruction itself actually uses the value.
-        if (MayUseOperandValue(i, position.index, user) ||
-            (user->IsRoot() && root_positions.contains(user))) {
-          HloUse new_use{user, i, position.index};
-
-#ifndef NDEBUG
-          // The new use must not already exist in uses.
-          for (const HloUse& use : uses) {
-            DCHECK_NE(use, new_use);
-          }
-#endif  // NDEBUG
-
-          uses.push_back(std::move(new_use));
-        }
+      }
+      // In case of HloOpcode::kGetTupleElement or HloOpcode::kCopy instruction,
+      // ensure that user has at most one operand.
+      if (user->opcode() == HloOpcode::kGetTupleElement ||
+          user->opcode() == HloOpcode::kCopy) {
+        CHECK_LE(i, 0);
       }
     }
   }
diff --git a/third_party/xla/xla/service/hlo_value.h b/third_party/xla/xla/service/hlo_value.h
index 2152b27825d413..3f4f4699cde544 100644
--- a/third_party/xla/xla/service/hlo_value.h
+++ b/third_party/xla/xla/service/hlo_value.h
@@ -81,6 +81,15 @@ struct HloUse {
   // The shape index within the operand in which the value appears.
   ShapeIndex operand_index;
 
+  HloUse() = default;
+  HloUse(HloInstruction* instruction, int64_t operand_number)
+      : instruction(instruction), operand_number(operand_number) {}
+  HloUse(HloInstruction* instruction, int64_t operand_number,
+         ShapeIndex operand_index)
+      : instruction(instruction),
+        operand_number(operand_number),
+        operand_index(std::move(operand_index)) {}
+
   std::string ToString() const;
 
   bool operator==(const HloUse& other) const {

From a0f37dceebca7151e9befd06706fd60e2a6f7087 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Sat, 23 Mar 2024 05:19:33 -0700
Subject: [PATCH 343/670] [XLA:GPU] Remove special pretty printing for symbolic
 tiles.

Now that the parameters represent only sizes, there is no ambiguity as to
what they represent. Furthermore, this pretty printing is not applied
automatically everywhere, making error messages sometimes inconsistent.

PiperOrigin-RevId: 618432739
---
 .../xla/service/gpu/model/tile_analysis.cc    | 13 -------
 .../xla/xla/service/gpu/model/tile_analysis.h |  3 --
 .../service/gpu/model/tile_analysis_test.cc   | 37 -------------------
 3 files changed, 53 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.cc b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
index 018788d560cb3c..f3371f32a8e6c0 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
@@ -24,7 +24,6 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
@@ -39,7 +38,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-using ::absl::StrCat;
 using ::mlir::AffineExpr;
 using ::mlir::AffineExprKind;
 using ::mlir::AffineMap;
@@ -334,16 +332,5 @@ void SymbolicTile::Print(std::ostream& out,
   out << "\n";
 }
 
-std::ostream& operator<<(std::ostream& out, const SymbolicTile& symbolic_tile) {
-  AffineMapPrinter printer;
-  for (int64_t symbol_id = 0;
-       symbol_id < symbolic_tile.size_map().getNumSymbols(); symbol_id++) {
-    printer.SetSymbolName(symbol_id, StrCat("size", symbol_id));
-  }
-
-  symbolic_tile.Print(out, printer);
-  return out;
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.h b/third_party/xla/xla/service/gpu/model/tile_analysis.h
index 54fcdd396fa589..65b838f9c85556 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.h
@@ -76,9 +76,6 @@ class SymbolicTile {
       : offset_map_(offset_map), size_map_(size_map), stride_map_(stride_map) {}
 };
 
-// Prints symbolic_tile with renamed labels for each symbol, s{i} => size{i}.
-std::ostream& operator<<(std::ostream& out, const SymbolicTile& symbolic_tile);
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
index e8188a09694eeb..c82d5bd7b59771 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
@@ -16,8 +16,6 @@ limitations under the License.
 #include "xla/service/gpu/model/tile_analysis.h"
 
 #include <optional>
-#include <sstream>
-#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -33,7 +31,6 @@ namespace gpu {
 namespace {
 
 using ::testing::ExplainMatchResult;
-using ::testing::HasSubstr;
 using ::testing::Optional;
 using ::testing::StrEq;
 
@@ -354,40 +351,6 @@ TEST_F(SymbolicTileTest,
       std::nullopt);
 }
 
-TEST_F(SymbolicTileTest, CanPrintSymbolicTileWithNamedTriplets) {
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    ENTRY e {
-      p0 = f32[17, 19] parameter(0)
-      p1 = f32[19, 23] parameter(1)
-      ROOT dot = f32[17, 23] dot(p0, p1),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    }
-  )"));
-
-  std::string s;
-  std::stringstream ss(s);
-
-  SymbolicTile first_operand_tile =
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin())
-          .value();
-  SymbolicTile second_operand_tile =
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin())
-          .value();
-
-  ss << first_operand_tile;
-  EXPECT_THAT(ss.str(), AllOf(HasSubstr("()[size0, size1] -> (0, 0)"),
-                              HasSubstr("()[size0, size1] -> (size0, 19)"),
-                              HasSubstr("()[size0, size1] -> (1, 1)")));
-
-  // Clear the stream and load the second map.
-  ss.str("");
-  ss << second_operand_tile;
-  EXPECT_THAT(ss.str(), AllOf(HasSubstr("()[size0, size1] -> (0, 0)"),
-                              HasSubstr("()[size0, size1] -> (19, size1)"),
-                              HasSubstr("()[size0, size1] -> (1, 1)")));
-}
-
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From 2404d8492f108f37767a63b6846dd0d8e3310113 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Mar 2024 07:14:11 -0700
Subject: [PATCH 344/670] Automated Code Change

PiperOrigin-RevId: 618445781
---
 .../best_fit_repacker.cc                      |  4 ++--
 .../best_fit_repacker.h                       |  3 ++-
 .../memory_space_assignment/cost_analysis.cc  |  2 +-
 .../memory_space_assignment/cost_analysis.h   |  2 +-
 .../memory_space_assignment.cc                | 24 ++++++++++---------
 .../memory_space_assignment.h                 | 14 +++++------
 .../memory_space_assignment_test.cc           | 22 +++++++++--------
 .../memory_space_assignment/repacking.h       |  3 ++-
 .../service/memory_space_assignment/slice.h   |  5 ++--
 .../memory_space_assignment/testing_utils.h   |  2 +-
 10 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
index eb8e1378114fa8..e99e4ed085c289 100644
--- a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
@@ -495,7 +495,7 @@ class BestFitRepacker
     LOG(FATAL) << "We should never get here.";
   }
 
-  StatusOr<Result> Finish() override {
+  absl::StatusOr<Result> Finish() override {
     std::vector<BufferInterval> sorted_buffer_intervals =
         GetSortedBufferIntervals();
 
@@ -632,7 +632,7 @@ class BestFitRepacker
 
 namespace memory_space_assignment {
 
-StatusOr<bool> MemorySpaceAssignmentBestFitRepacker::Repack(
+absl::StatusOr<bool> MemorySpaceAssignmentBestFitRepacker::Repack(
     absl::Span<AllocationBlock*> allocations) {
   BestFitRepacker best_fit_repacker = BestFitRepacker(
       options_, slice_time_permutation_iterator_type_, max_size_, alignment_);
diff --git a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h
index dacb7803d303cb..816031d383359f 100644
--- a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h
+++ b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h
@@ -62,7 +62,8 @@ class MemorySpaceAssignmentBestFitRepacker
         slice_time_permutation_iterator_type_(
             slice_time_permutation_iterator_type) {}
 
-  StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override;
+  absl::StatusOr<bool> Repack(
+      absl::Span<AllocationBlock*> allocations) override;
 
  private:
   BestFitRepackOptions options_;
diff --git a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc
index 2e5c07ac7546c1..72f1c226e9652c 100644
--- a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc
@@ -43,7 +43,7 @@ limitations under the License.
 
 namespace xla {
 namespace memory_space_assignment {
-/*static*/ StatusOr<std::unique_ptr<CostAnalysis>> CostAnalysis::Create(
+/*static*/ absl::StatusOr<std::unique_ptr<CostAnalysis>> CostAnalysis::Create(
     const HloCostAnalysis& cost_analysis, const CostAnalysisOptions& options,
     const HloModule& module) {
   TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));
diff --git a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
index 8463f7f914564b..b8152dbc23fda7 100644
--- a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
+++ b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
@@ -84,7 +84,7 @@ class CostAnalysis {
 
   virtual ~CostAnalysis() = default;
 
-  static StatusOr<std::unique_ptr<CostAnalysis>> Create(
+  static absl::StatusOr<std::unique_ptr<CostAnalysis>> Create(
       const HloCostAnalysis& cost_analysis, const CostAnalysisOptions& options,
       const HloModule& module);
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 4768f5d8261d33..71a28ed6bfdf00 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -343,7 +343,8 @@ Status InsertInstructionAndEnsureOperandsInserted(
   return OkStatus();
 }
 
-StatusOr<xla::HloLiveRange::LogicalTime> GetScheduleTimeFromInstructionName(
+absl::StatusOr<xla::HloLiveRange::LogicalTime>
+GetScheduleTimeFromInstructionName(
     absl::string_view name,
     const absl::flat_hash_map<const xla::HloInstruction*,
                               xla::HloLiveRange::LogicalTime>& schedule) {
@@ -381,7 +382,7 @@ bool DoesOperandMatchFilter(const HloOperandFilter& filter,
   return true;
 }
 
-StatusOr<std::optional<int64_t>> GetPrefetchTimeByEagerness(
+absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeByEagerness(
     float prefetch_eagerness, int64_t earliest_prefetch_time,
     int64_t latest_prefetch_time) {
   CHECK_GE(prefetch_eagerness, 0.0);
@@ -394,7 +395,7 @@ StatusOr<std::optional<int64_t>> GetPrefetchTimeByEagerness(
       (latest_prefetch_time - earliest_prefetch_time) * prefetch_eagerness);
 }
 
-StatusOr<std::optional<int64_t>> GetPrefetchTimeAfterInstruction(
+absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeAfterInstruction(
     const std::string& after_instruction_name,
     const absl::flat_hash_map<const xla::HloInstruction*,
                               xla::HloLiveRange::LogicalTime>& schedule) {
@@ -404,7 +405,7 @@ StatusOr<std::optional<int64_t>> GetPrefetchTimeAfterInstruction(
   return static_cast<std::optional<int64_t>>(reference_instruction_time);
 }
 
-StatusOr<std::optional<int64_t>> GetPrefetchTimeBeforeInstruction(
+absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeBeforeInstruction(
     const std::string& before_instruction_name,
     const absl::flat_hash_map<const xla::HloInstruction*,
                               xla::HloLiveRange::LogicalTime>& schedule) {
@@ -414,7 +415,7 @@ StatusOr<std::optional<int64_t>> GetPrefetchTimeBeforeInstruction(
   return static_cast<std::optional<int64_t>>(reference_instruction_time - 1);
 }
 
-StatusOr<std::optional<int64_t>> GetPrefetchTime(
+absl::StatusOr<std::optional<int64_t>> GetPrefetchTime(
     const PreferredPrefetchOverrideOptions& override_options,
     int64_t earliest_prefetch_time, int64_t latest_prefetch_time,
     const absl::flat_hash_map<const HloInstruction*, HloLiveRange::LogicalTime>&
@@ -436,7 +437,7 @@ StatusOr<std::optional<int64_t>> GetPrefetchTime(
   return static_cast<StatusOr<std::optional<int64_t>>>(std::nullopt);
 }
 
-StatusOr<std::optional<int64_t>> GetOverriddenPreferredPrefetchTime(
+absl::StatusOr<std::optional<int64_t>> GetOverriddenPreferredPrefetchTime(
     const PreferredPrefetchOverrides& preferred_prefetch_overrides,
     int64_t operand_size, const HloUse& hlo_use,
     const absl::flat_hash_map<const HloInstruction*, HloLiveRange::LogicalTime>&
@@ -1568,7 +1569,8 @@ void AlternateMemoryBestFitHeap::IdentifyAndOptimizeMemoryBoundLoops() {
   }
 }
 
-StatusOr<HeapSimulator::Result<HloValue>> AlternateMemoryBestFitHeap::Finish() {
+absl::StatusOr<HeapSimulator::Result<HloValue>>
+AlternateMemoryBestFitHeap::Finish() {
   if (options_.autotuning_config.has_value()) {
     CHECK_EQ((*options_.autotuning_config).size(), buffer_intervals_.size());
   }
@@ -2140,7 +2142,7 @@ void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
   FindAliases(&allocation_values);
 }
 
-StatusOr<AlternateMemoryBestFitHeap::Result>
+absl::StatusOr<AlternateMemoryBestFitHeap::Result>
 AlternateMemoryBestFitHeap::AllocateAllocationValues(
     absl::Span<MemorySpaceAssignment::AllocationValue> allocation_values) {
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
@@ -5329,7 +5331,7 @@ AlternateMemoryBestFitHeap::FindBestChunkCandidates(
   return {};
 }
 
-StatusOr<MemorySpaceAssignment::AsyncCopyStats>
+absl::StatusOr<MemorySpaceAssignment::AsyncCopyStats>
 MemorySpaceAssignment::CalculateAsyncCopyStats() const {
   AsyncCopyStats stats;
   int64_t current_copies = 0;
@@ -5373,7 +5375,7 @@ MemorySpaceAssignment::CalculateAsyncCopyStats() const {
   return stats;
 }
 
-/*static*/ StatusOr<std::unique_ptr<PresetAssignments>>
+/*static*/ absl::StatusOr<std::unique_ptr<PresetAssignments>>
 MemorySpaceAssignment::Run(HloModule* module,
                            const HloLiveRange& hlo_live_range,
                            const HloAliasAnalysis& alias_analysis,
@@ -5389,7 +5391,7 @@ MemorySpaceAssignment::Run(HloModule* module,
                                                           alias_analysis);
 }
 
-StatusOr<std::unique_ptr<PresetAssignments>>
+absl::StatusOr<std::unique_ptr<PresetAssignments>>
 MemorySpaceAssignment::RunMemorySpaceAssignment(
     const HloLiveRange& hlo_live_range,
     const HloAliasAnalysis& alias_analysis) {
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
index 72100f56b1425b..e262349fbb5cbd 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
@@ -477,12 +477,12 @@ class MemorySpaceAssignment {
   virtual ~MemorySpaceAssignment() = default;
 
   // Runs the MemorySpaceAssignment pass.
-  static StatusOr<std::unique_ptr<PresetAssignments>> Run(
+  static absl::StatusOr<std::unique_ptr<PresetAssignments>> Run(
       HloModule* module, const HloLiveRange& hlo_live_range,
       const HloAliasAnalysis& alias_analysis, const Options& options);
 
   // Calculates asynchronous copy statistics.
-  StatusOr<AsyncCopyStats> CalculateAsyncCopyStats() const;
+  absl::StatusOr<AsyncCopyStats> CalculateAsyncCopyStats() const;
 
   // Verify that the memory space assignment is free of overlapping buffers and
   // export heap simulator trace to be used by buffer_assignment.
@@ -490,9 +490,9 @@ class MemorySpaceAssignment {
 
  protected:
   // Main driver of the memory space assignment pass.
-  virtual StatusOr<std::unique_ptr<PresetAssignments>> RunMemorySpaceAssignment(
-      const HloLiveRange& hlo_live_range,
-      const HloAliasAnalysis& alias_analysis);
+  virtual absl::StatusOr<std::unique_ptr<PresetAssignments>>
+  RunMemorySpaceAssignment(const HloLiveRange& hlo_live_range,
+                           const HloAliasAnalysis& alias_analysis);
 
   // Finds an AllocationSequence for placing buffers in alternate memory using
   // the AlternateMemoryBestFitHeap algorithm. Must be set before Process() is
@@ -832,7 +832,7 @@ class AlternateMemoryBestFitHeap
   void AllocateCrossProgramPrefetchBuffer(
       HloModule* module, const BufferInterval& prefetch_candidate);
 
-  StatusOr<HeapSimulator::Result<HloValue>> Finish() override;
+  absl::StatusOr<HeapSimulator::Result<HloValue>> Finish() override;
 
  protected:
   // Given a buffer interval, returns the colocated intervals. Unlike the
@@ -1166,7 +1166,7 @@ class AlternateMemoryBestFitHeap
   // All of the allocation values have a must-alias relationship with each
   // other. Returns either kSuccess if all of the sites could be placed in the
   // alternate memory or a bitwise OR of failure reasons why they couldn't
-  StatusOr<Result> AllocateAllocationValues(
+  absl::StatusOr<Result> AllocateAllocationValues(
       absl::Span<AllocationValue> allocation_values);
 
   // Finds an allocation for an allocation request for a segment (see the
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index 172f565f16975f..afb8ecf0a07534 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -274,7 +274,8 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
     return std::move(status_or.value());
   }
 
-  StatusOr<std::unique_ptr<PresetAssignments>> AssignMemorySpaceAndReturnStatus(
+  absl::StatusOr<std::unique_ptr<PresetAssignments>>
+  AssignMemorySpaceAndReturnStatus(
       HloModule* module, std::optional<Options> options_override,
       std::optional<MsaBufferIntervalCompare> buffer_interval_compare,
       PrefetchIntervalPicker* prefetch_interval_picker) {
@@ -6696,7 +6697,8 @@ class FakeMemorySpaceAssignmentRepacker : public MemorySpaceAssignmentRepacker {
         check_fun_(check_fun),
         always_return_modified_(always_return_modified) {}
 
-  StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override {
+  absl::StatusOr<bool> Repack(
+      absl::Span<AllocationBlock*> allocations) override {
     bool modified = false;
     for (AllocationBlock* block : allocations) {
       absl::flat_hash_set<int64_t> colocations;
@@ -9833,13 +9835,13 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
     SliceProposer() = default;
     virtual ~SliceProposer() = default;
 
-    virtual StatusOr<SliceProposalCollection> ProposeSlices(
+    virtual absl::StatusOr<SliceProposalCollection> ProposeSlices(
         const Shape& shape, const SlicedPrefetchOptions& options) = 0;
   };
 
   class MockSliceProposer : public SliceProposer {
    public:
-    MOCK_METHOD(StatusOr<SliceProposalCollection>, ProposeSlices,
+    MOCK_METHOD(absl::StatusOr<SliceProposalCollection>, ProposeSlices,
                 (const Shape& shape, const SlicedPrefetchOptions& options),
                 (override));
   };
@@ -10197,7 +10199,7 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
   }
 
   // Returns the index of the first instruction with the given name.
-  static StatusOr<int> FindScheduleIndexOfInstruction(
+  static absl::StatusOr<int> FindScheduleIndexOfInstruction(
       const std::vector<HloInstruction*>& schedule, std::string_view name,
       InstructionClass c) {
     for (int i = 0; i < schedule.size(); ++i) {
@@ -10223,7 +10225,7 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
     return nullptr;
   }
 
-  static StatusOr<std::vector<int>> GetSliceStartIndicies(
+  static absl::StatusOr<std::vector<int>> GetSliceStartIndicies(
       const std::vector<HloInstruction*>& schedule,
       const HloInstruction* concat_bitcast) {
     std::vector<int> indicies;
@@ -10923,7 +10925,7 @@ ENTRY main {
   EXPECT_CALL(slice_proposer_,
               ProposeSlices(f32_8_8_, EqualsSlicedPrefetchOptions(
                                           options_.sliced_prefetch_options)))
-      .WillRepeatedly(Return(StatusOr<SliceProposalCollection>(
+      .WillRepeatedly(Return(absl::StatusOr<SliceProposalCollection>(
           FailedPrecondition("%s", "Cannot slice."))));
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
@@ -11116,7 +11118,7 @@ class MockRepacker : public MemorySpaceAssignmentRepacker {
   MockRepacker()
       : MemorySpaceAssignmentRepacker(std::numeric_limits<int64_t>::max(), 1) {}
 
-  MOCK_METHOD(StatusOr<bool>, Repack, (absl::Span<AllocationBlock*>),
+  MOCK_METHOD(absl::StatusOr<bool>, Repack, (absl::Span<AllocationBlock*>),
               (override));
 };
 
@@ -11240,7 +11242,7 @@ ENTRY main {
   absl::flat_hash_map<std::pair<int64_t, int64_t>, int64_t> repack_map;
   EXPECT_CALL(repacker, Repack(_))
       .WillRepeatedly([](absl::Span<AllocationBlock*> allocations)
-                          -> StatusOr<bool> {
+                          -> absl::StatusOr<bool> {
         bool found_p2 = false;
         bool found_p3 = false;
         for (AllocationBlock* block : allocations) {
@@ -11461,7 +11463,7 @@ ENTRY main {
   // Define a lambda for running MSA on the specified HLO, with the
   // configuration above.
   auto run_msa =
-      [&](std::string_view hlo_text) -> StatusOr<ModuleAndAssignments> {
+      [&](std::string_view hlo_text) -> absl::StatusOr<ModuleAndAssignments> {
     ModuleAndAssignments module_and_assignments;
     TF_ASSIGN_OR_RETURN(module_and_assignments.module,
                         ParseAndReturnVerifiedModule(hlo_text));
diff --git a/third_party/xla/xla/service/memory_space_assignment/repacking.h b/third_party/xla/xla/service/memory_space_assignment/repacking.h
index 8100f78d365df8..095fd8ded056ed 100644
--- a/third_party/xla/xla/service/memory_space_assignment/repacking.h
+++ b/third_party/xla/xla/service/memory_space_assignment/repacking.h
@@ -35,7 +35,8 @@ class MemorySpaceAssignmentRepacker {
   // Repack the AllocationBlocks provided in the parameter. Returns true if
   // allocations have been modified and false if not. Returns a non-ok status if
   // there was an error.
-  virtual StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) = 0;
+  virtual absl::StatusOr<bool> Repack(
+      absl::Span<AllocationBlock*> allocations) = 0;
 
  protected:
   int64_t max_size_;
diff --git a/third_party/xla/xla/service/memory_space_assignment/slice.h b/third_party/xla/xla/service/memory_space_assignment/slice.h
index ca67dd29faf7b1..3d1fe279e36fe3 100644
--- a/third_party/xla/xla/service/memory_space_assignment/slice.h
+++ b/third_party/xla/xla/service/memory_space_assignment/slice.h
@@ -95,8 +95,9 @@ struct SliceProposal {
 // A SliceProposalCollection is generated from a SliceProposalFunction and is
 // used when we want to slice a prefetch.
 using SliceProposalCollection = std::vector<SliceProposal>;
-using SliceProposalFunction = std::function<StatusOr<SliceProposalCollection>(
-    const Shape& shape, const SlicedPrefetchOptions& options)>;
+using SliceProposalFunction =
+    std::function<absl::StatusOr<SliceProposalCollection>(
+        const Shape& shape, const SlicedPrefetchOptions& options)>;
 
 // A SliceDecision is a SliceProposal that we've determined where and when to
 // allocate.
diff --git a/third_party/xla/xla/service/memory_space_assignment/testing_utils.h b/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
index b1d3c94cc0f421..ccea37b88b470c 100644
--- a/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
+++ b/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
@@ -41,7 +41,7 @@ namespace memory_space_assignment {
 // elapsed times of each HLO and asynchronous copy.
 class FakeCostAnalysis : public CostAnalysis {
  public:
-  static StatusOr<std::unique_ptr<FakeCostAnalysis>> Create(
+  static absl::StatusOr<std::unique_ptr<FakeCostAnalysis>> Create(
       const HloCostAnalysis& cost_analysis, const HloModule& module,
       const CostAnalysisOptions& options) {
     TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));

From db34a1447c80c110b70f0fdda9d23faf43d5d6c2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Mar 2024 07:30:02 -0700
Subject: [PATCH 345/670] Automated Code Change

PiperOrigin-RevId: 618447348
---
 third_party/xla/xla/client/local_client.cc    |  41 +-
 third_party/xla/xla/client/local_client.h     |  40 +-
 third_party/xla/xla/client/value_inference.cc | 133 +++---
 third_party/xla/xla/client/value_inference.h  |   9 +-
 third_party/xla/xla/client/xla_builder.cc     | 418 +++++++++---------
 third_party/xla/xla/client/xla_builder.h      | 201 +++++----
 .../xla/xla/client/xla_builder_test.cc        |  11 +-
 third_party/xla/xla/client/xla_computation.cc |   4 +-
 third_party/xla/xla/client/xla_computation.h  |   4 +-
 9 files changed, 445 insertions(+), 416 deletions(-)

diff --git a/third_party/xla/xla/client/local_client.cc b/third_party/xla/xla/client/local_client.cc
index 623d654ca7824b..033361e897b676 100644
--- a/third_party/xla/xla/client/local_client.cc
+++ b/third_party/xla/xla/client/local_client.cc
@@ -32,8 +32,8 @@ using xla::source_map_util::InvalidParameterArgument;
 namespace xla {
 
 namespace {
-StatusOr<StreamPool::Ptr> BorrowStreamForDevice(int device_ordinal,
-                                                Backend* backend) {
+absl::StatusOr<StreamPool::Ptr> BorrowStreamForDevice(int device_ordinal,
+                                                      Backend* backend) {
   if (device_ordinal < 0) {
     device_ordinal = backend->default_device_ordinal();
   }
@@ -115,7 +115,7 @@ Status LocalExecutable::ValidateExecutionOptions(
   return OkStatus();
 }
 
-StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>>
+absl::StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>>
 LocalExecutable::RunHelper(const absl::Span<const Shape* const> argument_shapes,
                            ExecutableRunOptions run_options) {
   const ComputationLayout& computation_layout =
@@ -171,7 +171,7 @@ LocalExecutable::RunHelper(const absl::Span<const Shape* const> argument_shapes,
   return std::make_pair(service_options, std::move(stream));
 }
 
-StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
+absl::StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
     const absl::Span<const ShapedBuffer* const> arguments,
     ExecutableRunOptions run_options) {
   std::vector<const Shape*> argument_shapes;
@@ -185,7 +185,7 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
       });
 }
 
-StatusOr<ExecutionOutput> LocalExecutable::Run(
+absl::StatusOr<ExecutionOutput> LocalExecutable::Run(
     std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
   std::vector<const Shape*> argument_shapes;
   argument_shapes.reserve(arguments.size());
@@ -239,7 +239,7 @@ static void DumpOutputsAndSaveSnapshot(const Backend* backend,
       });
 }
 
-StatusOr<ScopedShapedBuffer> LocalExecutable::RunAsync(
+absl::StatusOr<ScopedShapedBuffer> LocalExecutable::RunAsync(
     const absl::Span<const ShapedBuffer* const> arguments,
     ExecutableRunOptions run_options) {
   std::vector<const Shape*> argument_shapes;
@@ -279,7 +279,7 @@ static ShapedBuffer MaybeOwningShapeTreeToShapedBuffer(
   return result;
 }
 
-StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
+absl::StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
     absl::Span<Shape const* const> argument_host_shapes,
     std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
   if (argument_host_shapes.size() != arguments.size()) {
@@ -321,7 +321,7 @@ StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
   return std::move(outputs);
 }
 
-StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
+absl::StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
     std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
   std::vector<const Shape*> argument_shapes;
   argument_shapes.reserve(arguments.size());
@@ -355,7 +355,7 @@ Backend* LocalClient::mutable_backend() {
   return local_service_->mutable_backend();
 }
 
-static StatusOr<ExecutableBuildOptions> UpdateBuildOptions(
+static absl::StatusOr<ExecutableBuildOptions> UpdateBuildOptions(
     const ExecutableBuildOptions& options, int default_device_ordinal) {
   ExecutableBuildOptions updated_options = options;
   if (options.device_ordinal() == -1) {
@@ -383,10 +383,10 @@ static StatusOr<ExecutableBuildOptions> UpdateBuildOptions(
   return updated_options;
 }
 
-StatusOr<std::vector<std::unique_ptr<LocalExecutable>>> LocalClient::Compile(
-    const XlaComputation& computation,
-    const absl::Span<const Shape* const> argument_layouts,
-    const ExecutableBuildOptions& options) {
+absl::StatusOr<std::vector<std::unique_ptr<LocalExecutable>>>
+LocalClient::Compile(const XlaComputation& computation,
+                     const absl::Span<const Shape* const> argument_layouts,
+                     const ExecutableBuildOptions& options) {
   TF_ASSIGN_OR_RETURN(ExecutableBuildOptions updated_options,
                       UpdateBuildOptions(options, default_device_ordinal()));
   TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Executable>> executables,
@@ -405,7 +405,7 @@ StatusOr<std::vector<std::unique_ptr<LocalExecutable>>> LocalClient::Compile(
   return std::move(local_executables);
 }
 
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 LocalClient::CompileAheadOfTime(
     const XlaComputation& computation,
     const absl::Span<const Shape* const> argument_layouts,
@@ -420,7 +420,7 @@ LocalClient::CompileAheadOfTime(
   return std::move(aot_results);
 }
 
-StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Load(
+absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Load(
     const std::string& serialized_aot_result,
     const ExecutableBuildOptions& options) {
   TF_ASSIGN_OR_RETURN(ExecutableBuildOptions updated_options,
@@ -442,7 +442,7 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Load(
                                            updated_options);
 }
 
-StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
+absl::StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
     const LiteralSlice& literal, int device_ordinal,
     se::DeviceMemoryAllocator* allocator) {
   if (allocator == nullptr) {
@@ -458,7 +458,7 @@ StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
   return std::move(scoped_buffer);
 }
 
-StatusOr<Literal> LocalClient::ShapedBufferToLiteral(
+absl::StatusOr<Literal> LocalClient::ShapedBufferToLiteral(
     const ShapedBuffer& shaped_buffer) {
   TF_ASSIGN_OR_RETURN(auto stream, mutable_backend()->BorrowStream(
                                        shaped_buffer.device_ordinal()));
@@ -466,7 +466,7 @@ StatusOr<Literal> LocalClient::ShapedBufferToLiteral(
                                                                  shaped_buffer);
 }
 
-StatusOr<const ShapedBuffer*> LocalClient::GlobalDataToShapedBuffer(
+absl::StatusOr<const ShapedBuffer*> LocalClient::GlobalDataToShapedBuffer(
     const GlobalDataHandle& data, int replica_number) {
   return local_service_->GlobalDataToShapedBuffer(data, replica_number);
 }
@@ -487,11 +487,12 @@ Status LocalClient::TransferFromOutfeedLocal(int device_ordinal,
                                                                   literal);
 }
 
-StatusOr<int> LocalClient::ReplicaNumberToDeviceOrdinal(int replica_number) {
+absl::StatusOr<int> LocalClient::ReplicaNumberToDeviceOrdinal(
+    int replica_number) {
   return local_service_->ReplicaNumberToDeviceOrdinal(replica_number);
 }
 
-StatusOr<TransferToServerResponse> LocalClient::TransferToLocalServer(
+absl::StatusOr<TransferToServerResponse> LocalClient::TransferToLocalServer(
     const ::xla::BorrowingLiteral& literal, int device_ordinal) {
   const ::xla::Shape& shape = literal.shape();
 
diff --git a/third_party/xla/xla/client/local_client.h b/third_party/xla/xla/client/local_client.h
index d3b0af63d254cd..d6e382192cab86 100644
--- a/third_party/xla/xla/client/local_client.h
+++ b/third_party/xla/xla/client/local_client.h
@@ -49,25 +49,25 @@ class LocalExecutable {
 
   // Run the compiled computation with the given arguments and options and
   // return the result.
-  StatusOr<ScopedShapedBuffer> Run(
+  absl::StatusOr<ScopedShapedBuffer> Run(
       absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
   // Similar to Run(), but allows for donating argument buffers to the
   // executable.
-  StatusOr<ExecutionOutput> Run(std::vector<ExecutionInput> arguments,
-                                ExecutableRunOptions run_options);
+  absl::StatusOr<ExecutionOutput> Run(std::vector<ExecutionInput> arguments,
+                                      ExecutableRunOptions run_options);
 
   // Similar to Run(), but need not block the host waiting for the computation
   // to complete before returning.
-  StatusOr<ScopedShapedBuffer> RunAsync(
+  absl::StatusOr<ScopedShapedBuffer> RunAsync(
       absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
   // Similar to RunAsync(), but allows for donating argument buffers to the
   // executable.
-  StatusOr<ExecutionOutput> RunAsync(std::vector<ExecutionInput> arguments,
-                                     ExecutableRunOptions run_options);
+  absl::StatusOr<ExecutionOutput> RunAsync(
+      std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options);
 
   // Return the options used to build the executable.
   const ExecutableBuildOptions& build_options() const { return build_options_; }
@@ -76,7 +76,7 @@ class LocalExecutable {
   Executable* executable() const { return executable_.get(); }
 
  private:
-  StatusOr<ExecutionOutput> RunAsync(
+  absl::StatusOr<ExecutionOutput> RunAsync(
       absl::Span<Shape const* const> argument_host_shapes,
       std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options);
 
@@ -89,11 +89,12 @@ class LocalExecutable {
                                   const Backend& backend);
 
   // Returns a literal containing the contents of the given ShapedBuffer.
-  StatusOr<Literal> LiteralFromShapedBuffer(const ShapedBuffer& shaped_buffer);
+  absl::StatusOr<Literal> LiteralFromShapedBuffer(
+      const ShapedBuffer& shaped_buffer);
 
-  StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>> RunHelper(
-      absl::Span<const Shape* const> argument_shapes,
-      ExecutableRunOptions run_options);
+  absl::StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>>
+  RunHelper(absl::Span<const Shape* const> argument_shapes,
+            ExecutableRunOptions run_options);
 
   // The ordinal of the device which this executable was compiled for. The
   // executable can run on all equivalent devices (as determined by
@@ -142,7 +143,7 @@ class LocalClient : public Client {
   //
   // The given ExecutableBuildOptions overrides any values from XLA_FLAGS
   // environment variable.
-  StatusOr<std::vector<std::unique_ptr<LocalExecutable>>> Compile(
+  absl::StatusOr<std::vector<std::unique_ptr<LocalExecutable>>> Compile(
       const XlaComputation& computation,
       absl::Span<const Shape* const> argument_layouts,
       const ExecutableBuildOptions& options);
@@ -150,14 +151,14 @@ class LocalClient : public Client {
   // Same as Compile() above, but return AotCompilationResult objects (instead
   // of LocalExecutable objects), which can be persisted to later load
   // LocalExecutable(s) using the Load() method below.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(const XlaComputation& computation,
                      absl::Span<const Shape* const> argument_layouts,
                      const ExecutableBuildOptions& options);
 
   // Return a LocalExecutable object loaded from a serialized
   // AotCompilationResult.
-  StatusOr<std::unique_ptr<LocalExecutable>> Load(
+  absl::StatusOr<std::unique_ptr<LocalExecutable>> Load(
       const std::string& serialized_aot_result,
       const ExecutableBuildOptions& options);
 
@@ -165,21 +166,22 @@ class LocalClient : public Client {
   // ScopedShapedBuffer. If non-null the given memory allocator is used for
   // device memory allocation. If null, the default memory allocator for the
   // device is used.
-  StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
+  absl::StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
       const LiteralSlice& literal, int device_ordinal,
       se::DeviceMemoryAllocator* allocator = nullptr);
 
   // Transfer the BorrowingLiteral to the device with the given ordinal.
-  StatusOr<TransferToServerResponse> TransferToLocalServer(
+  absl::StatusOr<TransferToServerResponse> TransferToLocalServer(
       const ::xla::BorrowingLiteral& literal, int device_ordinal);
 
   // Copy the data from the device contained in the given ShapedBuffer and
   // return as a Literal.
-  StatusOr<Literal> ShapedBufferToLiteral(const ShapedBuffer& shaped_buffer);
+  absl::StatusOr<Literal> ShapedBufferToLiteral(
+      const ShapedBuffer& shaped_buffer);
 
   // Converts a GlobalDataHandle into a pointer to a ShapedBuffer that's valid
   // as long as the handle is valid.
-  StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
+  absl::StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
       const GlobalDataHandle& data, int replica_number);
 
   // Transfer the given literal to the infeed queue of the given device.
@@ -201,7 +203,7 @@ class LocalClient : public Client {
   // This returns an error if there is not a one-to-one correspondence of
   // replicas to device ordinals, but is useful as a short term mechanism for
   // the "easy" case where a single replica is a single device.
-  StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
+  absl::StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
 
   // Returns the platform that the underlying service targets.
   se::Platform* platform() const;
diff --git a/third_party/xla/xla/client/value_inference.cc b/third_party/xla/xla/client/value_inference.cc
index 8d09bc57b5759b..b4bb8af37c4ea8 100644
--- a/third_party/xla/xla/client/value_inference.cc
+++ b/third_party/xla/xla/client/value_inference.cc
@@ -148,7 +148,7 @@ struct HloProtoEvaluator {
     return *this;
   }
 
-  StatusOr<Literal> Evaluate() {
+  absl::StatusOr<Literal> Evaluate() {
     // Evaluate the instruction by swapping it's operands with constant
     // instructions with given literals.
     HloComputation::Builder builder("EmptyComputation");
@@ -286,11 +286,11 @@ struct PostorderDFSDep {
 
 // This function represents the logic to visit a node once its dependencies
 // (operands) are all resolved.
-using Visit = std::function<StatusOr<Literal>(absl::Span<Literal>)>;
+using Visit = std::function<absl::StatusOr<Literal>(absl::Span<Literal>)>;
 // Convenient specializations of Visit function for different operands.
-using Visit0D = std::function<StatusOr<Literal>()>;
-using Visit1D = std::function<StatusOr<Literal>(Literal)>;
-using Visit2D = std::function<StatusOr<Literal>(Literal, Literal)>;
+using Visit0D = std::function<absl::StatusOr<Literal>()>;
+using Visit1D = std::function<absl::StatusOr<Literal>(Literal)>;
+using Visit2D = std::function<absl::StatusOr<Literal>(Literal, Literal)>;
 
 // A postorder dfs node can be visited once its dependency requests are all
 // fulfilled.
@@ -332,7 +332,7 @@ struct [[nodiscard]] PostorderDFSNode {
 
 // Convert an interger handle to HloInstructionProto.
 using HandleToInstruction =
-    std::function<StatusOr<const HloInstructionProto*>(int64_t)>;
+    std::function<absl::StatusOr<const HloInstructionProto*>(int64_t)>;
 using HandleToComputation = std::function<const HloComputationProto*(int64_t)>;
 
 struct PostorderDFSVisitor {
@@ -343,20 +343,20 @@ struct PostorderDFSVisitor {
         handle_to_instruction(handle_to_instruction),
         handle_to_computation(handle_to_computation) {}
 
-  StatusOr<PostorderDFSNode> AnalyzeUpperBound(int64_t handle,
-                                               InferenceContext context);
-  StatusOr<PostorderDFSNode> AnalyzeLowerBound(int64_t handle,
-                                               InferenceContext context);
-  StatusOr<PostorderDFSNode> AnalyzeIsDynamic(int64_t handle,
-                                              PostorderDFSNodeType type,
-                                              InferenceContext context);
-  StatusOr<PostorderDFSNode> AnalyzeConstant(int64_t handle,
-                                             InferenceContext context);
-  StatusOr<PostorderDFSNode> AnalyzeConstantValueFallback(
+  absl::StatusOr<PostorderDFSNode> AnalyzeUpperBound(int64_t handle,
+                                                     InferenceContext context);
+  absl::StatusOr<PostorderDFSNode> AnalyzeLowerBound(int64_t handle,
+                                                     InferenceContext context);
+  absl::StatusOr<PostorderDFSNode> AnalyzeIsDynamic(int64_t handle,
+                                                    PostorderDFSNodeType type,
+                                                    InferenceContext context);
+  absl::StatusOr<PostorderDFSNode> AnalyzeConstant(int64_t handle,
+                                                   InferenceContext context);
+  absl::StatusOr<PostorderDFSNode> AnalyzeConstantValueFallback(
       int64_t handle, PostorderDFSNodeType type, InferenceContext context);
 
-  StatusOr<Literal> PostOrderDFSVisit(int64_t handle,
-                                      PostorderDFSNodeType type);
+  absl::StatusOr<Literal> PostOrderDFSVisit(int64_t handle,
+                                            PostorderDFSNodeType type);
 
   // Returns true if a value represented by `handle` is an integeral type or
   // a floating pointer type that just got converted from an integral type.
@@ -469,8 +469,10 @@ PostorderDFSNode CreateAllDynamicResult(const Shape& shape,
 }  // namespace
 
 // Analyze a tensor's constant value, upper-bound value or lower-bound value.
-StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
-    int64_t handle, PostorderDFSNodeType type, InferenceContext context) {
+absl::StatusOr<PostorderDFSNode>
+PostorderDFSVisitor::AnalyzeConstantValueFallback(int64_t handle,
+                                                  PostorderDFSNodeType type,
+                                                  InferenceContext context) {
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(root->opcode()));
@@ -534,7 +536,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
       call_context.caller_operand_handles.push_back(call_proto->operand_ids(0));
       node.AddDependency(called_root, PostorderDFSNodeType::kConstantValue,
                          call_context, "callee's root instruction");
-      return node.AddVisit([](Literal operand) -> StatusOr<Literal> {
+      return node.AddVisit([](Literal operand) -> absl::StatusOr<Literal> {
         // Forward result of callee's root to caller.
         return std::move(operand);
       });
@@ -565,7 +567,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
                            branch_context);
       }
       return node.AddVisit(
-          [](absl::Span<Literal> operands) -> StatusOr<Literal> {
+          [](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
             int64_t pred_is_dynamic = operands[1].Get<bool>({});
             if (pred_is_dynamic) {
               // If predicate is dynamic, return the value of the first branch
@@ -606,7 +608,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
           handle_to_computation(root->called_computation_ids(0));
       return result.AddVisit(
           [root, computation_proto, context,
-           this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+           this](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
             TF_ASSIGN_OR_RETURN(
                 auto computation,
                 HloComputation::CreateFromProto(*computation_proto, {}));
@@ -638,7 +640,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
   }
 }
 
-StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
+absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
     int64_t handle, InferenceContext context) {
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
@@ -657,7 +659,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
       const HloInstructionProto* operand_proto =
           handle_to_instruction(operand_handle).value();
       return PostorderDFSNode().AddVisit(
-          [operand_proto, dimension]() -> StatusOr<Literal> {
+          [operand_proto, dimension]() -> absl::StatusOr<Literal> {
             return LiteralUtil::CreateR0<int32_t>(
                 operand_proto->shape().dimensions(dimension));
           });
@@ -671,7 +673,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
           .AddDependency(root->operand_ids(0),
                          PostorderDFSNodeType::kConstantUpperBound, context)
           .AddVisit([this](Literal lower_bound,
-                           Literal upper_bound) -> StatusOr<Literal> {
+                           Literal upper_bound) -> absl::StatusOr<Literal> {
             TF_ASSIGN_OR_RETURN(auto lower_bound_abs,
                                 evaluator.EvaluateElementwiseUnaryOp(
                                     HloOpcode::kAbs, lower_bound));
@@ -701,7 +703,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
       }
 
       return dfs.AddVisit(
-          [root, context](absl::Span<Literal> operands) -> StatusOr<Literal> {
+          [root,
+           context](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
             std::vector<Literal> results;
             results.reserve(operands.size());
             // Conservatively set each element of the tensor to the max value.
@@ -724,7 +727,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
       return PostorderDFSNode()
           .AddDependency(root->operand_ids(0),
                          PostorderDFSNodeType::kConstantLowerBound, context)
-          .AddVisit([this](Literal lower_bound) -> StatusOr<Literal> {
+          .AddVisit([this](Literal lower_bound) -> absl::StatusOr<Literal> {
             return evaluator.EvaluateElementwiseUnaryOp(HloOpcode::kNegate,
                                                         lower_bound);
           });
@@ -739,7 +742,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
                          PostorderDFSNodeType::kConstantLowerBound, context)
           .AddVisit([root, opcode, this](
                         Literal upper_bound,
-                        Literal lower_bound) -> StatusOr<Literal> {
+                        Literal lower_bound) -> absl::StatusOr<Literal> {
             if (opcode == HloOpcode::kDivide &&
                 this->IsValueEffectiveInteger(root->operand_ids(1))) {
               // Because in many cases the lower bound of a value is
@@ -771,7 +774,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
     }
     case HloOpcode::kCustomCall: {
       if (root->custom_call_target() == "SetBound") {
-        return PostorderDFSNode().AddVisit([root]() -> StatusOr<Literal> {
+        return PostorderDFSNode().AddVisit([root]() -> absl::StatusOr<Literal> {
           if (root->literal().shape().element_type() == TUPLE) {
             // First literal of SetBound contains bounds, second literal
             // contains dynamism indicators.
@@ -808,7 +811,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
   }
 }
 
-StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
+absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
     int64_t handle, InferenceContext context) {
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
@@ -826,7 +829,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
       TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
                           handle_to_instruction(operand_handle));
       return PostorderDFSNode().AddVisit(
-          [dimension, operand_proto]() -> StatusOr<Literal> {
+          [dimension, operand_proto]() -> absl::StatusOr<Literal> {
             if (operand_proto->shape().is_dynamic_dimension(dimension)) {
               return LiteralUtil::CreateR0<int32_t>(0);
             } else {
@@ -844,7 +847,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
           .AddDependency(root->operand_ids(0),
                          PostorderDFSNodeType::kConstantUpperBound, context)
           .AddVisit([this](Literal lower_bound,
-                           Literal upper_bound) -> StatusOr<Literal> {
+                           Literal upper_bound) -> absl::StatusOr<Literal> {
             TF_ASSIGN_OR_RETURN(auto lower_bound_abs,
                                 evaluator.EvaluateElementwiseUnaryOp(
                                     HloOpcode::kAbs, lower_bound));
@@ -860,7 +863,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
       return PostorderDFSNode()
           .AddDependency(root->operand_ids(0),
                          PostorderDFSNodeType::kConstantUpperBound, context)
-          .AddVisit([this](Literal upper_bound) -> StatusOr<Literal> {
+          .AddVisit([this](Literal upper_bound) -> absl::StatusOr<Literal> {
             return evaluator.EvaluateElementwiseUnaryOp(HloOpcode::kNegate,
                                                         upper_bound);
           });
@@ -874,7 +877,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
           .AddDependency(root->operand_ids(1),
                          PostorderDFSNodeType::kConstantUpperBound, context)
           .AddVisit(
-              [root, this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+              [root,
+               this](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
                 return std::make_unique<HloProtoEvaluator>(evaluator, *root)
                     ->WithOperands(operands)
                     .Evaluate();
@@ -898,7 +902,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
   }
 }
 
-StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
+absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
     int64_t handle, InferenceContext context) {
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
@@ -916,7 +920,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
       TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
                           handle_to_instruction(operand_handle));
       return PostorderDFSNode().AddVisit(
-          [operand_proto, dimension, root]() -> StatusOr<Literal> {
+          [operand_proto, dimension, root]() -> absl::StatusOr<Literal> {
             if (operand_proto->shape().is_dynamic_dimension(dimension)) {
               // The value is dynamic, we return garbage data here and mask them
               // out later.
@@ -939,7 +943,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
                              context);
       }
       return result.AddVisit(
-          [root, this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+          [root,
+           this](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
             return std::make_unique<HloProtoEvaluator>(evaluator, *root)
                 ->WithOperands(operands)
                 .Evaluate();
@@ -952,8 +957,9 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
         return PostorderDFSNode()
             .AddDependency(root->operand_ids(0),
                            PostorderDFSNodeType::kConstantValue, context)
-            .AddVisit(
-                [](Literal operand) -> StatusOr<Literal> { return operand; });
+            .AddVisit([](Literal operand) -> absl::StatusOr<Literal> {
+              return operand;
+            });
       } else if (root->custom_call_target() == "Sharding") {
         return PostorderDFSNode()
             .AddDependency(root->operand_ids(0),
@@ -981,7 +987,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
           handle_to_computation(root->called_computation_ids(0));
       return result.AddVisit(
           [root, context, computation_proto,
-           this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+           this](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
             TF_ASSIGN_OR_RETURN(
                 auto computation,
                 HloComputation::CreateFromProto(*computation_proto, {}));
@@ -998,7 +1004,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
   }
 }
 
-StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
+absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     int64_t handle, PostorderDFSNodeType type, InferenceContext context) {
   TF_RETURN_IF_ERROR(handle_to_instruction(handle).status());
   // Invariant check.
@@ -1028,7 +1034,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
       TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
                           handle_to_instruction(operand_handle));
       return PostorderDFSNode().AddVisit(
-          [operand_proto, dimension, type]() -> StatusOr<Literal> {
+          [operand_proto, dimension, type]() -> absl::StatusOr<Literal> {
             if (type == PostorderDFSNodeType::kBoundIsDynamic) {
               // The bound of dynamic dimension is not dynamic.
               return LiteralUtil::CreateR0<bool>(false);
@@ -1048,7 +1054,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
       }
 
       return dfs.AddVisit([root, context, type](absl::Span<Literal> operands)
-                              -> StatusOr<Literal> {
+                              -> absl::StatusOr<Literal> {
         bool all_operands_values_static = true;
         for (int64_t i = 0; i < operands.size(); ++i) {
           all_operands_values_static &= operands[i].IsAll(0);
@@ -1205,10 +1211,11 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
           call_proto->operand_ids(0));
       node.AddDependency(call_root, PostorderDFSNodeType::kValueIsDynamic,
                          branch_context, "callee's root instruction");
-      return node.AddVisit([context](Literal operand) -> StatusOr<Literal> {
-        // Forward result of callee's root to caller.
-        return operand;
-      });
+      return node.AddVisit(
+          [context](Literal operand) -> absl::StatusOr<Literal> {
+            // Forward result of callee's root to caller.
+            return operand;
+          });
     }
     case HloOpcode::kConditional: {
       auto node = PostorderDFSNode();
@@ -1246,7 +1253,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
       // 2*i + 1: Branch value is dynamic.
       return node.AddVisit([root, branch_size,
                             context](absl::Span<Literal> operands)
-                               -> StatusOr<Literal> {
+                               -> absl::StatusOr<Literal> {
         int64_t pred_is_dynamic = operands[1].Get<bool>({});
         auto result = CreatePredLiteral(
             true,
@@ -1386,7 +1393,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
           .AddDependency(root->operand_ids(1), type, context)
           // rhs dependency.
           .AddDependency(root->operand_ids(2), type, context)
-          .AddVisit([root](absl::Span<Literal> operands) -> StatusOr<Literal> {
+          .AddVisit([root](absl::Span<Literal> operands)
+                        -> absl::StatusOr<Literal> {
             OptionalLiteral optional_selector_literal(std::move(operands[0]),
                                                       std::move(operands[1]));
             Literal lhs = std::move(operands[2]);
@@ -1423,7 +1431,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
           .AddDependency(root->operand_ids(1),
                          PostorderDFSNodeType::kValueIsDynamic, context)
           .AddVisit(
-              [root, this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+              [root,
+               this](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
                 OptionalLiteral optional_selector_literal(
                     std::move(operands[1]), std::move(operands[2]));
 
@@ -1444,7 +1453,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     }
     case HloOpcode::kCustomCall: {
       if (root->custom_call_target() == "SetBound") {
-        return PostorderDFSNode().AddVisit([type, root]() -> StatusOr<Literal> {
+        return PostorderDFSNode().AddVisit([type,
+                                            root]() -> absl::StatusOr<Literal> {
           if (type == PostorderDFSNodeType::kBoundIsDynamic) {
             return CreatePredLiteral(false, Shape(root->shape()));
           } else {
@@ -1476,8 +1486,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
     case HloOpcode::kWhile: {
-      return PostorderDFSNode().AddVisit([root,
-                                          context]() -> StatusOr<Literal> {
+      return PostorderDFSNode().AddVisit([root, context]()
+                                             -> absl::StatusOr<Literal> {
         return CreatePredLiteral(
             true,
             ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index));
@@ -1485,8 +1495,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
       break;
     }
     default:
-      return PostorderDFSNode().AddVisit([root,
-                                          context]() -> StatusOr<Literal> {
+      return PostorderDFSNode().AddVisit([root, context]()
+                                             -> absl::StatusOr<Literal> {
         return CreatePredLiteral(
             true,
             ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index));
@@ -1494,7 +1504,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
   }
 }
 
-StatusOr<Literal> PostorderDFSVisitor::PostOrderDFSVisit(
+absl::StatusOr<Literal> PostorderDFSVisitor::PostOrderDFSVisit(
     int64_t handle, PostorderDFSNodeType type) {
   enum VisitState {
     kUnvisited = 0,
@@ -1609,7 +1619,7 @@ StatusOr<Literal> PostorderDFSVisitor::PostOrderDFSVisit(
   return evaluated[root.GetCacheKey()].Clone();
 }
 
-StatusOr<Literal> ValueInference::AnalyzeIsDynamic(XlaOp op) {
+absl::StatusOr<Literal> ValueInference::AnalyzeIsDynamic(XlaOp op) {
   PostorderDFSVisitor visitor(
       evaluator_,
       [&](int64_t handle) {
@@ -1622,7 +1632,8 @@ StatusOr<Literal> ValueInference::AnalyzeIsDynamic(XlaOp op) {
   return result;
 }
 
-StatusOr<std::optional<int64_t>> ValueInference::CseOpHandle(int64_t handle) {
+absl::StatusOr<std::optional<int64_t>> ValueInference::CseOpHandle(
+    int64_t handle) {
   TF_ASSIGN_OR_RETURN(auto inst, builder_->LookUpInstructionByHandle(handle));
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(inst->opcode()));
   // For now, only handle kGetDimensionSize as that's the most duplicated one.
@@ -1653,7 +1664,7 @@ StatusOr<std::optional<int64_t>> ValueInference::CseOpHandle(int64_t handle) {
   return {std::nullopt};
 }
 
-StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
+absl::StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
   TF_ASSIGN_OR_RETURN(auto cse_handle, CseOpHandle(handle));
   if (cse_handle) {
     // Use the CSE'd handle instead.
@@ -1769,7 +1780,7 @@ StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
   }
 }
 
-StatusOr<OptionalLiteral> ValueInference::AnalyzeConstant(
+absl::StatusOr<OptionalLiteral> ValueInference::AnalyzeConstant(
     XlaOp op, ValueInferenceMode mode) {
   TF_RETURN_IF_ERROR(builder_->LookUpInstructionByHandle(op.handle()).status());
   PostorderDFSVisitor visitor(
diff --git a/third_party/xla/xla/client/value_inference.h b/third_party/xla/xla/client/value_inference.h
index 9108077204561c..6f1685f1a42e0a 100644
--- a/third_party/xla/xla/client/value_inference.h
+++ b/third_party/xla/xla/client/value_inference.h
@@ -85,10 +85,11 @@ class ValueInference {
   explicit ValueInference(XlaBuilder* builder) : builder_(builder) {
     CHECK(builder_);
   }
-  StatusOr<Literal> AnalyzeIsDynamic(XlaOp op);
+  absl::StatusOr<Literal> AnalyzeIsDynamic(XlaOp op);
   // Returns an OptionalLiteral. Each individual value of the literal is
   // the concrete constant value if it can be inferred, otherwise a nullopt.
-  StatusOr<OptionalLiteral> AnalyzeConstant(XlaOp op, ValueInferenceMode mode);
+  absl::StatusOr<OptionalLiteral> AnalyzeConstant(XlaOp op,
+                                                  ValueInferenceMode mode);
 
   // Returns underlying xla builder.
   XlaBuilder* builder() { return builder_; }
@@ -97,11 +98,11 @@ class ValueInference {
   // Given an op handle, returns a simplified version of the handle inside a
   // int64_t Literal. If the a -1 value for the handle means invalid
   // simplification and the result shouldn't be used.
-  StatusOr<Literal> SimplifyOp(int64_t handle);
+  absl::StatusOr<Literal> SimplifyOp(int64_t handle);
 
   // Perform CSE on a given handle, and return an equivalent handle if seen
   // before. Otherwise, returns nullopt.
-  StatusOr<std::optional<int64_t>> CseOpHandle(int64_t handle);
+  absl::StatusOr<std::optional<int64_t>> CseOpHandle(int64_t handle);
   XlaBuilder* builder_;
   HloEvaluator evaluator_;
   // A map from instruction_hash to handle that helps perform CSE.
diff --git a/third_party/xla/xla/client/xla_builder.cc b/third_party/xla/xla/client/xla_builder.cc
index be157c3def9404..39c49502f7a5c7 100644
--- a/third_party/xla/xla/client/xla_builder.cc
+++ b/third_party/xla/xla/client/xla_builder.cc
@@ -118,7 +118,7 @@ namespace internal {
 
 XlaOp XlaBuilderFriend::BuildAddDependency(XlaBuilder* builder, XlaOp operand,
                                            XlaOp token, const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(std::move(instr), HloOpcode::kAddDependency,
@@ -131,7 +131,7 @@ XlaOp XlaBuilderFriend::BuildFusion(
     absl::string_view fusion_kind, const XlaComputation& fused_computation,
     absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
         output_operand_aliasing) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     instr.set_fusion_kind(std::string(fusion_kind));
     if (!output_operand_aliasing.empty()) {
@@ -161,7 +161,7 @@ std::pair<XlaOp, int64_t> XlaBuilderFriend::BuildAsyncStart(
     std::string execution_thread, const XlaComputation& called_computation,
     const Shape& shape) {
   int64_t called_computation_id;
-  auto start_op = builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  auto start_op = builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     instr.set_async_execution_thread(execution_thread);
@@ -178,7 +178,7 @@ XlaOp XlaBuilderFriend::BuildAsyncUpdate(XlaBuilder* builder,
                                          std::string execution_thread,
                                          int64_t called_computation,
                                          const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     instr.set_async_execution_thread(execution_thread);
@@ -192,7 +192,7 @@ XlaOp XlaBuilderFriend::BuildAsyncDone(XlaBuilder* builder, const XlaOp operand,
                                        std::string execution_thread,
                                        int64_t called_computation,
                                        const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     instr.set_async_execution_thread(execution_thread);
@@ -216,7 +216,7 @@ XlaOp XlaBuilderFriend::BuildAllGatherStart(
 XlaOp XlaBuilderFriend::BuildAllGatherDone(XlaBuilder* builder,
                                            const XlaOp operand,
                                            const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(std::move(instr), HloOpcode::kAllGatherDone,
@@ -238,7 +238,7 @@ XlaOp XlaBuilderFriend::BuildAllReduceStart(
 XlaOp XlaBuilderFriend::BuildAllReduceDone(XlaBuilder* builder,
                                            const XlaOp operand,
                                            const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(std::move(instr), HloOpcode::kAllReduceDone,
@@ -249,7 +249,7 @@ XlaOp XlaBuilderFriend::BuildAllReduceDone(XlaBuilder* builder,
 XlaOp XlaBuilderFriend::BuildCopyStart(
     XlaBuilder* builder, const XlaOp operand,
     std::optional<int> cross_program_prefetch_index) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     if (cross_program_prefetch_index) {
       instr.set_cross_program_prefetch_index(*cross_program_prefetch_index);
@@ -269,7 +269,7 @@ XlaOp XlaBuilderFriend::BuildCopyStart(
 
 XlaOp XlaBuilderFriend::BuildCopyDone(XlaBuilder* builder, const XlaOp operand,
                                       const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(std::move(instr), HloOpcode::kCopyDone,
@@ -288,7 +288,7 @@ XlaOp XlaBuilderFriend::BuildCollectivePermuteStart(
 XlaOp XlaBuilderFriend::BuildCollectivePermuteDone(XlaBuilder* builder,
                                                    const XlaOp operand,
                                                    const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(
@@ -298,7 +298,7 @@ XlaOp XlaBuilderFriend::BuildCollectivePermuteDone(XlaBuilder* builder,
 
 XlaOp XlaBuilderFriend::BuildBitcast(XlaBuilder* builder, XlaOp operand,
                                      const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(std::move(instr), HloOpcode::kBitcast,
@@ -309,7 +309,7 @@ XlaOp XlaBuilderFriend::BuildBitcast(XlaBuilder* builder, XlaOp operand,
 XlaOp XlaBuilderFriend::BuildDomain(XlaBuilder* builder, XlaOp operand,
                                     const OpSharding entry,
                                     const OpSharding exit, const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_domain_entry_sharding() = entry;
     *instr.mutable_domain_exit_sharding() = exit;
@@ -321,7 +321,7 @@ XlaOp XlaBuilderFriend::BuildDomain(XlaBuilder* builder, XlaOp operand,
 
 XlaOp XlaBuilderFriend::BuildPartitionId(XlaBuilder* builder,
                                          const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(std::move(instr), HloOpcode::kPartitionId);
@@ -331,7 +331,7 @@ XlaOp XlaBuilderFriend::BuildPartitionId(XlaBuilder* builder,
 XlaOp XlaBuilderFriend::BuildSend(XlaBuilder* builder, XlaOp operand,
                                   XlaOp token, const ChannelHandle& handle,
                                   bool is_host_transfer) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto send_instr;
     TF_ASSIGN_OR_RETURN(const Shape* shape, builder->GetShapePtr(operand));
     // Send instruction produces a tuple of {aliased operand, U32 context,
@@ -350,7 +350,7 @@ XlaOp XlaBuilderFriend::BuildSend(XlaBuilder* builder, XlaOp operand,
 XlaOp XlaBuilderFriend::BuildSendDone(XlaBuilder* builder, XlaOp operand,
                                       const ChannelHandle& handle,
                                       bool is_host_transfer) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto send_done_instr;
     *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     send_done_instr.set_channel_id(handle.handle());
@@ -364,7 +364,7 @@ XlaOp XlaBuilderFriend::BuildRecv(XlaBuilder* builder, XlaOp token,
                                   const Shape& shape,
                                   const ChannelHandle& handle,
                                   bool is_host_transfer) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Recv instruction produces a tuple of {receive buffer, U32 context,
     // token}.
     HloInstructionProto recv_instr;
@@ -383,7 +383,7 @@ XlaOp XlaBuilderFriend::BuildRecvDone(XlaBuilder* builder, XlaOp token,
                                       const Shape& shape,
                                       const ChannelHandle& handle,
                                       bool is_host_transfer) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto recv_done_instr;
     *recv_done_instr.mutable_shape() =
         ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()})
@@ -399,7 +399,7 @@ XlaOp XlaBuilderFriend::BuildRngGetAndUpdateState(XlaBuilder* builder,
 
                                                   int64_t delta,
                                                   const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     instr.set_delta(delta);
     *instr.mutable_shape() = shape.ToProto();
@@ -435,7 +435,7 @@ XlaOp operator<<(XlaOp x, XlaOp y) { return ShiftLeft(x, y); }
 
 XlaOp operator>>(XlaOp x, XlaOp y) {
   XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, builder->GetShapePtr(x));
     if (!ShapeUtil::ElementIsIntegral(*shape)) {
       return InvalidArgument(
@@ -450,7 +450,7 @@ XlaOp operator>>(XlaOp x, XlaOp y) {
   });
 }
 
-StatusOr<const Shape*> XlaBuilder::GetShapePtr(XlaOp op) const {
+absl::StatusOr<const Shape*> XlaBuilder::GetShapePtr(XlaOp op) const {
   TF_RETURN_IF_ERROR(first_error_);
   TF_RETURN_IF_ERROR(CheckOpBuilder(op));
   auto it = handle_to_index_.find(op.handle());
@@ -460,12 +460,12 @@ StatusOr<const Shape*> XlaBuilder::GetShapePtr(XlaOp op) const {
   return instruction_shapes_.at(it->second).get();
 }
 
-StatusOr<Shape> XlaBuilder::GetShape(XlaOp op) const {
+absl::StatusOr<Shape> XlaBuilder::GetShape(XlaOp op) const {
   TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(op));
   return *shape;
 }
 
-StatusOr<std::vector<Shape>> XlaBuilder::GetOperandShapes(
+absl::StatusOr<std::vector<Shape>> XlaBuilder::GetOperandShapes(
     absl::Span<const XlaOp> operands) const {
   std::vector<Shape> operand_shapes;
   operand_shapes.reserve(operands.size());
@@ -532,7 +532,7 @@ XlaOp XlaBuilder::ReportError(const Status& error) {
   return XlaOp(this);
 }
 
-XlaOp XlaBuilder::ReportErrorOrReturn(const StatusOr<XlaOp>& op) {
+XlaOp XlaBuilder::ReportErrorOrReturn(const absl::StatusOr<XlaOp>& op) {
   if (!first_error_.ok()) {
     return XlaOp(this);
   }
@@ -543,11 +543,12 @@ XlaOp XlaBuilder::ReportErrorOrReturn(const StatusOr<XlaOp>& op) {
 }
 
 XlaOp XlaBuilder::ReportErrorOrReturn(
-    absl::FunctionRef<StatusOr<XlaOp>()> op_creator) {
+    absl::FunctionRef<absl::StatusOr<XlaOp>()> op_creator) {
   return ReportErrorOrReturn(op_creator());
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64_t root_id) const {
+absl::StatusOr<ProgramShape> XlaBuilder::GetProgramShape(
+    int64_t root_id) const {
   TF_RETURN_IF_ERROR(first_error_);
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root_proto,
                       LookUpInstructionByHandle(root_id));
@@ -578,12 +579,12 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64_t root_id) const {
   return program_shape;
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape() const {
+absl::StatusOr<ProgramShape> XlaBuilder::GetProgramShape() const {
   TF_RET_CHECK(!instructions_.empty());
   return GetProgramShape(instructions_.back().id());
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape(XlaOp root) const {
+absl::StatusOr<ProgramShape> XlaBuilder::GetProgramShape(XlaOp root) const {
   if (root.builder_ != this) {
     return InvalidArgument("Given root operation is not in this computation.");
   }
@@ -702,21 +703,22 @@ Status XlaBuilder::GetCurrentStatus() const {
   return OkStatus();
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build(bool remove_dynamic_dimensions) {
+absl::StatusOr<XlaComputation> XlaBuilder::Build(
+    bool remove_dynamic_dimensions) {
   TF_RETURN_IF_ERROR(GetCurrentStatus());
   return Build(instructions_.back().id(), remove_dynamic_dimensions);
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build(XlaOp root,
-                                           bool remove_dynamic_dimensions) {
+absl::StatusOr<XlaComputation> XlaBuilder::Build(
+    XlaOp root, bool remove_dynamic_dimensions) {
   if (root.builder_ != this) {
     return InvalidArgument("Given root operation is not in this computation.");
   }
   return Build(root.handle(), remove_dynamic_dimensions);
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build(int64_t root_id,
-                                           bool remove_dynamic_dimensions) {
+absl::StatusOr<XlaComputation> XlaBuilder::Build(
+    int64_t root_id, bool remove_dynamic_dimensions) {
   TF_RETURN_IF_ERROR(GetCurrentStatus());
 
   // TODO(b/121223198): XLA backend cannot handle dynamic dimensions yet, remove
@@ -900,7 +902,7 @@ XlaOp XlaBuilder::DynamicBroadcastInDim(
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
+absl::StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
     const Shape& shape, XlaOp operand,
     absl::Span<const int64_t> broadcast_dimensions) {
   TF_RETURN_IF_ERROR(first_error_);
@@ -932,8 +934,8 @@ StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
   return AddInstruction(std::move(instr), HloOpcode::kBroadcast, {operand});
 }
 
-StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
-                                                 XlaOp operand) {
+absl::StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(
+    const Shape& output_shape, XlaOp operand) {
   TF_RETURN_IF_ERROR(first_error_);
 
   TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
@@ -990,7 +992,7 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
 }
 
 XlaOp XlaBuilder::UnaryOp(HloOpcode unop, XlaOp operand) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferUnaryOpShape(unop, *operand_shape));
@@ -1003,7 +1005,7 @@ namespace {
 // Broadcasts an origin XLA op to the rank of target_shape.
 // Does not broadcast rank dimensions to match, only expands rank.
 // Is identity function if origin rank matches target rank.
-StatusOr<XlaOp> BroadcastToTargetRank(
+absl::StatusOr<XlaOp> BroadcastToTargetRank(
     XlaOp origin, const Shape& origin_shape, const Shape& target_shape,
     absl::Span<const int64_t> broadcast_dimensions) {
   if (ShapeUtil::IsScalar(origin_shape)) {
@@ -1136,7 +1138,7 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
                            absl::Span<const int64_t> broadcast_dimensions,
                            std::optional<ComparisonDirection> direction,
                            std::optional<Comparison::Type> type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_ASSIGN_OR_RETURN(
@@ -1218,24 +1220,26 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
 
 XlaOp XlaBuilder::BinaryOpNoBroadcast(HloOpcode binop, const Shape& shape,
                                       XlaOp lhs, XlaOp rhs) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), binop, {lhs, rhs});
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                                    ComparisonDirection direction) {
+absl::StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs,
+                                          XlaOp rhs,
+                                          ComparisonDirection direction) {
   TF_ASSIGN_OR_RETURN(auto operand_shape, GetShape(lhs));
   return Compare(
       shape, lhs, rhs, direction,
       Comparison::DefaultComparisonType(operand_shape.element_type()));
 }
 
-StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                                    ComparisonDirection direction,
-                                    Comparison::Type type) {
+absl::StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs,
+                                          XlaOp rhs,
+                                          ComparisonDirection direction,
+                                          Comparison::Type type) {
   HloInstructionProto instr;
   instr.set_comparison_direction(ComparisonDirectionToString(direction));
   instr.set_comparison_type(ComparisonTypeToString(type));
@@ -1264,7 +1268,7 @@ absl::StatusOr<XlaOp> XlaBuilder::BroadcastScalarToOutputShape(XlaOp scalar,
 }
 
 XlaOp XlaBuilder::TernaryOp(HloOpcode triop, XlaOp lhs, XlaOp rhs, XlaOp ehs) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     XlaOp updated_lhs = lhs;
     XlaOp updated_rhs = rhs;
     XlaOp updated_ehs = ehs;
@@ -1323,7 +1327,7 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, XlaOp lhs, XlaOp rhs, XlaOp ehs) {
 }
 
 XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (literal.shape().IsArray() && literal.element_count() > 1 &&
         literal.IsAllFirst()) {
       Literal scalar = LiteralUtil::GetFirstScalarLiteral(literal);
@@ -1344,7 +1348,7 @@ XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
 }
 
 XlaOp XlaBuilder::Iota(const Shape& shape, int64_t iota_dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (!shape.is_static()) {
       return InvalidArgument(
           "The output of iota must not have dynamic dimensions: %s",
@@ -1363,7 +1367,7 @@ XlaOp XlaBuilder::Iota(PrimitiveType type, int64_t size) {
 
 XlaOp XlaBuilder::Call(const XlaComputation& computation,
                        absl::Span<const XlaOp> operands) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
@@ -1385,7 +1389,7 @@ XlaOp XlaBuilder::Call(const XlaComputation& computation,
 XlaOp XlaBuilder::Parameter(
     int64_t parameter_number, const Shape& shape, const std::string& name,
     const std::vector<bool>& replicated_at_leaf_buffers) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     if (!parameter_numbers_.insert(parameter_number).second) {
       return InvalidArgument("parameter %d already registered",
@@ -1406,7 +1410,7 @@ XlaOp XlaBuilder::Parameter(
 
 XlaOp XlaBuilder::Broadcast(XlaOp operand,
                             absl::Span<const int64_t> broadcast_sizes) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(
         const Shape& shape,
@@ -1431,7 +1435,7 @@ XlaOp XlaBuilder::Broadcast(XlaOp operand,
 XlaOp XlaBuilder::BroadcastInDim(
     XlaOp operand, const absl::Span<const int64_t> out_dim_size,
     const absl::Span<const int64_t> broadcast_dimensions) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     // Output shape, in the case of degenerate broadcast, the out_dim_size is
     // not necessarily the same as the dimension sizes of the output shape.
@@ -1488,8 +1492,9 @@ XlaOp XlaBuilder::BroadcastInDim(
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::ReshapeInternal(const Shape& shape, XlaOp operand,
-                                            int64_t inferred_dimension) {
+absl::StatusOr<XlaOp> XlaBuilder::ReshapeInternal(const Shape& shape,
+                                                  XlaOp operand,
+                                                  int64_t inferred_dimension) {
   TF_RETURN_IF_ERROR(first_error_);
   if (shape.is_unbounded_dynamic()) {
     return InvalidArgument(
@@ -1507,7 +1512,7 @@ StatusOr<XlaOp> XlaBuilder::ReshapeInternal(const Shape& shape, XlaOp operand,
 XlaOp XlaBuilder::Slice(XlaOp operand, absl::Span<const int64_t> start_indices,
                         absl::Span<const int64_t> limit_indices,
                         absl::Span<const int64_t> strides) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferSliceShape(
                                          *operand_shape, start_indices,
@@ -1516,7 +1521,7 @@ XlaOp XlaBuilder::Slice(XlaOp operand, absl::Span<const int64_t> start_indices,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::SliceInternal(
+absl::StatusOr<XlaOp> XlaBuilder::SliceInternal(
     const Shape& shape, XlaOp operand, absl::Span<const int64_t> start_indices,
     absl::Span<const int64_t> limit_indices,
     absl::Span<const int64_t> strides) {
@@ -1534,7 +1539,7 @@ StatusOr<XlaOp> XlaBuilder::SliceInternal(
 XlaOp XlaBuilder::SliceInDim(XlaOp operand, int64_t start_index,
                              int64_t limit_index, int64_t stride,
                              int64_t dimno) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
     std::vector<int64_t> starts(shape->rank(), 0);
     std::vector<int64_t> limits(shape->dimensions().begin(),
@@ -1550,7 +1555,7 @@ XlaOp XlaBuilder::SliceInDim(XlaOp operand, int64_t start_index,
 XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
                                absl::Span<const XlaOp> start_indices,
                                absl::Span<const int64_t> slice_sizes) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<const Shape*> start_indices_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& start_indices_shapes,
@@ -1565,7 +1570,7 @@ XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::DynamicSliceInternal(
+absl::StatusOr<XlaOp> XlaBuilder::DynamicSliceInternal(
     const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
     absl::Span<const int64_t> slice_sizes) {
   HloInstructionProto instr;
@@ -1582,7 +1587,7 @@ StatusOr<XlaOp> XlaBuilder::DynamicSliceInternal(
 
 XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                      absl::Span<const XlaOp> start_indices) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* update_shape, GetShapePtr(update));
     std::vector<const Shape*> start_indices_shape_ptrs;
@@ -1598,7 +1603,7 @@ XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::DynamicUpdateSliceInternal(
+absl::StatusOr<XlaOp> XlaBuilder::DynamicUpdateSliceInternal(
     const Shape& shape, XlaOp operand, XlaOp update,
     absl::Span<const XlaOp> start_indices) {
   HloInstructionProto instr;
@@ -1612,7 +1617,7 @@ StatusOr<XlaOp> XlaBuilder::DynamicUpdateSliceInternal(
 
 XlaOp XlaBuilder::ConcatInDim(absl::Span<const XlaOp> operands,
                               int64_t dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
     absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
@@ -1623,7 +1628,7 @@ XlaOp XlaBuilder::ConcatInDim(absl::Span<const XlaOp> operands,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::ConcatInDimInternal(
+absl::StatusOr<XlaOp> XlaBuilder::ConcatInDimInternal(
     const Shape& shape, absl::Span<const XlaOp> operands, int64_t dimension) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
@@ -1635,7 +1640,7 @@ StatusOr<XlaOp> XlaBuilder::ConcatInDimInternal(
 
 XlaOp XlaBuilder::Pad(XlaOp operand, XlaOp padding_value,
                       const PaddingConfig& padding_config) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* padding_value_shape,
                         GetShapePtr(padding_value));
@@ -1648,7 +1653,7 @@ XlaOp XlaBuilder::Pad(XlaOp operand, XlaOp padding_value,
 
 XlaOp XlaBuilder::PadInDim(XlaOp operand, XlaOp padding_value, int64_t dimno,
                            int64_t pad_lo, int64_t pad_hi) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
     PaddingConfig padding_config = MakeNoPaddingConfig(shape->rank());
     auto* dims = padding_config.mutable_dimensions(dimno);
@@ -1658,9 +1663,9 @@ XlaOp XlaBuilder::PadInDim(XlaOp operand, XlaOp padding_value, int64_t dimno,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::PadInternal(const Shape& shape, XlaOp operand,
-                                        XlaOp padding_value,
-                                        const PaddingConfig& padding_config) {
+absl::StatusOr<XlaOp> XlaBuilder::PadInternal(
+    const Shape& shape, XlaOp operand, XlaOp padding_value,
+    const PaddingConfig& padding_config) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   *instr.mutable_padding_config() = padding_config;
@@ -1671,7 +1676,7 @@ StatusOr<XlaOp> XlaBuilder::PadInternal(const Shape& shape, XlaOp operand,
 XlaOp XlaBuilder::Reshape(XlaOp operand, absl::Span<const int64_t> dimensions,
                           absl::Span<const int64_t> new_sizes,
                           int64_t inferred_dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape shape, ShapeInference::InferReshapeShape(
                                                *operand_shape, dimensions,
@@ -1685,7 +1690,7 @@ XlaOp XlaBuilder::Reshape(XlaOp operand, absl::Span<const int64_t> dimensions,
 
 XlaOp XlaBuilder::Reshape(XlaOp operand, absl::Span<const int64_t> new_sizes,
                           int64_t inferred_dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
     std::vector<int64_t> dimensions(shape->dimensions_size());
     std::iota(dimensions.begin(), dimensions.end(), 0);
@@ -1695,7 +1700,7 @@ XlaOp XlaBuilder::Reshape(XlaOp operand, absl::Span<const int64_t> new_sizes,
 
 XlaOp XlaBuilder::Reshape(const Shape& shape, XlaOp operand,
                           int64_t inferred_dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     return ReshapeInternal(shape, operand, inferred_dimension);
   });
 }
@@ -1704,7 +1709,7 @@ XlaOp XlaBuilder::DynamicReshape(XlaOp operand,
                                  absl::Span<const XlaOp> dim_sizes,
                                  absl::Span<const int64_t> new_size_bounds,
                                  const std::vector<bool>& dims_are_dynamic) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<const Shape*> dim_size_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& dim_size_shapes,
@@ -1732,7 +1737,7 @@ XlaOp XlaBuilder::DynamicReshape(XlaOp operand,
 
 XlaOp XlaBuilder::Collapse(XlaOp operand,
                            absl::Span<const int64_t> dimensions) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (dimensions.size() <= 1) {
       // Not collapsing anything, trivially we can return the operand versus
       // enqueueing a trivial reshape.
@@ -1772,14 +1777,15 @@ XlaOp XlaBuilder::Collapse(XlaOp operand,
 }
 
 // Dummy pass-through computation returning it's parameter of shape `shape`.
-static StatusOr<XlaComputation> PassthroughComputation(const Shape& shape) {
+static absl::StatusOr<XlaComputation> PassthroughComputation(
+    const Shape& shape) {
   XlaBuilder builder("dummy");
   XlaOp out = Parameter(&builder, 0, shape, "p");
   return builder.Build(out);
 }
 
 XlaOp XlaBuilder::Select(XlaOp pred, XlaOp on_true, XlaOp on_false) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* true_shape, GetShapePtr(on_true));
     TF_ASSIGN_OR_RETURN(const Shape* false_shape, GetShapePtr(on_false));
     TF_RET_CHECK(true_shape->IsTuple() == false_shape->IsTuple());
@@ -1796,7 +1802,7 @@ XlaOp XlaBuilder::Select(XlaOp pred, XlaOp on_true, XlaOp on_false) {
 }
 
 XlaOp XlaBuilder::Tuple(absl::Span<const XlaOp> elements) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements));
     absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
@@ -1808,15 +1814,15 @@ XlaOp XlaBuilder::Tuple(absl::Span<const XlaOp> elements) {
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::TupleInternal(const Shape& shape,
-                                          absl::Span<const XlaOp> elements) {
+absl::StatusOr<XlaOp> XlaBuilder::TupleInternal(
+    const Shape& shape, absl::Span<const XlaOp> elements) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   return AddInstruction(std::move(instr), HloOpcode::kTuple, elements);
 }
 
 XlaOp XlaBuilder::GetTupleElement(XlaOp tuple_data, int64_t index) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* tuple_shape, GetShapePtr(tuple_data));
     if (!tuple_shape->IsTuple()) {
       return InvalidArgument(
@@ -1834,9 +1840,9 @@ XlaOp XlaBuilder::GetTupleElement(XlaOp tuple_data, int64_t index) {
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::GetTupleElementInternal(const Shape& shape,
-                                                    XlaOp tuple_data,
-                                                    int64_t index) {
+absl::StatusOr<XlaOp> XlaBuilder::GetTupleElementInternal(const Shape& shape,
+                                                          XlaOp tuple_data,
+                                                          int64_t index) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   instr.set_tuple_index(index);
@@ -1847,7 +1853,7 @@ StatusOr<XlaOp> XlaBuilder::GetTupleElementInternal(const Shape& shape,
 XlaOp XlaBuilder::Dot(XlaOp lhs, XlaOp rhs,
                       const PrecisionConfig* precision_config,
                       std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
 
     DotDimensionNumbers dimension_numbers;
@@ -1863,7 +1869,7 @@ XlaOp XlaBuilder::DotGeneral(
     XlaOp lhs, XlaOp rhs, const DotDimensionNumbers& dimension_numbers,
     const PrecisionConfig* precision_config,
     std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_ASSIGN_OR_RETURN(
@@ -1875,7 +1881,7 @@ XlaOp XlaBuilder::DotGeneral(
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::DotGeneralInternal(
+absl::StatusOr<XlaOp> XlaBuilder::DotGeneralInternal(
     const Shape& shape, XlaOp lhs, XlaOp rhs,
     const DotDimensionNumbers& dimension_numbers,
     const PrecisionConfig* precision_config) {
@@ -1894,7 +1900,7 @@ XlaOp XlaBuilder::SparseDot(
     const DotDimensionNumbers& dimension_numbers,
     const PrecisionConfig* precision_config,
     std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_ASSIGN_OR_RETURN(Shape shape,
@@ -1991,7 +1997,7 @@ XlaOp XlaBuilder::ConvWithGeneralDimensions(
     int64_t feature_group_count, int64_t batch_group_count,
     const PrecisionConfig* precision_config,
     std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
 
@@ -2049,7 +2055,7 @@ XlaOp XlaBuilder::ConvGeneralDilated(
     const PrecisionConfig* precision_config,
     std::optional<PrimitiveType> preferred_element_type,
     std::optional<std::vector<bool>> window_reversal) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_RETURN_IF_ERROR(
@@ -2079,7 +2085,7 @@ XlaOp XlaBuilder::ConvGeneralDilated(
   });
 }
 
-StatusOr<HloInstructionProto> XlaBuilder::DynamicConvInstruction(
+absl::StatusOr<HloInstructionProto> XlaBuilder::DynamicConvInstruction(
     XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
     absl::Span<const std::pair<int64_t, int64_t>> padding,
     absl::Span<const int64_t> lhs_dilation,
@@ -2132,7 +2138,7 @@ XlaOp XlaBuilder::DynamicConvInputGrad(
     int64_t feature_group_count, int64_t batch_group_count,
     const PrecisionConfig* precision_config, PaddingType padding_type,
     std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(
         HloInstructionProto instr,
         DynamicConvInstruction(
@@ -2157,7 +2163,7 @@ XlaOp XlaBuilder::DynamicConvKernelGrad(
     int64_t feature_group_count, int64_t batch_group_count,
     const PrecisionConfig* precision_config, PaddingType padding_type,
     std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(
         HloInstructionProto instr,
         DynamicConvInstruction(activations, gradients, window_strides, padding,
@@ -2184,7 +2190,7 @@ XlaOp XlaBuilder::DynamicConvForward(
     int64_t feature_group_count, int64_t batch_group_count,
     const PrecisionConfig* precision_config, PaddingType padding_type,
     std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(
         HloInstructionProto instr,
         DynamicConvInstruction(
@@ -2197,7 +2203,7 @@ XlaOp XlaBuilder::DynamicConvForward(
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::ConvGeneralDilatedInternal(
+absl::StatusOr<XlaOp> XlaBuilder::ConvGeneralDilatedInternal(
     const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
     absl::Span<const int64_t> window_strides,
     absl::Span<const std::pair<int64_t, int64_t>> padding,
@@ -2223,7 +2229,7 @@ StatusOr<XlaOp> XlaBuilder::ConvGeneralDilatedInternal(
 
 XlaOp XlaBuilder::Fft(XlaOp operand, const FftType fft_type,
                       const absl::Span<const int64_t> fft_length) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferFftShape(
                                          *operand_shape, fft_type, fft_length));
@@ -2231,7 +2237,7 @@ XlaOp XlaBuilder::Fft(XlaOp operand, const FftType fft_type,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::FftInternal(
+absl::StatusOr<XlaOp> XlaBuilder::FftInternal(
     const Shape& shape, XlaOp operand, const FftType fft_type,
     const absl::Span<const int64_t> fft_length) {
   HloInstructionProto instr;
@@ -2244,7 +2250,7 @@ StatusOr<XlaOp> XlaBuilder::FftInternal(
   return AddInstruction(std::move(instr), HloOpcode::kFft, {operand});
 }
 
-StatusOr<XlaOp> XlaBuilder::TriangularSolveInternal(
+absl::StatusOr<XlaOp> XlaBuilder::TriangularSolveInternal(
     const Shape& shape, XlaOp a, XlaOp b, TriangularSolveOptions options) {
   HloInstructionProto instr;
   *instr.mutable_triangular_solve_options() = std::move(options);
@@ -2253,8 +2259,8 @@ StatusOr<XlaOp> XlaBuilder::TriangularSolveInternal(
   return AddInstruction(std::move(instr), HloOpcode::kTriangularSolve, {a, b});
 }
 
-StatusOr<XlaOp> XlaBuilder::CholeskyInternal(const Shape& shape, XlaOp a,
-                                             bool lower) {
+absl::StatusOr<XlaOp> XlaBuilder::CholeskyInternal(const Shape& shape, XlaOp a,
+                                                   bool lower) {
   HloInstructionProto instr;
   CholeskyOptions& options = *instr.mutable_cholesky_options();
   options.set_lower(lower);
@@ -2264,7 +2270,7 @@ StatusOr<XlaOp> XlaBuilder::CholeskyInternal(const Shape& shape, XlaOp a,
 }
 
 XlaOp XlaBuilder::Infeed(const Shape& shape, const std::string& config) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     if (!LayoutUtil::HasLayout(shape)) {
       return InvalidArgument("Given shape to Infeed must have a layout");
@@ -2338,7 +2344,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const std::string& config) {
 
 XlaOp XlaBuilder::InfeedWithToken(XlaOp token, const Shape& shape,
                                   const std::string& config) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (!LayoutUtil::HasLayout(shape)) {
       return InvalidArgument("Given shape to Infeed must have a layout");
     }
@@ -2360,7 +2366,7 @@ XlaOp XlaBuilder::InfeedWithToken(XlaOp token, const Shape& shape,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::InfeedWithTokenInternal(
+absl::StatusOr<XlaOp> XlaBuilder::InfeedWithTokenInternal(
     const Shape& infeed_instruction_shape, XlaOp token,
     const std::string& config) {
   HloInstructionProto instr;
@@ -2371,7 +2377,7 @@ StatusOr<XlaOp> XlaBuilder::InfeedWithTokenInternal(
 
 void XlaBuilder::Outfeed(XlaOp operand, const Shape& shape_with_layout,
                          const std::string& outfeed_config) {
-  ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
@@ -2444,7 +2450,7 @@ void XlaBuilder::Outfeed(XlaOp operand, const Shape& shape_with_layout,
 XlaOp XlaBuilder::OutfeedWithToken(XlaOp operand, XlaOp token,
                                    const Shape& shape_with_layout,
                                    const std::string& outfeed_config) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Check and set outfeed shape.
     if (!LayoutUtil::HasLayout(shape_with_layout)) {
       return InvalidArgument("Given shape to Outfeed must have a layout");
@@ -2461,7 +2467,7 @@ XlaOp XlaBuilder::OutfeedWithToken(XlaOp operand, XlaOp token,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::OutfeedWithTokenInternal(
+absl::StatusOr<XlaOp> XlaBuilder::OutfeedWithTokenInternal(
     XlaOp operand, XlaOp token, const Shape& shape_with_layout,
     const std::string& outfeed_config) {
   HloInstructionProto instr;
@@ -2473,7 +2479,7 @@ StatusOr<XlaOp> XlaBuilder::OutfeedWithTokenInternal(
 }
 
 XlaOp XlaBuilder::CreateToken() {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kAfterAll);
@@ -2481,7 +2487,7 @@ XlaOp XlaBuilder::CreateToken() {
 }
 
 XlaOp XlaBuilder::AfterAll(absl::Span<const XlaOp> tokens) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (tokens.empty()) {
       return InvalidArgument("AfterAll requires at least one operand");
     }
@@ -2510,7 +2516,7 @@ XlaOp XlaBuilder::CustomCall(
     const Literal* literal, std::optional<Window> window,
     std::optional<ConvolutionDimensionNumbers> dnums,
     CustomCallSchedule schedule, CustomCallApiVersion api_version) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (absl::StartsWith(call_target_name, "$")) {
       return InvalidArgument(
           "Invalid custom_call_target \"%s\": Call targets that start with '$' "
@@ -2547,7 +2553,7 @@ XlaOp XlaBuilder::CustomCall(
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::CustomCallInternal(
+absl::StatusOr<XlaOp> XlaBuilder::CustomCallInternal(
     const std::string& call_target_name, absl::Span<const XlaOp> operands,
     const XlaComputation* computation, const Shape& shape,
     const std::string& opaque,
@@ -2618,7 +2624,7 @@ XlaOp XlaBuilder::CustomCall(
         output_operand_aliasing,
     const Literal* literal, CustomCallSchedule schedule,
     CustomCallApiVersion api_version) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (absl::StartsWith(call_target_name, "$")) {
       return InvalidArgument(
           "Invalid custom_call_target \"%s\": Call targets that start with '$' "
@@ -2656,7 +2662,7 @@ XlaOp XlaBuilder::CustomCall(
 }
 
 XlaOp XlaBuilder::OptimizationBarrier(XlaOp operand) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     Shape shape = *operand_shape;
     HloInstructionProto instr;
@@ -2668,7 +2674,7 @@ XlaOp XlaBuilder::OptimizationBarrier(XlaOp operand) {
 
 XlaOp XlaBuilder::Transpose(XlaOp operand,
                             absl::Span<const int64_t> permutation) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferTransposeShape(
                                          *operand_shape, permutation));
@@ -2676,7 +2682,7 @@ XlaOp XlaBuilder::Transpose(XlaOp operand,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::TransposeInternal(
+absl::StatusOr<XlaOp> XlaBuilder::TransposeInternal(
     const Shape& shape, XlaOp operand, absl::Span<const int64_t> permutation) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
@@ -2687,7 +2693,7 @@ StatusOr<XlaOp> XlaBuilder::TransposeInternal(
 }
 
 XlaOp XlaBuilder::Rev(XlaOp operand, absl::Span<const int64_t> dimensions) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferReverseShape(
                                          *operand_shape, dimensions));
@@ -2695,8 +2701,8 @@ XlaOp XlaBuilder::Rev(XlaOp operand, absl::Span<const int64_t> dimensions) {
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::RevInternal(const Shape& shape, XlaOp operand,
-                                        absl::Span<const int64_t> dimensions) {
+absl::StatusOr<XlaOp> XlaBuilder::RevInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const int64_t> dimensions) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   for (int64_t dim : dimensions) {
@@ -2708,7 +2714,7 @@ StatusOr<XlaOp> XlaBuilder::RevInternal(const Shape& shape, XlaOp operand,
 XlaOp XlaBuilder::Sort(absl::Span<const XlaOp> operands,
                        const XlaComputation& comparator, int64_t dimension,
                        bool is_stable) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(std::vector<Shape> operand_shapes,
                         GetOperandShapes(operands));
@@ -2720,10 +2726,11 @@ XlaOp XlaBuilder::Sort(absl::Span<const XlaOp> operands,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::SortInternal(const Shape& shape,
-                                         absl::Span<const XlaOp> operands,
-                                         const XlaComputation& comparator,
-                                         int64_t dimension, bool is_stable) {
+absl::StatusOr<XlaOp> XlaBuilder::SortInternal(const Shape& shape,
+                                               absl::Span<const XlaOp> operands,
+                                               const XlaComputation& comparator,
+                                               int64_t dimension,
+                                               bool is_stable) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   instr.set_is_stable(is_stable);
@@ -2737,7 +2744,7 @@ StatusOr<XlaOp> XlaBuilder::SortInternal(const Shape& shape,
 }
 
 XlaOp XlaBuilder::TopK(XlaOp operand, int64_t k, bool largest) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape,
@@ -2746,8 +2753,9 @@ XlaOp XlaBuilder::TopK(XlaOp operand, int64_t k, bool largest) {
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::TopKInternal(const Shape& shape, XlaOp operand,
-                                         int64_t k, bool largest) {
+absl::StatusOr<XlaOp> XlaBuilder::TopKInternal(const Shape& shape,
+                                               XlaOp operand, int64_t k,
+                                               bool largest) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   instr.set_k(k);
@@ -2757,7 +2765,7 @@ StatusOr<XlaOp> XlaBuilder::TopKInternal(const Shape& shape, XlaOp operand,
 
 XlaOp XlaBuilder::ConvertElementType(XlaOp operand,
                                      PrimitiveType new_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvertShape(
                                          *operand_shape, new_element_type));
@@ -2771,7 +2779,7 @@ XlaOp XlaBuilder::ConvertElementType(XlaOp operand,
 
 XlaOp XlaBuilder::BitcastConvertType(XlaOp operand,
                                      PrimitiveType new_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferBitcastConvertShape(
                                          *operand_shape, new_element_type));
@@ -2779,8 +2787,8 @@ XlaOp XlaBuilder::BitcastConvertType(XlaOp operand,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::BitcastConvertTypeInternal(const Shape& shape,
-                                                       XlaOp operand) {
+absl::StatusOr<XlaOp> XlaBuilder::BitcastConvertTypeInternal(const Shape& shape,
+                                                             XlaOp operand) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert,
@@ -2789,7 +2797,7 @@ StatusOr<XlaOp> XlaBuilder::BitcastConvertTypeInternal(const Shape& shape,
 
 XlaOp XlaBuilder::StochasticConvertType(XlaOp operand, XlaOp random,
                                         PrimitiveType new_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* random_shape, GetShapePtr(random));
     TF_ASSIGN_OR_RETURN(Shape shape,
@@ -2808,7 +2816,7 @@ XlaOp XlaBuilder::Map(absl::Span<const XlaOp> operands,
                       const XlaComputation& computation,
                       absl::Span<const int64_t> dimensions,
                       absl::Span<const XlaOp> static_operands) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (!static_operands.empty()) {
       return Unimplemented("static_operands is not supported in Map");
     }
@@ -2850,7 +2858,7 @@ XlaOp XlaBuilder::Map(absl::Span<const XlaOp> operands,
 XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
                         absl::Span<const XlaOp> parameters,
                         const Shape& shape) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Check the number of parameters per RNG distribution.
     switch (distribution) {
       case RandomDistribution::RNG_NORMAL:
@@ -2870,9 +2878,9 @@ XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::RngOpInternal(RandomDistribution distribution,
-                                          absl::Span<const XlaOp> parameters,
-                                          const Shape& shape) {
+absl::StatusOr<XlaOp> XlaBuilder::RngOpInternal(
+    RandomDistribution distribution, absl::Span<const XlaOp> parameters,
+    const Shape& shape) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   instr.set_distribution(distribution);
@@ -2890,7 +2898,7 @@ XlaOp XlaBuilder::RngUniform(XlaOp a, XlaOp b, const Shape& shape) {
 
 XlaOp XlaBuilder::RngBitGenerator(RandomAlgorithm algorithm,
                                   XlaOp initial_state, const Shape& shape) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
     TF_ASSIGN_OR_RETURN(Shape state_shape, GetShape(initial_state));
     Shape output_shape = shape;
@@ -2910,7 +2918,7 @@ XlaOp XlaBuilder::RngBitGenerator(RandomAlgorithm algorithm,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::RngBitGeneratorInternal(
+absl::StatusOr<XlaOp> XlaBuilder::RngBitGeneratorInternal(
     const Shape& full_result_shape, RandomAlgorithm algorithm,
     XlaOp initial_state) {
   HloInstructionProto instr;
@@ -2922,7 +2930,7 @@ StatusOr<XlaOp> XlaBuilder::RngBitGeneratorInternal(
 
 XlaOp XlaBuilder::While(const XlaComputation& condition,
                         const XlaComputation& body, XlaOp init) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Infer shape.
     TF_ASSIGN_OR_RETURN(const auto& body_program_shape, body.GetProgramShape());
     TF_ASSIGN_OR_RETURN(const auto& condition_program_shape,
@@ -2935,10 +2943,10 @@ XlaOp XlaBuilder::While(const XlaComputation& condition,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::WhileInternal(const Shape& shape,
-                                          const XlaComputation& condition,
-                                          const XlaComputation& body,
-                                          XlaOp init) {
+absl::StatusOr<XlaOp> XlaBuilder::WhileInternal(const Shape& shape,
+                                                const XlaComputation& condition,
+                                                const XlaComputation& body,
+                                                XlaOp init) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   // Body comes before condition computation in the vector.
@@ -2951,7 +2959,7 @@ XlaOp XlaBuilder::Gather(XlaOp input, XlaOp start_indices,
                          const GatherDimensionNumbers& dimension_numbers,
                          absl::Span<const int64_t> slice_sizes,
                          bool indices_are_sorted) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* input_shape, GetShapePtr(input));
     TF_ASSIGN_OR_RETURN(const Shape* start_indices_shape,
                         GetShapePtr(start_indices));
@@ -2963,7 +2971,7 @@ XlaOp XlaBuilder::Gather(XlaOp input, XlaOp start_indices,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::GatherInternal(
+absl::StatusOr<XlaOp> XlaBuilder::GatherInternal(
     const Shape& shape, XlaOp input, XlaOp start_indices,
     const GatherDimensionNumbers& dimension_numbers,
     absl::Span<const int64_t> slice_sizes, bool indices_are_sorted) {
@@ -2993,7 +3001,7 @@ XlaOp XlaBuilder::Scatter(absl::Span<const XlaOp> inputs, XlaOp scatter_indices,
                           const XlaComputation& update_computation,
                           const ScatterDimensionNumbers& dimension_numbers,
                           bool indices_are_sorted, bool unique_indices) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (inputs.empty()) {
       return InvalidArgument("Scatter inputs cannot be empty.");
     }
@@ -3026,12 +3034,12 @@ XlaOp XlaBuilder::Scatter(absl::Span<const XlaOp> inputs, XlaOp scatter_indices,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::ScatterInternal(
+absl::StatusOr<XlaOp> XlaBuilder::ScatterInternal(
     const Shape& shape, absl::Span<const XlaOp> inputs, XlaOp scatter_indices,
     absl::Span<const XlaOp> updates, const XlaComputation& update_computation,
     const ScatterDimensionNumbers& dimension_numbers, bool indices_are_sorted,
     bool unique_indices) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     instr.set_indices_are_sorted(indices_are_sorted);
     instr.set_unique_indices(unique_indices);
@@ -3052,7 +3060,7 @@ XlaOp XlaBuilder::Conditional(XlaOp predicate, XlaOp true_operand,
                               const XlaComputation& true_computation,
                               XlaOp false_operand,
                               const XlaComputation& false_computation) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(predicate));
 
     if (!ShapeUtil::IsScalar(*shape) || shape->element_type() != PRED) {
@@ -3072,7 +3080,7 @@ XlaOp XlaBuilder::Conditional(
     XlaOp branch_index,
     absl::Span<const XlaComputation* const> branch_computations,
     absl::Span<const XlaOp> branch_operands) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(branch_index));
 
     if (!ShapeUtil::IsScalar(*shape) || shape->element_type() != S32) {
@@ -3091,7 +3099,7 @@ XlaOp XlaBuilder::AllReduceImpl(XlaOp operand,
                                 const std::optional<Shape>& layout,
                                 const std::optional<bool> use_global_device_ids,
                                 bool async) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<const Shape*> operand_shapes;
@@ -3177,7 +3185,7 @@ XlaOp XlaBuilder::AllGatherImpl(const XlaOp operand,
                                 const std::optional<Layout>& layout,
                                 const std::optional<bool> use_global_device_ids,
                                 bool async) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
@@ -3229,7 +3237,7 @@ XlaOp XlaBuilder::ConditionalImpl(
     XlaOp branch_index,
     absl::Span<const XlaComputation* const> branch_computations,
     absl::Span<const XlaOp> branch_operands) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     TF_ASSIGN_OR_RETURN(const Shape* branch_index_shape,
@@ -3284,7 +3292,7 @@ XlaOp XlaBuilder::Reduce(absl::Span<const XlaOp> operands,
                          absl::Span<const XlaOp> init_values,
                          const XlaComputation& computation,
                          absl::Span<const int64_t> dimensions_to_reduce) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
                         computation.GetProgramShape());
 
@@ -3308,11 +3316,11 @@ XlaOp XlaBuilder::Reduce(absl::Span<const XlaOp> operands,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::ReduceInternal(
+absl::StatusOr<XlaOp> XlaBuilder::ReduceInternal(
     const Shape& shape, absl::Span<const XlaOp> all_operands,
     const XlaComputation& computation,
     absl::Span<const int64_t> dimensions_to_reduce) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
 
@@ -3327,7 +3335,7 @@ StatusOr<XlaOp> XlaBuilder::ReduceInternal(
 
 XlaOp XlaBuilder::ReduceAll(XlaOp operand, XlaOp init_value,
                             const XlaComputation& computation) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<int64_t> all_dimnos(operand_shape->rank());
     std::iota(all_dimnos.begin(), all_dimnos.end(), 0);
@@ -3351,7 +3359,7 @@ XlaOp XlaBuilder::ReduceWindow(absl::Span<const XlaOp> operands,
                                absl::Span<const int64_t> window_dimensions,
                                absl::Span<const int64_t> window_strides,
                                Padding padding) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     const Shape* operand_shape = nullptr;
     for (const auto& operand : operands) {
       TF_ASSIGN_OR_RETURN(operand_shape, GetShapePtr(operand));
@@ -3405,7 +3413,7 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
     absl::Span<const int64_t> window_dilations,
     absl::Span<const std::pair<int64_t, int64_t>> padding) {
   std::vector<const Shape*> operand_shapes, init_shapes;
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (operands.size() == 1) {
       const auto& operand = operands[0];
       const auto& init_value = init_values[0];
@@ -3441,7 +3449,7 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
   });
 }
 
-StatusOr<HloInstructionProto> XlaBuilder::ReduceWindowInternal(
+absl::StatusOr<HloInstructionProto> XlaBuilder::ReduceWindowInternal(
     absl::Span<const XlaOp> operands, absl::Span<const XlaOp> init_values,
     const XlaComputation& computation,
     absl::Span<const int64_t> window_dimensions,
@@ -3476,7 +3484,7 @@ StatusOr<HloInstructionProto> XlaBuilder::ReduceWindowInternal(
   return instr;
 }
 
-StatusOr<XlaOp> XlaBuilder::ReduceWindowInternal(
+absl::StatusOr<XlaOp> XlaBuilder::ReduceWindowInternal(
     const Shape& shape, XlaOp operand, XlaOp init_value,
     const XlaComputation& computation, Window window) {
   HloInstructionProto instr;
@@ -3490,7 +3498,7 @@ StatusOr<XlaOp> XlaBuilder::ReduceWindowInternal(
 
 XlaOp XlaBuilder::BatchNormTraining(XlaOp operand, XlaOp scale, XlaOp offset,
                                     float epsilon, int64_t feature_index) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
@@ -3513,7 +3521,7 @@ XlaOp XlaBuilder::BatchNormTraining(XlaOp operand, XlaOp scale, XlaOp offset,
 XlaOp XlaBuilder::BatchNormInference(XlaOp operand, XlaOp scale, XlaOp offset,
                                      XlaOp mean, XlaOp variance, float epsilon,
                                      int64_t feature_index) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
@@ -3538,7 +3546,7 @@ XlaOp XlaBuilder::BatchNormInference(XlaOp operand, XlaOp scale, XlaOp offset,
 XlaOp XlaBuilder::BatchNormGrad(XlaOp operand, XlaOp scale, XlaOp batch_mean,
                                 XlaOp batch_var, XlaOp grad_output,
                                 float epsilon, int64_t feature_index) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
@@ -3574,7 +3582,7 @@ XlaOp XlaBuilder::AllGather(XlaOp operand, int64_t all_gather_dimension,
 
 XlaOp XlaBuilder::CrossReplicaSum(
     XlaOp operand, absl::Span<const ReplicaGroup> replica_groups) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
     const Shape* element_shape;
     if (shape->IsTuple()) {
@@ -3618,7 +3626,7 @@ XlaOp XlaBuilder::ReduceScatter(
     const std::optional<ChannelHandle>& channel_id,
     const std::optional<Layout>& layout,
     const std::optional<bool> use_global_device_ids) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<const Shape*> operand_shapes;
@@ -3690,7 +3698,7 @@ XlaOp XlaBuilder::AllToAllArray(
     XlaOp operand, int64_t split_dimension, int64_t concat_dimension,
     int64_t split_count, absl::Span<const ReplicaGroup> replica_groups,
     const std::optional<ChannelHandle>& channel_id) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(
         const Shape all_to_all_shape,
@@ -3749,7 +3757,7 @@ XlaOp XlaBuilder::AllToAllTuple(
     absl::Span<const ReplicaGroup> replica_groups,
     const std::optional<Layout>& layout,
     const std::optional<ChannelHandle>& channel_id) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(auto operand_shapes, this->GetOperandShapes(operands));
     std::vector<const Shape*> operand_shape_ptrs;
@@ -3792,7 +3800,7 @@ XlaOp XlaBuilder::AllToAllTuple(
     int64_t split_count, absl::Span<const ReplicaGroup> replica_groups,
     const std::optional<Layout>& layout,
     const std::optional<ChannelHandle>& channel_id) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
     // The HloInstruction for AllToAll currently only handles the data
@@ -3872,7 +3880,7 @@ XlaOp XlaBuilder::CollectivePermuteImpl(
     XlaOp operand,
     const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
     const std::optional<ChannelHandle>& channel_id, bool async) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(
@@ -3897,7 +3905,7 @@ XlaOp XlaBuilder::CollectivePermuteImpl(
 }
 
 XlaOp XlaBuilder::ReplicaId() {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = ShapeUtil::MakeShape(U32, {}).ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kReplicaId, {});
@@ -3910,7 +3918,7 @@ XlaOp XlaBuilder::SelectAndScatter(XlaOp operand, const XlaComputation& select,
                                    Padding padding, XlaOp source,
                                    XlaOp init_value,
                                    const XlaComputation& scatter) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
     std::vector<std::pair<int64_t, int64_t>> padding_values =
@@ -3949,7 +3957,7 @@ XlaOp XlaBuilder::SelectAndScatter(XlaOp operand, const XlaComputation& select,
   });
 }
 
-StatusOr<HloInstructionProto> XlaBuilder::SelectAndScatterInternal(
+absl::StatusOr<HloInstructionProto> XlaBuilder::SelectAndScatterInternal(
     XlaOp operand, const XlaComputation& select,
     absl::Span<const int64_t> window_dimensions,
     absl::Span<const int64_t> window_strides,
@@ -3985,7 +3993,7 @@ XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
     absl::Span<const int64_t> window_strides,
     absl::Span<const std::pair<int64_t, int64_t>> padding, XlaOp source,
     XlaOp init_value, const XlaComputation& scatter) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(HloInstructionProto instr,
                         SelectAndScatterInternal(
                             operand, select, window_dimensions, window_strides,
@@ -3998,7 +4006,7 @@ XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
 
 XlaOp XlaBuilder::ReducePrecision(XlaOp operand, const int exponent_bits,
                                   const int mantissa_bits) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferReducePrecisionShape(
@@ -4008,10 +4016,9 @@ XlaOp XlaBuilder::ReducePrecision(XlaOp operand, const int exponent_bits,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::ReducePrecisionInternal(const Shape& shape,
-                                                    XlaOp operand,
-                                                    const int exponent_bits,
-                                                    const int mantissa_bits) {
+absl::StatusOr<XlaOp> XlaBuilder::ReducePrecisionInternal(
+    const Shape& shape, XlaOp operand, const int exponent_bits,
+    const int mantissa_bits) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   instr.set_exponent_bits(exponent_bits);
@@ -4021,7 +4028,7 @@ StatusOr<XlaOp> XlaBuilder::ReducePrecisionInternal(const Shape& shape,
 }
 
 void XlaBuilder::Send(XlaOp operand, const ChannelHandle& handle) {
-  ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Send HLO takes two operands: a data operand and a token. Generate the
     // token to pass into the send.
     // TODO(b/80000000): Remove this when clients have been updated to handle
@@ -4037,7 +4044,7 @@ void XlaBuilder::Send(XlaOp operand, const ChannelHandle& handle) {
 
 XlaOp XlaBuilder::SendWithToken(XlaOp operand, XlaOp token,
                                 const ChannelHandle& handle) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (handle.type() != ChannelHandle::DEVICE_TO_DEVICE) {
       return InvalidArgument("Send must use a device-to-device channel");
     }
@@ -4050,7 +4057,7 @@ XlaOp XlaBuilder::SendWithToken(XlaOp operand, XlaOp token,
 }
 
 XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Recv HLO takes a single token operand. Generate the token to pass into
     // the Recv and RecvDone instructions.
     // TODO(b/80000000): Remove this when clients have been updated to handle
@@ -4076,7 +4083,7 @@ XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
 
 XlaOp XlaBuilder::RecvWithToken(XlaOp token, const Shape& shape,
                                 const ChannelHandle& handle) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (handle.type() != ChannelHandle::DEVICE_TO_DEVICE) {
       return InvalidArgument("Recv must use a device-to-device channel");
     }
@@ -4091,7 +4098,7 @@ XlaOp XlaBuilder::RecvWithToken(XlaOp token, const Shape& shape,
 XlaOp XlaBuilder::SendToHost(XlaOp operand, XlaOp token,
                              const Shape& shape_with_layout,
                              const ChannelHandle& handle) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (!LayoutUtil::HasLayout(shape_with_layout)) {
       return InvalidArgument("Shape passed to SendToHost must have a layout");
     }
@@ -4139,7 +4146,7 @@ XlaOp XlaBuilder::SendToHost(XlaOp operand, XlaOp token,
 
 XlaOp XlaBuilder::RecvFromHost(XlaOp token, const Shape& shape,
                                const ChannelHandle& handle) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (!LayoutUtil::HasLayout(shape)) {
       return InvalidArgument("Shape passed to RecvFromHost must have a layout");
     }
@@ -4179,7 +4186,7 @@ XlaOp XlaBuilder::RecvFromHost(XlaOp token, const Shape& shape,
 }
 
 XlaOp XlaBuilder::GetDimensionSize(XlaOp operand, int64_t dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferGetDimensionSizeShape(
@@ -4197,7 +4204,7 @@ XlaOp XlaBuilder::GetDimensionSize(XlaOp operand, int64_t dimension) {
 }
 
 XlaOp XlaBuilder::RemoveDynamicDimension(XlaOp operand, int64_t dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
     Shape shape = *operand_shape;
@@ -4212,7 +4219,7 @@ XlaOp XlaBuilder::RemoveDynamicDimension(XlaOp operand, int64_t dimension) {
 
 XlaOp XlaBuilder::SetDimensionSize(XlaOp operand, XlaOp val,
                                    int64_t dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* val_shape, GetShapePtr(val));
 
@@ -4223,9 +4230,10 @@ XlaOp XlaBuilder::SetDimensionSize(XlaOp operand, XlaOp val,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::SetDimensionSizeInternal(const Shape& shape,
-                                                     XlaOp operand, XlaOp val,
-                                                     int64_t dimension) {
+absl::StatusOr<XlaOp> XlaBuilder::SetDimensionSizeInternal(const Shape& shape,
+                                                           XlaOp operand,
+                                                           XlaOp val,
+                                                           int64_t dimension) {
   // Note that both SetDimensionSize and RemoveDynamicDimension use
   // HloOpcode::kSetDimensionSize internally. However, The SetDimensionSize
   // builder always produces an output with a dynamic bound on the given
@@ -4242,7 +4250,7 @@ StatusOr<XlaOp> XlaBuilder::SetDimensionSizeInternal(const Shape& shape,
                         {operand, val});
 }
 
-StatusOr<bool> XlaBuilder::IsConstant(XlaOp operand) const {
+absl::StatusOr<bool> XlaBuilder::IsConstant(XlaOp operand) const {
   TF_RETURN_IF_ERROR(first_error_);
 
   // Verify that the handle is valid.
@@ -4254,7 +4262,7 @@ StatusOr<bool> XlaBuilder::IsConstant(XlaOp operand) const {
   return is_constant;
 }
 
-StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
+absl::StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
     XlaOp root_op, bool dynamic_dimension_is_minus_one) {
   TF_ASSIGN_OR_RETURN(bool is_constant, IsConstant(root_op));
   if (!is_constant) {
@@ -4522,9 +4530,9 @@ XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
   return OkStatus();
 }
 
-StatusOr<XlaOp> XlaBuilder::AddInstruction(HloInstructionProto&& instr,
-                                           HloOpcode opcode,
-                                           absl::Span<const XlaOp> operands) {
+absl::StatusOr<XlaOp> XlaBuilder::AddInstruction(
+    HloInstructionProto&& instr, HloOpcode opcode,
+    absl::Span<const XlaOp> operands) {
   TF_RETURN_IF_ERROR(first_error_);
 
   const int64_t handle = GetNextId();
@@ -4570,8 +4578,8 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(HloInstructionProto&& instr,
   return op;
 }
 
-StatusOr<XlaOp> XlaBuilder::AddOpWithShape(HloOpcode opcode, const Shape& shape,
-                                           absl::Span<const XlaOp> operands) {
+absl::StatusOr<XlaOp> XlaBuilder::AddOpWithShape(
+    HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   return AddInstruction(std::move(instr), opcode, operands);
@@ -4633,25 +4641,25 @@ void XlaBuilder::AddCalledComputation(const XlaComputation& computation,
   }
 }
 
-StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
+absl::StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
     const XlaOp op) const {
   TF_RETURN_IF_ERROR(first_error_);
   return LookUpInstructionInternal<const HloInstructionProto*>(op);
 }
 
-StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstructionByHandle(
-    int64_t handle) const {
+absl::StatusOr<const HloInstructionProto*>
+XlaBuilder::LookUpInstructionByHandle(int64_t handle) const {
   return LookUpInstructionByHandleInternal<const HloInstructionProto*>(handle);
 }
 
-StatusOr<HloInstructionProto*> XlaBuilder::LookUpMutableInstruction(
+absl::StatusOr<HloInstructionProto*> XlaBuilder::LookUpMutableInstruction(
     const XlaOp op) {
   TF_RETURN_IF_ERROR(first_error_);
   return LookUpInstructionInternal<HloInstructionProto*>(op);
 }
 
-StatusOr<HloInstructionProto*> XlaBuilder::LookUpMutableInstructionByHandle(
-    int64_t handle) {
+absl::StatusOr<HloInstructionProto*>
+XlaBuilder::LookUpMutableInstructionByHandle(int64_t handle) {
   return LookUpInstructionByHandleInternal<HloInstructionProto*>(handle);
 }
 
@@ -4789,7 +4797,7 @@ static XlaOp CompareTotalOrder(const XlaOp lhs, const XlaOp rhs,
                                absl::Span<const int64_t> broadcast_dimensions,
                                ComparisonDirection comparison_direction) {
   auto b = lhs.builder();
-  return b->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return b->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(auto operand_shape, b->GetShape(lhs));
     auto operand_element_type = operand_shape.element_type();
     auto compare_type =
@@ -5022,7 +5030,7 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
                       bool unit_diagonal,
                       TriangularSolveOptions::Transpose transpose_a) {
   XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* a_shape, builder->GetShapePtr(a));
     TF_ASSIGN_OR_RETURN(const Shape* b_shape, builder->GetShapePtr(b));
     TriangularSolveOptions options;
@@ -5038,7 +5046,7 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
 
 XlaOp Cholesky(XlaOp a, bool lower) {
   XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* a_shape, builder->GetShapePtr(a));
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferCholeskyShape(*a_shape));
@@ -5754,7 +5762,7 @@ OpSharding GetManualSharding(const OpSharding& original, int64_t single_dim) {
   return manual;
 }
 
-StatusOr<XlaOp> ConvertSpmdFullToShardShape(
+absl::StatusOr<XlaOp> ConvertSpmdFullToShardShape(
     XlaBuilder* builder, XlaOp input, int single_dim,
     const OpSharding& manual_sharding,
     absl::Span<const int64_t> unspecified_dims) {
@@ -5799,7 +5807,7 @@ StatusOr<XlaOp> ConvertSpmdFullToShardShape(
   }
 }
 
-StatusOr<XlaOp> ConvertSpmdShardToFullShape(
+absl::StatusOr<XlaOp> ConvertSpmdShardToFullShape(
     XlaBuilder* builder, XlaOp input, const Shape& output_shape, int single_dim,
     const OpSharding& manual_sharding,
     absl::Span<const int64_t> unspecified_dims) {
diff --git a/third_party/xla/xla/client/xla_builder.h b/third_party/xla/xla/client/xla_builder.h
index f9c562376de72c..b744c679bd7a66 100644
--- a/third_party/xla/xla/client/xla_builder.h
+++ b/third_party/xla/xla/client/xla_builder.h
@@ -358,12 +358,12 @@ class XlaBuilder {
   // TODO(b/121223198): Delete `remove_dynamic_dimensions` and keeps the
   // dynamic dimensions information when XLA backend can handle dynamic
   // dimensions.
-  StatusOr<XlaComputation> Build(bool remove_dynamic_dimensions = false);
+  absl::StatusOr<XlaComputation> Build(bool remove_dynamic_dimensions = false);
 
   // Overload of Build which specifies a particular root instruction for the
   // computation.
-  StatusOr<XlaComputation> Build(XlaOp root,
-                                 bool remove_dynamic_dimensions = false);
+  absl::StatusOr<XlaComputation> Build(XlaOp root,
+                                       bool remove_dynamic_dimensions = false);
 
   // Builds the computation with the requested operations, or notes an error in
   // the parent XlaBuilder and returns an empty computation if building failed.
@@ -379,7 +379,7 @@ class XlaBuilder {
   // compile-time constant (see `IsConstant`), returns an error.
   //
   // This will copy the needed ops/computations to the subgraph.
-  StatusOr<XlaComputation> BuildConstantSubGraph(
+  absl::StatusOr<XlaComputation> BuildConstantSubGraph(
       XlaOp root_op, bool dynamic_dimension_is_minus_one = false);
 
   // Returns the first error that was encountered while building the
@@ -395,18 +395,18 @@ class XlaBuilder {
   Status GetCurrentStatus() const;
 
   // Returns the shape of the given op.
-  StatusOr<Shape> GetShape(XlaOp op) const;
+  absl::StatusOr<Shape> GetShape(XlaOp op) const;
 
   // Returns the shape of the given op.
-  virtual StatusOr<const Shape*> GetShapePtr(XlaOp op) const;
+  virtual absl::StatusOr<const Shape*> GetShapePtr(XlaOp op) const;
 
   // Returns the (inferred) result for the current computation's shape. This
   // assumes the root instruction is the last added instruction.
-  StatusOr<ProgramShape> GetProgramShape() const;
+  absl::StatusOr<ProgramShape> GetProgramShape() const;
 
   // Returns the (inferred) result for the current computation's shape using the
   // given operation as the root.
-  StatusOr<ProgramShape> GetProgramShape(XlaOp root) const;
+  absl::StatusOr<ProgramShape> GetProgramShape(XlaOp root) const;
 
   // Reports an error to the builder, by
   // * storing it internally and capturing a backtrace if it's the first error
@@ -420,11 +420,12 @@ class XlaBuilder {
   // A helper function that converts a StatusOr<XlaOp> into an XlaOp.
   // If the Status was an error, reports the error to builder and returns an
   // invalid XlaOp handle.
-  XlaOp ReportErrorOrReturn(const StatusOr<XlaOp>& op);
+  XlaOp ReportErrorOrReturn(const absl::StatusOr<XlaOp>& op);
 
   // A helper function that runs a function that returns a StatusOr<XlaOp> and
   // returns an XlaOp.
-  XlaOp ReportErrorOrReturn(absl::FunctionRef<StatusOr<XlaOp>()> op_creator);
+  XlaOp ReportErrorOrReturn(
+      absl::FunctionRef<absl::StatusOr<XlaOp>()> op_creator);
 
   // Returns true if 'operand' is a compile-time constant. A compile-time
   // constant does not depend on any parameters, or on stateful operators such
@@ -432,7 +433,7 @@ class XlaBuilder {
   //
   // This tests whether a computation is a compile-time constant without
   // evaluating the computation.
-  StatusOr<bool> IsConstant(XlaOp operand) const;
+  absl::StatusOr<bool> IsConstant(XlaOp operand) const;
 
   // Adds a new input/output alias. Since the input/output shape information are
   // not available until the computation is built, any eventual error in the
@@ -480,7 +481,7 @@ class XlaBuilder {
                                          std::string value);
 
   // Returns shapes for the operands.
-  StatusOr<std::vector<Shape>> GetOperandShapes(
+  absl::StatusOr<std::vector<Shape>> GetOperandShapes(
       absl::Span<const XlaOp> operands) const;
 
   // Converts the op to string for the ease of debugging.
@@ -490,8 +491,8 @@ class XlaBuilder {
   void ToStringHelper(std::string* out, int ident, int64_t op_handle) const;
 
   // Build helper which takes the id of the root operation..
-  StatusOr<XlaComputation> Build(int64_t root_id,
-                                 bool remove_dynamic_dimensions);
+  absl::StatusOr<XlaComputation> Build(int64_t root_id,
+                                       bool remove_dynamic_dimensions);
 
   // Description for the methods below can be found in the corresponding public
   // functions section in this file.
@@ -525,9 +526,9 @@ class XlaBuilder {
   XlaOp PadInDim(XlaOp operand, XlaOp padding_value, int64_t dimno,
                  int64_t pad_lo, int64_t pad_hi);
 
-  virtual StatusOr<XlaOp> PadInternal(const Shape& shape, XlaOp operand,
-                                      XlaOp padding_value,
-                                      const PaddingConfig& padding_config);
+  virtual absl::StatusOr<XlaOp> PadInternal(
+      const Shape& shape, XlaOp operand, XlaOp padding_value,
+      const PaddingConfig& padding_config);
 
   XlaOp Reshape(XlaOp operand, absl::Span<const int64_t> dimensions,
                 absl::Span<const int64_t> new_sizes,
@@ -548,40 +549,40 @@ class XlaBuilder {
   XlaOp Slice(XlaOp operand, absl::Span<const int64_t> start_indices,
               absl::Span<const int64_t> limit_indices,
               absl::Span<const int64_t> strides);
-  virtual StatusOr<XlaOp> SliceInternal(const Shape& shape, XlaOp operand,
-                                        absl::Span<const int64_t> start_indices,
-                                        absl::Span<const int64_t> limit_indices,
-                                        absl::Span<const int64_t> strides);
+  virtual absl::StatusOr<XlaOp> SliceInternal(
+      const Shape& shape, XlaOp operand,
+      absl::Span<const int64_t> start_indices,
+      absl::Span<const int64_t> limit_indices,
+      absl::Span<const int64_t> strides);
   virtual XlaOp SliceInDim(XlaOp operand, int64_t start_index,
                            int64_t limit_index, int64_t stride, int64_t dimno);
 
   XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                      absl::Span<const int64_t> slice_sizes);
-  virtual StatusOr<XlaOp> DynamicSliceInternal(
+  virtual absl::StatusOr<XlaOp> DynamicSliceInternal(
       const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
       absl::Span<const int64_t> slice_sizes);
 
   XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                            absl::Span<const XlaOp> start_indices);
-  virtual StatusOr<XlaOp> DynamicUpdateSliceInternal(
+  virtual absl::StatusOr<XlaOp> DynamicUpdateSliceInternal(
       const Shape& shape, XlaOp operand, XlaOp update,
       absl::Span<const XlaOp> start_indices);
 
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64_t dimension);
-  virtual StatusOr<XlaOp> ConcatInDimInternal(const Shape& shape,
-                                              absl::Span<const XlaOp> operands,
-                                              int64_t dimension);
+  virtual absl::StatusOr<XlaOp> ConcatInDimInternal(
+      const Shape& shape, absl::Span<const XlaOp> operands, int64_t dimension);
 
   XlaOp Select(XlaOp pred, XlaOp on_true, XlaOp on_false);
 
   XlaOp Tuple(absl::Span<const XlaOp> elements);
-  virtual StatusOr<XlaOp> TupleInternal(const Shape& shape,
-                                        absl::Span<const XlaOp> elements);
+  virtual absl::StatusOr<XlaOp> TupleInternal(const Shape& shape,
+                                              absl::Span<const XlaOp> elements);
 
   XlaOp GetTupleElement(XlaOp tuple_data, int64_t index);
-  virtual StatusOr<XlaOp> GetTupleElementInternal(const Shape& shape,
-                                                  XlaOp tuple_data,
-                                                  int64_t index);
+  virtual absl::StatusOr<XlaOp> GetTupleElementInternal(const Shape& shape,
+                                                        XlaOp tuple_data,
+                                                        int64_t index);
 
   XlaOp Dot(XlaOp lhs, XlaOp rhs,
             const PrecisionConfig* precision_config = nullptr,
@@ -671,7 +672,7 @@ class XlaBuilder {
       const PrecisionConfig* precision_config, PaddingType padding_type,
       std::optional<PrimitiveType> preferred_element_type = std::nullopt);
 
-  StatusOr<HloInstructionProto> DynamicConvInstruction(
+  absl::StatusOr<HloInstructionProto> DynamicConvInstruction(
       XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
       absl::Span<const std::pair<int64_t, int64_t>> padding,
       absl::Span<const int64_t> lhs_dilation,
@@ -681,7 +682,7 @@ class XlaBuilder {
       const PrecisionConfig* precision_config, PaddingType padding_type,
       std::optional<PrimitiveType> preferred_element_type = std::nullopt);
 
-  virtual StatusOr<XlaOp> ConvGeneralDilatedInternal(
+  virtual absl::StatusOr<XlaOp> ConvGeneralDilatedInternal(
       const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
       absl::Span<const int64_t> window_strides,
       absl::Span<const std::pair<int64_t, int64_t>> padding,
@@ -693,20 +694,20 @@ class XlaBuilder {
 
   XlaOp Fft(XlaOp operand, FftType fft_type,
             absl::Span<const int64_t> fft_length);
-  virtual StatusOr<XlaOp> FftInternal(const Shape& shape, XlaOp operand,
-                                      FftType fft_type,
-                                      absl::Span<const int64_t> fft_length);
+  virtual absl::StatusOr<XlaOp> FftInternal(
+      const Shape& shape, XlaOp operand, FftType fft_type,
+      absl::Span<const int64_t> fft_length);
 
-  virtual StatusOr<XlaOp> TriangularSolveInternal(
+  virtual absl::StatusOr<XlaOp> TriangularSolveInternal(
       const Shape& shape, XlaOp a, XlaOp b, TriangularSolveOptions options);
 
-  virtual StatusOr<XlaOp> CholeskyInternal(const Shape& shape, XlaOp a,
-                                           bool lower);
+  virtual absl::StatusOr<XlaOp> CholeskyInternal(const Shape& shape, XlaOp a,
+                                                 bool lower);
 
   XlaOp Infeed(const Shape& shape, const std::string& config = "");
   XlaOp InfeedWithToken(XlaOp token, const Shape& shape,
                         const std::string& config);
-  virtual StatusOr<XlaOp> InfeedWithTokenInternal(
+  virtual absl::StatusOr<XlaOp> InfeedWithTokenInternal(
       const Shape& infeed_instruction_shape, XlaOp token,
       const std::string& config);
 
@@ -715,7 +716,7 @@ class XlaBuilder {
   XlaOp OutfeedWithToken(XlaOp operand, XlaOp token,
                          const Shape& shape_with_layout,
                          const std::string& outfeed_config);
-  virtual StatusOr<XlaOp> OutfeedWithTokenInternal(
+  virtual absl::StatusOr<XlaOp> OutfeedWithTokenInternal(
       XlaOp operand, XlaOp token, const Shape& shape_with_layout,
       const std::string& outfeed_config);
   XlaOp Call(const XlaComputation& computation,
@@ -735,7 +736,7 @@ class XlaBuilder {
   // Internal version of CustomCall without computation that doesn't do op
   // specific error handling and expects arguments to be legal. CustomCall
   // method above calls this method after error handling.
-  virtual StatusOr<XlaOp> CustomCallInternal(
+  virtual absl::StatusOr<XlaOp> CustomCallInternal(
       const std::string& call_target_name, absl::Span<const XlaOp> operands,
       const XlaComputation* computation, const Shape& shape_with_layout,
       const std::string& opaque,
@@ -771,7 +772,7 @@ class XlaBuilder {
                const XlaComputation& computation,
                absl::Span<const int64_t> dimensions_to_reduce);
 
-  virtual StatusOr<XlaOp> ReduceInternal(
+  virtual absl::StatusOr<XlaOp> ReduceInternal(
       const Shape& shape, absl::Span<const XlaOp> all_operands,
       const XlaComputation& computation,
       absl::Span<const int64_t> dimensions_to_reduce);
@@ -798,7 +799,7 @@ class XlaBuilder {
       absl::Span<const int64_t> base_dilations,
       absl::Span<const int64_t> window_dilations,
       absl::Span<const std::pair<int64_t, int64_t>> padding);
-  StatusOr<HloInstructionProto> ReduceWindowInternal(
+  absl::StatusOr<HloInstructionProto> ReduceWindowInternal(
       absl::Span<const XlaOp> operands, absl::Span<const XlaOp> init_values,
       const XlaComputation& computation,
       absl::Span<const int64_t> window_dimensions,
@@ -806,7 +807,7 @@ class XlaBuilder {
       absl::Span<const int64_t> base_dilations,
       absl::Span<const int64_t> window_dilations,
       absl::Span<const std::pair<int64_t, int64_t>> padding);
-  virtual StatusOr<XlaOp> ReduceWindowInternal(
+  virtual absl::StatusOr<XlaOp> ReduceWindowInternal(
       const Shape& shape, XlaOp operand, XlaOp init_value,
       const XlaComputation& computation, Window window);
   XlaOp CrossReplicaSum(XlaOp operand,
@@ -875,7 +876,7 @@ class XlaBuilder {
       absl::Span<const std::pair<int64_t, int64_t>> padding, XlaOp source,
       XlaOp init_value, const XlaComputation& scatter);
 
-  StatusOr<HloInstructionProto> SelectAndScatterInternal(
+  absl::StatusOr<HloInstructionProto> SelectAndScatterInternal(
       XlaOp operand, const XlaComputation& select,
       absl::Span<const int64_t> window_dimensions,
       absl::Span<const int64_t> window_strides,
@@ -889,30 +890,30 @@ class XlaBuilder {
   XlaOp ConvertElementType(XlaOp operand, PrimitiveType new_element_type);
 
   XlaOp BitcastConvertType(XlaOp operand, PrimitiveType new_element_type);
-  virtual StatusOr<XlaOp> BitcastConvertTypeInternal(const Shape& shape,
-                                                     XlaOp operand);
+  virtual absl::StatusOr<XlaOp> BitcastConvertTypeInternal(const Shape& shape,
+                                                           XlaOp operand);
 
   XlaOp StochasticConvertType(XlaOp operand, XlaOp random,
                               PrimitiveType new_element_type);
 
   XlaOp Transpose(XlaOp operand, absl::Span<const int64_t> permutation);
-  virtual StatusOr<XlaOp> TransposeInternal(
+  virtual absl::StatusOr<XlaOp> TransposeInternal(
       const Shape& shape, XlaOp operand, absl::Span<const int64_t> permutation);
 
   XlaOp Rev(XlaOp operand, absl::Span<const int64_t> dimensions);
-  virtual StatusOr<XlaOp> RevInternal(const Shape& shape, XlaOp operand,
-                                      absl::Span<const int64_t> dimensions);
+  virtual absl::StatusOr<XlaOp> RevInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const int64_t> dimensions);
 
   XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
              int64_t dimension = -1, bool is_stable = false);
-  virtual StatusOr<XlaOp> SortInternal(const Shape& shape,
-                                       absl::Span<const XlaOp> operands,
-                                       const XlaComputation& comparator,
-                                       int64_t dimension, bool is_stable);
+  virtual absl::StatusOr<XlaOp> SortInternal(const Shape& shape,
+                                             absl::Span<const XlaOp> operands,
+                                             const XlaComputation& comparator,
+                                             int64_t dimension, bool is_stable);
 
   XlaOp TopK(XlaOp operand, int64_t k, bool largest);
-  virtual StatusOr<XlaOp> TopKInternal(const Shape& shape, XlaOp operand,
-                                       int64_t k, bool largest);
+  virtual absl::StatusOr<XlaOp> TopKInternal(const Shape& shape, XlaOp operand,
+                                             int64_t k, bool largest);
 
   XlaOp Clamp(XlaOp min, XlaOp operand, XlaOp max);
 
@@ -928,15 +929,16 @@ class XlaBuilder {
                         const Shape& shape);
   // Internal variant for the op with the full result shape containing both data
   // and state shape as a tuple.
-  virtual StatusOr<XlaOp> RngBitGeneratorInternal(
+  virtual absl::StatusOr<XlaOp> RngBitGeneratorInternal(
       const Shape& full_result_shape, RandomAlgorithm algorithm,
       XlaOp initial_state);
 
   XlaOp While(const XlaComputation& condition, const XlaComputation& body,
               XlaOp init);
-  virtual StatusOr<XlaOp> WhileInternal(const Shape& shape,
-                                        const XlaComputation& condition,
-                                        const XlaComputation& body, XlaOp init);
+  virtual absl::StatusOr<XlaOp> WhileInternal(const Shape& shape,
+                                              const XlaComputation& condition,
+                                              const XlaComputation& body,
+                                              XlaOp init);
 
   XlaOp Conditional(XlaOp predicate, XlaOp true_operand,
                     const XlaComputation& true_computation, XlaOp false_operand,
@@ -947,17 +949,17 @@ class XlaBuilder {
                     absl::Span<const XlaOp> branch_operands);
 
   XlaOp ReducePrecision(XlaOp operand, int exponent_bits, int mantissa_bits);
-  virtual StatusOr<XlaOp> ReducePrecisionInternal(const Shape& shape,
-                                                  XlaOp operand,
-                                                  int exponent_bits,
-                                                  int mantissa_bits);
+  virtual absl::StatusOr<XlaOp> ReducePrecisionInternal(const Shape& shape,
+                                                        XlaOp operand,
+                                                        int exponent_bits,
+                                                        int mantissa_bits);
 
   XlaOp Gather(XlaOp input, XlaOp start_indices,
                const GatherDimensionNumbers& dimension_numbers,
                absl::Span<const int64_t> slice_sizes,
                bool indices_are_sorted = false);
 
-  virtual StatusOr<XlaOp> GatherInternal(
+  virtual absl::StatusOr<XlaOp> GatherInternal(
       const Shape& shape, XlaOp input, XlaOp start_indices,
       const GatherDimensionNumbers& dimension_numbers,
       absl::Span<const int64_t> slice_sizes, bool indices_are_sorted);
@@ -972,7 +974,7 @@ class XlaBuilder {
                 const ScatterDimensionNumbers& dimension_numbers,
                 bool indices_are_sorted = false, bool unique_indices = false);
 
-  virtual StatusOr<XlaOp> ScatterInternal(
+  virtual absl::StatusOr<XlaOp> ScatterInternal(
       const Shape& shape, absl::Span<const XlaOp> inputs, XlaOp scatter_indices,
       absl::Span<const XlaOp> updates, const XlaComputation& update_computation,
       const ScatterDimensionNumbers& dimension_numbers, bool indices_are_sorted,
@@ -1010,28 +1012,29 @@ class XlaBuilder {
 
   XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64_t dimension);
 
-  virtual StatusOr<XlaOp> SetDimensionSizeInternal(const Shape& shape,
-                                                   XlaOp operand, XlaOp val,
-                                                   int64_t dimension);
+  virtual absl::StatusOr<XlaOp> SetDimensionSizeInternal(const Shape& shape,
+                                                         XlaOp operand,
+                                                         XlaOp val,
+                                                         int64_t dimension);
 
   XlaOp RemoveDynamicDimension(XlaOp operand, int64_t dimension);
 
-  virtual StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr,
-                                         HloOpcode opcode,
-                                         absl::Span<const XlaOp> operands);
-  StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr,
-                                 HloOpcode opcode) {
+  virtual absl::StatusOr<XlaOp> AddInstruction(
+      HloInstructionProto&& instr, HloOpcode opcode,
+      absl::Span<const XlaOp> operands);
+  absl::StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr,
+                                       HloOpcode opcode) {
     return AddInstruction(std::move(instr), opcode, /*operands=*/{});
   }
 
   void AddCalledComputation(const XlaComputation& computation,
                             HloInstructionProto* instr);
 
-  StatusOr<const HloInstructionProto*> LookUpInstruction(XlaOp op) const;
-  StatusOr<const HloInstructionProto*> LookUpInstructionByHandle(
+  absl::StatusOr<const HloInstructionProto*> LookUpInstruction(XlaOp op) const;
+  absl::StatusOr<const HloInstructionProto*> LookUpInstructionByHandle(
       int64_t handle) const;
-  StatusOr<HloInstructionProto*> LookUpMutableInstruction(XlaOp op);
-  StatusOr<HloInstructionProto*> LookUpMutableInstructionByHandle(
+  absl::StatusOr<HloInstructionProto*> LookUpMutableInstruction(XlaOp op);
+  absl::StatusOr<HloInstructionProto*> LookUpMutableInstructionByHandle(
       int64_t handle);
 
   // Internal helper method that does the building for an arbitrary unary op.
@@ -1046,13 +1049,14 @@ class XlaBuilder {
                  std::optional<ComparisonDirection> direction = std::nullopt,
                  std::optional<Comparison::Type> type = std::nullopt);
 
-  StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                          ComparisonDirection direction);
+  absl::StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
+                                ComparisonDirection direction);
 
   // Internal helper method for binary op compare without broadcast dimensions.
-  virtual StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                                  ComparisonDirection direction,
-                                  Comparison::Type type);
+  virtual absl::StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs,
+                                        XlaOp rhs,
+                                        ComparisonDirection direction,
+                                        Comparison::Type type);
 
   // Internal helper method that does the building for an arbitrary binary op
   // with same ranked operands that doesn't broadcast.
@@ -1065,11 +1069,11 @@ class XlaBuilder {
   XlaOp RngOp(RandomDistribution distribution,
               absl::Span<const XlaOp> parameters, const Shape& shape);
 
-  virtual StatusOr<XlaOp> RngOpInternal(RandomDistribution distribution,
-                                        absl::Span<const XlaOp> parameters,
-                                        const Shape& shape);
+  virtual absl::StatusOr<XlaOp> RngOpInternal(
+      RandomDistribution distribution, absl::Span<const XlaOp> parameters,
+      const Shape& shape);
 
-  virtual StatusOr<XlaOp> InDimBroadcast(
+  virtual absl::StatusOr<XlaOp> InDimBroadcast(
       const Shape& shape, XlaOp operand,
       absl::Span<const int64_t> broadcast_dimensions);
 
@@ -1078,8 +1082,8 @@ class XlaBuilder {
   // All dimensions of the operand must either be equal to the corresponding
   // output shape dimension, or be exactly 1.  (Such dimensions are the
   // degenerate dimensions.)
-  StatusOr<XlaOp> AddBroadcastSequence(const Shape& output_shape,
-                                       XlaOp operand);
+  absl::StatusOr<XlaOp> AddBroadcastSequence(const Shape& output_shape,
+                                             XlaOp operand);
 
   // Internal helper method that broadcasts a scalar to the shape of the output.
   absl::StatusOr<XlaOp> BroadcastScalarToOutputShape(XlaOp scalar,
@@ -1087,11 +1091,12 @@ class XlaBuilder {
 
   // Internal helper method for creating a Reshape op with the already inferred
   // shape.
-  virtual StatusOr<XlaOp> ReshapeInternal(const Shape& shape, XlaOp operand,
-                                          int64_t inferred_dimension);
+  virtual absl::StatusOr<XlaOp> ReshapeInternal(const Shape& shape,
+                                                XlaOp operand,
+                                                int64_t inferred_dimension);
 
   // Returns the (inferred) result for the program shape using the given root.
-  StatusOr<ProgramShape> GetProgramShape(int64_t root_id) const;
+  absl::StatusOr<ProgramShape> GetProgramShape(int64_t root_id) const;
 
   // A visitor which checks whether an operation is a compile-time constant,
   // meaning that it doesn't depend on any parameters, or on any stateful
@@ -1264,7 +1269,7 @@ class XlaBuilder {
                           const DotDimensionNumbers& dimension_number,
                           const PrecisionConfig* precision_config,
                           std::optional<PrimitiveType> preferred_element_type);
-  virtual StatusOr<XlaOp> DotGeneralInternal(
+  virtual absl::StatusOr<XlaOp> DotGeneralInternal(
       const Shape& shape, XlaOp lhs, XlaOp rhs,
       const DotDimensionNumbers& dimension_number,
       const PrecisionConfig* precision_config);
@@ -1680,8 +1685,8 @@ class XlaBuilder {
       const std::optional<ChannelHandle>& channel_id = std::nullopt);
 
   // Creates an op with the given opcode and the output shape.
-  virtual StatusOr<XlaOp> AddOpWithShape(HloOpcode opcode, const Shape& shape,
-                                         absl::Span<const XlaOp> operands);
+  virtual absl::StatusOr<XlaOp> AddOpWithShape(
+      HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands);
 
   // Here, InstructionType is either const HloInstructionProto* or non-const
   // HloInstructionProto*.
@@ -3023,7 +3028,7 @@ XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
 // Switches from automatic SPMD partitioning to manual partitioning. Converts a
 // full-shaped tensor (to be automatically partitioned by SPMD partitioner) to a
 // shard-shaped tensor to be consumed by manually partitioned ops.
-StatusOr<xla::XlaOp> ConvertSpmdFullToShardShape(
+absl::StatusOr<xla::XlaOp> ConvertSpmdFullToShardShape(
     xla::XlaBuilder* builder, xla::XlaOp input, int single_dim,
     const xla::OpSharding& manual_sharding,
     absl::Span<const int64_t> unspecified_dims);
@@ -3031,7 +3036,7 @@ StatusOr<xla::XlaOp> ConvertSpmdFullToShardShape(
 // Switches from manual partitioning to automatic SPMD partitioning. Converts a
 // shard-shaped tensor (manually partitioned in SPMD-style) to a full-shaped
 // tensor to be partitioned automatically by the SPMD partitioner.
-StatusOr<xla::XlaOp> ConvertSpmdShardToFullShape(
+absl::StatusOr<xla::XlaOp> ConvertSpmdShardToFullShape(
     xla::XlaBuilder* builder, xla::XlaOp input, const xla::Shape& output_shape,
     int single_dim, const xla::OpSharding& manual_sharding,
     absl::Span<const int64_t> unspecified_dims);
diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index b4c418ad88da1f..671dc4ab3362a8 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -77,7 +77,7 @@ HloInstruction* GetRoot(HloModule& module) {
 }
 
 // TODO(b/74197823): Move the tests to service/.
-StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder& b) {
+absl::StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder& b) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation,
                       b.Build(/*remove_dynamic_dimensions=*/false));
   const HloModuleProto& proto = computation.proto();
@@ -88,7 +88,8 @@ StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder& b) {
 }
 
 // Overload which explicitly specifies the root instruction.
-StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder& b, XlaOp root) {
+absl::StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder& b,
+                                                          XlaOp root) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation,
                       b.Build(root, /*remove_dynamic_dimensions=*/false));
   const HloModuleProto& proto = computation.proto();
@@ -704,7 +705,7 @@ TEST(XlaBuilderTest, ReportError) {
 
 TEST(XlaBuilderTest, ReportErrorOrReturnHandlesNonErrors) {
   XlaBuilder b(TestName());
-  StatusOr<XlaOp> op(ConstantR0<float>(&b, 1.0));
+  absl::StatusOr<XlaOp> op(ConstantR0<float>(&b, 1.0));
   Add(b.ReportErrorOrReturn(op), ConstantR0<float>(&b, 2.0));
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
@@ -713,7 +714,7 @@ TEST(XlaBuilderTest, ReportErrorOrReturnHandlesNonErrors) {
 
 TEST(XlaBuilderTest, ReportErrorOrReturnHandlesErrors) {
   XlaBuilder b(TestName());
-  StatusOr<XlaOp> op(InvalidArgument("a test error"));
+  absl::StatusOr<XlaOp> op(InvalidArgument("a test error"));
   Add(b.ReportErrorOrReturn(op), ConstantR0<float>(&b, 2.0));
   auto statusor = b.Build();
   ASSERT_FALSE(statusor.ok());
@@ -1569,7 +1570,7 @@ TEST(XlaBuilderTest, ComplexAbsConstant) {
   XlaOp out =
       Abs(ConstantR0<std::complex<float>>(&b, std::complex<float>{-1, -1}));
   ValueInference value_inference(&b);
-  StatusOr<OptionalLiteral> analyzed =
+  absl::StatusOr<OptionalLiteral> analyzed =
       value_inference.AnalyzeConstant(out, kUpperBound);
   EXPECT_IS_OK(analyzed.status());
   EXPECT_EQ(analyzed->GetValue().value().shape().element_type(),
diff --git a/third_party/xla/xla/client/xla_computation.cc b/third_party/xla/xla/client/xla_computation.cc
index 67ef21bd820642..c92de63495d190 100644
--- a/third_party/xla/xla/client/xla_computation.cc
+++ b/third_party/xla/xla/client/xla_computation.cc
@@ -23,12 +23,12 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<ProgramShape> XlaComputation::GetProgramShape() const {
+absl::StatusOr<ProgramShape> XlaComputation::GetProgramShape() const {
   TF_RET_CHECK(proto_.has_host_program_shape());
   return ProgramShape(proto_.host_program_shape());
 }
 
-StatusOr<std::unique_ptr<HloSnapshot>> XlaComputation::Snapshot() const {
+absl::StatusOr<std::unique_ptr<HloSnapshot>> XlaComputation::Snapshot() const {
   if (IsNull()) {
     return InvalidArgument("Computation is invalid.");
   }
diff --git a/third_party/xla/xla/client/xla_computation.h b/third_party/xla/xla/client/xla_computation.h
index 73b44fd4e1a040..e21a92d6300654 100644
--- a/third_party/xla/xla/client/xla_computation.h
+++ b/third_party/xla/xla/client/xla_computation.h
@@ -45,7 +45,7 @@ class XlaComputation {
 
   // Returns the "program shape" (parameter and return shapes) for this
   // computation.
-  StatusOr<ProgramShape> GetProgramShape() const;
+  absl::StatusOr<ProgramShape> GetProgramShape() const;
 
   const std::string& name() const { return proto().name(); }
 
@@ -54,7 +54,7 @@ class XlaComputation {
 
   // Requests that we snapshot the computation into a serializable protocol
   // buffer form.
-  StatusOr<std::unique_ptr<HloSnapshot>> Snapshot() const;
+  absl::StatusOr<std::unique_ptr<HloSnapshot>> Snapshot() const;
 
   // Returns true if this object is a null Computation.
   bool IsNull() const { return unique_id_ == -1; }

From 7feee24c8fc988f50df14ad419eb0df0a41ed766 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 Mar 2024 13:50:42 -0700
Subject: [PATCH 346/670] [XLA:Runtime] Moved the nccl_collective_broadcast
 thunk to new folder and removed unused dependencies.Updated the necessary
 directories pointing to this thunk and removed it from the
 nccl_collectives_thunks build and removed the nccl_collectives_thunk target
 as all the remaining thunks have been moved out. #5758

PiperOrigin-RevId: 618491660
---
 third_party/xla/xla/service/gpu/BUILD         | 79 ++-----------------
 .../xla/xla/service/gpu/ir_emitter_context.h  |  2 +-
 .../xla/service/gpu/ir_emitter_unnested.cc    |  2 +-
 .../xla/xla/service/gpu/mock_nccl_utils.cc    |  2 +-
 .../xla/xla/service/gpu/mock_nccl_utils.h     |  2 +-
 .../service/gpu/mock_nccl_utils_default.cc    |  2 +-
 third_party/xla/xla/service/gpu/runtime/BUILD | 73 ++++++++++++++---
 .../service/gpu/runtime/command_buffer_cmd.cc |  2 +-
 .../service/gpu/runtime/command_buffer_cmd.h  |  2 +-
 .../gpu/runtime/nccl_all_gather_thunk.cc      |  2 +-
 .../gpu/runtime/nccl_all_gather_thunk.h       |  2 +-
 .../gpu/runtime/nccl_all_reduce_thunk.cc      |  2 +-
 .../gpu/runtime/nccl_all_reduce_thunk.h       |  2 +-
 .../gpu/runtime/nccl_all_to_all_thunk.cc      |  2 +-
 .../gpu/runtime/nccl_all_to_all_thunk.h       |  2 +-
 .../nccl_collective_broadcast_thunk.cc        |  2 +-
 .../runtime/nccl_collective_broadcast_thunk.h |  2 +-
 .../runtime/nccl_collective_permute_thunk.cc  |  2 +-
 .../runtime/nccl_collective_permute_thunk.h   |  2 +-
 .../{ => runtime}/nccl_collective_thunk.cc    |  6 +-
 .../gpu/{ => runtime}/nccl_collective_thunk.h |  6 +-
 .../gpu/runtime/nccl_p2p_thunk_common.h       |  2 +-
 .../service/gpu/runtime/nccl_recv_thunk.cc    |  2 +-
 .../xla/service/gpu/runtime/nccl_recv_thunk.h |  2 +-
 .../service/gpu/runtime/nccl_send_thunk.cc    |  2 +-
 .../xla/service/gpu/runtime/nccl_send_thunk.h |  2 +-
 26 files changed, 96 insertions(+), 112 deletions(-)
 rename third_party/xla/xla/service/gpu/{ => runtime}/nccl_collective_thunk.cc (99%)
 rename third_party/xla/xla/service/gpu/{ => runtime}/nccl_collective_thunk.h (98%)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 8ac0e29e3759a7..42f8dff69451b6 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -296,10 +296,11 @@ cc_library(
         ":gpu_executable",
         ":ir_emission_utils",
         ":kernel_reuse_cache",
-        ":nccl_collective_thunks",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:name_uniquer",
+        "//xla/service/gpu/model:indexing_map",
+        "//xla/service/gpu/runtime:nccl_collective_thunk",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -337,7 +338,6 @@ cc_library(
         ":launch_dimensions",
         ":matmul_utils",
         ":nccl_api",
-        ":nccl_collective_thunks",
         ":parallel_loop_emitter",
         ":thunk",
         ":triton_call",
@@ -381,6 +381,7 @@ cc_library(
         "//xla/service/gpu/runtime:nccl_all_to_all_thunk",
         "//xla/service/gpu/runtime:nccl_collective_broadcast_thunk",
         "//xla/service/gpu/runtime:nccl_collective_permute_thunk",
+        "//xla/service/gpu/runtime:nccl_collective_thunk",
         "//xla/service/gpu/runtime:nccl_recv_thunk",
         "//xla/service/gpu/runtime:nccl_send_thunk",
         "//xla/service/gpu/runtime:norm_thunk",
@@ -889,76 +890,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "nccl_collective_thunks",
-    srcs = [
-        "nccl_collective_thunk.cc",
-    ],
-    hdrs = [
-        "nccl_collective_thunk.h",
-    ],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW=1",
-    ]),
-    deps = [
-        ":backend_configs_cc",
-        ":buffer_allocations",
-        ":gpu_executable_run_options",
-        ":ir_emission_utils",
-        ":nccl_api",
-        ":nccl_clique",
-        ":nccl_clique_key",
-        ":thunk",
-        "//xla:debug_options_flags",
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:status_macros",
-        "//xla:statusor",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:computation_placer",
-        "//xla/service:global_device_id",
-        "//xla/service:hlo_parser",
-        "//xla/service:rendezvous",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_headers",
-        "//xla/stream_executor/gpu:gpu_activation_header",
-        "//xla/stream_executor/gpu:gpu_driver_header",
-        "//xla/stream_executor/gpu:gpu_stream",
-        "//xla/stream_executor/gpu:gpu_types_header",
-        "//xla/translate/hlo_to_mhlo:hlo_utils",
-        "//xla/translate/mhlo_to_hlo:attribute_exporter",
-        "//xla/translate/mhlo_to_hlo:type_to_shape",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
-        "@local_config_nccl//:nccl",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rccl",
-    ]),
-)
-
 #===-------------------------------------------------------------------------------------------===//
 # NCCL integration
 #===-------------------------------------------------------------------------------------------===//
@@ -1180,7 +1111,6 @@ cc_library(
     deps = if_cuda_is_configured([
         ":gpu_executable_run_options",
         ":mock_nccl_xml_google",
-        ":nccl_collective_thunks",
         ":nccl_api",
         ":nccl_clique_key",
         ":nccl_clique",
@@ -1212,6 +1142,7 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service:rendezvous",
         "//xla/service:lockable",
+        "//xla/service/gpu/runtime:nccl_collective_thunk",
         "//xla/service/gpu/runtime:nccl_p2p_thunk_common",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_activation",
@@ -1235,7 +1166,6 @@ cc_library(
     deps = if_gpu_is_configured([
         ":gpu_executable_run_options",
         ":nccl_api",
-        ":nccl_collective_thunks",
         ":nccl_clique_key",
         ":nccl_clique",
         ":thunk",
@@ -1247,6 +1177,7 @@ cc_library(
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
+        "//xla/service/gpu/runtime:nccl_collective_thunk",
         "//xla/service/gpu/runtime:nccl_p2p_thunk_common",
         "//xla/service:collective_ops_utils",
         "//xla/service:global_device_id",
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index cc79e4cd3c8266..afbf212bad0369 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/name_uniquer.h"
 #include "xla/stream_executor/device_description.h"
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 93dd7754008298..686de8112eb2e9 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -110,7 +110,6 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd_emitter.h"
@@ -129,6 +128,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/nccl_all_to_all_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_permute_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_recv_thunk.h"
 #include "xla/service/gpu/runtime/nccl_send_thunk.h"
 #include "xla/service/gpu/runtime/norm_thunk.h"
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils.cc b/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
index 56782e9e6d777a..375cdf5954a930 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
+++ b/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
@@ -64,7 +64,7 @@ limitations under the License.
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/service/gpu/sleep_kernel.h"
 #include "xla/service/gpu/thunk.h"
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils.h b/third_party/xla/xla/service/gpu/mock_nccl_utils.h
index 3a2ec6cde1c534..71412ed7d36e92 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils.h
+++ b/third_party/xla/xla/service/gpu/mock_nccl_utils.h
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/lockable.h"
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc b/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc
index 0d6f24226f4b8f..dc1b8bde221609 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc
+++ b/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/service/gpu/mock_nccl_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index f1573b18665bd8..bf5efda4935f9f 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -74,6 +74,7 @@ cc_library(
         ":nccl_all_gather_thunk",
         ":nccl_all_reduce_thunk",
         ":nccl_collective_broadcast_thunk",
+        ":nccl_collective_thunk",
         "//xla:executable_run_options",
         "//xla:status",
         "//xla:types",
@@ -94,7 +95,6 @@ cc_library(
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
-        "//xla/service/gpu:nccl_collective_thunks",
         "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu:thunk",
         "//xla/service/gpu/kernels:custom_kernel",
@@ -143,7 +143,6 @@ cc_library(
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
-        "//xla/service/gpu:nccl_collective_thunks",
         "//xla/service/gpu:thunk",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
@@ -622,12 +621,12 @@ cc_library(
     srcs = ["nccl_all_gather_thunk.cc"],
     hdrs = ["nccl_all_gather_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_collective_thunks",
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
@@ -644,13 +643,13 @@ cc_library(
     srcs = ["nccl_all_reduce_thunk.cc"],
     hdrs = ["nccl_all_reduce_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_collective_thunks",
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
@@ -668,13 +667,13 @@ cc_library(
     srcs = ["nccl_all_to_all_thunk.cc"],
     hdrs = ["nccl_all_to_all_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_collective_thunks",
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
@@ -691,12 +690,12 @@ cc_library(
     srcs = ["nccl_collective_broadcast_thunk.cc"],
     hdrs = ["nccl_collective_broadcast_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_collective_thunks",
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/types:span",
@@ -710,6 +709,7 @@ cc_library(
     srcs = ["nccl_collective_permute_thunk.cc"],
     hdrs = ["nccl_collective_permute_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         ":nccl_p2p_thunk_common",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
@@ -719,7 +719,6 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_collective_thunks",
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
         "//xla/translate/mhlo_to_hlo:attribute_exporter",
@@ -733,11 +732,66 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "nccl_collective_thunk",
+    srcs = ["nccl_collective_thunk.cc"],
+    hdrs = ["nccl_collective_thunk.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW=1",
+    ]),
+    deps = [
+        "//xla:debug_options_flags",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:computation_placer",
+        "//xla/service:global_device_id",
+        "//xla/service:rendezvous",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu:nccl_api",
+        "//xla/service/gpu:nccl_clique",
+        "//xla/service/gpu:nccl_clique_key",
+        "//xla/service/gpu:thunk",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/stream_executor",
+        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor/gpu:gpu_activation_header",
+        "//xla/stream_executor/gpu:gpu_driver_header",
+        "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/stream_executor/gpu:gpu_types_header",
+        "//xla/translate/mhlo_to_hlo:attribute_exporter",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ] + if_cuda_is_configured([
+        "@local_config_nccl//:nccl",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rccl",
+    ]),
+)
+
 cc_library(
     name = "nccl_p2p_thunk_common",
     srcs = ["nccl_p2p_thunk_common.cc"],
     hdrs = ["nccl_p2p_thunk_common.h"],
     deps = [
+        ":nccl_collective_thunk",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
@@ -745,7 +799,6 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service:hlo_parser",
         "//xla/service/gpu:nccl_clique_key",
-        "//xla/service/gpu:nccl_collective_thunks",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/base:core_headers",
@@ -764,6 +817,7 @@ cc_library(
     srcs = ["nccl_recv_thunk.cc"],
     hdrs = ["nccl_recv_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         ":nccl_p2p_thunk_common",
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
@@ -772,7 +826,6 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
-        "//xla/service/gpu:nccl_collective_thunks",
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/log",
@@ -789,6 +842,7 @@ cc_library(
     srcs = ["nccl_send_thunk.cc"],
     hdrs = ["nccl_send_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         ":nccl_p2p_thunk_common",
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
@@ -797,7 +851,6 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
-        "//xla/service/gpu:nccl_collective_thunks",
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/log",
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
index 4ba70f1cf698ea..0ca9addd067aa1 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
@@ -51,11 +51,11 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/annotation.h"
 #include "xla/service/gpu/runtime/nccl_all_gather_thunk.h"
 #include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/service_executable_run_options.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
index 2ac0ebc49eafc7..5ecf3c922fa096 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
@@ -43,8 +43,8 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/command_buffer.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
index ba6c86a4ac05ef..dd564620115c69 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
index 562ba7840143c8..95512b9099116e 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
index 1a2a8d17f0839c..262795ab1350bc 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
index df24d557087d08..7fedfee6786f5a 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
index 0f1de920ab5e55..623bf8095e2ff6 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
index 16860257da027b..083283388f03ad 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
index 78b940c3889e3e..03790724ef8975 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
index ece85730a250d3..5f5f993085a91c 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
index e9520f82bf1070..77cafde8120c1e 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
index 6171ed4680b6dc..29498660195bef 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/service/gpu/nccl_collective_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/nccl_collective_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
index ea12aaecb56f89..4be40d4ee2ccb2 100644
--- a/third_party/xla/xla/service/gpu/nccl_collective_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 
 #include <cstdint>
 #include <cstdlib>
@@ -527,12 +527,12 @@ absl::Status IsValidOperand(mlir::Value operand, Thunk::Kind reduction_op) {
 
 absl::Status IsValidOperand(Shape shape, Thunk::Kind reduction_op) {
   if (!LayoutUtil::IsDenseArray(shape)) {
-    return tsl::errors::Unimplemented(
+    return absl::AbortedError(
         absl::StrFormat("input is not a dense array: %s",
                         shape.ToString(/*print_layout=*/true)));
   }
   if (!IsTypeSupportedByNccl(shape.element_type(), reduction_op)) {
-    return tsl::errors::Unimplemented(absl::StrFormat(
+    return absl::AbortedError(absl::StrFormat(
         "element type %s not suppored by NCCL",
         primitive_util::LowercasePrimitiveTypeName(shape.element_type())));
   }
diff --git a/third_party/xla/xla/service/gpu/nccl_collective_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
similarity index 98%
rename from third_party/xla/xla/service/gpu/nccl_collective_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
index 035bc4aafb46a1..6853ef0025a519 100644
--- a/third_party/xla/xla/service/gpu/nccl_collective_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_NCCL_COLLECTIVE_THUNK_H_
-#define XLA_SERVICE_GPU_NCCL_COLLECTIVE_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_THUNK_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -298,4 +298,4 @@ Status MaybeRegisterBuffers(NcclApi* nccl_api, int device_ordinal,
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_NCCL_COLLECTIVE_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h b/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h
index 819ef04d76e302..0a255fc3012ccc 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
index 2dad1baaa49daf..d0a09adbfe08fd 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status_macros.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
index ecec84a2abc8c7..587530c93128d1 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
index e83b0f4e97e04a..e56e62bba62c81 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status_macros.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
index d384b6a435574b..30f63215a83a86 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
 

From 39cb3719012c249ba40bffc4cebaad66311a4669 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Sat, 23 Mar 2024 15:23:54 -0700
Subject: [PATCH 347/670] Convert absl::string_view to std::string before
 calling ParseFromString()

The OSS version of ParseFromString() does not accept absl::string_view yet.

PiperOrigin-RevId: 618501554
---
 tensorflow/core/tfrt/mlrt/kernel/kernel.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
index 0e09ec23cf52dd..323ee9fccf6503 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -645,7 +646,10 @@ struct ConstOp : mlrt::KernelFrame {
 
 void ConstOp::Invoke() {
   tensorflow::TensorProto proto;
-  if (!proto.ParseFromString(tensor_proto())) {
+  // TODO(b/330806453): Remove the std::string conversion once ParseFromString()
+  // in OSS accepets absl::string_view.
+  // NOLINTNEXTLINE: readability-redundant-string-conversions
+  if (!proto.ParseFromString(std::string(tensor_proto()))) {
     execution_context().Fail(
         absl::InternalError("Failed to parse const tensor proto"));
     return;

From 038b5c3308f2d7603364fba18dc95ca41bfce93d Mon Sep 17 00:00:00 2001
From: Deqiang Chen <deqiangc@google.com>
Date: Sat, 23 Mar 2024 16:27:00 -0700
Subject: [PATCH 348/670] Open source IdentityPropogation pass

PiperOrigin-RevId: 618508615
---
 .../tests/ifrt/tf_identity_propagation.mlir   | 38 ++++++++
 .../compiler/mlir/tfrt/transforms/ifrt/BUILD  |  2 +
 .../mlir/tfrt/transforms/ifrt/passes.td       | 27 +++++-
 .../ifrt/tf_identity_propagation.cc           | 88 +++++++++++++++++++
 .../tfrt/transforms/ifrt/tf_ifrt_passes.cc    |  2 +
 .../tfrt/transforms/ifrt/tf_ifrt_passes.h     |  4 +
 6 files changed, 160 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_identity_propagation.mlir
 create mode 100644 tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_identity_propagation.cc

diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_identity_propagation.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_identity_propagation.mlir
new file mode 100644
index 00000000000000..6ff8613283472d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_identity_propagation.mlir
@@ -0,0 +1,38 @@
+// RUN: tf-tfrt-opt %s -tf-identity-propagation -canonicalize | FileCheck %s
+
+// CHECK-LABEL: func @identity
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<i32>)
+func.func @identity(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK-NOT: "tf.Identity"
+  %0 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return %[[ARG0]]
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @identity_terminator
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<i32>)
+func.func @identity_terminator(%arg0: tensor<i32>) -> (tensor<*xi32>, tensor<i32>) {
+  // CHECK: %[[IDENTITY:.*]] = "tf.Identity"
+  %0 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<*xi32>
+  // CHECK-NOT: "tf.Identity"
+  %1 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return %[[IDENTITY]], %[[ARG0]]
+  func.return %0, %1 : tensor<*xi32>, tensor<i32>
+}
+
+// CHECK-LABEL: func @xla_sharding
+func.func @xla_sharding(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK: %[[OUTPUT:.*]] = "tf.Identity"
+  %0 = "tf.Identity"(%arg0) {_XlaSharding = ""} : (tensor<i32>) -> tensor<i32>
+  // CHECK: return %[[OUTPUT]]
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @identity_n
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<i32>, %[[ARG1:.*]]: tensor<f32>)
+func.func @identity_n(%arg0: tensor<i32>, %arg1: tensor<f32>) -> (tensor<i32>, tensor<f32>) {
+  // CHECK-NOT: "tf.IdentityN"
+  %0:2 = "tf.IdentityN"(%arg0, %arg1) : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>)
+  // CHECK: return %[[ARG0]], %[[ARG1]]
+  func.return %0#0, %0#1 : tensor<i32>, tensor<f32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
index 8e15b9fcfee8ac..db6335ce1e6d7b 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
@@ -17,6 +17,7 @@ package_group(
         "//learning/brain/tfrt/cpp_tests/...",
         "//learning/pathways/serving/runtime/...",
         "//learning/pathways/serving/tests/...",
+        "//learning/brain/tfrt/mlir/mlrt/application/pathways/compiler/...",
         # Allow visibility from the mlir language server.
         "//learning/brain/mlir/mlir_lsp_server/...",
     ]),
@@ -55,6 +56,7 @@ cc_library(
         "lower_to_ifrt_restore_variable.cc",
         "rewrite_cluster_to_ifrt_call.cc",
         "sink_variable_as_named_array.cc",
+        "tf_identity_propagation.cc",
         "tf_ifrt_passes.cc",
         "tf_restore_merging.cc",
         "tf_restore_splitting.cc",
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td
index 20bddd75722c63..3ead3c515f4d15 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td
@@ -89,4 +89,29 @@ def TfRestoreMergingPass : Pass<"tf-restore-merging", "mlir::func::FuncOp"> {
   }];
 
   let constructor = "CreateTfRestoreMergingPass()";
-}
\ No newline at end of file
+}
+
+def TfIdentityPropagationPass
+    : Pass<"tf-identity-propagation", "mlir::func::FuncOp"> {
+  let summary = "Propagates inputs of no-op identity ops to their outputs";
+
+  let description = [{
+    This pass finds identity ops that are no-op and propagates their inputs
+    directly to outputs so that identity ops can be skipped.
+
+    One example of identity ops that are not no-op is identity ops with XLA
+    sharding annotation. Since some models use identity ops with `_XlaSharding`
+    attributes to change output sharding, this pass doesn't propagate the inputs
+    of such identity ops in order to preserve the sharding changes.
+
+    This pass is useful to make sure that ineffective identity ops don't affect
+    the graph partitioning. For example, in a pipelined model, if there is a CPU
+    identity op between two TPU computation stages (which sometimes happens
+    because TensorFlow inserts it), this will unnecessarily route the
+    intermediate tensors through the CPU device. By forwarding the inputs of the
+    identity op directly to its outputs, we can avoid such inefficiency.
+  }];
+
+  let constructor = "CreateTfIdentityPropagationPass()";
+}
+
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_identity_propagation.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_identity_propagation.cc
new file mode 100644
index 00000000000000..873838727c9d33
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_identity_propagation.cc
@@ -0,0 +1,88 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+#define GEN_PASS_DEF_TFIDENTITYPROPAGATIONPASS
+#define GEN_PASS_DECL_TFIDENTITYPROPAGATIONPASS
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.h.inc"  // IWYU pragma: keep
+
+constexpr absl::string_view kXlaShardingAttr = "_XlaSharding";
+
+bool IsTerminator(mlir::Operation* op) {
+  return op->hasTrait<mlir::OpTrait::IsTerminator>();
+}
+
+class TfIdentityPropagationPass
+    : public impl::TfIdentityPropagationPassBase<TfIdentityPropagationPass> {
+ public:
+  void runOnOperation() override {
+    mlir::func::FuncOp func = getOperation();
+
+    func.walk([](mlir::TF::IdentityOp identity) {
+      // Don't propagate inputs of identity ops with sharding annotation since
+      // identity ops are sometimes used to change output sharding.
+      if (identity->hasAttr(kXlaShardingAttr)) {
+        return;
+      }
+      // Identity outputs to terminator ops (e.g., `func.return`) cannot be
+      // replaced unless input/output types are exactly the same. Doing so may
+      // cause mismatch between the enclosing region's return type and the
+      // terminator's arg type.
+      const bool same_type =
+          identity.getInput().getType() == identity.getOutput().getType();
+      identity.getOutput().replaceUsesWithIf(
+          identity.getInput(), [&](mlir::OpOperand& operand) {
+            return same_type || !IsTerminator(operand.getOwner());
+          });
+    });
+
+    func.walk([](mlir::TF::IdentityNOp identity_n) {
+      if (identity_n->hasAttr(kXlaShardingAttr)) {
+        return;
+      }
+      for (auto [input, output] :
+           llvm::zip(identity_n.getInput(), identity_n.getOutput())) {
+        const bool same_type = input.getType() == output.getType();
+        output.replaceUsesWithIf(input, [&](mlir::OpOperand& operand) {
+          return same_type || !IsTerminator(operand.getOwner());
+        });
+      }
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfIdentityPropagationPass() {
+  return std::make_unique<TfIdentityPropagationPass>();
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
index 53bd55cc0d2799..a547a05d5d34ea 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
@@ -70,6 +70,8 @@ void AddClusterToIfrtRuntimeOpsPassPipeline(OpPassManager& pm,
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateCanonicalizeCompileAndReplicateAttributesPass());
 
+  pm.addNestedPass<mlir::func::FuncOp>(CreateTfIdentityPropagationPass());
+
   pm.addPass(CreateRewriteClusterToIfrtCallPass());
 
   // Sink VarHandle with ReadVariableOp: subsequent SinkVariableAsNamedArrayPass
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
index 3835a77f04f93c..7ef062ec5601f4 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
@@ -44,6 +44,10 @@ CreateTfRestoreSplittingPass();
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 CreateTfRestoreMergingPass();
 
+// Creates a pass that propagates inputs of no-op identity ops to their outputs.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfIdentityPropagationPass();
+
 // Creates a pass that lower `tf.RestoreVariableOp` to
 // `tf.IfrtRestoreVariableOp`.
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>

From 8024347e2ee1d583afc7ec06b33e80c5714ef5ec Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 24 Mar 2024 00:37:45 -0700
Subject: [PATCH 349/670] Automated Code Change

PiperOrigin-RevId: 618568619
---
 tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD    | 4 ++--
 .../mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc      | 2 ++
 .../mlir/quantization/tensorflow/ops/tf_quantize_op.h         | 1 +
 .../mlir/quantization/tensorflow/ops/uniform_op_quant_spec.cc | 4 +++-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
index 1734fa03aefe3e..de23418e1af031 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
@@ -51,7 +51,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
@@ -85,8 +85,8 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
index 8a9dc4eb3d4989..52ca3722a12bd5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
@@ -20,10 +20,12 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "llvm/Support/Casting.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.h b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.h
index 5d25779826e81c..bc6031eea7d85b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include <optional>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.cc
index c86968b319c6dd..afeb8905855837 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.cc
@@ -16,7 +16,9 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir::quant {

From 7c8b4e27f0e72774463b860fc269cb9c19327f71 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 24 Mar 2024 02:01:55 -0700
Subject: [PATCH 350/670] compat: Update forward compatibility horizon to
 2024-03-24

PiperOrigin-RevId: 618579417
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 46ce87e2f8d611..1862bf22ca85e1 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 23)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 24)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 8b7261408f9d99df8fa33d69aca4cb6f6836f629 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 24 Mar 2024 02:02:13 -0700
Subject: [PATCH 351/670] Update GraphDef version to 1811.

PiperOrigin-RevId: 618579478
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 84aad8c370b97e..a7987d3aec02da 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1810  // Updated: 2024/3/23
+#define TF_GRAPH_DEF_VERSION 1811  // Updated: 2024/3/24
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 0a1ec9d1d9e162251ee6ae6c5dcae33809351d9b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 24 Mar 2024 13:00:44 -0700
Subject: [PATCH 352/670] Automated Code Change

PiperOrigin-RevId: 618656675
---
 tensorflow/c/experimental/filesystem/plugins/posix/BUILD    | 6 ++++--
 .../filesystem/plugins/posix/posix_filesystem.cc            | 2 ++
 .../filesystem/plugins/posix/posix_filesystem_static.cc     | 2 ++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/posix/BUILD b/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
index 90acb2bf389370..a4406b46945193 100644
--- a/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
@@ -1,7 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Experimental posix filesystem plugin.
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -25,6 +24,7 @@ cc_library(
     hdrs = ["posix_filesystem.h"],
     deps = [
         ":posix_filesystem_helper",
+        "//tensorflow/c:tf_file_statistics",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
     ],
@@ -40,6 +40,8 @@ cc_library(
         ":posix_filesystem_impl",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "//tensorflow/c/experimental/filesystem:modular_filesystem",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/log",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.cc
index f1f3dda5e8ccc0..e3fbf03ea19440 100644
--- a/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.cc
@@ -26,7 +26,9 @@ limitations under the License.
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_helper.h"
+#include "tensorflow/c/tf_file_statistics.h"
 #include "tensorflow/c/tf_status.h"
 
 // Implementation of a filesystem for POSIX environments.
diff --git a/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_static.cc b/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_static.cc
index 6081722e699e86..60205858499aed 100644
--- a/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_static.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_static.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/log/log.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/experimental/filesystem/modular_filesystem_registration.h"
 #include "tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
 

From 4132866cb7d287bfc3fbcb35f1e52971235d0d5f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 24 Mar 2024 15:14:43 -0700
Subject: [PATCH 353/670] Automated Code Change

PiperOrigin-RevId: 618671497
---
 tensorflow/python/client/BUILD | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tensorflow/python/client/BUILD b/tensorflow/python/client/BUILD
index 1f8f7e6b8b1d31..15f5937a06b95a 100644
--- a/tensorflow/python/client/BUILD
+++ b/tensorflow/python/client/BUILD
@@ -133,11 +133,6 @@ py_strict_library(
     name = "client",
     srcs = ["client_lib.py"],
     srcs_version = "PY3",
-    visibility = [
-        "//tensorflow:internal",
-        "//third_party/mlperf:__subpackages__",
-        "//third_party/py/tf_slim:__subpackages__",
-    ],
     deps = [
         ":_pywrap_device_lib",
         "//tensorflow/core:protos_all_py",

From 890ff623b8c57793df0e10cb8a3e6d7fa1c2b753 Mon Sep 17 00:00:00 2001
From: Jackson Stokes <jacksonstokes@google.com>
Date: Sun, 24 Mar 2024 16:04:13 -0700
Subject: [PATCH 354/670] [xla:tpu] Allow host offloading ops to pass hlo
 verification when changing layout.

Host offloading ops (dynamic slice, copy, and dynamic update slice) move tensors between device memory and host memory space. This can cause the hlo verifier to fail as it interprets this layout to be unequal. This change allows for memory space to be ignored when comparing layouts for these ops.

PiperOrigin-RevId: 618677446
---
 third_party/xla/xla/service/BUILD             |  1 +
 third_party/xla/xla/service/hlo_verifier.cc   | 43 +++++++++++++
 .../xla/xla/service/hlo_verifier_test.cc      | 61 +++++++++++++++++++
 3 files changed, 105 insertions(+)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index daae872b83ec01..51bea8807ae831 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -4775,6 +4775,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
     ],
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index 1a975869303143..c202158f761b45 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/hlo_verifier.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <iterator>
 #include <map>
 #include <memory>
@@ -27,6 +28,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -39,6 +41,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/layout.h"
 #include "xla/permutation_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/collective_ops_utils.h"
@@ -2777,6 +2780,13 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
           if (instruction->opcode() == HloOpcode::kConvert) {
             // Convert instructions can change element_size_in_bits
             equal_predicate.IgnoreElementSize();
+          } else if (instruction->opcode() == HloOpcode::kDynamicSlice ||
+                     instruction->opcode() == HloOpcode::kDynamicUpdateSlice ||
+                     instruction->opcode() == HloOpcode::kCopy) {
+            TF_RETURN_IF_ERROR(HostOffloadInstructionCanChangeMemorySpace(
+                instruction, operand_layout.memory_space(),
+                result_layout.memory_space()));
+            equal_predicate.IgnoreMemorySpace();
           }
           TF_RET_CHECK(equal_predicate(result_layout, operand_layout))
               << "Instruction shouldn't change layouts "
@@ -2810,6 +2820,39 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
+  // Verifies whether a given `instruction` is permitted to change the layout
+  // memory space from `operand_memory_space` to `result_memory_space`.
+  // Returns OkStatus() if the instruction's layout changes are valid;
+  // otherwise, returns an appropriate error status.
+  static Status HostOffloadInstructionCanChangeMemorySpace(
+      const HloInstruction* instruction, const int64_t operand_memory_space,
+      const int64_t result_memory_space) {
+    TF_RET_CHECK(!(operand_memory_space == Layout::kGenericFastMemorySpace &&
+                   result_memory_space != Layout::kGenericFastMemorySpace) ||
+                 (operand_memory_space != Layout::kGenericFastMemorySpace &&
+                  result_memory_space == Layout::kGenericFastMemorySpace))
+        << "Instruction shouldn't change layout memory space between generic "
+           "fast memory space and others for instruction: "
+        << instruction->ToString();
+
+    if (instruction->opcode() == HloOpcode::kDynamicSlice) {
+      TF_RET_CHECK(!(operand_memory_space == Layout::kDefaultMemorySpace &&
+                     result_memory_space == Layout::kHostMemorySpace))
+          << "DynamicSlice instruction shouldn't change layout memory "
+          << "space from device to host: " << instruction->ToString();
+    } else if (instruction->opcode() == HloOpcode::kDynamicUpdateSlice) {
+      TF_RET_CHECK(!(operand_memory_space == Layout::kHostMemorySpace &&
+                     result_memory_space == Layout::kDefaultMemorySpace))
+          << "DynamicUpdateSlice instruction shouldn't change layout "
+          << "memory space from host to device: " << instruction->ToString();
+    } else if (instruction->opcode() != HloOpcode::kCopy) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Instruction shouldn't change layout memory space: ",
+                       instruction->ToString()));
+    }
+    return OkStatus();
+  }
+
   absl::flat_hash_map<std::string, const HloInstruction*> instructions_by_name_;
   const HloVerifierOpts& opts_;
   std::optional<int64_t> num_devices_;
diff --git a/third_party/xla/xla/service/hlo_verifier_test.cc b/third_party/xla/xla/service/hlo_verifier_test.cc
index b03a88a6c98bba..9d803e20295237 100644
--- a/third_party/xla/xla/service/hlo_verifier_test.cc
+++ b/third_party/xla/xla/service/hlo_verifier_test.cc
@@ -2874,5 +2874,66 @@ TEST_F(HloVerifierTest, SparseDotMetadataShape) {
   EXPECT_THAT(status.message(), HasSubstr("Expected sparse dot metadata"));
 }
 
+TEST_F(HloVerifierTestLayoutSensitive,
+       HostOffloadingDUSAndDSAreVerifiedWhenChangingLayout) {
+  const char* const hlo_string = R"(
+  HloModule m
+
+  ENTRY main {
+    constant_f32_0 = f32[] constant(0)
+    custom-call = f32[2,2048,2048]{2,1,0:S(5)} custom-call(), custom_call_target="AllocateBuffer"
+    data_param = f32[1,2048,2048]{2,1,0} parameter(0)
+    index_param = s32[] parameter(1)
+    constant_s32_0 = s32[] constant(0)
+    dynamic_update_slice = f32[2,2048,2048]{2,1,0:S(5)} dynamic-update-slice(custom-call, data_param, index_param, constant_s32_0, constant_s32_0)
+    ROOT dynamic_slice = f32[1,2048,2048]{2,1,0} dynamic-slice(f32[2,2048,2048]{2,1,0:S(5)} dynamic_update_slice, index_param, constant_s32_0, constant_s32_0), dynamic_slice_sizes={1,2048,2048}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTestLayoutSensitive,
+       HostOffloadingCopyIsVerifiedWhenChangingLayout) {
+  const char* const hlo_string = R"(
+  HloModule m
+
+  ENTRY main {
+    data_param = f32[2048]{0} parameter(0)
+    copy_0 = f32[2048]{0:S(5)} copy(f32[2048]{0} data_param)
+    ROOT copy_1 = f32[2048]{0} copy(f32[2048]{0:S(5)} copy_0)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTestLayoutSensitive,
+       HostOffloadingDSCannotChangeLayoutFromDeviceToHost) {
+  const char* const hlo_string = R"(
+  HloModule m
+
+  ENTRY main {
+    constant_f32_0 = f32[] constant(0)
+    custom-call = f32[2,2048,2048]{2,1,0} custom-call(), custom_call_target="AllocateBuffer"
+    data_param = f32[1,2048,2048]{2,1,0} parameter(0)
+    index_param = s32[] parameter(1)
+    constant_s32_0 = s32[] constant(0)
+    dynamic_update_slice = f32[2,2048,2048]{2,1,0} dynamic-update-slice(custom-call, data_param, index_param, constant_s32_0, constant_s32_0)
+    ROOT dynamic_slice = f32[1,2048,2048]{2,1,0:S(5)} dynamic-slice(f32[2,2048,2048]{2,1,0} dynamic_update_slice, index_param, constant_s32_0, constant_s32_0), dynamic_slice_sizes={1,2048,2048}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.message(),
+              HasSubstr("DynamicSlice instruction shouldn't change layout "
+                        "memory space from device to host"));
+}
 }  // namespace
 }  // namespace xla

From 2a8da791e65495224512ddfd39382518720b39f9 Mon Sep 17 00:00:00 2001
From: "Jiyoun (Jen) Ha" <jiyounha@google.com>
Date: Sun, 24 Mar 2024 17:24:25 -0700
Subject: [PATCH 355/670] Add a function to determine whether an op is inside a
 StableHLO op with `Region`.

This will be used for filtering target lift patterns in `TableGen` file.

Additionally, this CL does the following:
* Use the `Operation*` than reference-by-value for consistency with MLIR norm.
* Adds additional tests for `IsStablehlOp`.

PiperOrigin-RevId: 618686954
---
 .../compiler/mlir/quantization/common/BUILD   |  1 +
 .../common/lift_as_function_call.cc           | 12 +++--
 .../common/lift_as_function_call.h            | 11 +++--
 .../common/lift_as_function_call.td           |  6 ++-
 .../common/lift_as_function_call_test.cc      | 46 ++++++++++++++++++-
 .../mlir/quantization/stablehlo/BUILD         |  1 +
 .../utils/stablehlo_type_utils_test.cc        | 18 ++++++--
 7 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/common/BUILD b/tensorflow/compiler/mlir/quantization/common/BUILD
index 8e4c39b8d5f1b7..bc08ac01bce1d7 100644
--- a/tensorflow/compiler/mlir/quantization/common/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/BUILD
@@ -60,6 +60,7 @@ tf_cc_test(
         ":test_base",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
index 86ba98a7ee1139..cee57a4458225e 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
@@ -74,9 +74,15 @@ constexpr StringRef kStablehloModuleAttrsAttrName = "_stablehlo_module_attrs";
 // version 8 and above.
 constexpr StringRef kUsesShapePolymorphismAttr = "jax.uses_shape_polymorphism";
 
-// Checks if the op is inside a lifted function.
-bool IsInLiftedFunc(Operation& op) {
-  return op.getParentOfType<func::FuncOp>()->hasAttr(kFusedFunctionAttr);
+bool IsInLiftedFunc(Operation* op) {
+  if (op == nullptr) return false;
+  return op->getParentOfType<func::FuncOp>()->hasAttr(kFusedFunctionAttr);
+}
+
+bool IsInStableHloOpRegion(Operation* op) {
+  if (op == nullptr) return false;
+  auto parent_op = op->getParentOp();
+  return parent_op != nullptr && stablehlo::IsStablehloOp(parent_op);
 }
 
 // Inserts the function to the symbol table of the module thread-safely.
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
index db86b56734ab99..5448ebfa1330e2 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
@@ -47,10 +47,15 @@ inline constexpr StringRef kQuantizationMethodAttr = "_quantization_method";
 // function lifting will happen.
 enum FunctionCallOpType { TFPartitionedCallOp = 0, TFXlaCallModuleOp = 1 };
 
-// Checks if the op is inside a lifted function.
-bool IsInLiftedFunc(Operation& op);
+// Checks if an op is inside a lifted function.
+// If the given op pointer is a nullptr, returns false.
+bool IsInLiftedFunc(Operation* op);
 
-// Checks if the given einsum op is supported for XlaDotV2 quantization.
+// Checks if the op is inside a StableHLO op with region.
+// If the given op pointer is a nullptr, returns false.
+bool IsInStableHloOpRegion(Operation* op);
+
+// Checks if a given einsum op is supported for XlaDotV2 quantization.
 bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr);
 
 // Gets the quantization method from the given `XlaCallModuleOp`. It is
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td
index a4437b50ac0cf0..1ca03a803bef4d 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td
@@ -59,7 +59,11 @@ class NamedAttr<string attr_name> :
 // Checks if the value is not defined inside a lifted function by checking the
 // `tf_quant.composite_function` attribute.
 def IsNotInLiftedFunc :
-      Constraint<CPred<"!IsInLiftedFunc(*$0.getDefiningOp())">>;
+      Constraint<CPred<"!IsInLiftedFunc($0.getDefiningOp())">>;
+
+// Checks if the value is not inside a StableHLO op with region.
+def IsNotInStableHloOpRegion :
+      Constraint<CPred<"!IsInStableHloOpRegion($0.getDefiningOp())">>;
 
 // Checks if the given einsum op is supported for XlaDotV2 quantization.
 def IsEinsumSupportedByXlaDotV2 :
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
index 30c1a342f8d4d5..0b580592b188c6 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -67,7 +68,7 @@ TEST_F(LiftAsFunctionCallTest, LiftedFunctionSucceeds) {
 
   auto dot_general_op = FindOperationOfType<mlir::stablehlo::DotGeneralOp>(
       composite_dot_general_fn);
-  EXPECT_TRUE(IsInLiftedFunc(*dot_general_op));
+  EXPECT_TRUE(IsInLiftedFunc(dot_general_op));
 }
 
 constexpr absl::string_view kModuleStableHlo = R"mlir(
@@ -243,5 +244,48 @@ TEST_F(LiftAsFunctionCallTest,
                        HasSubstr("Failed to parse Method from textproto")));
 }
 
+constexpr absl::string_view kFunctionWithRegion =
+    R"mlir(
+  func.func @main(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+    %if = "stablehlo.if"(%arg0) ({
+      %0 = stablehlo.add %arg1, %arg1 : tensor<f32>
+      stablehlo.return %0 : tensor<f32>
+    }, {
+      %1 = stablehlo.add %arg2, %arg2 : tensor<f32>
+      stablehlo.return %1 : tensor<f32>
+    }) : (tensor<i1>) -> (tensor<f32>)
+    %subtract = stablehlo.subtract %if, %if : tensor<f32>
+    return %subtract : tensor<f32>
+  }
+)mlir";
+
+TEST_F(LiftAsFunctionCallTest, IsInRegionSucceedsWhenOpInsideRegion) {
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kFunctionWithRegion);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto if_op = FindOperationOfType<mlir::stablehlo::IfOp>(main_fn);
+  Block& block = if_op->getRegion(0).front();
+  Operation& add_op = *absl::c_find_if(block, [](Operation& entry) {
+    return dyn_cast_or_null<::mlir::stablehlo::AddOp>(&entry);
+  });
+  EXPECT_TRUE(IsInStableHloOpRegion(&add_op));
+}
+
+TEST_F(LiftAsFunctionCallTest, IsInRegionFailsWhenOpNotInsideRegion) {
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kFunctionWithRegion);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto subtract_op = FindOperationOfType<mlir::stablehlo::SubtractOp>(main_fn);
+  EXPECT_FALSE(IsInStableHloOpRegion(subtract_op));
+}
+
 }  // namespace
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 829fd93d741604..9263f5ccf68838 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -596,6 +596,7 @@ tf_cc_test(
     deps = [
         ":stablehlo_type_utils",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@stablehlo//:stablehlo_ops",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils_test.cc
index a864ee556ff5af..fe6d62cc9731d1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils_test.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils.h"
 
 #include <gtest/gtest.h>
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 
 namespace mlir::quant::stablehlo {
@@ -30,19 +32,25 @@ class StablehloTypeUtilsTest : public Test {
  protected:
   StablehloTypeUtilsTest() {
     ctx_.loadDialect<mlir::stablehlo::StablehloDialect,
-                     mlir::func::FuncDialect>();
+                     mlir::arith::ArithDialect, mlir::func::FuncDialect>();
   }
 
   MLIRContext ctx_;
   OpBuilder builder_{&ctx_};
 };
 
-TEST_F(StablehloTypeUtilsTest, ValidStablehloOpSucceeds) {
-  mlir::stablehlo::ConstantOp constant_op =
+TEST_F(StablehloTypeUtilsTest, IsStablehloOpSucceedsWithStablehloOp) {
+  const OwningOpRef<mlir::stablehlo::ConstantOp> constant_op =
       builder_.create<mlir::stablehlo::ConstantOp>(
           builder_.getUnknownLoc(), builder_.getI32IntegerAttr(0));
-  EXPECT_TRUE(IsStablehloOp(constant_op));
-  constant_op->erase();
+  EXPECT_TRUE(IsStablehloOp(*constant_op));
+}
+
+TEST_F(StablehloTypeUtilsTest, IsStablehloOpFailsWithArithOp) {
+  const OwningOpRef<mlir::arith::ConstantOp> constant_op =
+      builder_.create<mlir::arith::ConstantOp>(builder_.getUnknownLoc(),
+                                               builder_.getI32IntegerAttr(0));
+  EXPECT_FALSE(IsStablehloOp(*constant_op));
 }
 
 }  // namespace

From 5a33f48f4c7437aeb3a6c004def00889528e6915 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 24 Mar 2024 18:42:02 -0700
Subject: [PATCH 356/670] Automated Code Change

PiperOrigin-RevId: 618696528
---
 tensorflow/compiler/mlir/lite/metrics/BUILD        |  7 ++++++-
 .../mlir/lite/metrics/error_collector_inst.cc      |  4 ++++
 .../mlir/lite/metrics/error_collector_inst.h       |  2 ++
 .../mlir/lite/metrics/error_collector_inst_test.cc | 14 ++++++++++++--
 .../compiler/mlir/lite/metrics/types_util.cc       |  3 +++
 5 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/metrics/BUILD b/tensorflow/compiler/mlir/lite/metrics/BUILD
index dfdb63ce59ef5c..6218a2fb30a829 100644
--- a/tensorflow/compiler/mlir/lite/metrics/BUILD
+++ b/tensorflow/compiler/mlir/lite/metrics/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -33,6 +33,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -43,11 +44,15 @@ tf_cc_test(
         "testdata/strided_slice.mlir",
     ],
     deps = [
+        ":error_collector",
         ":error_collector_inst",
         ":types_util",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/lite/python/metrics:converter_error_data_proto_cc",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
diff --git a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.cc b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.cc
index 9a6c173f8c4f9d..6e31d8cb21f29a 100644
--- a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.cc
+++ b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.cc
@@ -21,7 +21,11 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_split.h"
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/metrics/error_collector.h"
+#include "tensorflow/compiler/mlir/lite/metrics/types_util.h"
 
 namespace mlir {
 namespace TFL {
diff --git a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h
index 322ec2e852d8cc..b5d66c622ab389 100644
--- a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h
+++ b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h
@@ -20,7 +20,9 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Pass/PassInstrumentation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/metrics/error_collector.h"
diff --git a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
index ee433b0ded933c..f7d20783b6ea81 100644
--- a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
+++ b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
@@ -23,19 +23,29 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "llvm/Support/MemoryBuffer.h"
+#include "absl/status/statusor.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/metrics/error_collector.h"
 #include "tensorflow/compiler/mlir/lite/metrics/types_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/lite/python/metrics/converter_error_data.pb.h"
 #include "tsl/platform/statusor.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/lite/metrics/types_util.cc b/tensorflow/compiler/mlir/lite/metrics/types_util.cc
index 96a167b3254ba6..b47347ceb03827 100644
--- a/tensorflow/compiler/mlir/lite/metrics/types_util.cc
+++ b/tensorflow/compiler/mlir/lite/metrics/types_util.cc
@@ -16,8 +16,11 @@ limitations under the License.
 
 #include <string>
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "tensorflow/lite/python/metrics/converter_error_data.pb.h"
 
 namespace mlir {
 namespace TFL {

From 1fb878335af2f086d88fab0f3f1480fa66a5bacc Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Sun, 24 Mar 2024 21:24:54 -0700
Subject: [PATCH 357/670] Set `SetSingleLineMode` to `true` to embed `Method`
 textprotos in a single line.

Previous implementation manually removed `\n` symbols, which resulted in some verbosity where extra whitespaces are included.
Use `SetSingleLineMode` API to achieve the same, but resulting in a more compact string representation.

PiperOrigin-RevId: 618719668
---
 .../lift_quantizable_spots_as_functions.cc    | 23 +++++++++++++++----
 ..._as_functions_with_quantization_specs.mlir |  6 ++---
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
index a40568f70d193a..a4bf42ec6f8eba 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/strings/str_replace.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -76,6 +77,12 @@ bool FloatValueEquals(const Attribute& attr, const double value) {
   });
 }
 
+inline void TrimTrailingWhitespaces(std::string& str) {
+  while (!str.empty() && str.back() == ' ') {
+    str.pop_back();
+  }
+}
+
 // Lifts quantizable units as separate functions, thereby identifying the
 // boundaries of quantizable subgraphs. `QuantizationSpecs` influences how
 // quantizable units are lifted.
@@ -146,16 +153,22 @@ class FunctionNameMatcher {
   std::unique_ptr<RE2> match_regex_;  // NOLINT
 };
 
-// Converts `Method` to text proto representation. All newline characters are
-// removed.
+// Converts `Method` to a single-line textproto representation. Returns
+// `failure()` when converting to textproto failed.
 FailureOr<std::string> QuantizationMethodToTextProto(const Method& method) {
+  TextFormat::Printer printer;
+  printer.SetSingleLineMode(true);
+
   std::string method_txtpb;
-  if (!TextFormat::PrintToString(method, &method_txtpb)) {
+  if (!printer.PrintToString(method, &method_txtpb)) {
+    LLVM_DEBUG(llvm::dbgs() << "Failed to convert Method to textproto\n.");
     return failure();
   }
 
-  // Remove newlines.
-  absl::StrReplaceAll({{"\n", ""}}, &method_txtpb);
+  // Single line mode might have an extra space at the end, due to the internal
+  // details of `Printer`.
+  TrimTrailingWhitespaces(method_txtpb);
+
   return method_txtpb;
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
index 6b3753a2bed846..c8bffa8be6b6b4 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
@@ -21,10 +21,10 @@ func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
 
 // Check that the `_quantization_method` attribute contains the quantization
 // method in textproto format. The dot_general op quantization is explicitly
-// disabled by having `_quantization_method = "no_quantization {}"`.
+// disabled by having `_quantization_method = "no_quantization { }"`.
 // DISABLE-ALL-DOT-GENERAL-SAME: _entry_function = @composite_dot_general_fn_1
 // DISABLE-ALL-DOT-GENERAL-SAME: _original_entry_function
-// DISABLE-ALL-DOT-GENERAL-SAME: _quantization_method = "no_quantization {}"
+// DISABLE-ALL-DOT-GENERAL-SAME: _quantization_method = "no_quantization { }"
 // DISABLE-ALL-DOT-GENERAL-SAME: _tfl_quant_trait = "fully_quantizable"
 
 // DISABLE-ALL-DOT-GENERAL: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
@@ -87,7 +87,7 @@ func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
 // method in textproto format, enabling static-range PTQ.
 // STATIC-RANGE-PTQ-TO-ALL-SAME: _entry_function = @composite_dot_general_fn_1
 // STATIC-RANGE-PTQ-TO-ALL-SAME: _original_entry_function
-// STATIC-RANGE-PTQ-TO-ALL-SAME: _quantization_method = "static_range_ptq {}"
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _quantization_method = "static_range_ptq { }"
 // STATIC-RANGE-PTQ-TO-ALL-SAME: _tfl_quant_trait = "fully_quantizable"
 
 // STATIC-RANGE-PTQ-TO-ALL: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>

From 8eaa962ff3c25831ff68d44f8d7113b7af827e75 Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Sun, 24 Mar 2024 23:03:49 -0700
Subject: [PATCH 358/670] Implement `GetQuantizationMethodOrDefault`.

This is similar to `GetQuantizationMethod`, but returns a default instance of `Method` when there is no quantization method attribute instead of returning a non-ok `Status`.
This allows to avoid one extra level of indirection from checking the `Status`.

PiperOrigin-RevId: 618735895
---
 .../compiler/mlir/quantization/common/BUILD   |  5 ++
 .../common/lift_as_function_call.cc           | 12 ++++
 .../common/lift_as_function_call.h            |  7 ++
 .../common/lift_as_function_call_test.cc      | 64 +++++++++++++++++++
 .../mlir/quantization/common/test_base.h      | 18 ++++++
 5 files changed, 106 insertions(+)

diff --git a/tensorflow/compiler/mlir/quantization/common/BUILD b/tensorflow/compiler/mlir/quantization/common/BUILD
index bc08ac01bce1d7..8091fe21ef56ff 100644
--- a/tensorflow/compiler/mlir/quantization/common/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/BUILD
@@ -41,6 +41,7 @@ cc_library(
         "//tensorflow/core:framework_lite",
         "//tensorflow/core/ir/types:Dialect",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -48,6 +49,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -69,6 +71,7 @@ tf_cc_test(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status_matchers",
         "@stablehlo//:stablehlo_ops",
     ],
@@ -110,6 +113,7 @@ cc_library(
     hdrs = ["test_base.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":func",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:context",
@@ -123,6 +127,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
         "@stablehlo//:stablehlo_ops",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
index cee57a4458225e..050bf45d7b5a46 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
@@ -497,4 +498,15 @@ absl::StatusOr<Method> GetQuantizationMethod(
   return quantization_method;
 }
 
+Method GetQuantizationMethodOrDefault(TF::XlaCallModuleOp xla_call_module_op) {
+  absl::StatusOr<Method> method = GetQuantizationMethod(xla_call_module_op);
+  if (method.status().code() == absl::StatusCode::kInternal) {
+    // This indicates that the `Method` protobuf string is corrupt, but this
+    // function ignores it and returns the default instance.
+    xla_call_module_op->emitError(absl::StrCat(
+        "Failed to get quantization method: ", method.status().ToString()));
+  }
+  return method.ok() ? *method : Method::default_instance();
+}
+
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
index 5448ebfa1330e2..bd7421d376102b 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
@@ -65,6 +65,13 @@ bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr);
 absl::StatusOr<::stablehlo::quantization::Method> GetQuantizationMethod(
     TF::XlaCallModuleOp xla_call_module_op);
 
+// Gets the quantization method from the given `XlaCallModuleOp`. It is
+// retrieved from the `kQuantizationMethodAttr` string attribute. Returns a
+// default instance of `Method` iff the attribute doesn't exist or the attribute
+// contains an invalid textproto for `Method`.
+::stablehlo::quantization::Method GetQuantizationMethodOrDefault(
+    TF::XlaCallModuleOp xla_call_module_op);
+
 // Creates a function to wrap the section between arguments and results.
 // The generated function call op type will be decided by the given call_op_type
 // argument. Currently, it supports TF::XlaCallModuleOp and
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
index 0b580592b188c6..c37a997217d2b7 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
@@ -31,11 +31,13 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/platform/status_matchers.h"
 
 namespace mlir::quant {
@@ -44,6 +46,7 @@ namespace {
 using ::stablehlo::quantization::Method;
 using ::testing::HasSubstr;
 using ::testing::NotNull;
+using ::tsl::protobuf::util::MessageDifferencer;
 using ::tsl::testing::IsOk;
 using ::tsl::testing::StatusIs;
 
@@ -287,5 +290,66 @@ TEST_F(LiftAsFunctionCallTest, IsInRegionFailsWhenOpNotInsideRegion) {
   EXPECT_FALSE(IsInStableHloOpRegion(subtract_op));
 }
 
+TEST_F(LiftAsFunctionCallTest,
+       GetQuantizationMethodOrDefaultReturnsCorrectMethod) {
+  // Function containing a simple `TF::XlaCallModuleOp` with a valid string
+  // attribute `_quantization_method` set to `"no_quantization { }"`.
+  constexpr absl::string_view kXlaCallModuleOpWithQuantizationMethodAttr =
+      R"mlir(
+    func.func @main(%arg0: tensor<1x1x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x1x4xf32> {
+      %0 = "tf.XlaCallModule"(%arg0, %arg1) <{Sout = [#tf_type.shape<1x1x4>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}>
+          {
+            _entry_function = @composite_dot_general_fn_1,
+            _quantization_method = "no_quantization { }",
+            _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}
+          } : (tensor<1x1x3xf32>, tensor<3x4xf32>) -> tensor<1x1x4xf32>
+      return %0 : tensor<1x1x4xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kXlaCallModuleOpWithQuantizationMethodAttr);
+  ASSERT_TRUE(module_op);
+
+  FailureOr<TF::XlaCallModuleOp> xla_call_module_op =
+      FindFirstOpFromMainFunc<TF::XlaCallModuleOp>(*module_op);
+  ASSERT_TRUE(succeeded(xla_call_module_op));
+
+  // Test that `GetQuantizationMethodOrDefault` returns a valid `Method`
+  // corresponding to `"no_quantization {}"`.
+  const Method method = GetQuantizationMethodOrDefault(*xla_call_module_op);
+  EXPECT_TRUE(method.has_no_quantization());
+}
+
+TEST_F(
+    LiftAsFunctionCallTest,
+    GetQuantizationMethodOrDefaultReturnsDefaultWhenNoQuantizationMethodAttr) {
+  // Function containing a simple `TF::XlaCallModuleOp` that is missing the
+  // "_quantization_method" attribute.
+  constexpr absl::string_view kXlaCallModuleOpWithoutQuantizationMethodAttr =
+      R"mlir(
+    func.func @main(%arg0: tensor<1x1x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x1x4xf32> {
+      %0 = "tf.XlaCallModule"(%arg0, %arg1) <{Sout = [#tf_type.shape<1x1x4>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}>
+          {
+            _entry_function = @composite_dot_general_fn_1,
+            _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}
+          } : (tensor<1x1x3xf32>, tensor<3x4xf32>) -> tensor<1x1x4xf32>
+      return %0 : tensor<1x1x4xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kXlaCallModuleOpWithoutQuantizationMethodAttr);
+  ASSERT_TRUE(module_op);
+
+  FailureOr<TF::XlaCallModuleOp> xla_call_module_op =
+      FindFirstOpFromMainFunc<TF::XlaCallModuleOp>(*module_op);
+  ASSERT_TRUE(succeeded(xla_call_module_op));
+
+  // Test that `GetQuantizationMethodOrDefault` returns the default instance.
+  const Method method = GetQuantizationMethodOrDefault(*xla_call_module_op);
+  EXPECT_TRUE(MessageDifferencer::Equals(method, Method::default_instance()));
+}
+
 }  // namespace
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/common/test_base.h b/tensorflow/compiler/mlir/quantization/common/test_base.h
index 46c069cc49011e..a1a770ff616dee 100644
--- a/tensorflow/compiler/mlir/quantization/common/test_base.h
+++ b/tensorflow/compiler/mlir/quantization/common/test_base.h
@@ -28,9 +28,11 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -69,6 +71,22 @@ class QuantizationTestBase : public Test {
     return nullptr;
   }
 
+  // Convenience function that returns the first operation of type `OpT` from
+  // the `@main` function in `module_op`. Useful when testing with a text
+  // representation of a `ModuleOp` containing a single function `@main`.
+  // Returns `failure` iff there is no `@main` or no such operation is found in
+  // `@main`.
+  template <typename OpT>
+  FailureOr<OpT> FindFirstOpFromMainFunc(ModuleOp module_op) {
+    func::FuncOp main_func_op = FindMainFuncOp(module_op);
+    if (main_func_op == nullptr) return failure();
+
+    auto ops = main_func_op.getOps<OpT>();
+    if (ops.empty()) return failure();
+
+    return *ops.begin();
+  }
+
   std::unique_ptr<MLIRContext> ctx_;
   OpBuilder builder_;
 };

From 8fcb611cdae1ffae5b643762901fbb72a8941315 Mon Sep 17 00:00:00 2001
From: "Jiyoun (Jen) Ha" <jiyounha@google.com>
Date: Sun, 24 Mar 2024 23:21:22 -0700
Subject: [PATCH 359/670] Refactor patterns in `quantize_patterns.cc` depending
 on compute-heaviness.

* Implemented and added tests for `GetElementType`, and had minor cleanups.
* Refactor singular op patterns into `QuantizeSingularOpPattern`, excluding unnecessary requantize for singular ops with qi8 output.

PiperOrigin-RevId: 618738661
---
 .../common/attrs_and_constraints_test.cc      | 28 +++---
 .../common/uniform_quantized_types.h          |  7 ++
 .../common/uniform_quantized_types_test.cc    | 23 +++++
 .../stablehlo/passes/quantization_patterns.cc | 89 ++++++-------------
 .../stablehlo/passes/quantization_patterns.h  | 16 +---
 .../quantization/stablehlo/passes/quantize.cc |  6 +-
 .../passes/quantize_composite_functions.mlir  |  5 +-
 7 files changed, 82 insertions(+), 92 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
index 6ec7285a8e7406..f6e633aa4c7861 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
@@ -72,7 +72,7 @@ constexpr absl::string_view kModuleMultipleUses = R"mlir(
   module {
     func.func @main(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
       %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
-      %1 = stablehlo.subtract %0, %arg2 : tensor<1x3xf32>
+      %1 = stablehlo.subtract %arg2, %0 : tensor<1x3xf32>
       %2 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
       return %2 : tensor<1x3xf32>
     }
@@ -411,9 +411,8 @@ TEST_F(AttrsAndConstraintsTest, HasQuantizableTraitFalse) {
 }
 
 TEST_F(AttrsAndConstraintsTest, IsHybridQuantizedOpTrue) {
-  OwningOpRef<ModuleOp> module_op_ref =
-      ParseModuleOpString(kModuleHybridQuantized);
-  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kModuleHybridQuantized);
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
   ASSERT_THAT(main_fn, NotNull());
 
   Operation* dot_general = FindOperationOfType<DotGeneralOp>(main_fn);
@@ -421,9 +420,8 @@ TEST_F(AttrsAndConstraintsTest, IsHybridQuantizedOpTrue) {
 }
 
 TEST_F(AttrsAndConstraintsTest, IsHybridQuantizedOpFalse) {
-  OwningOpRef<ModuleOp> module_op_ref =
-      ParseModuleOpString(kModuleXlaCallModule);
-  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kModuleXlaCallModule);
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
   ASSERT_THAT(main_fn, NotNull());
 
   Operation* call_op = FindOperationOfType<TF::XlaCallModuleOp>(main_fn);
@@ -453,17 +451,25 @@ constexpr absl::string_view kModuleDotGeneralBatchMatmul = R"mlir(
 )mlir";
 
 TEST_F(AttrsAndConstraintsTest, DotGeneralFullyConnectedReturnsQuantDim) {
-  OwningOpRef<ModuleOp> module_op_ref =
+  OwningOpRef<ModuleOp> module_op =
       ParseModuleOpString(kModuleDotGeneralFullyConnected);
-  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
   auto dot_general_op = *main_fn.getOps<DotGeneralOp>().begin();
   EXPECT_THAT(GetDotGeneralQuantizationDim(dot_general_op), Optional(1));
 }
 
 TEST_F(AttrsAndConstraintsTest, DotGeneralBatchMatmulReturnsNullQuantDim) {
-  OwningOpRef<ModuleOp> module_op_ref =
+  OwningOpRef<ModuleOp> module_op =
       ParseModuleOpString(kModuleDotGeneralBatchMatmul);
-  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
   auto dot_general_op = *main_fn.getOps<DotGeneralOp>().begin();
   EXPECT_THAT(GetDotGeneralQuantizationDim(dot_general_op), Eq(std::nullopt));
 }
diff --git a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
index 6c02f0d1dcbfd5..ab850c878ff0dd 100644
--- a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
+++ b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
@@ -78,6 +79,12 @@ bool IsStorageTypeI32(QuantizedType quantized_type);
 
 bool IsExpressedTypeF32(QuantizedType quantized_type);
 
+// Given a value, extract the `ElementType`.
+// `value` should be a non-null `TensorType`.
+inline Type GetElementType(const Value value) {
+  return value.getType().cast<TensorType>().getElementType();
+}
+
 // Returns true iff `type` is a uniform quantized type whose storage type is
 // 8-bit integer and expressed type is f32.
 bool IsI8F32UniformQuantizedType(Type type);
diff --git a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc
index 474c378acc1e0d..e9443a667fcef3 100644
--- a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
@@ -725,6 +726,28 @@ TEST_F(IsOpNotQuantizedTest, FalseIfOpPartiallyQuantized) {
   EXPECT_FALSE(IsOpNotQuantized(*uniform_quantize_op_itr));
 }
 
+using UniformQuantizedTypeTest = QuantizationTestBase;
+
+TEST_F(UniformQuantizedTypeTest, GetElementTypeSucceeds) {
+  constexpr absl::string_view kQuantizeOp = R"mlir(
+    func.func @quantize(%arg0: tensor<2xf32>) -> tensor<2x!quant.uniform<i8:f32, 1.000000e+00:0>> {
+      %0 = stablehlo.uniform_quantize %arg0 : (tensor<2xf32>) -> tensor<2x!quant.uniform<i8:f32, 1.000000e+00:0>>
+      return %0 : tensor<2x!quant.uniform<i8:f32, 1.000000e+00:0>>
+    }
+  )mlir";
+
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kQuantizeOp);
+  ASSERT_TRUE(module_op);
+
+  auto func_op = module_op->lookupSymbol<func::FuncOp>("quantize");
+  ASSERT_THAT(func_op, NotNull());
+
+  auto uniform_quantize_op =
+      *func_op.getOps<::mlir::stablehlo::UniformQuantizeOp>().begin();
+  Value result = uniform_quantize_op.getResult();
+  EXPECT_THAT(GetElementType(result), NotNull());
+}
+
 }  // namespace
 }  // namespace quant
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
index 72702621f6e8b4..2dfed06b8f7087 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
@@ -417,49 +417,6 @@ void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter,
   }
 }
 
-template <typename SingularOp>
-// Match for tensor manipulation op.
-LogicalResult MatchSingularOp(func::FuncOp entry_func_op) {
-  const auto op_iterator_range = entry_func_op.getOps<SingularOp>();
-  if (op_iterator_range.empty()) {
-    LLVM_DEBUG(llvm::dbgs() << "Function does not have "
-                            << SingularOp::getOperationName() << " op.\n");
-    return failure();
-  }
-  if (!isa<RankedTensorType>(
-          (*op_iterator_range.begin()).getResult().getType())) {
-    LLVM_DEBUG(llvm::dbgs() << SingularOp::getOperationName()
-                            << " op must have ranked tensor type.\n");
-    return failure();
-  }
-  return success();
-}
-
-template <typename SingularOp>
-void RewriteSingularOp(func::FuncOp entry_func_op, PatternRewriter& rewriter) {
-  SingularOp singular_op = *entry_func_op.getOps<SingularOp>().begin();
-
-  const Type operand_type = entry_func_op.getArgumentTypes()[0];
-  const Type func_result_type = entry_func_op.getResultTypes()[0];
-
-  // Get the quantized tensor manipulation op's output type and update.
-  Value singular_op_result = singular_op.getResult();
-  const auto singular_op_result_type =
-      singular_op_result.getType().cast<RankedTensorType>();
-  const ArrayRef<int64_t> singular_op_shape =
-      singular_op_result_type.getShape();
-  const TensorType new_singular_op_result_type =
-      singular_op_result_type.cloneWith(
-          singular_op_shape,
-          getElementTypeOrSelf(operand_type).cast<UniformQuantizedType>());
-  singular_op_result.setType(new_singular_op_result_type);
-
-  // Create requantization op and return.
-  rewriter.setInsertionPointAfter(singular_op);
-  CreateAndReturnUniformQuantizeOp(rewriter, *singular_op, entry_func_op,
-                                   func_result_type);
-}
-
 // Quantizes the entry function's body containing a `DotGeneralOp`.
 class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
@@ -508,19 +465,34 @@ class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
   const bool enable_per_channel_quantized_weight_;
 };
 
-// Quantizes the entry function's body containing a `GatherOp`.
-class QuantizeGatherOpPattern : public EntryFuncBodyQuantizationPattern {
+template <typename SingularOpT>
+class QuantizeSingularOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
-  explicit QuantizeGatherOpPattern(
+  explicit QuantizeSingularOpPattern(
       const bool enable_per_channel_quantized_weight) {}
 
   LogicalResult match(func::FuncOp entry_func_op) const override {
-    return MatchSingularOp<GatherOp>(entry_func_op);
+    const auto op_iterator_range = entry_func_op.getOps<SingularOpT>();
+    if (op_iterator_range.empty()) {
+      LLVM_DEBUG(llvm::dbgs() << "Function does not have "
+                              << SingularOpT::getOperationName() << " op.\n");
+      return failure();
+    }
+    if (!isa<RankedTensorType>(
+            (*op_iterator_range.begin()).getResult().getType())) {
+      LLVM_DEBUG(llvm::dbgs() << SingularOpT::getOperationName()
+                              << " op must have ranked tensor type.\n");
+      return failure();
+    }
+    return success();
   }
 
   void rewrite(func::FuncOp entry_func_op,
                PatternRewriter& rewriter) const override {
-    RewriteSingularOp<GatherOp>(entry_func_op, rewriter);
+    auto singular_op = *entry_func_op.getOps<SingularOpT>().begin();
+
+    Value singular_op_result = singular_op.getResult();
+    singular_op_result.setType(entry_func_op.getResultTypes()[0]);
   }
 };
 
@@ -933,14 +905,20 @@ class HybridXlaCallModuleOpToCallOp
   }
 };
 
-// TODO: b/307620428 - Increase fused op coverage for static range quantization.
-void PopulateFusedGemmStylePatterns(
+// Compute heavy patterns should be quantized for both server and ODML targets.
+void PopulateComputeHeavyPatterns(
     MLIRContext& ctx, RewritePatternSet& patterns,
     const bool enable_per_channel_quantized_weight) {
   patterns.add<XlaCallModuleOpToCallOp<QuantizeConvolutionOpPattern>>(
       ctx, enable_per_channel_quantized_weight);
   patterns.add<XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>>(
       ctx, enable_per_channel_quantized_weight);
+  // TODO: b/307620772 - Per-channel quantization for gather.
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeSingularOpPattern<GatherOp>>>(
+      ctx, /*enable_per_channel_quantized_weight=*/false);
+  // Populate pattern for quantization of ops with regions such as
+  // `stablehlo.reduce_window` op.
+  patterns.add<QuantizeOpWithRegionPattern>(ctx);
 }
 
 void PopulateQuantizeHybridPatterns(MLIRContext& ctx,
@@ -949,15 +927,4 @@ void PopulateQuantizeHybridPatterns(MLIRContext& ctx,
       ctx, false);
 }
 
-void PopulateQuantizeOpWithRegionPattern(MLIRContext& ctx,
-                                         RewritePatternSet& patterns) {
-  patterns.add<QuantizeOpWithRegionPattern>(ctx);
-}
-
-void PopulateQuantizeSingularOpPatterns(MLIRContext& ctx,
-                                        RewritePatternSet& patterns) {
-  // TODO: b/307620772 - Per-channel quantization for gather.
-  patterns.add<XlaCallModuleOpToCallOp<QuantizeGatherOpPattern>>(
-      ctx, /*enable_per_channel_quantized_weight=*/false);
-}
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
index 7b681cc71f71e3..5c26cd0a3c6837 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
@@ -249,24 +249,14 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
   }
 };
 
-// Gemm Style Op: glossary/gemm.
-void PopulateFusedGemmStylePatterns(MLIRContext& ctx,
-                                    RewritePatternSet& patterns,
-                                    bool enable_per_channel_quantized_weight);
+// Populates pattern for compute heavy operations.
+void PopulateComputeHeavyPatterns(MLIRContext& ctx, RewritePatternSet& patterns,
+                                  bool enable_per_channel_quantized_weight);
 
 // Populates pattern for hybrid quantization.
 void PopulateQuantizeHybridPatterns(MLIRContext& ctx,
                                     RewritePatternSet& patterns);
 
-// Populates pattern for quantization of ops with regions such as
-// stablehlo.reduce_window op.
-void PopulateQuantizeOpWithRegionPattern(MLIRContext& ctx,
-                                         RewritePatternSet& patterns);
-
-// Populates conversion patterns for unary data movement ops.
-void PopulateQuantizeSingularOpPatterns(MLIRContext& ctx,
-                                        RewritePatternSet& patterns);
-
 }  // namespace mlir::quant::stablehlo
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_QUANTIZATION_PATTERNS_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
index 048f0f04cff789..5aaf98fd7cc810 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
@@ -117,10 +117,8 @@ void QuantizePass::runOnOperation() {
     PopulateQuantizeHybridPatterns(ctx, patterns);
   }
 
-  PopulateQuantizeOpWithRegionPattern(ctx, patterns);
-  PopulateFusedGemmStylePatterns(ctx, patterns,
-                                 enable_per_channel_quantized_weight_);
-  PopulateQuantizeSingularOpPatterns(ctx, patterns);
+  PopulateComputeHeavyPatterns(ctx, patterns,
+                               enable_per_channel_quantized_weight_);
 
   if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
     // There are cases where no rewrites happen even if a pattern matches,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
index 13570eb583110e..b19f8af3f6e8c2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
@@ -598,7 +598,7 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
-// Tests that basic gather is properly quantized.
+// Tests that basic `stablehlo.gather` is properly quantized.
 
 module attributes {tf_saved_model.semantics} {
 // CHECK: func.func private @quantize_gather_fn(%[[ARG:.+]]: tensor<3x4x2xf32>) -> tensor<2x3x2x2xf32> attributes {tf._original_func_name = "main_0"}
@@ -631,6 +631,5 @@ module attributes {tf_saved_model.semantics} {
     return %0 : tensor<2x3x2x2xf32>
   }
 // CHECK: %[[GATHER:.+]] = "stablehlo.gather"(%[[ARG_0]], %[[ARG_1]]) {{.*}} : (tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x2xi32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[GATHER]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: return %[[UNIFORM_QUANTIZE]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[GATHER]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
 }

From 08eb7e88074f43b25000d892fbaa0f9e77c5f528 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 02:02:05 -0700
Subject: [PATCH 360/670] Update GraphDef version to 1812.

PiperOrigin-RevId: 618769038
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index a7987d3aec02da..42afec7cd64fcd 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1811  // Updated: 2024/3/24
+#define TF_GRAPH_DEF_VERSION 1812  // Updated: 2024/3/25
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 39e82a245104045da0f1c97aa413bf6ec93f5319 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 02:03:46 -0700
Subject: [PATCH 361/670] compat: Update forward compatibility horizon to
 2024-03-25

PiperOrigin-RevId: 618769534
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 1862bf22ca85e1..d3223e10f83839 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 24)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 25)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From d4497e73e7ee4843e9ec0662f2170f11ab9125a3 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Mon, 25 Mar 2024 02:28:23 -0700
Subject: [PATCH 362/670] [xla:gpu] multi_output_fusion_test: drop dependency
 on gunit

PiperOrigin-RevId: 618774780
---
 third_party/xla/xla/service/gpu/BUILD                       | 1 -
 third_party/xla/xla/service/gpu/multi_output_fusion_test.cc | 2 --
 2 files changed, 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 42f8dff69451b6..5d214b9becb46a 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2706,7 +2706,6 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc b/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
index a36f38bafcc3fb..c28c953a58bdb8 100644
--- a/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
@@ -19,8 +19,6 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"

From db5177baaa0b9ed65b1b6eb0c1c60bb212f292ed Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Mon, 25 Mar 2024 03:00:42 -0700
Subject: [PATCH 363/670] Expand StableHLO Quantizer presets for the ODML
 entrypoint.

The expanded StableHLO Quantizer currently does not have any visible change of behavior as the current expansion is not used downstream, but it will be used by features to be introduced.

PiperOrigin-RevId: 618781102
---
 .../compiler/mlir/lite/quantization/stablehlo/BUILD |  1 +
 .../lite/quantization/stablehlo/quantization.cc     | 13 +++++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
index f469cbc8fddacf..f96d4961e733b4 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
@@ -25,6 +25,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_freeze_variables",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
index 0cc946a23d4e25..08f5ecd4851b7e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -96,8 +97,11 @@ absl::StatusOr<mlir::ModuleOp> RunQuantization(
         "be nullptr.");
   }
 
-  const QuantizationConfig config_with_defaults =
-      PopulateDefaults(quantization_config);
+  LOG(INFO) << "User-provided quantization config: "
+            << quantization_config.DebugString();
+  const QuantizationConfig updated_config =
+      ExpandPresets(PopulateDefaults(quantization_config));
+  LOG(INFO) << "Updated quantization config: " << updated_config.DebugString();
 
   const absl::flat_hash_map<std::string, SignatureDef> signature_def_map =
       GetSignatureDefMapFromBundle(*saved_model_bundle);
@@ -131,8 +135,9 @@ absl::StatusOr<mlir::ModuleOp> RunQuantization(
       module_op.getContext(), quantization_py_function_lib, saved_model_dir,
       /*signature_keys=*/exported_names, saved_model_tags, signature_def_map,
       GetFunctionAliases(*saved_model_bundle));
-  const absl::StatusOr<mlir::ModuleOp> quantized_module_op =
-      static_range_ptq_component.Run(module_op, config_with_defaults);
+
+  absl::StatusOr<mlir::ModuleOp> quantized_module_op =
+      static_range_ptq_component.Run(module_op, updated_config);
   if (!quantized_module_op.ok()) {
     return absl::InternalError("Failed to run quantization. Status msg: " +
                                quantized_module_op.status().ToString());

From 5a234c4e3306ffa31d1b93aebdb1527a12c0c40a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 03:15:57 -0700
Subject: [PATCH 364/670] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/b182e75065369d1eea499fe1d08b6c57d03da2ab.

PiperOrigin-RevId: 618784568
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index c219069e9795d3..a5f1e85bee4c28 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "5c38721c44d1366b675e38651c4c1cc885054f7b"
-    TFRT_SHA256 = "79307512f2f9b4ebaec87c73d1d188867a947f52d5b68df965b751897853ab88"
+    TFRT_COMMIT = "b182e75065369d1eea499fe1d08b6c57d03da2ab"
+    TFRT_SHA256 = "ea043b73fbc7a99a2d22566a34be2e7a4752ed33408859cdf77706f3e14baf79"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index c219069e9795d3..a5f1e85bee4c28 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "5c38721c44d1366b675e38651c4c1cc885054f7b"
-    TFRT_SHA256 = "79307512f2f9b4ebaec87c73d1d188867a947f52d5b68df965b751897853ab88"
+    TFRT_COMMIT = "b182e75065369d1eea499fe1d08b6c57d03da2ab"
+    TFRT_SHA256 = "ea043b73fbc7a99a2d22566a34be2e7a4752ed33408859cdf77706f3e14baf79"
 
     tf_http_archive(
         name = "tf_runtime",

From 3cc20df23479ac6bde370c38358279f45b751665 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Mon, 25 Mar 2024 03:26:41 -0700
Subject: [PATCH 365/670] Add support for symbol rescaling to the indexing map

Rescaling allows symbols that are constrained by a modulus expression:

```
s_k mod C = [N, N]
```

to be replaced by:

```
s_k -> C * s_k + N
```

PiperOrigin-RevId: 618786490
---
 .../xla/xla/service/gpu/model/indexing_map.cc | 48 +++++++++++
 .../xla/xla/service/gpu/model/indexing_map.h  |  4 +
 .../service/gpu/model/indexing_map_test.cc    | 79 +++++++++++++++++++
 3 files changed, 131 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index 48186d460d75b9..d863e2bdadebe6 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -1213,5 +1213,53 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
   return composed_indexing_map;
 }
 
+bool IndexingMap::RescaleSymbols() {
+  MergeModConstraints();
+
+  std::vector<AffineExpr> to_delete;
+
+  for (const auto& [expr, range] : constraints_) {
+    if (range.lower != range.upper) continue;
+    auto shift_value = range.lower;
+
+    if (expr.getKind() != AffineExprKind::Mod) continue;
+    auto mod_expr = mlir::cast<AffineBinaryOpExpr>(expr);
+
+    auto constant_expr = mlir::dyn_cast<AffineConstantExpr>(mod_expr.getRHS());
+    if (!constant_expr) continue;
+
+    // We don't rescale mod expressions with non-positive divisors.
+    if (constant_expr.getValue() <= 0) continue;
+    auto scaling_factor = constant_expr.getValue();
+
+    if (mod_expr.getLHS().getKind() != AffineExprKind::SymbolId) continue;
+    auto symbol_expr = mlir::cast<AffineSymbolExpr>(mod_expr.getLHS());
+
+    affine_map_ = affine_map_.replace(
+        symbol_expr, constant_expr * symbol_expr + shift_value,
+        affine_map_.getNumDims(), affine_map_.getNumSymbols());
+
+    for (auto& [other_expr, other_range] : constraints_) {
+      if (other_expr == expr) continue;
+      if (!other_expr.isFunctionOfSymbol(symbol_expr.getPosition())) continue;
+
+      other_expr = other_expr.replace(
+          symbol_expr, constant_expr * symbol_expr + shift_value);
+    }
+
+    auto& symbol_range = range_vars_[symbol_expr.getPosition()].range;
+    symbol_range.lower = (symbol_range.lower - shift_value) / scaling_factor;
+    symbol_range.upper = (symbol_range.upper - shift_value) / scaling_factor;
+
+    to_delete.emplace_back(expr);
+  }
+
+  for (const auto& expr : to_delete) {
+    constraints_.erase(expr);
+  }
+
+  return !to_delete.empty();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index b3f5538f227c94..0419c05557364c 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -315,6 +315,10 @@ class IndexingMap {
   // Removes unused symbols from the `affine_map_` and constraints.
   void RemoveUnusedSymbols();
 
+  // Rescales all symbols that are sufficiently constrained through `s? mod x =
+  // [N, N]` constraints. Returns true if a rescale took place, otherwise false.
+  bool RescaleSymbols();
+
  private:
   IndexingMap() = default;
 
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index 4598849d9f0338..d9dd20ebba5648 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -540,6 +540,85 @@ TEST_F(IndexingMapTest,
     )"));
 }
 
+TEST_F(IndexingMapTest, RescaleSymbols_Simple) {
+  auto serialized_map = "(d0)[s0, s1, s2] -> (s2, d0, s1, s0 floordiv 6)";
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {4}, {7, 2, 6});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 6", &mlir_context_),
+                             Interval{0, 0});
+
+  EXPECT_TRUE(indexing_map.RescaleSymbols());
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+      (d0)[s0, s1, s2] -> (s2, d0, s1, s0)
+      domain:
+        d0 in [0, 3]
+        s0 in [0, 1]
+        s1 in [0, 1]
+        s2 in [0, 5]
+    )"));
+}
+
+TEST_F(IndexingMapTest, RescaleSymbols_WithShift) {
+  auto serialized_map = "(d0)[s0, s1, s2] -> (s2, d0, s1, s0)";
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {4}, {42, 2, 6});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 6", &mlir_context_),
+                             Interval{3, 3});
+
+  // [BEFORE] Allowed values for s0: 3, 9, 15, ..., 39 = (6 * 6 + 3)
+  // [AFTER] Allowed values for s0: 0, 1, 2, ..., 6
+  EXPECT_TRUE(indexing_map.RescaleSymbols());
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+      (d0)[s0, s1, s2] -> (s2, d0, s1, s0 * 6 + 3)
+      domain:
+        d0 in [0, 3]
+        s0 in [0, 6]
+        s1 in [0, 1]
+        s2 in [0, 5]
+    )"));
+}
+
+TEST_F(IndexingMapTest, RescaleSymbols_TwoModConstraints) {
+  auto serialized_map = "(d0)[s0, s1, s2] -> (s2, d0, s1, s0 floordiv 6)";
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {4}, {7, 2, 6});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 2", &mlir_context_),
+                             Interval{0, 0});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 3", &mlir_context_),
+                             Interval{0, 0});
+
+  EXPECT_TRUE(indexing_map.RescaleSymbols());
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+      (d0)[s0, s1, s2] -> (s2, d0, s1, s0)
+      domain:
+        d0 in [0, 3]
+        s0 in [0, 1]
+        s1 in [0, 1]
+        s2 in [0, 5]
+    )"));
+}
+
+TEST_F(IndexingMapTest, RescaleSymbols_RescaledSymbolInOtherConstraint) {
+  auto serialized_map = "(d0)[s0, s1, s2] -> (s2, d0, s1, s0)";
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {4}, {10, 2, 6});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 6", &mlir_context_),
+                             Interval{3, 3});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 * s2", &mlir_context_),
+                             Interval{0, 28});
+
+  EXPECT_TRUE(indexing_map.RescaleSymbols());
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+      (d0)[s0, s1, s2] -> (s2, d0, s1, s0 * 6 + 3)
+      domain:
+        d0 in [0, 3]
+        s0 in [0, 1]
+        s1 in [0, 1]
+        s2 in [0, 5]
+        (s0 * 6 + 3) * s2 in [0, 28]
+    )"));
+}
+
 TEST_F(IndexingMapTest, RangeEvaluatorTest) {
   RangeEvaluator range_evaluator(
       {Interval{0, 9}, Interval{-10, -1}, Interval{-1, 2}, Interval{0, 0}}, {},

From d29abb32b14e0cc3364ba6c548d9390ff1bd8a9a Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Mon, 25 Mar 2024 03:34:56 -0700
Subject: [PATCH 366/670] Implement deferring `stablehlo.transpose` ops for the
 input of `stablehlo.reduce_window` op with `maximum` reduce function.

Implements a pattern `RewriteMaxPoolReduceWindowOpWithActivationTranspose`. This pattern defers `stablehlo.transpose` op from the input to the result of `stablehlo.reduce_window` op. The `reduce_window`'s reduction function should be semantically equivalent to a `stablehlo.maximum`, which represents a max pool operation.

PiperOrigin-RevId: 618788424
---
 .../common/attrs_and_constraints.h            |   9 +
 .../passes/defer_activation_transpose.cc      | 134 +++++++++++++--
 .../passes/defer_activation_transpose.mlir    | 154 ++++++++++++++++++
 .../tests/pipelines/process_nchw_tensor.mlir  |  38 +++++
 4 files changed, 320 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
index 4c9e22dfce62aa..faf56159b39cc5 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_ATTRS_AND_CONSTRAINTS_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_ATTRS_AND_CONSTRAINTS_H_
 
+#include <array>
 #include <cstdint>
 #include <optional>
 #include <type_traits>
@@ -40,6 +41,14 @@ namespace mlir::quant {
 
 constexpr char kAttrMapAttribute[] = "attr_map";
 
+// Permutation from the NHWC tensor format to NCHW. This is an inverse
+// permutation of `kNchwToNhwcPermutation`.
+inline constexpr std::array<int64_t, 4> kNhwcToNchwPermutation = {0, 3, 1, 2};
+
+// Permutation from the NCHW tensor format to NHWC. This is an inverse
+// permutation of `kNchwToNhwcPermutation`.
+inline constexpr std::array<int64_t, 4> kNchwToNhwcPermutation = {0, 2, 3, 1};
+
 // Returns true if the value has static shape.
 bool HasStaticShape(Value value);
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
index cee1fb9d8cdaf1..566d416c0536aa 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <array>
 #include <cstdint>
+#include <optional>
 #include <utility>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
@@ -27,6 +30,7 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h"
 
 namespace mlir::quant::stablehlo {
 
@@ -60,7 +64,7 @@ class RewriteAddWithActivationTranspose : public OpRewritePattern<AddOp> {
     }
 
     return success(transpose_op.getPermutation() ==
-                   ArrayRef<int64_t>(kDesiredLhsPermutation));
+                   ArrayRef<int64_t>(kNhwcToNchwPermutation));
   }
 
   void rewrite(AddOp op, PatternRewriter& rewriter) const override {
@@ -72,7 +76,7 @@ class RewriteAddWithActivationTranspose : public OpRewritePattern<AddOp> {
     // NCHW -> NHWC for the right-hand side, to match the operand's shape.
     auto rhs_transpose_op = rewriter.create<TransposeOp>(
         op.getLoc(), /*operand=*/rhs_input,
-        rewriter.getDenseI64ArrayAttr(kRhsPermutation));
+        rewriter.getDenseI64ArrayAttr(kNchwToNhwcPermutation));
 
     auto add_op =
         rewriter.create<AddOp>(op.getLoc(), lhs_input, rhs_transpose_op);
@@ -80,22 +84,121 @@ class RewriteAddWithActivationTranspose : public OpRewritePattern<AddOp> {
     // NHWC -> NCHW for the output, to match the shapes of `op`'s users.
     auto output_transpose_op = rewriter.create<TransposeOp>(
         op.getLoc(), /*operand=*/add_op.getResult(),
-        rewriter.getDenseI64ArrayAttr(kOutputPermutation));
+        rewriter.getDenseI64ArrayAttr(kNhwcToNchwPermutation));
 
     rewriter.replaceAllUsesWith(op.getResult(), output_transpose_op);
   }
+};
+
+// Rewrites the `reduce_window(transpose(%activation), %init_value)` patterns to
+// `transpose(reduce_window(%activation), %init_value)`, deferring the transpose
+// to the result. The reduce function should be equivalent to
+// `stablehlo.maximum`, representing max pooling.
+class DeferActivationTransposeForMaxPoolReduceWindowOp
+    : public OpRewritePattern<mlir::stablehlo::ReduceWindowOp> {
+ public:
+  using OpRewritePattern<mlir::stablehlo::ReduceWindowOp>::OpRewritePattern;
+
+  LogicalResult match(mlir::stablehlo::ReduceWindowOp op) const override {
+    if (failed(MatchMaxPoolReduceWindowOp(op))) return failure();
+
+    // Match only when the lhs is connected to a transpose.
+    // Only supports the case commonly appearing for 2D convolutions.
+    Value lhs = op.getOperand(0);
+    if (!HasRankOf(lhs, /*rank=*/4)) return failure();
+
+    // Match input permutation that converts: NHWC -> NCHW.
+    auto transpose_op = dyn_cast_or_null<TransposeOp>(lhs.getDefiningOp());
+
+    return success(transpose_op != nullptr &&
+                   transpose_op.getPermutation() ==
+                       ArrayRef<int64_t>(kNhwcToNchwPermutation));
+  }
+
+  // Pushes the transpose op at the input to the result.
+  void rewrite(mlir::stablehlo::ReduceWindowOp op,
+               PatternRewriter& rewriter) const override {
+    auto transpose_op = cast<TransposeOp>(op.getOperand(0).getDefiningOp());
+
+    const auto result_type = op.getResult(0).getType().cast<TensorType>();
+    const SmallVector<int64_t> new_result_shape =
+        Permute<int64_t>(result_type.getShape(), kNchwToNhwcPermutation);
+
+    const TensorType new_result_type =
+        result_type.cloneWith(new_result_shape, result_type.getElementType());
+
+    // Create a new `stablehlo.reduce_window` with all relevant attributes
+    // permutated to match the new operand & result type.
+    auto new_reduce_window_op =
+        rewriter.create<mlir::stablehlo::ReduceWindowOp>(
+            op.getLoc(), new_result_type, transpose_op.getOperand(),
+            /*init_value=*/op.getOperand(1),
+            /*window_dimensions=*/
+            PermuteI64ArrayAttr(rewriter, op.getWindowDimensionsAttr(),
+                                kNchwToNhwcPermutation),
+            /*window_strides=*/
+            PermuteI64ArrayAttr(rewriter, op.getWindowStridesAttr(),
+                                kNchwToNhwcPermutation),
+            /*base_dilations=*/
+            PermuteI64ArrayAttr(rewriter, op.getBaseDilationsAttr(),
+                                kNchwToNhwcPermutation),
+            /*window_dilations=*/
+            PermuteI64ArrayAttr(rewriter, op.getWindowDilationsAttr(),
+                                kNchwToNhwcPermutation),
+            /*padding=*/DenseIntElementsAttr(nullptr));
+
+    // Clone the reduce body. It is not affected by the permutation.
+    IRMapping mapping;
+    op.getBody().cloneInto(&new_reduce_window_op.getBody(), mapping);
+
+    // Introduce a transpose to the result to match the shapes of `op`'s uses.
+    auto result_transpose_op = rewriter.create<stablehlo::TransposeOp>(
+        op.getLoc(), new_reduce_window_op.getResult(0), kNhwcToNchwPermutation);
+
+    rewriter.replaceAllUsesWith(op.getResult(0), result_transpose_op);
+  }
 
  private:
-  // Permutation representing NHWC -> NCHW for the activation (LHS), used for
-  // matching the pattern.
-  static constexpr std::array<int64_t, 4> kDesiredLhsPermutation = {0, 3, 1, 2};
-
-  // Permutation representing NCHW -> NHWC for the RHS, newly inserted after the
-  // conversion.
-  static constexpr std::array<int64_t, 4> kRhsPermutation = {0, 2, 3, 1};
-  // Permutation representing NHWC -> NCHW for the output, newly inserted after
-  // the conversion.
-  static constexpr std::array<int64_t, 4> kOutputPermutation = {0, 3, 1, 2};
+  // Permutes `array_attr` with `permutation`. The number of elements in
+  // `array_attr` and `permutation` must be equal. Returns a null attribute
+  // if `array_attr` is null.
+  DenseI64ArrayAttr PermuteI64ArrayAttr(
+      PatternRewriter& rewriter, const DenseI64ArrayAttr array_attr,
+      const ArrayRef<int64_t> permutation) const {
+    if (array_attr == nullptr) return DenseI64ArrayAttr(nullptr);
+
+    return rewriter.getDenseI64ArrayAttr(
+        Permute<int64_t>(array_attr, permutation));
+  }
+
+  LogicalResult MatchMaxPoolReduceWindowOp(
+      mlir::stablehlo::ReduceWindowOp op) const {
+    // TODO: b/321099943 - Support explicit padding.
+    if (HasPadding(op)) return failure();
+
+    // Check that the reduce-window body is a max operation.
+    return success(IsMaxFunction(op.getBody().front()));
+  }
+
+  // Whether `block` semantically corresponds to a `stablehlo.maximum` op.
+  bool IsMaxFunction(Block& block) const {
+    if (block.getNumArguments() != 2) return false;
+
+    auto return_op = cast<mlir::stablehlo::ReturnOp>(block.getTerminator());
+    if (return_op.getNumOperands() != 1) return false;
+
+    auto max_op = dyn_cast_or_null<mlir::stablehlo::MaxOp>(
+        return_op.getOperands().front().getDefiningOp());
+    if (!max_op) return false;
+
+    return (max_op.getLhs() == block.getArgument(0)) &&
+           (max_op.getRhs() == block.getArgument(1));
+  }
+
+  // Whether `op` has the `padding` attribute (which is optional).
+  bool HasPadding(mlir::stablehlo::ReduceWindowOp op) const {
+    return op.getPadding() != std::nullopt;
+  }
 };
 
 }  // namespace
@@ -112,7 +215,8 @@ void DeferActivationTransposePass::runOnOperation() {
   MLIRContext& ctx = getContext();
 
   RewritePatternSet patterns(&ctx);
-  patterns.add<RewriteAddWithActivationTranspose>(&ctx);
+  patterns.add<RewriteAddWithActivationTranspose,
+               DeferActivationTransposeForMaxPoolReduceWindowOp>(&ctx);
   if (failed(applyPatternsAndFoldGreedily(func_op, std::move(patterns)))) {
     func_op->emitWarning() << "Failed to converge patterns: " << getArgument();
   }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
index c6a1a5ca6d5fd7..b7100d88e96aa2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
@@ -69,3 +69,157 @@ func.func @add_with_activation_transpose_nonconst_rhs(%arg0: tensor<1x3x3x4xf32>
 // CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
 // CHECK: %[[ADD_0:.+]] = stablehlo.add %[[TRANSPOSE_0]], {{.*}}
 // CHECK: return %[[ADD_0]]
+
+// -----
+
+// Tests that the transpose of the input of `stablehlo.reduce_window` is
+// deferred to the result. The attributes are permutated according to the new
+// input shape.
+
+// CHECK-LABEL: reduce_window_max_activation_transpose
+func.func @reduce_window_max_activation_transpose(%arg0: tensor<1x16x16x4xf32>) -> tensor<1x4x8x8xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x16x16x4xf32>) -> tensor<1x4x16x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {window_dimensions = array<i64: 1, 1, 2, 2>, window_strides = array<i64: 1, 1, 2, 2>} : (tensor<1x4x16x16xf32>, tensor<f32>) -> tensor<1x4x8x8xf32>
+  return %2 : tensor<1x4x8x8xf32>
+}
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x16x16x4xf32>
+// CHECK-DAG: %[[INIT_VALUE_CONST:.+]] = stablehlo.constant dense<0xFF800000>
+
+// Check that the body is not modified.
+// CHECK: %[[REDUCE_WINDOW:.+]] = "stablehlo.reduce_window"(%[[ARG]], %[[INIT_VALUE_CONST]])
+// CHECK: ^bb0(%[[REDUCE_ARG_0:.+]]: tensor<f32>, %[[REDUCE_ARG_1:.+]]: tensor<f32>):
+// CHECK: %[[MAX:.+]] = stablehlo.maximum %[[REDUCE_ARG_0]], %[[REDUCE_ARG_1]]
+// CHECK: stablehlo.return %[[MAX]]
+
+// Check that the attributes window_dimensions & window_strides are also
+// permutated to match the new input shape.
+// CHECK: {window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>}
+// CHECK-SAME: (tensor<1x16x16x4xf32>, tensor<f32>) -> tensor<1x8x8x4xf32>
+
+// Check that a `stablehlo.transpose` is added to the result to match the shape
+// of the users.
+// CHECK: %[[TRANSPOSE:.+]] = stablehlo.transpose %[[REDUCE_WINDOW]], dims = [0, 3, 1, 2] : (tensor<1x8x8x4xf32>) -> tensor<1x4x8x8xf32>
+// CHECK: return %[[TRANSPOSE]]
+
+// -----
+
+// Tests that the transpose of the input of `stablehlo.reduce_window` is
+// deferred to the result. The attributes are permutated according to the new
+// input shape. This test is similar to the test above with the difference that
+// the `stablehlo.reduce_window` has explicit optional attributes:
+// `base_dilations` and `window_dilations`.
+
+// CHECK-LABEL: reduce_window_max_activation_transpose_explicit_optional_attrs
+func.func @reduce_window_max_activation_transpose_explicit_optional_attrs(
+      %arg0: tensor<1x16x16x4xf32>) -> tensor<1x4x15x15xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x16x16x4xf32>) -> tensor<1x4x16x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {
+    window_dimensions = array<i64: 1, 1, 2, 2>,
+    window_strides = array<i64: 1, 1, 2, 2>,
+    base_dilations = array<i64: 1, 1, 2, 2>,
+    window_dilations = array<i64: 1, 1, 2, 2>
+  } : (tensor<1x4x16x16xf32>, tensor<f32>) -> tensor<1x4x15x15xf32>
+  return %2 : tensor<1x4x15x15xf32>
+}
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x16x16x4xf32>
+// CHECK-DAG: %[[INIT_VALUE_CONST:.+]] = stablehlo.constant dense<0xFF800000>
+
+// Check that the body is not modified.
+// CHECK: %[[REDUCE_WINDOW:.+]] = "stablehlo.reduce_window"(%[[ARG]], %[[INIT_VALUE_CONST]])
+// CHECK: ^bb0(%[[REDUCE_ARG_0:.+]]: tensor<f32>, %[[REDUCE_ARG_1:.+]]: tensor<f32>):
+// CHECK: %[[MAX:.+]] = stablehlo.maximum %[[REDUCE_ARG_0]], %[[REDUCE_ARG_1]]
+// CHECK: stablehlo.return %[[MAX]]
+
+// Check that the attributes window_dimensions & window_strides along with
+// optional attributes base_dilations and window_dilations are also permutated
+// to match the new input shape.
+// CHECK: {base_dilations = array<i64: 1, 2, 2, 1>, window_dilations = array<i64: 1, 2, 2, 1>, window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>}
+// CHECK-SAME: (tensor<1x16x16x4xf32>, tensor<f32>) -> tensor<1x15x15x4xf32>
+
+// Check that a `stablehlo.transpose` is added to the result to match the shape
+// of the users.
+// CHECK: %[[TRANSPOSE:.+]] = stablehlo.transpose %[[REDUCE_WINDOW]], dims = [0, 3, 1, 2] : (tensor<1x15x15x4xf32>) -> tensor<1x4x15x15xf32>
+// CHECK: return %[[TRANSPOSE]]
+
+// -----
+
+// [No change] Tests that the transpose of the input of
+// `stablehlo.reduce_window` is NOT deferred to the result, when the input
+// tensor does not have rank 4.
+
+// CHECK-LABEL: reduce_window_max_activation_transpose
+// CHECK-SAME: (%[[ARG:.+]]: tensor<16x8xf32>) -> tensor<4x8xf32>
+func.func @reduce_window_max_activation_transpose_rank2(%arg0: tensor<16x8xf32>) -> tensor<4x8xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [1, 0] : (tensor<16x8xf32>) -> tensor<8x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {window_dimensions = array<i64: 2, 2>, window_strides = array<i64: 2, 2>} : (tensor<8x16xf32>, tensor<f32>) -> tensor<4x8xf32>
+  return %2 : tensor<4x8xf32>
+}
+// CHECK-DAG: stablehlo.constant
+// CHECK: stablehlo.transpose %[[ARG]]
+// CHECK: stablehlo.reduce_window
+
+// -----
+
+// [No change] Tests that the transpose of the input of
+// `stablehlo.reduce_window` is NOT deferred to the result, when it has an
+// explicit `padding` attribute.
+
+// CHECK-LABEL: reduce_window_max_activation_transpose_with_padding
+func.func @reduce_window_max_activation_transpose_with_padding(%arg0: tensor<1x16x16x4xf32>) -> tensor<1x4x9x9xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x16x16x4xf32>) -> tensor<1x4x16x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {
+    window_dimensions = array<i64: 1, 1, 2, 2>,
+    window_strides = array<i64: 1, 1, 2, 2>,
+    padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>
+  } : (tensor<1x4x16x16xf32>, tensor<f32>) -> tensor<1x4x9x9xf32>
+  return %2 : tensor<1x4x9x9xf32>
+}
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x16x16x4xf32>
+// CHECK-DAG: stablehlo.constant
+// CHECK: stablehlo.transpose %[[ARG]]
+// CHECK: stablehlo.reduce_window
+
+// -----
+
+// [No change] Tests that the transpose of the input of
+// `stablehlo.reduce_window` is NOT deferred to the result, when the transpose
+// isn't `[0, 3, 1, 2]` (i.e. NCHW->NHWC).
+
+// CHECK-LABEL: reduce_window_max_activation_transpose_with_padding
+func.func @reduce_window_max_activation_transpose_with_padding(%arg0: tensor<16x16x4x1xf32>) -> tensor<1x4x8x8xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [3, 2, 1, 0] : (tensor<16x16x4x1xf32>) -> tensor<1x4x16x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {
+    window_dimensions = array<i64: 1, 1, 2, 2>,
+    window_strides = array<i64: 1, 1, 2, 2>
+  } : (tensor<1x4x16x16xf32>, tensor<f32>) -> tensor<1x4x8x8xf32>
+  return %2 : tensor<1x4x8x8xf32>
+}
+// CHECK-SAME: %[[ARG:.+]]: tensor<16x16x4x1xf32>
+// CHECK-DAG: stablehlo.constant
+// CHECK: stablehlo.transpose %[[ARG]]
+// CHECK: stablehlo.reduce_window
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
index c40f5d3b3cdd1b..9fc6b1369b2af5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
@@ -66,3 +66,41 @@ func.func @nchw_conv_with_nonconst_bias_add(%arg0: tensor<1x2x5x5xf32>, %arg1: t
 // CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[CONV]], dims = [0, 3, 1, 2] : (tensor<1x5x5x4xf32>) -> tensor<1x4x5x5xf32>
 // CHECK: %[[ADD:.+]] = stablehlo.add %[[TRANSPOSE_1]], %[[ARG_1]] : tensor<1x4x5x5xf32>
 // CHECK: return %[[ADD]]
+
+// -----
+
+// Tests that a `reduce_window{max}(add(convolution(%activation, %weight), %bias), %init_value)`
+// with the activation tensor of NCHW format is converted to NHWC convolution +
+// add + reduce_window (with max) operation. Transpose ops are inserted to
+// activation and the final result to match the function signature. Constants
+// are also transposed accordingly.
+
+// CHECK-LABEL: nchw_conv_with_bias_add_max_pool
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x5x5xf32>
+func.func @nchw_conv_with_bias_add_max_pool(%arg0: tensor<1x2x5x5xf32>) -> tensor<1x4x2x2xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4x2x3x3xf32>
+  %1 = stablehlo.constant dense<3.000000e+00> : tensor<1x4x5x5xf32>
+  %5 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2x5x5xf32>, tensor<4x2x3x3xf32>) -> tensor<1x4x5x5xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x4x5x5xf32>
+  %4 = "stablehlo.reduce_window"(%3, %5) ({  // max pool
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %6 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %6 : tensor<f32>
+  }) {
+    window_dimensions = array<i64: 1, 1, 2, 2>,
+    window_strides = array<i64: 1, 1, 2, 2>
+  } : (tensor<1x4x5x5xf32>, tensor<f32>) -> tensor<1x4x2x2xf32>
+  return %4 : tensor<1x4x2x2xf32>
+}
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} : tensor<3x3x2x4xf32>
+// CHECK-DAG: %[[BIAS_CONST:.+]] = stablehlo.constant {{.*}} : tensor<1x5x5x4xf32>
+// CHECK-DAG: %[[INIT_VALUE_CONST:.+]] = stablehlo.constant dense<0xFF800000> : tensor<f32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG]], dims = [0, 2, 3, 1] : (tensor<1x2x5x5xf32>) -> tensor<1x5x5x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[WEIGHT_CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2xf32>, tensor<3x3x2x4xf32>) -> tensor<1x5x5x4xf32>
+// CHECK: %[[ADD:.+]] = stablehlo.add %[[CONV]], %[[BIAS_CONST]] : tensor<1x5x5x4xf32>
+// CHECK: %[[REDUCE_WINDOW_MAX:.+]] = "stablehlo.reduce_window"(%[[ADD]], %[[INIT_VALUE_CONST:.+]])
+// CHECK: stablehlo.maximum
+// CHECK: {window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>} : (tensor<1x5x5x4xf32>, tensor<f32>) -> tensor<1x2x2x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[REDUCE_WINDOW_MAX]], dims = [0, 3, 1, 2] : (tensor<1x2x2x4xf32>) -> tensor<1x4x2x2xf32>
+// CHECK: return %[[TRANSPOSE_1]]

From 50136b8de0148c4c8c7f8ee07a13c6382ce9d764 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Mon, 25 Mar 2024 03:36:21 -0700
Subject: [PATCH 367/670] [XLA:GPU] Implement TiledHloInstruction graph in
 SymbolicTileAnalysis.

SymbolicTileAnalysis create a new tiled HLO node for each unique (HLO instruction, indexing map) pair. Tiled instructions are stored in def-before-use order for easier access.

This representation makes it easier to write codegen, because each instruction knows about it's operands and we can efficiently cache emitter values. Those also allows to do some degree of CSE, but it can be CSEd more for concrete tile sizes.

PiperOrigin-RevId: 618788735
---
 third_party/xla/xla/service/gpu/model/BUILD   |   4 +-
 .../gpu/model/symbolic_tile_analysis.cc       | 177 ++++++++----------
 .../gpu/model/symbolic_tile_analysis.h        |  66 ++++---
 .../gpu/model/symbolic_tile_analysis_test.cc  |  52 +++--
 4 files changed, 169 insertions(+), 130 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index f62cc090b390b9..1c1b997cf7adc3 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -560,15 +560,13 @@ cc_library(
         ":indexing_analysis",
         ":indexing_map",
         ":tile_analysis",
-        "//xla:shape_util",
         "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:instruction_fusion",
-        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index 178ea0969dd7b0..b1edee873b525c 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -16,29 +16,31 @@ limitations under the License.
 #include "xla/service/gpu/model/symbolic_tile_analysis.h"
 
 #include <cstdint>
+#include <functional>
+#include <memory>
 #include <optional>
-#include <queue>
-#include <string>
+#include <utility>
+#include <variant>
 #include <vector>
 
-#include "absl/base/nullability.h"
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
-#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/model/tile_analysis.h"
-#include "xla/shape.h"
+#include "xla/service/instruction_fusion.h"
 #include "xla/status.h"
 
 namespace xla {
@@ -51,106 +53,109 @@ using ::mlir::AffineMap;
 using ::mlir::MLIRContext;
 using ::mlir::SmallVector;
 
-struct HloAndPath {
-  const HloInstruction* hlo;
-  SymbolicTileAnalysis::InstructionPathFromRoot path;
-};
-
 }  // namespace
 
 /*static*/ SymbolicTileAnalysisOrError SymbolicTileAnalysis::AnalyzeComputation(
     const HloComputation& computation, MLIRContext* ctx) {
-  absl::flat_hash_map<InstructionPathFromRoot, SymbolicTile>
-      symbolic_tile_from_path;
-  ConstHloInstructionMap<absl::flat_hash_set<InstructionPathFromRoot>>
-      paths_from_root_to_instruction;
-  absl::flat_hash_map<const InstructionPathFromRoot, IndexingMap>
-      indexing_map_from_path;
-  std::queue<HloAndPath> to_process;
-
-  const HloInstruction* root = computation.root_instruction();
-  paths_from_root_to_instruction.insert({root, {{}}});
-
-  to_process.push(HloAndPath{root, /*path=*/{}});
-  indexing_map_from_path.insert({{}, CreateIdentityMap(root->shape(), ctx)});
-
-  while (!to_process.empty()) {
-    const HloAndPath hlo_and_path = to_process.front();
-    to_process.pop();
-
-    const HloInstruction* hlo = hlo_and_path.hlo;
+  std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions;
+  absl::flat_hash_map<std::pair<const HloInstruction*, IndexingMap>,
+                      TiledHloInstruction*>
+      tiled_hlo_instructions_map;
+
+  absl::flat_hash_map<TiledHloInstruction*, int64_t> topological_order;
+
+  std::function<std::variant<TiledHloInstruction*, FusionDecision>(
+      const HloInstruction*, IndexingMap)>
+      get_tiled_hlo_instruction;
+
+  // Create a new tiled hlo instruction or return existing instruction from
+  // cache for the given hlo and indexing map.
+  get_tiled_hlo_instruction = [&](const HloInstruction* hlo,
+                                  IndexingMap indexing_map)
+      -> std::variant<TiledHloInstruction*, FusionDecision> {
+    auto key = std::make_pair(hlo, indexing_map);
+
+    auto it = tiled_hlo_instructions_map.find(key);
+    if (it != tiled_hlo_instructions_map.end()) {
+      return it->second;
+    }
 
-    // Bail out on instructions that are known to cause problems down the line.
-    // This is not an inherent limitation of the approach, but simply issues
-    // to be resolved in the current implementation.
+    // Bail out on instructions that are known to cause problems down the
+    // line. This is not an inherent limitation of the approach, but simply
+    // issues to be resolved in the current implementation.
     if (hlo->opcode() == HloOpcode::kDot ||
         hlo->opcode() == HloOpcode::kReshape ||
         hlo->opcode() == HloOpcode::kBitcast ||
         hlo->opcode() == HloOpcode::kConcatenate) {
-      return absl::StrCat("Bailing out on ", hlo->ToString()).c_str();
+      return FusionDecision{} << "Bailing out on " << hlo->ToString();
     }
 
     // Bail out on instructions that do not output a single array.
     if (!hlo->shape().IsArray()) {
-      return absl::StrCat(hlo->ToString(), " outputs more than a single array")
-          .c_str();
+      return FusionDecision{} << hlo->ToString()
+                              << " outputs more than a single array";
     }
 
-    auto hlo_indexing_map_it = indexing_map_from_path.find(hlo_and_path.path);
-    CHECK(hlo_indexing_map_it != indexing_map_from_path.end());
-
-    const IndexingMap& hlo_indexing_map = hlo_indexing_map_it->second;
-    std::optional<SymbolicTile> symbolic_tile =
-        SymbolicTile::FromIndexingMap(hlo_indexing_map);
+    auto symbolic_tile = SymbolicTile::FromIndexingMap(indexing_map);
     if (!symbolic_tile.has_value()) {
-      return absl::StrCat("Failed to compute symbolic tile for ",
-                          hlo_indexing_map.ToString(), " for HLO ",
-                          hlo->ToString())
-          .c_str();
+      return FusionDecision{} << "Failed to compute symbolic tile for "
+                              << indexing_map.ToString() << " for HLO "
+                              << hlo->ToString();
     }
-    symbolic_tile_from_path.insert({hlo_and_path.path, symbolic_tile.value()});
+
+    tiled_hlo_instructions.push_back(std::make_unique<TiledHloInstruction>(
+        hlo, std::move(indexing_map), std::move(*symbolic_tile)));
+
+    auto tiled_hlo_instruction = tiled_hlo_instructions.back().get();
 
     std::optional<HloInstructionIndexing> operands_indexing =
-        ComputeOutputToInputIndexing(hlo, /*output_id=*/0, ctx);
+        ComputeOutputToInputIndexing(tiled_hlo_instruction->hlo,
+                                     /*output_id=*/0, ctx);
 
     if (!operands_indexing.has_value()) {
-      return absl::StrCat("Failed to compute operands indexing for ",
-                          hlo->ToString())
-          .c_str();
+      return FusionDecision{} << "Failed to compute operands indexing for "
+                              << tiled_hlo_instruction->hlo->ToString();
     }
 
-    int operand_id = 0;
     for (auto [operand, operand_indexing_map_set] :
-         llvm::zip(hlo->operands(), operands_indexing->indexing_maps)) {
-      // Assign hlo_indexing_map again, since the reference may have been
-      // invalidated by the insertion below.
-      const IndexingMap& hlo_indexing_map =
-          indexing_map_from_path.at(hlo_and_path.path);
+         llvm::zip(tiled_hlo_instruction->hlo->operands(),
+                   operands_indexing->indexing_maps)) {
       CHECK_EQ(operand_indexing_map_set.size(), 1);
 
-      IndexingMap operand_indexing_map = ComposeIndexingMaps(
-          hlo_indexing_map, *operand_indexing_map_set.begin());
+      IndexingMap operand_indexing_map =
+          ComposeIndexingMaps(tiled_hlo_instruction->indexing_map,
+                              *operand_indexing_map_set.begin());
 
-      InstructionPathFromRoot operand_path = InstructionPathFromRoot(
-          hlo_and_path.path.begin(), hlo_and_path.path.end());
-      operand_path.push_back(operand_id);
+      auto tiled_operand_or =
+          get_tiled_hlo_instruction(operand, std::move(operand_indexing_map));
 
-      indexing_map_from_path.insert({operand_path, operand_indexing_map});
-      to_process.push(HloAndPath{operand, operand_path});
-
-      if (paths_from_root_to_instruction.find(operand) ==
-          paths_from_root_to_instruction.end()) {
-        paths_from_root_to_instruction.insert({operand, {operand_path}});
-      } else {
-        paths_from_root_to_instruction.at(operand).insert(operand_path);
+      if (auto fusion_decison =
+              std::get_if<FusionDecision>(&tiled_operand_or)) {
+        return *fusion_decison;
       }
 
-      ++operand_id;
+      tiled_hlo_instruction->operands.push_back(
+          std::get<TiledHloInstruction*>(tiled_operand_or));
     }
+
+    topological_order[tiled_hlo_instruction] = topological_order.size();
+    tiled_hlo_instructions_map.emplace(key, tiled_hlo_instruction);
+    return tiled_hlo_instruction;
+  };
+
+  const HloInstruction* root = computation.root_instruction();
+  auto tiled_root =
+      get_tiled_hlo_instruction(root, CreateIdentityMap(root->shape(), ctx));
+  if (auto* fusion_decision = std::get_if<FusionDecision>(&tiled_root)) {
+    return *fusion_decision;
   }
 
-  return SymbolicTileAnalysis(symbolic_tile_from_path,
-                              paths_from_root_to_instruction, ctx);
+  // Order instructions in def-before-use order.
+  absl::c_sort(tiled_hlo_instructions, [&](const auto& i1, const auto& i2) {
+    return topological_order.at(i1.get()) < topological_order.at(i2.get());
+  });
+
+  return SymbolicTileAnalysis(std::move(tiled_hlo_instructions), ctx);
 }
 
 namespace {
@@ -181,39 +186,23 @@ std::vector<int64_t> EvaluateTileMap(AffineMap affine_map,
 }  // namespace
 
 std::vector<int64_t> SymbolicTileAnalysis::TileOffsets(
-    absl::Nonnull<const HloInstruction*> hlo,
-    const InstructionPathFromRoot& path) const {
+    const TiledHloInstruction& tiled_hlo) const {
   CHECK(tile_parameters_.has_value());
-  CHECK(paths_from_root_to_instruction_.find(hlo) !=
-        paths_from_root_to_instruction_.end());
-  CHECK(paths_from_root_to_instruction_.at(hlo).find(path) !=
-        paths_from_root_to_instruction_.at(hlo).end());
-  return EvaluateTileMap(symbolic_tile_from_path_.at(path).offset_map(),
+  return EvaluateTileMap(tiled_hlo.symbolic_tile.offset_map(),
                          *tile_parameters_);
 }
 
 // TODO(bchetioui): remove dependency on stride and offset parameters.
 std::vector<int64_t> SymbolicTileAnalysis::TileSizes(
-    absl::Nonnull<const HloInstruction*> hlo,
-    const InstructionPathFromRoot& path) const {
+    const TiledHloInstruction& tiled_hlo) const {
   CHECK(tile_parameters_.has_value());
-  CHECK(paths_from_root_to_instruction_.find(hlo) !=
-        paths_from_root_to_instruction_.end());
-  CHECK(paths_from_root_to_instruction_.at(hlo).find(path) !=
-        paths_from_root_to_instruction_.at(hlo).end());
-  return EvaluateTileMap(symbolic_tile_from_path_.at(path).size_map(),
-                         *tile_parameters_);
+  return EvaluateTileMap(tiled_hlo.symbolic_tile.size_map(), *tile_parameters_);
 }
 
 std::vector<int64_t> SymbolicTileAnalysis::TileStrides(
-    absl::Nonnull<const HloInstruction*> hlo,
-    const InstructionPathFromRoot& path) const {
+    const TiledHloInstruction& tiled_hlo) const {
   CHECK(tile_parameters_.has_value());
-  CHECK(paths_from_root_to_instruction_.find(hlo) !=
-        paths_from_root_to_instruction_.end());
-  CHECK(paths_from_root_to_instruction_.at(hlo).find(path) !=
-        paths_from_root_to_instruction_.at(hlo).end());
-  return EvaluateTileMap(symbolic_tile_from_path_.at(path).stride_map(),
+  return EvaluateTileMap(tiled_hlo.symbolic_tile.stride_map(),
                          *tile_parameters_);
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
index d8ced62be4ea75..142e1a97b15110 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -17,16 +17,16 @@ limitations under the License.
 #define XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_ANALYSIS_H_
 
 #include <cstdint>
+#include <memory>
 #include <optional>
+#include <utility>
 #include <variant>
 #include <vector>
 
-#include "absl/base/nullability.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/model/tile_analysis.h"
 #include "xla/service/instruction_fusion.h"
 
@@ -37,6 +37,29 @@ class SymbolicTileAnalysis;
 using SymbolicTileAnalysisOrError =
     std::variant<SymbolicTileAnalysis, FusionDecision>;
 
+// A node in the tiled representation of an HLO computation. During tiling and
+// codegen an HLO instruction may need to be emitted multiple times with
+// different tiling parameters.
+struct TiledHloInstruction {
+  // Pointer to the original HLO instruction.
+  const HloInstruction* hlo;
+
+  // Indexing map from the computation root to this instruction output.
+  IndexingMap indexing_map;
+
+  // Symbolic tile derived from the indexing map.
+  SymbolicTile symbolic_tile;
+
+  // Operands of the instruction in the tiled computation graph.
+  std::vector<TiledHloInstruction*> operands;
+
+  TiledHloInstruction(const HloInstruction* hlo, IndexingMap indexing_map,
+                      SymbolicTile symbolic_tile)
+      : hlo(hlo),
+        indexing_map(std::move(indexing_map)),
+        symbolic_tile(std::move(symbolic_tile)) {}
+};
+
 // Constructs and holds symbolic tiles for all the instructions within a
 // computation. We may hold several different symbolic tiles for the same
 // instruction if the instruction is indexed in several different ways in order
@@ -59,43 +82,44 @@ class SymbolicTileAnalysis {
   // Evaluates the tile offsets of an instruction from the analyzed computation
   // following the provided path from the root. Tile parameters must have been
   // set before calling this method.
-  std::vector<int64_t> TileOffsets(absl::Nonnull<const HloInstruction*> hlo,
-                                   const InstructionPathFromRoot& path) const;
+  std::vector<int64_t> TileOffsets(const TiledHloInstruction& tiled_hlo) const;
   // Evaluates the tile sizes of an instruction from the analyzed computation
   // following the provided path from the root. Tile parameters must have been
   // set before calling this method.
-  std::vector<int64_t> TileSizes(absl::Nonnull<const HloInstruction*> hlo,
-                                 const InstructionPathFromRoot& path) const;
+  std::vector<int64_t> TileSizes(const TiledHloInstruction& tiled_hlo) const;
   // Evaluates the tile strides of an instruction from the analyzed computation
   // following the provided path from the root. Tile parameters must have been
   // set before calling this method.
-  std::vector<int64_t> TileStrides(absl::Nonnull<const HloInstruction*> hlo,
-                                   const InstructionPathFromRoot& path) const;
+  std::vector<int64_t> TileStrides(const TiledHloInstruction& tiled_hlo) const;
 
   // Populates input tile sizes. This is a prerequisite in order to extract
   // concrete values using `TileOffsets`, `TileSizes`, and `TileStrides`.
   void SetTileSizes(absl::Span<int64_t const> sizes);
 
+  // Returns the tiled root instruction.
+  const TiledHloInstruction* GetRoot() const {
+    return tiled_hlo_instructions_.back().get();
+  }
+
+  // Returns the tiled HLO instructions in def-before-use order.
+  const std::vector<std::unique_ptr<TiledHloInstruction>>&
+  GetTiledHloInstructions() const {
+    return tiled_hlo_instructions_;
+  }
+
   // Return the underlying MLIRContext.
   mlir::MLIRContext* GetMLIRContext() const { return context_; };
 
  private:
   SymbolicTileAnalysis(
-      absl::flat_hash_map<InstructionPathFromRoot, SymbolicTile>
-          symbolic_tile_from_path,
-      ConstHloInstructionMap<absl::flat_hash_set<InstructionPathFromRoot>>
-          paths_from_root_to_instruction,
+      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
       mlir::MLIRContext* context)
-      : symbolic_tile_from_path_(symbolic_tile_from_path),
-        paths_from_root_to_instruction_(paths_from_root_to_instruction),
+      : tiled_hlo_instructions_(std::move(tiled_hlo_instructions)),
         context_(context) {}
 
-  absl::flat_hash_map<InstructionPathFromRoot, SymbolicTile>
-      symbolic_tile_from_path_;
-  // Maps each instruction in the analyzed computation to a set containing all
-  // the possible paths from the root instruction to the key instruction.
-  ConstHloInstructionMap<absl::flat_hash_set<InstructionPathFromRoot>>
-      paths_from_root_to_instruction_;
+  // The tiled HLO instructions in def-before-use order.
+  std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions_;
+
   mlir::MLIRContext* context_;
   // Optionally set tile parameters. These parameters can be set by calling
   // `SetTileParameters`, and correspond to the output tile for the analyzed
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
index 6cb24760cace43..ae6bd453284bc0 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
@@ -16,14 +16,15 @@ limitations under the License.
 #include "xla/service/gpu/model/symbolic_tile_analysis.h"
 
 #include <memory>
+#include <utility>
 #include <variant>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
 #include "tsl/platform/statusor.h"
@@ -60,22 +61,49 @@ ENTRY main {
 
   ASSERT_TRUE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
   SymbolicTileAnalysis analysis =
-      std::get<SymbolicTileAnalysis>(analysis_or_error);
+      std::get<SymbolicTileAnalysis>(std::move(analysis_or_error));
 
   analysis.SetTileSizes(/*sizes=*/{1, 10});
 
-  const HloInstruction* p0 =
-      module->entry_computation()->parameter_instruction(0);
-  SymbolicTileAnalysis::InstructionPathFromRoot p0_from_subtract0({0});
-  SymbolicTileAnalysis::InstructionPathFromRoot p0_from_subtract1({1, 0, 0});
+  const TiledHloInstruction* root = analysis.GetRoot();
 
-  EXPECT_THAT(analysis.TileOffsets(p0, p0_from_subtract0), ElementsAre(0, 0));
-  EXPECT_THAT(analysis.TileSizes(p0, p0_from_subtract0), ElementsAre(1, 10));
-  EXPECT_THAT(analysis.TileStrides(p0, p0_from_subtract0), ElementsAre(1, 1));
+  auto p0_from_subtract0 = root->operands[0];
+  auto p0_from_subtract1 = root->operands[1]->operands[0]->operands[0];
 
-  EXPECT_THAT(analysis.TileOffsets(p0, p0_from_subtract1), ElementsAre(0, 0));
-  EXPECT_THAT(analysis.TileSizes(p0, p0_from_subtract1), ElementsAre(1, 97));
-  EXPECT_THAT(analysis.TileStrides(p0, p0_from_subtract1), ElementsAre(1, 1));
+  EXPECT_THAT(analysis.TileOffsets(*p0_from_subtract0), ElementsAre(0, 0));
+  EXPECT_THAT(analysis.TileSizes(*p0_from_subtract0), ElementsAre(1, 10));
+  EXPECT_THAT(analysis.TileStrides(*p0_from_subtract0), ElementsAre(1, 1));
+
+  EXPECT_THAT(analysis.TileOffsets(*p0_from_subtract1), ElementsAre(0, 0));
+  EXPECT_THAT(analysis.TileSizes(*p0_from_subtract1), ElementsAre(1, 97));
+  EXPECT_THAT(analysis.TileStrides(*p0_from_subtract1), ElementsAre(1, 1));
+}
+
+TEST_F(SymbolicTileAnalysisTest, ElementwiseDiamondCSEIsSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY main {
+  p0 = f32[2,97] parameter(0)
+  exp = f32[2,97] exponential(p0)
+  log = f32[2,97] log(p0)
+  ROOT subtract = f32[2,97] subtract(exp, log)
+})"));
+
+  mlir::MLIRContext mlir_ctx;
+  SymbolicTileAnalysisOrError analysis_or_error =
+      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                               &mlir_ctx);
+
+  EXPECT_TRUE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+  SymbolicTileAnalysis analysis =
+      std::get<SymbolicTileAnalysis>(std::move(analysis_or_error));
+
+  const TiledHloInstruction* root = analysis.GetRoot();
+
+  auto p0_from_subtract0 = root->operands[0]->operands[0];
+  auto p0_from_subtract1 = root->operands[1]->operands[0];
+
+  EXPECT_EQ(p0_from_subtract0, p0_from_subtract1);
 }
 
 TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedDot) {

From 78a074fecffbee1ec63980a4bfd5d36bd7c0ea22 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Mon, 25 Mar 2024 03:39:02 -0700
Subject: [PATCH 368/670] Consistently handle algorithm selection in
 GemmAlgorithmPicker

PiperOrigin-RevId: 618789520
---
 .../xla/service/gpu/gemm_algorithm_picker.cc  | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index 446cde8de272de..c2a2b9e2c1950c 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -417,15 +417,28 @@ absl::StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
                  config.GetGpuComputeCapability());
 
   if (update_algorithm) {
+    int64_t new_algorithm{};
     if (algorithm.has_gemm()) {
-      backend_config.set_selected_algorithm(algorithm.gemm().algorithm());
+      new_algorithm = algorithm.gemm().algorithm();
     } else {
       // NOTE: runtime autotuning is no longer available => set to default
-      backend_config.set_selected_algorithm(se::blas::kDefaultAlgorithm);
+      new_algorithm = se::blas::kDefaultAlgorithm;
     }
+
+    if (new_algorithm == old_algorithm &&
+        backend_config.has_selected_algorithm()) {
+      // We don't need to update the backend config if
+      // the algorithm hasn't changed unless previously
+      // the algorithm wasn't set explicitly.
+      return false;
+    }
+
+    backend_config.set_selected_algorithm(new_algorithm);
+    TF_RETURN_IF_ERROR(gemm->set_backend_config(gpu_config));
+    return true;  // We changed `gemm`
   }
-  TF_RETURN_IF_ERROR(gemm->set_backend_config(gpu_config));
-  return old_algorithm != backend_config.selected_algorithm();
+
+  return false;  // No change to `gemm`
 }
 
 absl::StatusOr<bool> RunOnComputation(HloComputation* computation,

From aa89782b20e7f4a9aa8ffbebb06ffb0232b69eee Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Mon, 25 Mar 2024 04:18:18 -0700
Subject: [PATCH 369/670] Add version checks to FindCudaExecutable

Currently we look for ptxas and nvlink in a few different places on the host machine, then we choose the first found binary without taking its version into account. If the chosen binary doesn't fulfill our version requirements we will later fail even if there was a suitable ptxas or nvlink in the search path in the first place.

This change makes it take the version of each binary into account when going through the search path. Unsuitable binaries will be discarded right away and the search continues until we are out of locations to check.

This should help with host environments that have multiple CUDA toolkits installed and should make ptxas and nvlink selection more robust.

The concreate changes:

1. `FindCudaExecutable` now also takes a minimum version and a list of forbidden (think buggy) versions that are supposed to be skipped.
2. `WarnIfBadPtxAsVersion` has been removed. It was checking for ptxas < 11.1 which is way older than our minimum supported version of 11.8 and was not doing anything given the check described in #3.
3. There was another version check for `ptxas` in `NVPTXCompiler::ChooseLinkingMethod` which was checking for `version(ptxas)` < 11.8. This has also been removed/replace by the version check described in #4.
4. Version checking for `ptxas` and `nvlink` has been consolidated into 2 methods `FindPtxAsExectuable` and `FindNvLinkExecutable`. These methods hard code the current minimum version (and the list of excluded versions) of each tool in one place. It's still not great but at least less spaghetti-like.

PiperOrigin-RevId: 618797392
---
 .../xla/xla/service/gpu/nvptx_compiler.cc     | 36 +------
 .../xla/xla/stream_executor/cuda/BUILD        |  4 +-
 .../stream_executor/cuda/cuda_asm_compiler.cc | 33 +++++--
 third_party/xla/xla/stream_executor/gpu/BUILD |  2 +
 .../xla/stream_executor/gpu/asm_compiler.cc   | 94 ++++++++++---------
 .../xla/stream_executor/gpu/asm_compiler.h    | 15 ++-
 6 files changed, 94 insertions(+), 90 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 62920ee5adb920..3422cd810290fd 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <fstream>
 #include <iterator>
 #include <memory>
-#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -672,8 +671,7 @@ NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
   return cache_value->maybe_cubin;
 }
 
-static std::optional<std::array<int64_t, 3>> GetNvLinkVersion(
-    const std::string& preferred_cuda_dir) {
+static bool IsNvlinkEnabled() {
   const bool use_nvlink_by_default =
 #ifdef TF_DISABLE_NVLINK_BY_DEFAULT
       false;
@@ -684,24 +682,7 @@ static std::optional<std::array<int64_t, 3>> GetNvLinkVersion(
   TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_USE_NVLINK_FOR_PARALLEL_COMPILATION",
                                       /*default_val=*/
                                       use_nvlink_by_default, &use_nvlink));
-
-  if (!use_nvlink) {
-    return std::nullopt;
-  }
-
-  // Make sure nvlink exists and is executable.
-  absl::StatusOr<std::string> bin_path =
-      se::FindCudaExecutable("nvlink", preferred_cuda_dir);
-
-  if (!bin_path.ok()) {
-    return std::nullopt;
-  }
-
-  auto version = se::GetToolVersion(bin_path.value());
-  if (!version.ok()) {
-    return std::nullopt;
-  }
-  return *version;
+  return use_nvlink;
 }
 
 absl::StatusOr<NVPTXCompiler::LinkingMethod> ChooseLinkingMethodImpl(
@@ -710,16 +691,9 @@ absl::StatusOr<NVPTXCompiler::LinkingMethod> ChooseLinkingMethodImpl(
   TF_ASSIGN_OR_RETURN(auto ptxas_version_tuple,
                       se::GetAsmCompilerVersion(preferred_cuda_dir));
 
-  // ptxas versions prior to 11.8 are not supported anymore. We check this here,
-  // since we are fetching the ptxas version anyway. Catching the error
-  // elsewhere might introduce unnecessary overhead.
-  if (ptxas_version_tuple < std::array<int64_t, 3>{11, 8, 0}) {
-    return absl::InternalError("XLA requires ptxas version 11.8 or higher");
-  }
-
-  std::optional<std::array<int64_t, 3>> nvlink_version =
-      GetNvLinkVersion(preferred_cuda_dir);
-  if (nvlink_version && *nvlink_version >= ptxas_version_tuple) {
+  auto nvlink_version = stream_executor::GetNvLinkVersion(preferred_cuda_dir);
+  if (IsNvlinkEnabled() && nvlink_version.ok() &&
+      nvlink_version.value() >= ptxas_version_tuple) {
     return LinkingMethod::kNvLink;
   }
 
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index b2162e309f48b4..599987444361b0 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -566,10 +566,7 @@ cuda_only_cc_library(
         "//xla:status_macros",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:asm_compiler_header",
-        "//xla/stream_executor/gpu:gpu_asm_opts",
         "//xla/stream_executor/gpu:gpu_driver_header",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/log",
@@ -577,6 +574,7 @@ cuda_only_cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc b/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc
index 2d435a74ef308e..771d570b5f3f38 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/call_once.h"
 #include "absl/base/optimization.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/log/log.h"
@@ -31,6 +30,8 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/gpu/asm_compiler.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
@@ -56,18 +57,32 @@ namespace stream_executor {
     }                                                                         \
   } while (false)
 
+static absl::StatusOr<std::string> FindNvlinkExecutable(
+    std::string_view preferred_cuda_dir) {
+  static constexpr ToolVersion kMinimumNvlinkVersion{11, 8, 0};
+  static constexpr absl::Span<const ToolVersion> kNoExcludedVersions{};
+  static constexpr std::string_view kNvLinkBinaryName = "nvlink";
+
+  return FindCudaExecutable(kNvLinkBinaryName, preferred_cuda_dir,
+                            kMinimumNvlinkVersion, kNoExcludedVersions);
+}
+
+absl::StatusOr<ToolVersion> GetNvLinkVersion(
+    std::string_view preferred_cuda_dir) {
+  // Make sure nvlink exists and is executable.
+  TF_ASSIGN_OR_RETURN(std::string bin_path,
+                      FindNvlinkExecutable(preferred_cuda_dir));
+
+  return GetToolVersion(bin_path);
+}
+
 absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
     absl::string_view preferred_cuda_dir, gpu::GpuContext* context,
     std::vector<CubinOrPTXImage> images) {
-  {
-    static absl::once_flag log_once;
-    absl::call_once(log_once,
-                    [] { LOG(INFO) << "Using nvlink for parallel linking"; });
-  }
+  LOG_FIRST_N(INFO, 1) << "Using nvlink for parallel linking";
 
-  TF_ASSIGN_OR_RETURN(
-      std::string bin_path,
-      FindCudaExecutable("nvlink", std::string(preferred_cuda_dir)));
+  TF_ASSIGN_OR_RETURN(std::string bin_path,
+                      FindNvlinkExecutable(preferred_cuda_dir));
 
   if (images.empty()) {
     return std::vector<uint8>();
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index f75e32f0fe8866..4d902bafadd58a 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -445,6 +445,7 @@ gpu_only_cc_library(
         "//xla/stream_executor/cuda:ptx_compiler",
         "//xla/stream_executor/cuda:ptx_compiler_support",
         "//xla/stream_executor/platform",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -468,6 +469,7 @@ gpu_only_cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:subprocess",
+        "@local_tsl//tsl/util:env_var",
     ] + if_cuda_is_configured([
         "//xla/stream_executor/cuda:cuda_asm_compiler",
         "//xla/stream_executor/cuda:cuda_driver",
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
index 54641031ee58d0..6293d7a8eaa7cf 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/const_init.h"
 #include "absl/base/optimization.h"
 #include "absl/base/thread_annotations.h"
@@ -63,7 +64,7 @@ limitations under the License.
 namespace stream_executor {
 
 static absl::StatusOr<std::string> GetToolVersionString(
-    absl::string_view binary_path) {
+    std::string_view binary_path) {
   // If binary_path doesn't exist, then tsl::SubProcess will log a bunch of
   // error messages that have confused users in the past. Therefore we first
   // check whether the binary_path exists and error out early if not.
@@ -103,7 +104,7 @@ static absl::StatusOr<ToolVersion> GetToolVersionImpl(
   }
   static constexpr LazyRE2 kVersionRegex = {R"(\bV(\d+)\.(\d+)\.(\d+)\b)"};
   ToolVersion version{};
-  absl::string_view vmaj_str, vmin_str, vdot_str;
+  std::string_view vmaj_str, vmin_str, vdot_str;
   if (!RE2::PartialMatch(tool_version.value(), *kVersionRegex, &vmaj_str,
                          &vmin_str, &vdot_str) ||
       !absl::SimpleAtoi(vmaj_str, &version[0]) ||
@@ -134,28 +135,6 @@ absl::StatusOr<ToolVersion> GetToolVersion(std::string_view tool_path) {
       .first->second;
 }
 
-// Prints a warning if the ptxas at ptxas_path has known bugs.
-//
-// Only prints a warning the first time it's called for a particular value of
-// ptxas_path.
-//
-// Locks on entry.˝
-static void WarnIfBadPtxasVersion(absl::string_view ptxas_path) {
-  absl::StatusOr<std::array<int64_t, 3>> version = GetToolVersion(ptxas_path);
-  if (!version.ok()) {
-    LOG(WARNING) << "Couldn't get ptxas version : " << version.status();
-    return;
-  }
-
-  if (std::make_tuple((*version)[0], (*version)[1]) < std::make_tuple(11, 1)) {
-    LOG(ERROR) << "*** WARNING *** You are using ptxas " << (*version)[0] << "."
-               << (*version)[1] << "." << (*version)[2]
-               << ", which is older than 11.1. ptxas before 11.1 is known to "
-                  "miscompile XLA code, leading to incorrect results or "
-                  "invalid-address errors.\n";
-  }
-}
-
 absl::StatusOr<absl::Span<const uint8_t>> CompileGpuAsmOrGetCached(
     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
   using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
@@ -201,7 +180,9 @@ absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
 }
 
 absl::StatusOr<std::string> FindCudaExecutable(
-    std::string_view binary_name, std::string_view preferred_cuda_dir) {
+    std::string_view binary_name, std::string_view preferred_cuda_dir,
+    ToolVersion minimum_version,
+    absl::Span<const ToolVersion> excluded_versions) {
 #if defined(PLATFORM_WINDOWS)
   const std::string binary_filename = std::string{binary_name} + ".exe";
 #else
@@ -234,18 +215,44 @@ absl::StatusOr<std::string> FindCudaExecutable(
 
   for (const auto& candidate : candidates) {
     VLOG(2) << "Looking for " << candidate;
-    if (GetToolVersion(candidate).ok()) {
-      VLOG(2) << "Using " << candidate;
-      return candidate;
+    auto candidate_version = GetToolVersion(candidate);
+    if (!candidate_version.ok()) {
+      continue;
+    }
+
+    if (candidate_version.value() < minimum_version) {
+      VLOG(2) << candidate << " with version "
+              << absl::StrJoin(minimum_version, ".") << " is too old.";
+      continue;
+    }
+
+    if (absl::c_find(excluded_versions, candidate_version.value()) !=
+        excluded_versions.end()) {
+      VLOG(2) << candidate << " has version "
+              << absl::StrJoin(candidate_version.value(), ".")
+              << " which was explicitly excluded.";
+      continue;
     }
+
+    VLOG(2) << "Using " << candidate << " with version "
+            << absl::StrJoin(candidate_version.value(), ".");
+    return candidate;
   }
 
   return absl::NotFoundError(
-      absl::StrCat("Couldn't find ", binary_name,
+      absl::StrCat("Couldn't find a suitable version of ", binary_name,
                    ". The following locations were considered: ",
                    absl::StrJoin(candidates, ", ")));
 }
 
+absl::StatusOr<std::string> FindCudaExecutable(
+    std::string_view binary_name, std::string_view preferred_cuda_dir) {
+  static constexpr ToolVersion kNoMinimumVersion{0, 0, 0};
+  static constexpr absl::Span<const ToolVersion> kNoExcludedVersions{};
+  return FindCudaExecutable(binary_name, preferred_cuda_dir, kNoMinimumVersion,
+                            kNoExcludedVersions);
+}
+
 static void LogPtxasTooOld(const std::string& ptxas_path, int cc_major,
                            int cc_minor) {
   using AlreadyLoggedSetTy =
@@ -274,29 +281,28 @@ static void AppendArgsFromOptions(GpuAsmOpts options,
               options.extra_flags.end());
 }
 
-absl::StatusOr<std::array<int64_t, 3>> GetAsmCompilerVersion(
-    const std::string& preferred_cuda_dir) {
+static absl::StatusOr<std::string> FindPtxAsExecutable(
+    std::string_view preferred_cuda_dir) {
+  static constexpr ToolVersion kMinimumSupportedPtxAsVersion{11, 8, 0};
+  static constexpr ToolVersion kBuggyPtxAsVersions[] = {{12, 3, 103}};
+  static constexpr std::string_view kPtxAsBinaryName = "ptxas";
+
+  return FindCudaExecutable(kPtxAsBinaryName, preferred_cuda_dir,
+                            kMinimumSupportedPtxAsVersion, kBuggyPtxAsVersions);
+}
+
+absl::StatusOr<ToolVersion> GetAsmCompilerVersion(
+    std::string_view preferred_cuda_dir) {
   TF_ASSIGN_OR_RETURN(std::string ptxas_path,
-                      FindCudaExecutable("ptxas", preferred_cuda_dir));
+                      FindPtxAsExecutable(preferred_cuda_dir));
   return GetToolVersion(ptxas_path);
 }
 
 absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingPtxAs(
     int cc_major, int cc_minor, const char* ptx_contents, GpuAsmOpts options,
     bool cancel_if_reg_spill) {
-  TF_ASSIGN_OR_RETURN(auto ptxas_version_tuple,
-                      GetAsmCompilerVersion(options.preferred_cuda_dir));
-  if (ptxas_version_tuple == std::array<int64_t, 3>{12, 3, 103}) {
-    return absl::InternalError(absl::StrFormat(
-        "ptxas %d.%d.%d has a bug that we think can affect XLA. "
-        "Please use a different version.",
-        std::get<0>(ptxas_version_tuple), std::get<1>(ptxas_version_tuple),
-        std::get<2>(ptxas_version_tuple)));
-  }
   TF_ASSIGN_OR_RETURN(std::string ptxas_path,
-                      FindCudaExecutable("ptxas", options.preferred_cuda_dir));
-
-  WarnIfBadPtxasVersion(ptxas_path);
+                      FindPtxAsExecutable(options.preferred_cuda_dir));
 
   // Write ptx into a temporary file.
   std::string ptx_path;
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
index 78d785f76041b9..5933a218baca13 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
@@ -104,16 +104,25 @@ absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
     absl::string_view preferred_cuda_dir, gpu::GpuContext* context,
     std::vector<CubinOrPTXImage> images);
 
+using ToolVersion = std::array<int64_t, 3>;
+absl::StatusOr<std::string> FindCudaExecutable(
+    std::string_view binary_name, std::string_view preferred_cuda_dir,
+    ToolVersion minimum_version,
+    absl::Span<const ToolVersion> excluded_versions);
+
 absl::StatusOr<std::string> FindCudaExecutable(
     std::string_view binary_name, std::string_view preferred_cuda_dir);
 
 // Runs tool --version and parses its version string.
-using ToolVersion = std::array<int64_t, 3>;
 absl::StatusOr<ToolVersion> GetToolVersion(std::string_view tool_path);
 
-// On NVIDIA GPUs, returns the CUDA toolkit version supported by the driver,
+// On NVIDIA GPUs, returns the version of the ptxas command line tool.
 absl::StatusOr<ToolVersion> GetAsmCompilerVersion(
-    const std::string& preferred_cuda_dir);
+    std::string_view preferred_cuda_dir);
+
+// On NVIDIA GPUs, returns the version of the nvlink command line tool.
+absl::StatusOr<ToolVersion> GetNvLinkVersion(
+    std::string_view preferred_cuda_dir);
 
 #if GOOGLE_CUDA
 // Maintains a cache of pointers to loaded kernels

From 5c20ea3e7ad64422059742d13e239a389ad4eed5 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Mon, 25 Mar 2024 04:27:00 -0700
Subject: [PATCH 370/670] [XLA:GPU] Remove unused RTVars when calling
 RemoveUnusedSymbols.

PiperOrigin-RevId: 618799014
---
 .../xla/xla/service/gpu/model/indexing_map.cc | 11 +++++---
 .../service/gpu/model/indexing_map_test.cc    | 26 +++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index d863e2bdadebe6..f93fa03a4c31d8 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -1015,8 +1015,6 @@ bool IsFunctionOfUnusedDimsAndSymbolsOnly(
 
 void IndexingMap::RemoveUnusedSymbols() {
   if (IsUndefined()) return;
-  // TODO(b/329052892): Implement composition with RT vars.
-  if (GetRTVarsCount()) return;
 
   // Remove unused symbols from the affine_map.
   unsigned num_symbols_before = affine_map_.getNumSymbols();
@@ -1059,18 +1057,25 @@ void IndexingMap::RemoveUnusedSymbols() {
   if (num_symbols_after == num_symbols_before) return;
 
   std::vector<RangeVar> compressed_range_vars;
+  std::vector<RTVar> compressed_rt_vars;
   MLIRContext* mlir_context = GetMLIRContext();
   int64_t used_symbols_count = 0;
   std::vector<AffineExpr> symbol_replacements(
       num_symbols_before, getAffineConstantExpr(0, mlir_context));
+  auto range_vars_count = range_vars_.size();
   for (int i = 0; i < unused_symbols_bit_vector.size(); ++i) {
     if (!unused_symbols_bit_vector[i]) {
-      compressed_range_vars.push_back(range_vars_[i]);
+      if (i < range_vars_count) {
+        compressed_range_vars.push_back(range_vars_[i]);
+      } else {
+        compressed_rt_vars.push_back(rt_vars_[i - range_vars_count]);
+      }
       symbol_replacements[i] =
           getAffineSymbolExpr(used_symbols_count++, mlir_context);
     }
   }
   range_vars_ = std::move(compressed_range_vars);
+  rt_vars_ = std::move(compressed_rt_vars);
   std::vector<AffineExpr> to_remove;
   std::vector<std::pair<AffineExpr, Interval>> to_add;
   for (const auto& [expr, range] : constraints_) {
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index d9dd20ebba5648..6118265cea60ce 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -235,6 +235,32 @@ TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithManySymbols) {
                             )"));
 }
 
+TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithRTVars) {
+  auto zero_dim_map = AffineMap::get(&mlir_context_);
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0, s1, s2, s3, s4] -> (d0 * 4 + s1 + s3 - 42)",
+                     &mlir_context_),
+      {DimVar{{0, 31}}}, {RangeVar{{0, 0}}, RangeVar{{0, 1}}, RangeVar{{0, 2}}},
+      {RTVar{Interval{0, 3},
+             /*instr=*/nullptr, zero_dim_map},
+       RTVar{Interval{0, 4},
+             /*instr=*/nullptr, zero_dim_map}});
+  indexing_map.AddConstraint(
+      ParseAffineExpr("d0 * 4 + s1 + s3", &mlir_context_), Interval{24, 459});
+  indexing_map.RemoveUnusedSymbols();
+  // Symbols s0, s2, s4 will be removed and s1 and s3 will become s0 and s1.
+  EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
+                              (d0)[s0, s1] -> (d0 * 4 + s0 + s1 - 42)
+                              domain:
+                              d0 in [0, 31]
+                              s0 in [0, 1]
+                              s1 in [0, 3]
+                                hlo: NULL
+                                () -> ()
+                              d0 * 4 + s0 + s1 in [24, 459]
+                            )"));
+}
+
 TEST_F(IndexingMapTest, ConstraintIntervalSimplification_Sum) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100}, {});

From b594f43c4b15f4c3890bb8e9efe4b9c2539c9e03 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 04:39:58 -0700
Subject: [PATCH 371/670] Automated Code Change

PiperOrigin-RevId: 618801338
---
 tensorflow/core/grappler/inputs/BUILD             | 12 +++++++++++-
 .../core/grappler/inputs/file_input_yielder.cc    |  3 +++
 .../inputs/trivial_test_graph_input_yielder.cc    | 15 +++++++++++++--
 tensorflow/core/grappler/inputs/utils.cc          |  4 ++++
 tensorflow/core/grappler/inputs/utils.h           |  1 +
 tensorflow/core/grappler/inputs/utils_test.cc     |  8 +++++++-
 6 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index 73c6edc643adb6..3f2fddd7fef103 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -31,6 +31,9 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:graph_proto_cc",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -53,15 +56,20 @@ cc_library(
     deps = [
         ":input_yielder",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:no_op_op_lib",
+        "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/kernels:aggregate_ops",
+        "//tensorflow/core/platform:strcat",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -79,5 +87,7 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:grappler_item_builder",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
     ],
 )
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.cc b/tensorflow/core/grappler/inputs/file_input_yielder.cc
index e63a38c9746ef9..e2511806ce35d8 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <unordered_set>
 #include <utility>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/grappler_item_builder.h"
@@ -26,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
index 9ce0284369af3c..3c72721e5099a6 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
@@ -17,11 +17,22 @@ limitations under the License.
 // and feed them as inputs to Grappler. This can be used for quick experiments
 // or to derive small regression tests.
 
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/data_flow_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/random_ops.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/queue_runner.pb.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace grappler {
diff --git a/tensorflow/core/grappler/inputs/utils.cc b/tensorflow/core/grappler/inputs/utils.cc
index 03f59701cefd61..580a526d0b1b3d 100644
--- a/tensorflow/core/grappler/inputs/utils.cc
+++ b/tensorflow/core/grappler/inputs/utils.cc
@@ -17,7 +17,11 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
 namespace grappler {
diff --git a/tensorflow/core/grappler/inputs/utils.h b/tensorflow/core/grappler/inputs/utils.h
index 2588e380fed231..0f69913ed6fa99 100644
--- a/tensorflow/core/grappler/inputs/utils.h
+++ b/tensorflow/core/grappler/inputs/utils.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/grappler/inputs/utils_test.cc b/tensorflow/core/grappler/inputs/utils_test.cc
index 93d4fe956f8e37..ea38f6c4d8545b 100644
--- a/tensorflow/core/grappler/inputs/utils_test.cc
+++ b/tensorflow/core/grappler/inputs/utils_test.cc
@@ -14,9 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/inputs/utils.h"
-#include "tensorflow/core/lib/io/path.h"
+
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace grappler {

From 74b2e839a450b759252f2e8faa4e274c804517f0 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <dmitrig@google.com>
Date: Mon, 25 Mar 2024 06:47:34 -0700
Subject: [PATCH 372/670] Integrate LLVM at llvm/llvm-project@7ac7d418ac2b

Updates LLVM usage to match
[7ac7d418ac2b](https://github.com/llvm/llvm-project/commit/7ac7d418ac2b)

PiperOrigin-RevId: 618825418
---
 third_party/llvm/generated.patch | 150 +++++++++++++++++++++++++++++++
 third_party/llvm/workspace.bzl   |   4 +-
 2 files changed, 152 insertions(+), 2 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 509398da979e83..523466863396aa 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1 +1,151 @@
 Auto generated patch. Do not edit or delete it, even if empty.
+diff -ruN --strip-trailing-cr a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
++++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+@@ -126,7 +126,6 @@
+ const int SIGSEGV = 11;
+ const int SIGPIPE = 13;
+ const int SIGTERM = 15;
+-const int SIGPROF = 27;
+ #if defined(__mips__) || SANITIZER_FREEBSD || SANITIZER_APPLE || SANITIZER_NETBSD
+ const int SIGBUS = 10;
+ const int SIGSYS = 12;
+@@ -2180,8 +2179,7 @@
+     return;
+   }
+   // Don't mess with synchronous signals.
+-  const bool sync = is_sync_signal(sctx, sig, info) ||
+-                    (sig == SIGPROF && thr->is_inited && !thr->is_dead);
++  const bool sync = is_sync_signal(sctx, sig, info);
+   if (sync ||
+       // If we are in blocking function, we can safely process it now
+       // (but check if we are in a recursive interceptor,
+diff -ruN --strip-trailing-cr a/compiler-rt/test/tsan/signal_errno.cpp b/compiler-rt/test/tsan/signal_errno.cpp
+--- a/compiler-rt/test/tsan/signal_errno.cpp
++++ b/compiler-rt/test/tsan/signal_errno.cpp
+@@ -18,7 +18,7 @@
+ 
+ static void* sendsignal(void *p) {
+   barrier_wait(&barrier);
+-  pthread_kill(mainth, SIGALRM);
++  pthread_kill(mainth, SIGPROF);
+   return 0;
+ }
+ 
+@@ -37,7 +37,7 @@
+   mainth = pthread_self();
+   struct sigaction act = {};
+   act.sa_sigaction = &MyHandler;
+-  sigaction(SIGALRM, &act, 0);
++  sigaction(SIGPROF, &act, 0);
+   pthread_t th;
+   pthread_create(&th, 0, sendsignal, 0);
+   loop();
+@@ -46,7 +46,7 @@
+ }
+ 
+ // CHECK: WARNING: ThreadSanitizer: signal handler spoils errno
+-// CHECK:   Signal 14 handler invoked at:
++// CHECK:   Signal 27 handler invoked at:
+ // CHECK:     #0 MyHandler(int, {{(__)?}}siginfo{{(_t)?}}*, void*) {{.*}}signal_errno.cpp
+ // CHECK:     main
+ // CHECK: SUMMARY: ThreadSanitizer: signal handler spoils errno{{.*}}MyHandler
+diff -ruN --strip-trailing-cr a/compiler-rt/test/tsan/signal_reset.cpp b/compiler-rt/test/tsan/signal_reset.cpp
+--- a/compiler-rt/test/tsan/signal_reset.cpp
++++ b/compiler-rt/test/tsan/signal_reset.cpp
+@@ -28,12 +28,12 @@
+   struct sigaction act = {};
+   for (int i = 0; i < 1000000; i++) {
+     act.sa_handler = &handler;
+-    if (sigaction(SIGALRM, &act, 0)) {
++    if (sigaction(SIGPROF, &act, 0)) {
+       perror("sigaction");
+       exit(1);
+     }
+     act.sa_handler = SIG_IGN;
+-    if (sigaction(SIGALRM, &act, 0)) {
++    if (sigaction(SIGPROF, &act, 0)) {
+       perror("sigaction");
+       exit(1);
+     }
+@@ -44,7 +44,7 @@
+ int main() {
+   struct sigaction act = {};
+   act.sa_handler = SIG_IGN;
+-  if (sigaction(SIGALRM, &act, 0)) {
++  if (sigaction(SIGPROF, &act, 0)) {
+     perror("sigaction");
+     exit(1);
+   }
+@@ -53,7 +53,7 @@
+   t.it_value.tv_sec = 0;
+   t.it_value.tv_usec = 10;
+   t.it_interval = t.it_value;
+-  if (setitimer(ITIMER_REAL, &t, 0)) {
++  if (setitimer(ITIMER_PROF, &t, 0)) {
+     perror("setitimer");
+     exit(1);
+   }
+diff -ruN --strip-trailing-cr a/compiler-rt/test/tsan/signal_sync.cpp b/compiler-rt/test/tsan/signal_sync.cpp
+--- a/compiler-rt/test/tsan/signal_sync.cpp
++++ b/compiler-rt/test/tsan/signal_sync.cpp
+@@ -30,7 +30,7 @@
+ 
+   struct sigaction act = {};
+   act.sa_handler = &handler;
+-  if (sigaction(SIGVTALRM, &act, 0)) {
++  if (sigaction(SIGPROF, &act, 0)) {
+     perror("sigaction");
+     exit(1);
+   }
+@@ -39,7 +39,7 @@
+   t.it_value.tv_sec = 0;
+   t.it_value.tv_usec = 10;
+   t.it_interval = t.it_value;
+-  if (setitimer(ITIMER_VIRTUAL, &t, 0)) {
++  if (setitimer(ITIMER_PROF, &t, 0)) {
+     perror("setitimer");
+     exit(1);
+   }
+diff -ruN --strip-trailing-cr a/compiler-rt/test/tsan/signal_thread2.cpp b/compiler-rt/test/tsan/signal_thread2.cpp
+--- a/compiler-rt/test/tsan/signal_thread2.cpp
++++ b/compiler-rt/test/tsan/signal_thread2.cpp
+@@ -40,7 +40,7 @@
+ int main() {
+   struct sigaction act = {};
+   act.sa_handler = &handler;
+-  if (sigaction(SIGALRM, &act, 0)) {
++  if (sigaction(SIGPROF, &act, 0)) {
+     perror("sigaction");
+     exit(1);
+   }
+@@ -49,7 +49,7 @@
+   t.it_value.tv_sec = 0;
+   t.it_value.tv_usec = 10;
+   t.it_interval = t.it_value;
+-  if (setitimer(ITIMER_REAL, &t, 0)) {
++  if (setitimer(ITIMER_PROF, &t, 0)) {
+     perror("setitimer");
+     exit(1);
+   }
+diff -ruN --strip-trailing-cr a/compiler-rt/test/tsan/signal_thread.cpp b/compiler-rt/test/tsan/signal_thread.cpp
+--- a/compiler-rt/test/tsan/signal_thread.cpp
++++ b/compiler-rt/test/tsan/signal_thread.cpp
+@@ -24,7 +24,7 @@
+ int main() {
+   struct sigaction act = {};
+   act.sa_handler = &handler;
+-  if (sigaction(SIGVTALRM, &act, 0)) {
++  if (sigaction(SIGPROF, &act, 0)) {
+     perror("sigaction");
+     exit(1);
+   }
+@@ -33,7 +33,7 @@
+   t.it_value.tv_sec = 0;
+   t.it_value.tv_usec = 10;
+   t.it_interval = t.it_value;
+-  if (setitimer(ITIMER_VIRTUAL, &t, 0)) {
++  if (setitimer(ITIMER_PROF, &t, 0)) {
+     perror("setitimer");
+     exit(1);
+   }
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 9d68095c7cf6f5..2b48087e0c951f 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "6f44bb7717897191be25aa01161831c67cdf5b84"
-    LLVM_SHA256 = "ce1ae51a2790299efa81060ad6c4c8a25ef9b99e9b63ca5e571bc635b2f3a026"
+    LLVM_COMMIT = "7ac7d418ac2b16fd44789dcf48e2b5d73de3e715"
+    LLVM_SHA256 = "8b99a146881fbb2a2d8e812724550b2c88fed4403dfb4e133ee8b7107a6a9348"
 
     tf_http_archive(
         name = name,

From eabae13ffeeaaac7086d242bbe8102f03236fd50 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Mon, 25 Mar 2024 06:58:22 -0700
Subject: [PATCH 373/670] #shlo_ref Small fixes to `test_util.h`

- Add comments describing the test helpers.
- Rename Distribution to UniformDistribution.
- Make RandomBuffer's distribution configurable.
- Improve IotaBuffer's usability.
- Fix IotaBuffer's behaviour.

PiperOrigin-RevId: 618827551
---
 .../lite/experimental/shlo/ops/test_util.h    | 56 +++++++++++++++----
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/experimental/shlo/ops/test_util.h b/tensorflow/lite/experimental/shlo/ops/test_util.h
index 3ca89ffea3f086..58f35dcb8c446d 100644
--- a/tensorflow/lite/experimental/shlo/ops/test_util.h
+++ b/tensorflow/lite/experimental/shlo/ops/test_util.h
@@ -37,29 +37,39 @@ namespace shlo_ref {
 template <class T>
 using Vector = absl::InlinedVector<T, 1>;
 
+// Helper for UniformDistribution.
 template <DataType storage_type, typename = void>
-struct Distribution;
+struct UniformDistributionImpl;
 
 template <>
-struct Distribution<DataType::kI1, void>
+struct UniformDistributionImpl<DataType::kI1, void>
     : std::uniform_int_distribution<int32_t> {
   using std::uniform_int_distribution<int32_t>::uniform_int_distribution;
 };
 
 template <DataType storage_type>
-struct Distribution<storage_type, std::enable_if_t<IsInteger(storage_type)>>
+struct UniformDistributionImpl<storage_type,
+                               std::enable_if_t<IsInteger(storage_type)>>
     : std::uniform_int_distribution<typename Storage<storage_type>::Type> {
   using std::uniform_int_distribution<
       typename Storage<storage_type>::Type>::uniform_int_distribution;
 };
 
 template <DataType storage_type>
-struct Distribution<storage_type, std::enable_if_t<IsFloat(storage_type)>>
+struct UniformDistributionImpl<storage_type,
+                               std::enable_if_t<IsFloat(storage_type)>>
     : std::uniform_real_distribution<float> {
   using std::uniform_real_distribution<float>::uniform_real_distribution;
 };
 
-template <DataType storage_type, class MinT = StorageType<storage_type>,
+// Helps creating a uniform distribution for the given data type.
+template <DataType storage_type>
+using UniformDistribution = UniformDistributionImpl<storage_type>;
+
+// Returns a vector filled with random data according to the set distribution.
+template <DataType storage_type,
+          template <DataType> class Distribution = UniformDistribution,
+          class MinT = StorageType<storage_type>,
           class MaxT = StorageType<storage_type>,
           class Config = Storage<storage_type>>
 Vector<typename Config::Type> RandomBuffer(const Shape& shape,
@@ -77,31 +87,44 @@ Vector<typename Config::Type> RandomBuffer(const Shape& shape,
   return vec;
 }
 
+// Returns a vector filled with incremental value. The values wrap around
+// according to the storage type range.
 template <DataType storage_type, class Config = Storage<storage_type>>
 Vector<typename Config::Type> IotaBuffer(
     const Shape& shape, const typename Config::Type start = Config::kMinValue,
     const typename Config::Type min = Config::kMinValue,
     const typename Config::Type max = Config::kMaxValue) {
+  using StorageT = StorageType<storage_type>;
+  const StorageT min_val =
+      min > Config::kMinValue ? static_cast<StorageT>(min) : Config::kMinValue;
+  const StorageT max_val =
+      max < Config::kMaxValue ? static_cast<StorageT>(max) : Config::kMaxValue;
   Vector<typename Config::Type> vec(shape.NumElements());
-  auto v = start;
+  StorageT v = start >= min_val ? static_cast<StorageT>(start) : min_val;
+  v = v <= max_val ? v : min_val;
   for (auto& e : vec) {
     e = v;
-    if (++v > max) {
-      v = min;
+    if (v >= max_val) {
+      v = min_val;
+    } else {
+      ++v;
     }
   }
   return vec;
 }
 
+// Typed test parameter type.
 template <DataType... Types>
 struct TestParam;
 
+// Typed test parameter specialization for non quantized tensors.
 template <DataType storage_type>
 struct TestParam<storage_type> {
   static constexpr DataType kStorage = storage_type;
   using StorageT = StorageType<storage_type>;
 };
 
+// Typed test parameter specialization for quantized tensors.
 template <DataType storage_type, DataType expressed_type>
 struct TestParam<storage_type, expressed_type> {
   static constexpr DataType kStorage = storage_type;
@@ -111,18 +134,23 @@ struct TestParam<storage_type, expressed_type> {
 };
 
 // Typed test parameter tag to ask for a per-tensor quantized tensor.
+//
+// TestParamT should be a `TestParam<storage_type, expressed_type>`.
 template <class TestParamT>
 struct PerTensor {
   using Param = TestParamT;
 };
 
 // Typed test parameter tag to ask for a per-channel quantized tensor.
+//
+// TestParamT should be a `TestParam<storage_type, expressed_type>`.
 template <class TestParamT, Axis kAxis = 0>
 struct PerAxis {
   using Param = TestParamT;
   static constexpr Axis axis = kAxis;
 };
 
+// Gets a string representation of the given DataType.
 constexpr const char* ToString(DataType t) {
   switch (t) {
     case DataType::kI1:
@@ -153,6 +181,7 @@ constexpr const char* ToString(DataType t) {
   return "Unknown data type";
 }
 
+// Helps getting a human readable typed test parameter name.
 template <class T>
 struct ParamName;
 
@@ -192,6 +221,7 @@ struct ParamName<std::tuple<TestParamT, TestParamTs...>> {
   }
 };
 
+// Allows GTest to print a human readable version of the typed test parameters.
 class TestParamNames {
  public:
   template <class T>
@@ -200,6 +230,7 @@ class TestParamNames {
   }
 };
 
+// Applies the F template to the given testing::Types list.
 template <template <class> class F, class T>
 struct Map;
 
@@ -211,6 +242,7 @@ struct Map<F, ::testing::Types<Ts...>> {
 template <template <class> class F, class T>
 using MapTypes = typename Map<F, T>::Types;
 
+// Concatenates testing::Types lists.
 template <class... Ts>
 struct Concat;
 
@@ -228,6 +260,7 @@ struct Concat<::testing::Types<Ts...>, ::testing::Types<Us...>, ExtraTypes...> {
 template <class... Ts>
 using ConcatTypes = typename Concat<Ts...>::Types;
 
+// Transforms a list of types into a list of tuple<Op, type>.
 template <class Op, class T>
 struct WithOp;
 
@@ -239,6 +272,7 @@ struct WithOp<Op, ::testing::Types<Ts...>> {
 template <class Op, class T>
 using WithOpTypes = typename WithOp<Op, T>::Types;
 
+// Helps generating a cross-product of lists.
 template <class Accu, class... Lists>
 struct CrossProductImpl;
 
@@ -279,6 +313,7 @@ static_assert(
         ::testing::Types<::testing::Types<int, char, float>,
                          ::testing::Types<int, double, float>>>);
 
+// Filters out the types that don't satisfy the predicate.
 template <template <class...> class Predicate, class List>
 struct Filter;
 
@@ -296,6 +331,7 @@ static_assert(std::is_same_v<
               FilterTypes<std::is_integral, ::testing::Types<int, char, float>>,
               ::testing::Types<int, char>>);
 
+// Checks if all given types are the same.
 template <class T, class... Ts>
 struct SameTypes : std::bool_constant<(std::is_same_v<T, Ts> && ...)> {};
 
@@ -371,8 +407,8 @@ template <DataType storage_type, DataType expressed_type>
 TensorTypeVariant TensorTypeFor(
     PerTensor<TestParam<storage_type, expressed_type>>, const Shape& shape) {
   std::random_device rd;
-  Distribution<expressed_type> expressed_dist(0.5, 1.5);
-  Distribution<storage_type> storage_dist(-5, 5);
+  UniformDistribution<expressed_type> expressed_dist(0.5, 1.5);
+  UniformDistribution<storage_type> storage_dist(-5, 5);
   StorageType<expressed_type> scale =
       static_cast<StorageType<expressed_type>>(expressed_dist(rd));
   StorageType<storage_type> zero_point = storage_dist(rd);

From 2493f323ea2b3ef77eef7ecc9091a0272db3bba8 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Mon, 25 Mar 2024 07:09:39 -0700
Subject: [PATCH 374/670] #shlo_ref Add `divide` op.

PiperOrigin-RevId: 618830161
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  29 ++++
 .../lite/experimental/shlo/ops/divide.cc      |  62 ++++++++
 .../lite/experimental/shlo/ops/divide.h       |  36 +++++
 .../lite/experimental/shlo/ops/divide_test.cc | 147 ++++++++++++++++++
 4 files changed, 274 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/divide.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/divide.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/divide_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index b87249f468002b..f18bfb98a4f711 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -818,3 +818,32 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "divide",
+    srcs = ["divide.cc"],
+    hdrs = ["divide.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "divide_test",
+    srcs = ["divide_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":divide",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/divide.cc b/tensorflow/lite/experimental/shlo/ops/divide.cc
new file mode 100644
index 00000000000000..420c808e6ab310
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/divide.cc
@@ -0,0 +1,62 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/divide.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Divide : std::divides<void> {};
+
+DivideOp Create(DivideOp::Attributes) { return {}; }
+
+absl::Status Prepare(DivideOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(CheckCtx("divide"), lhs,
+                                               IsIntTensor, IsFloatTensor,
+                                               IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("divide"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("divide"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(DivideOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  Divide divide;
+  if (IsIntTensor(lhs) || IsFloatTensor(lhs)) {
+    // Note: all the arithmetic types share the same implementation.
+    DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
+                       lhs.tensor_element_type(), divide, lhs, rhs, output);
+  } else if (IsQuantizedPerTensorTensor(lhs)) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       lhs.quantized_tensor_element_type().StorageType(),
+                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       divide, lhs, rhs, output)
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.divide: Unsupported tensor type.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/divide.h b/tensorflow/lite/experimental/shlo/ops/divide.h
new file mode 100644
index 00000000000000..a1a16c1b3545ad
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/divide.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_DIVIDE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_DIVIDE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct DivideOp {
+  struct Attributes {};
+};
+
+DivideOp Create(DivideOp::Attributes);
+absl::Status Prepare(DivideOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(DivideOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_DIVIDE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/divide_test.cc b/tensorflow/lite/experimental/shlo/ops/divide_test.cc
new file mode 100644
index 00000000000000..c83d02ca3e6569
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/divide_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/divide.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<DivideOp> {
+  static std::string Get() { return "Divide"; }
+};
+
+struct Divide : std::divides<void> {};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Divide, BinaryElementwiseOpShapePropagationTest,
+                               DivideOp, TestParamNames);
+
+using MultipyBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    DivideOp,
+    ConcatTypes<BaselineConstraintIntTypes, BaselineConstraintFloatTypes,
+                BaselineConstraintQuantizedPerTensorTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Divide, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MultipyBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<DivideOp, ConcatTypes<BoolTestType, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Divide, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using ArithmeticTypes = ConcatTypes<ArithmeticTestTypes>;
+
+template <class T>
+struct DivideTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(DivideTest, ArithmeticTypes, TestParamNames);
+
+TYPED_TEST(DivideTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(), Divide());
+
+  auto op = Create(DivideOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedDivideTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedDivideTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedDivideTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(2);
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/zero_point + 1, /*max=*/zero_point + 5);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor lhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [zero_point, scale](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs = Dequantize(lhs, zero_point, scale);
+        const ExpressedT dequantized_rhs = Dequantize(rhs, zero_point, scale);
+        const ExpressedT dequantized_res =
+            Divide()(dequantized_lhs, dequantized_rhs);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(DivideOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+}  // namespace
+}  // namespace shlo_ref

From 6c5332d902a73b97d91ec55ecee4b0767604744e Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Mon, 25 Mar 2024 07:18:13 -0700
Subject: [PATCH 375/670] #shlo_ref Add `subtract` op.

PiperOrigin-RevId: 618832074
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  29 ++++
 .../lite/experimental/shlo/ops/subtract.cc    |  62 ++++++++
 .../lite/experimental/shlo/ops/subtract.h     |  36 +++++
 .../experimental/shlo/ops/subtract_test.cc    | 149 ++++++++++++++++++
 4 files changed, 276 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/subtract.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/subtract.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/subtract_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index f18bfb98a4f711..04bdcbb3053fe7 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -847,3 +847,32 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "subtract",
+    srcs = ["subtract.cc"],
+    hdrs = ["subtract.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "subtract_test",
+    srcs = ["subtract_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":subtract",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/subtract.cc b/tensorflow/lite/experimental/shlo/ops/subtract.cc
new file mode 100644
index 00000000000000..0c097afaebb578
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/subtract.cc
@@ -0,0 +1,62 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/subtract.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Subtract : std::minus<void> {};
+
+SubtractOp Create(SubtractOp::Attributes) { return {}; }
+
+absl::Status Prepare(SubtractOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(CheckCtx("subtract"), lhs,
+                                               IsIntTensor, IsFloatTensor,
+                                               IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("subtract"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("subtract"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(SubtractOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  Subtract subtract;
+  if (IsIntTensor(lhs) || IsFloatTensor(lhs)) {
+    // Note: all the arithmetic types share the same implementation.
+    DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
+                       lhs.tensor_element_type(), subtract, lhs, rhs, output);
+  } else if (IsQuantizedPerTensorTensor(lhs)) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       lhs.quantized_tensor_element_type().StorageType(),
+                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       subtract, lhs, rhs, output)
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.subtract: Unsupported tensor type.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/subtract.h b/tensorflow/lite/experimental/shlo/ops/subtract.h
new file mode 100644
index 00000000000000..f0a80dbef54350
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/subtract.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SUBTRACT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SUBTRACT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct SubtractOp {
+  struct Attributes {};
+};
+
+SubtractOp Create(SubtractOp::Attributes);
+absl::Status Prepare(SubtractOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(SubtractOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SUBTRACT_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/subtract_test.cc b/tensorflow/lite/experimental/shlo/ops/subtract_test.cc
new file mode 100644
index 00000000000000..ed350d79d72150
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/subtract_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/subtract.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<SubtractOp> {
+  static std::string Get() { return "Subtract"; }
+};
+
+struct Subtract : std::minus<void> {};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Subtract,
+                               BinaryElementwiseOpShapePropagationTest,
+                               SubtractOp, TestParamNames);
+
+using MultipyBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    SubtractOp,
+    ConcatTypes<BaselineConstraintIntTypes, BaselineConstraintFloatTypes,
+                BaselineConstraintQuantizedPerTensorTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Subtract, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MultipyBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<SubtractOp,
+                ConcatTypes<BoolTestType, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Subtract, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using ArithmeticTypes = ConcatTypes<ArithmeticTestTypes>;
+
+template <class T>
+struct SubtractTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(SubtractTest, ArithmeticTypes, TestParamNames);
+
+TYPED_TEST(SubtractTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(), Subtract());
+
+  auto op = Create(SubtractOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedSubtractTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedSubtractTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedSubtractTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(2);
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor lhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [zero_point, scale](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs = Dequantize(lhs, zero_point, scale);
+        const ExpressedT dequantized_rhs = Dequantize(rhs, zero_point, scale);
+        const ExpressedT dequantized_res =
+            Subtract()(dequantized_lhs, dequantized_rhs);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(SubtractOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+}  // namespace
+}  // namespace shlo_ref

From 7e867717142d841a46786bd5d4bed2d2bdc768d7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 07:19:37 -0700
Subject: [PATCH 376/670] Update TFRT dependency to use revision
 http://github.com/tensorflow/runtime/commit/968eb3e5b0aa2e20301a41af9bb14a48dd1aee40.

PiperOrigin-RevId: 618832399
---
 third_party/tf_runtime/workspace.bzl                          | 4 ++--
 .../xla/third_party/tsl/third_party/tf_runtime/workspace.bzl  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index a5f1e85bee4c28..2b789c8cead317 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "b182e75065369d1eea499fe1d08b6c57d03da2ab"
-    TFRT_SHA256 = "ea043b73fbc7a99a2d22566a34be2e7a4752ed33408859cdf77706f3e14baf79"
+    TFRT_COMMIT = "968eb3e5b0aa2e20301a41af9bb14a48dd1aee40"
+    TFRT_SHA256 = "cd3b1d190625d6ca5ddcf1c9cc0b095928707b623f1b986f1ba333a89d5418ae"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index a5f1e85bee4c28..2b789c8cead317 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "b182e75065369d1eea499fe1d08b6c57d03da2ab"
-    TFRT_SHA256 = "ea043b73fbc7a99a2d22566a34be2e7a4752ed33408859cdf77706f3e14baf79"
+    TFRT_COMMIT = "968eb3e5b0aa2e20301a41af9bb14a48dd1aee40"
+    TFRT_SHA256 = "cd3b1d190625d6ca5ddcf1c9cc0b095928707b623f1b986f1ba333a89d5418ae"
 
     tf_http_archive(
         name = "tf_runtime",

From b56e79d17a3d763d6e11038f9c25c653d2e0a4a0 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Mon, 25 Mar 2024 07:29:30 -0700
Subject: [PATCH 377/670] #shlo_ref Add `and` op.

PiperOrigin-RevId: 618834802
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  29 +++++
 tensorflow/lite/experimental/shlo/ops/and.cc  |  63 +++++++++++
 tensorflow/lite/experimental/shlo/ops/and.h   |  36 ++++++
 .../lite/experimental/shlo/ops/and_test.cc    | 107 ++++++++++++++++++
 4 files changed, 235 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/and.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/and.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/and_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 04bdcbb3053fe7..009cedb2c59ee0 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -876,3 +876,32 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "and",
+    srcs = ["and.cc"],
+    hdrs = ["and.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "and_test",
+    srcs = ["and_test.cc"],
+    deps = [
+        ":and",
+        ":binary_elementwise_test_util",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/and.cc b/tensorflow/lite/experimental/shlo/ops/and.cc
new file mode 100644
index 00000000000000..844690fc194175
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/and.cc
@@ -0,0 +1,63 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/and.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <DataType>
+struct And : std::bit_and<void> {};
+
+template <>
+struct And<DataType::kI1> : std::logical_and<void> {};
+
+AndOp Create(AndOp::Attributes) { return {}; }
+
+absl::Status Prepare(AndOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("and"), lhs, IsBoolTensor, IsIntTensor));
+  SHLO_REF_RETURN_ON_ERROR(CheckSameBaselineType(CheckCtx("and"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(CheckSameBaselineType(CheckCtx("and"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(AndOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  if (IsIntTensor(lhs)) {
+    // Note: all the integer types share the same implementation.
+    And<DataType::kSI32> and_func;
+    DISPATCH_INT(detail::EvaluateNoQuantization, lhs.tensor_element_type(),
+                 and_func, lhs, rhs, output);
+  } else if (IsBoolTensor(lhs)) {
+    And<DataType::kI1> and_func;
+    detail::EvaluateNoQuantization<DataType::kI1>(and_func, lhs, rhs, output);
+    return absl::OkStatus();
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.and: Unsupported tensor type in Evaluate.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/and.h b/tensorflow/lite/experimental/shlo/ops/and.h
new file mode 100644
index 00000000000000..5a17f42c9df46a
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/and.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_AND_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_AND_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct AndOp {
+  struct Attributes {};
+};
+
+AndOp Create(AndOp::Attributes);
+absl::Status Prepare(AndOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(AndOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_AND_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/and_test.cc b/tensorflow/lite/experimental/shlo/ops/and_test.cc
new file mode 100644
index 00000000000000..7b22155f7434af
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/and_test.cc
@@ -0,0 +1,107 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/and.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<AndOp> {
+  static std::string Get() { return "And"; }
+};
+
+template <DataType>
+struct And : std::bit_and<void> {};
+
+template <>
+struct And<DataType::kI1> : std::logical_and<void> {};
+
+template <>
+struct SupportedOpDataType<AndOp> {
+  static constexpr DataType kStorageType = DataType::kSI32;
+};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(And, BinaryElementwiseOpShapePropagationTest,
+                               AndOp, TestParamNames);
+
+using MultipyBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    AndOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    And, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MultipyBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<AndOp, ConcatTypes<FloatTestTypes, PerTensorQuantizedTestTypes,
+                                   PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(And, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using SupportedTypes = ConcatTypes<BoolTestType, IntTestTypes>;
+
+template <class T>
+struct AndTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(AndTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(AndTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(),
+                    And<TypeParam::kStorage>());
+
+  auto op = Create(AndOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From f42ef4589eb623f6da635d7b56397db40be570da Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Mon, 25 Mar 2024 07:39:47 -0700
Subject: [PATCH 378/670] #shlo_ref Add `or` op.

PiperOrigin-RevId: 618837226
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  29 +++++
 tensorflow/lite/experimental/shlo/ops/or.cc   |  63 +++++++++++
 tensorflow/lite/experimental/shlo/ops/or.h    |  36 ++++++
 .../lite/experimental/shlo/ops/or_test.cc     | 107 ++++++++++++++++++
 4 files changed, 235 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/or.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/or.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/or_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 009cedb2c59ee0..e109daff2d4c48 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -905,3 +905,32 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "or",
+    srcs = ["or.cc"],
+    hdrs = ["or.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "or_test",
+    srcs = ["or_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":or",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/or.cc b/tensorflow/lite/experimental/shlo/ops/or.cc
new file mode 100644
index 00000000000000..639bcc898afb63
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/or.cc
@@ -0,0 +1,63 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions or
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/or.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <DataType>
+struct Or : std::bit_or<void> {};
+
+template <>
+struct Or<DataType::kI1> : std::logical_or<void> {};
+
+OrOp Create(OrOp::Attributes) { return {}; }
+
+absl::Status Prepare(OrOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("or"), lhs, IsBoolTensor, IsIntTensor));
+  SHLO_REF_RETURN_ON_ERROR(CheckSameBaselineType(CheckCtx("or"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(CheckSameBaselineType(CheckCtx("or"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(OrOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  if (IsIntTensor(lhs)) {
+    // Note: all the integer types share the same implementation.
+    Or<DataType::kSI32> or_func;
+    DISPATCH_INT(detail::EvaluateNoQuantization, lhs.tensor_element_type(),
+                 or_func, lhs, rhs, output);
+  } else if (IsBoolTensor(lhs)) {
+    Or<DataType::kI1> or_func;
+    detail::EvaluateNoQuantization<DataType::kI1>(or_func, lhs, rhs, output);
+    return absl::OkStatus();
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.or: Unsupported tensor type in Evaluate.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/or.h b/tensorflow/lite/experimental/shlo/ops/or.h
new file mode 100644
index 00000000000000..f9201c9327acf8
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/or.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions or
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_OR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_OR_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct OrOp {
+  struct Attributes {};
+};
+
+OrOp Create(OrOp::Attributes);
+absl::Status Prepare(OrOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(OrOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_OR_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/or_test.cc b/tensorflow/lite/experimental/shlo/ops/or_test.cc
new file mode 100644
index 00000000000000..b0fcba048d1a50
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/or_test.cc
@@ -0,0 +1,107 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions or
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/or.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<OrOp> {
+  static std::string Get() { return "Or"; }
+};
+
+template <DataType>
+struct Or : std::bit_or<void> {};
+
+template <>
+struct Or<DataType::kI1> : std::logical_or<void> {};
+
+template <>
+struct SupportedOpDataType<OrOp> {
+  static constexpr DataType kStorageType = DataType::kSI32;
+};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Or, BinaryElementwiseOpShapePropagationTest,
+                               OrOp, TestParamNames);
+
+using MultipyBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    OrOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Or, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MultipyBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<OrOp, ConcatTypes<FloatTestTypes, PerTensorQuantizedTestTypes,
+                                  PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Or, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using SupportedTypes = ConcatTypes<BoolTestType, IntTestTypes>;
+
+template <class T>
+struct OrTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(OrTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(OrTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(),
+                    Or<TypeParam::kStorage>());
+
+  auto op = Create(OrOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From 7641498ae4ca1623c626e81ead225c31454e48f2 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 25 Mar 2024 08:58:45 -0700
Subject: [PATCH 379/670] [xla:ffi] Change scalar attributes ABI and annotate
 error handling branches with XLA_FFI_PREDICT_FALSE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation for adding dense array attributes change scalar attributes ABI for consistency

Small regression in decoding attributes because of one extra branch.

name                old cpu/op   new cpu/op   delta
BM_BufferBaseArgX1  9.51ns ± 5%  9.10ns ± 5%  -4.29%  (p=0.000 n=20+18)
BM_BufferBaseArgX4  17.4ns ± 7%  16.3ns ± 7%  -6.15%  (p=0.000 n=19+20)
BM_BufferArgX1      10.8ns ±11%  11.4ns ±15%  +5.86%  (p=0.010 n=20+20)
BM_BufferArgX4      19.3ns ± 8%  18.5ns ± 4%  -3.99%  (p=0.000 n=20+20)
BM_TupleOfI32Attrs  56.5ns ± 3%  60.9ns ± 3%  +7.90%  (p=0.000 n=20+20)

BM_BufferArgX1 regression looks like noise

PiperOrigin-RevId: 618857817
---
 third_party/xla/xla/ffi/api/api.h     | 115 ++++++++++++++------------
 third_party/xla/xla/ffi/api/c_api.h   |  19 +++--
 third_party/xla/xla/ffi/api/ffi.h     |  20 +++--
 third_party/xla/xla/ffi/call_frame.cc |  44 ++++++++--
 third_party/xla/xla/ffi/call_frame.h  |   3 +-
 third_party/xla/xla/ffi/ffi.h         |  21 +++--
 6 files changed, 138 insertions(+), 84 deletions(-)

diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index 7faddec1e350d4..42da19edc17340 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -58,19 +58,27 @@ limitations under the License.
 #include "xla/ffi/api/c_api.h"
 
 #if __has_attribute(always_inline)
-#define XLA_ATTRIBUTE_ALWAYS_INLINE inline __attribute__((always_inline))
+#define XLA_FFI_ATTRIBUTE_ALWAYS_INLINE inline __attribute__((always_inline))
 #elif defined(_MSC_VER)
-#define XLA_ATTRIBUTE_ALWAYS_INLINE __forceinline
+#define XLA_FFI_ATTRIBUTE_ALWAYS_INLINE __forceinline
 #else
-#define XLA_ATTRIBUTE_ALWAYS_INLINE inline
+#define XLA_FFI_ATTRIBUTE_ALWAYS_INLINE inline
 #endif
 
 #if __has_attribute(noinline)
-#define XLA_ATTRIBUTE_NEVER_INLINE __attribute__((noinline))
+#define XLA_FFI_ATTRIBUTE_NEVER_INLINE __attribute__((noinline))
 #elif defined(_MSC_VER)
-#define XLA_ATTRIBUTE_NEVER_INLINE __declspec(noinline)
+#define XLA_FFI_ATTRIBUTE_NEVER_INLINE __declspec(noinline)
 #else
-#define XLA_ATTRIBUTE_NEVER_INLINE
+#define XLA_FFI_ATTRIBUTE_NEVER_INLINE
+#endif
+
+#if __has_builtin(__builtin_expect)
+#define XLA_FFI_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
+#define XLA_FFI_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
+#else
+#define XLA_FFI_PREDICT_FALSE(x) (x)
+#define XLA_FFI_PREDICT_TRUE(x) (x)
 #endif
 
 namespace xla::ffi {
@@ -652,7 +660,7 @@ struct DecodingContext {
 
 template <typename T>
 struct Decode {
-  XLA_ATTRIBUTE_ALWAYS_INLINE
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx,
                                DiagnosticEngine& diagnostic) {
     int64_t idx = offsets.args++;
@@ -969,14 +977,14 @@ class Handler : public Ffi {
     // Check that the number of passed arguments matches the signature. Each
     // individual argument decoding will check the actual type.
     if (internal::HasRemainingArgsTag<Ts...>::value) {
-      if (call_frame->args.num_args < kNumArgs) {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->args.num_args < kNumArgs)) {
         return InvalidArgument(
             call_frame->api,
             StrCat("Wrong number of arguments: expected at least ",
                    kNumArgs - 1, " but got ", call_frame->args.num_args));
       }
     } else {
-      if (call_frame->args.num_args != kNumArgs) {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->args.num_args != kNumArgs)) {
         return InvalidArgument(
             call_frame->api,
             StrCat("Wrong number of arguments: expected ", kNumArgs,
@@ -989,7 +997,8 @@ class Handler : public Ffi {
     // attributes into a dictionary (or a custom struct decoded from a
     // dictionary), then there is no need to check attributes, as the FFI
     // handler (or a struct decoding) should be responsible for it.
-    if (kNumDictAttrs == 0 && call_frame->attrs.num_attrs != kNumAttrs) {
+    if (XLA_FFI_PREDICT_FALSE(kNumDictAttrs == 0 &&
+                              call_frame->attrs.num_attrs != kNumAttrs)) {
       return InvalidArgument(
           call_frame->api,
           StrCat("Wrong number of attributes: expected ", kNumAttrs,
@@ -1004,7 +1013,7 @@ class Handler : public Ffi {
 
  private:
   template <size_t... Is>
-  XLA_ATTRIBUTE_ALWAYS_INLINE XLA_FFI_Error* Call(
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE XLA_FFI_Error* Call(
       const XLA_FFI_CallFrame* call_frame, std::index_sequence<Is...>) const {
     // A helper structure to allow each decoder find the correct offset.
     internal::DecodingOffsets offsets;
@@ -1019,7 +1028,7 @@ class Handler : public Ffi {
         internal::Decode<Ts>::call(offsets, ctx, diagnostic)...};
 
     bool all_decoded = (std::get<Is>(args).has_value() && ...);
-    if (!all_decoded) {
+    if (XLA_FFI_PREDICT_FALSE(!all_decoded)) {
       return FailedDecodeError(call_frame, {std::get<Is>(args).has_value()...},
                                diagnostic);
     }
@@ -1087,12 +1096,8 @@ class Handler : public Ffi {
 
 inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_AttrType type) {
   switch (type) {
-    case XLA_FFI_AttrType_I32:
-      return os << "int32";
-    case XLA_FFI_AttrType_I64:
-      return os << "int64";
-    case XLA_FFI_AttrType_F32:
-      return os << "float";
+    case XLA_FFI_AttrType_SCALAR:
+      return os << "scalar";
     case XLA_FFI_AttrType_STRING:
       return os << "string";
     case XLA_FFI_AttrType_DICTIONARY:
@@ -1106,18 +1111,24 @@ inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_AttrType type) {
     using Type = T;                                                   \
     static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr, \
                                    DiagnosticEngine& diagnostic) {    \
-      if (type != TYPE) {                                             \
+      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR)) {   \
         return diagnostic.Emit("Wrong attribute type: expected ")     \
-               << TYPE << " but got " << type;                        \
+               << XLA_FFI_AttrType_SCALAR << " but got " << type;     \
+      }                                                               \
+                                                                      \
+      auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);         \
+      if (XLA_FFI_PREDICT_FALSE(scalar->dtype != TYPE)) {             \
+        return diagnostic.Emit("Wrong scalar data type: expected ")   \
+               << TYPE << " but got " << scalar->dtype;               \
       }                                                               \
                                                                       \
-      return *reinterpret_cast<T*>(attr);                             \
+      return *reinterpret_cast<T*>(scalar->value);                    \
     }                                                                 \
   }
 
-XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int32_t, XLA_FFI_AttrType_I32);
-XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int64_t, XLA_FFI_AttrType_I64);
-XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(float, XLA_FFI_AttrType_F32);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int32_t, XLA_FFI_DataType_S32);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int64_t, XLA_FFI_DataType_S64);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(float, XLA_FFI_DataType_F32);
 
 #undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
 
@@ -1127,7 +1138,7 @@ struct AttrDecoding<std::string_view> {
   static std::optional<std::string_view> Decode(XLA_FFI_AttrType type,
                                                 void* attr,
                                                 DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_AttrType_STRING) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_STRING)) {
       return diagnostic.Emit("Wrong attribute type: expected ")
              << XLA_FFI_AttrType_STRING << " but got " << type;
     }
@@ -1142,7 +1153,7 @@ struct AttrDecoding<Dictionary> {
   using Type = Dictionary;
   static std::optional<Dictionary> Decode(XLA_FFI_AttrType type, void* attr,
                                           DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_AttrType_DICTIONARY) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) {
       return diagnostic.Emit("Wrong attribute type: expected ")
              << XLA_FFI_AttrType_DICTIONARY << " but got " << type;
     }
@@ -1172,7 +1183,7 @@ template <typename T, typename... Ts>
 struct DecodeDictionaryAttr {
   static constexpr size_t kSize = sizeof...(Ts);
 
-  XLA_ATTRIBUTE_ALWAYS_INLINE
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<T> Decode(const XLA_FFI_Attrs* attrs,
                                  std::array<std::string_view, kSize> names,
                                  DiagnosticEngine& diagnostic) {
@@ -1180,10 +1191,10 @@ struct DecodeDictionaryAttr {
   }
 
   template <size_t... Is>
-  XLA_ATTRIBUTE_ALWAYS_INLINE static std::optional<T> Decode(
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<T> Decode(
       const XLA_FFI_Attrs* attrs, std::array<std::string_view, kSize> names,
       std::index_sequence<Is...>, DiagnosticEngine& diagnostic) {
-    if (kSize != attrs->num_attrs) {
+    if (XLA_FFI_PREDICT_FALSE(kSize != attrs->num_attrs)) {
       return diagnostic.Emit("Wrong number of attributes: expected ")
              << kSize << " attributes but got " << attrs->num_attrs;
     }
@@ -1201,7 +1212,7 @@ struct DecodeDictionaryAttr {
     std::tuple<std::optional<Ts>...> members = {
         dict.get<Ts>(names[Is], diagnostic)...};
     bool all_decoded = (std::get<Is>(members).has_value() && ...);
-    if (!all_decoded) return std::nullopt;
+    if (XLA_FFI_PREDICT_FALSE(!all_decoded)) return std::nullopt;
 
     return T{std::move(*std::get<Is>(members))...};
   }
@@ -1231,28 +1242,28 @@ auto DictionaryDecoder(Members... m) {
 // Automatically registers attributes binding for a struct that allows automatic
 // binding specification inference from a callable signature.
 //
-#define XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(T, ...)                 \
-  template <>                                                         \
-  struct AttrsBinding<T> {                                            \
-    using Attrs = T;                                                  \
-  };                                                                  \
-                                                                      \
-  template <>                                                         \
-  struct AttrDecoding<T> {                                            \
-    using Type = T;                                                   \
-    static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr, \
-                                   DiagnosticEngine& diagnostic) {    \
-      if (type != XLA_FFI_AttrType_DICTIONARY) {                      \
-        diagnostic.Emit("Wrong attribute type: expected ")            \
-            << XLA_FFI_AttrType_DICTIONARY << " but got " << type;    \
-        return std::nullopt;                                          \
-      }                                                               \
-                                                                      \
-      auto decoder = internal::DictionaryDecoder<T>(__VA_ARGS__);     \
-      return decltype(decoder)::Decode(                               \
-          reinterpret_cast<const XLA_FFI_Attrs*>(attr),               \
-          internal::StructMemberNames(__VA_ARGS__), diagnostic);      \
-    }                                                                 \
+#define XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(T, ...)                   \
+  template <>                                                           \
+  struct AttrsBinding<T> {                                              \
+    using Attrs = T;                                                    \
+  };                                                                    \
+                                                                        \
+  template <>                                                           \
+  struct AttrDecoding<T> {                                              \
+    using Type = T;                                                     \
+    static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr,   \
+                                   DiagnosticEngine& diagnostic) {      \
+      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) { \
+        diagnostic.Emit("Wrong attribute type: expected ")              \
+            << XLA_FFI_AttrType_DICTIONARY << " but got " << type;      \
+        return std::nullopt;                                            \
+      }                                                                 \
+                                                                        \
+      auto decoder = internal::DictionaryDecoder<T>(__VA_ARGS__);       \
+      return decltype(decoder)::Decode(                                 \
+          reinterpret_cast<const XLA_FFI_Attrs*>(attr),                 \
+          internal::StructMemberNames(__VA_ARGS__), diagnostic);        \
+    }                                                                   \
   }
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/api/c_api.h b/third_party/xla/xla/ffi/api/c_api.h
index 5549c5f3c2a30d..60c05cd57bca22 100644
--- a/third_party/xla/xla/ffi/api/c_api.h
+++ b/third_party/xla/xla/ffi/api/c_api.h
@@ -192,11 +192,9 @@ typedef enum {
 //===----------------------------------------------------------------------===//
 
 typedef enum {
-  XLA_FFI_AttrType_I32 = 1,
-  XLA_FFI_AttrType_I64 = 2,
-  XLA_FFI_AttrType_F32 = 3,
-  XLA_FFI_AttrType_STRING = 4,
-  XLA_FFI_AttrType_DICTIONARY = 5,
+  XLA_FFI_AttrType_SCALAR = 1,
+  XLA_FFI_AttrType_STRING = 2,
+  XLA_FFI_AttrType_DICTIONARY = 3,
 } XLA_FFI_AttrType;
 
 //===----------------------------------------------------------------------===//
@@ -223,6 +221,17 @@ struct XLA_FFI_ByteSpan {
 
 XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_ByteSpan, len);
 
+// A struct to pass a scalar value to FFI handler.
+struct XLA_FFI_Scalar {
+  size_t struct_size;
+  void* priv;
+
+  XLA_FFI_DataType dtype;
+  void* value;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Scalar, value);
+
 struct XLA_FFI_Args {
   size_t struct_size;
   void* priv;
diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
index 6c281e4878f960..a30121216e8cd6 100644
--- a/third_party/xla/xla/ffi/api/ffi.h
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -198,10 +198,10 @@ inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_ArgType type) {
 
 template <>
 struct ArgDecoding<BufferBase> {
-  XLA_ATTRIBUTE_ALWAYS_INLINE
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<BufferBase> Decode(XLA_FFI_ArgType type, void* arg,
                                           DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_ArgType_BUFFER) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
       return diagnostic.Emit("Wrong argument type: expected ")
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
@@ -213,10 +213,10 @@ struct ArgDecoding<BufferBase> {
 
 template <DataType dtype, size_t rank>
 struct ArgDecoding<Buffer<dtype, rank>> {
-  XLA_ATTRIBUTE_ALWAYS_INLINE
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<Buffer<dtype, rank>> Decode(
       XLA_FFI_ArgType type, void* arg, DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_ArgType_BUFFER) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
       return diagnostic.Emit("Wrong argument type: expected ")
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
@@ -224,13 +224,13 @@ struct ArgDecoding<Buffer<dtype, rank>> {
     auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
 
     if (auto actual_dtype = static_cast<DataType>(buf->dtype);
-        actual_dtype != dtype) {
+        XLA_FFI_PREDICT_FALSE(actual_dtype != dtype)) {
       return diagnostic.Emit("Wrong buffer dtype: expected ")
              << dtype << " but got " << actual_dtype;
     }
 
     if constexpr (rank != internal::kDynamicRank) {
-      if (buf->rank != rank) {
+      if (XLA_FFI_PREDICT_FALSE(buf->rank != rank)) {
         return diagnostic.Emit("Wrong buffer rank: expected ")
                << rank << " but got " << buf->rank;
       }
@@ -257,13 +257,15 @@ struct AttrDecoding<Pointer<T>> {
 
   static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,
                                     DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_AttrType_I64) {
+    auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR ||
+                              scalar->dtype != XLA_FFI_DataType_S64)) {
       return diagnostic.Emit("Wrong attribute type: ")
-             << "expected i64 for passing user data but got " << type;
+             << "expected i64 scalar for passing pointer but got " << type;
     }
 
     static_assert(sizeof(uintptr_t) == sizeof(int64_t));
-    uintptr_t ptr = *reinterpret_cast<uintptr_t*>(attr);
+    uintptr_t ptr = *reinterpret_cast<uintptr_t*>(scalar->value);
     return reinterpret_cast<Type>(ptr);
   }
 };
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index bbdbd7544357e4..30cf762db44218 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -125,6 +126,12 @@ struct CallFrame::Dictionary {
   std::unique_ptr<Attributes> attrs;
 };
 
+struct CallFrame::Scalar {
+  std::variant<int32_t, int64_t, float> value;  // XLA_FFI_Scalar::value
+
+  XLA_FFI_Scalar scalar = {XLA_FFI_Scalar_STRUCT_SIZE, nullptr};
+};
+
 struct CallFrame::String {
   std::string value;  // XLA_FFI_ByteSpan::ptr
 
@@ -260,7 +267,7 @@ CallFrame::~CallFrame() = default;
 struct CallFrame::ConvertAttribute {
   template <typename T>
   CallFrame::Attribute operator()(const T& value) {
-    return value;
+    return CallFrame::Scalar{value};
   }
 
   CallFrame::Attribute operator()(const std::string& str) {
@@ -272,25 +279,44 @@ struct CallFrame::ConvertAttribute {
   }
 };
 
+template <typename T>
+static XLA_FFI_DataType GetDataType() {
+  if constexpr (std::is_same_v<int32_t, T>) {
+    return XLA_FFI_DataType_S32;
+  } else if constexpr (std::is_same_v<int64_t, T>) {
+    return XLA_FFI_DataType_S64;
+  } else if constexpr (std::is_same_v<float, T>) {
+    return XLA_FFI_DataType_F32;
+  } else {
+    static_assert(sizeof(T) == 0, "unsupported FFI data type");
+  }
+}
+
 // An std::visit overload set to fix up CallFrame::Attribute storage and
 // initialize XLA FFI structs with valid pointers into storage objects.
 struct CallFrame::FixupAttribute {
-  template <typename T>
-  void operator()(T& value) {}
+  void operator()(CallFrame::Scalar& scalar) {
+    auto visitor = [&](auto& value) {
+      using T = std::remove_reference_t<decltype(value)>;
+      scalar.scalar.dtype = GetDataType<T>();
+      scalar.scalar.value = &value;
+    };
+    std::visit(visitor, scalar.value);
+  }
 
   void operator()(CallFrame::String& str) {
     str.span.ptr = str.value.data();
     str.span.len = str.value.size();
   }
+
+  void operator()(CallFrame::Dictionary&) {}
 };
 
 // An std::visit overload set to get CallFrame::Attribute XLA FFI type.
 struct CallFrame::AttributeType {
-  XLA_FFI_AttrType operator()(int32_t&) { return XLA_FFI_AttrType_I32; }
-
-  XLA_FFI_AttrType operator()(int64_t&) { return XLA_FFI_AttrType_I64; }
-
-  XLA_FFI_AttrType operator()(float&) { return XLA_FFI_AttrType_F32; }
+  XLA_FFI_AttrType operator()(CallFrame::Scalar&) {
+    return XLA_FFI_AttrType_SCALAR;
+  }
 
   XLA_FFI_AttrType operator()(CallFrame::String&) {
     return XLA_FFI_AttrType_STRING;
@@ -308,6 +334,8 @@ struct CallFrame::AttributeStorage {
     return &value;
   }
 
+  void* operator()(CallFrame::Scalar& scalar) { return &scalar.scalar; }
+
   void* operator()(CallFrame::String& str) { return &str.span; }
 
   void* operator()(CallFrame::Dictionary& dict) {
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index b765dc887d8fcd..65fa5446d90597 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -127,9 +127,10 @@ class CallFrame {
   struct Buffer;
   struct Dictionary;
   struct NamedAttribute;
+  struct Scalar;
   struct String;
 
-  using Attribute = std::variant<int32_t, int64_t, float, String, Dictionary>;
+  using Attribute = std::variant<Scalar, String, Dictionary>;
 
   CallFrame(absl::Span<const CallFrameBuilder::Buffer> args,
             const CallFrameBuilder::AttributesMap& attrs);
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index 38939c4fb2f630..206c2927e77203 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -109,10 +109,10 @@ struct ArgBinding<Buffer<dtype, rank>> {
 
 template <>
 struct ArgDecoding<BufferBase> {
-  XLA_ATTRIBUTE_ALWAYS_INLINE
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<BufferBase> Decode(XLA_FFI_ArgType type, void* arg,
                                           DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_ArgType_BUFFER) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
       return diagnostic.Emit("Wrong argument type: expected ")
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
@@ -129,24 +129,25 @@ struct ArgDecoding<BufferBase> {
 
 template <PrimitiveType dtype, size_t rank>
 struct ArgDecoding<Buffer<dtype, rank>> {
-  XLA_ATTRIBUTE_ALWAYS_INLINE
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<Buffer<dtype, rank>> Decode(
       XLA_FFI_ArgType type, void* arg, DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_ArgType_BUFFER) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
       return diagnostic.Emit("Wrong argument type: expected ")
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
 
     auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
 
-    if (auto actual_dtype = PrimitiveType(buf->dtype); actual_dtype != dtype) {
+    if (auto actual_dtype = PrimitiveType(buf->dtype);
+        XLA_FFI_PREDICT_FALSE(actual_dtype != dtype)) {
       return diagnostic.Emit("Wrong buffer dtype: expected ")
              << primitive_util::LowercasePrimitiveTypeName(dtype) << " but got "
              << primitive_util::LowercasePrimitiveTypeName(actual_dtype);
     }
 
     if constexpr (rank != internal::kDynamicRank) {
-      if (buf->rank != rank) {
+      if (XLA_FFI_PREDICT_FALSE(buf->rank != rank)) {
         return diagnostic.Emit("Wrong buffer rank: expected ")
                << rank << " but got " << buf->rank;
       }
@@ -174,13 +175,15 @@ struct AttrDecoding<Pointer<T>> {
 
   static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,
                                     DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_AttrType_I64) {
+    auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR ||
+                              scalar->dtype != XLA_FFI_DataType_S64)) {
       return diagnostic.Emit("Wrong attribute type: ")
-             << "expected i64 for passing user data but got " << type;
+             << "expected i64 scalar for passing pointer but got " << type;
     }
 
     static_assert(sizeof(uintptr_t) == sizeof(int64_t));
-    uintptr_t ptr = *reinterpret_cast<uintptr_t*>(attr);
+    uintptr_t ptr = *reinterpret_cast<uintptr_t*>(scalar->value);
     return reinterpret_cast<Type>(ptr);
   }
 };

From e3b08563bf11f65d0374cac6eea5d183c58a1997 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Mon, 25 Mar 2024 09:27:47 -0700
Subject: [PATCH 380/670] #shlo_ref Add `xor` op.

PiperOrigin-RevId: 618866356
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  29 +++++
 tensorflow/lite/experimental/shlo/ops/xor.cc  |  68 +++++++++++
 tensorflow/lite/experimental/shlo/ops/xor.h   |  36 ++++++
 .../lite/experimental/shlo/ops/xor_test.cc    | 112 ++++++++++++++++++
 4 files changed, 245 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/xor.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/xor.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/xor_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index e109daff2d4c48..b4e3004ff12f8a 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -934,3 +934,32 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "xor",
+    srcs = ["xor.cc"],
+    hdrs = ["xor.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "xor_test",
+    srcs = ["xor_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":test_util",
+        ":xor",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/xor.cc b/tensorflow/lite/experimental/shlo/ops/xor.cc
new file mode 100644
index 00000000000000..6b17220e2a49ff
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/xor.cc
@@ -0,0 +1,68 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions xor
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/xor.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <DataType>
+struct Xor : std::bit_xor<void> {};
+
+template <>
+struct Xor<DataType::kI1> {
+  template <class T>
+  bool operator()(T lhs, T rhs) const {
+    return static_cast<bool>(lhs) != static_cast<bool>(rhs);
+  }
+};
+
+XorOp Create(XorOp::Attributes) { return {}; }
+
+absl::Status Prepare(XorOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("xor"), lhs, IsBoolTensor, IsIntTensor));
+  SHLO_REF_RETURN_ON_ERROR(CheckSameBaselineType(CheckCtx("xor"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(CheckSameBaselineType(CheckCtx("xor"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(XorOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  if (IsIntTensor(lhs)) {
+    // Note: all the integer types share the same implementation.
+    Xor<DataType::kSI32> xor_func;
+    DISPATCH_INT(detail::EvaluateNoQuantization, lhs.tensor_element_type(),
+                 xor_func, lhs, rhs, output);
+  } else if (IsBoolTensor(lhs)) {
+    Xor<DataType::kI1> xor_func;
+    detail::EvaluateNoQuantization<DataType::kI1>(xor_func, lhs, rhs, output);
+    return absl::OkStatus();
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.xor: Unsupported tensor type in Evaluate.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/xor.h b/tensorflow/lite/experimental/shlo/ops/xor.h
new file mode 100644
index 00000000000000..0a98aca65c24ca
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/xor.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions xor
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_XOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_XOR_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct XorOp {
+  struct Attributes {};
+};
+
+XorOp Create(XorOp::Attributes);
+absl::Status Prepare(XorOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(XorOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_XOR_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/xor_test.cc b/tensorflow/lite/experimental/shlo/ops/xor_test.cc
new file mode 100644
index 00000000000000..591b185327aed6
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/xor_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions xor
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/xor.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<XorOp> {
+  static std::string Get() { return "Xor"; }
+};
+
+template <DataType>
+struct Xor : std::bit_xor<void> {};
+
+template <>
+struct Xor<DataType::kI1> {
+  template <class T>
+  bool operator()(T lhs, T rhs) const {
+    return static_cast<bool>(lhs) != static_cast<bool>(rhs);
+  }
+};
+
+template <>
+struct SupportedOpDataType<XorOp> {
+  static constexpr DataType kStorageType = DataType::kSI32;
+};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Xor, BinaryElementwiseOpShapePropagationTest,
+                               XorOp, TestParamNames);
+
+using MultipyBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    XorOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Xor, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MultipyBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<XorOp, ConcatTypes<FloatTestTypes, PerTensorQuantizedTestTypes,
+                                   PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Xor, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using SupportedTypes = ConcatTypes<BoolTestType, IntTestTypes>;
+
+template <class T>
+struct XorTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(XorTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(XorTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(),
+                    Xor<TypeParam::kStorage>());
+
+  auto op = Create(XorOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref

From 96091ee7d89da53dd69f909f198a2a147d51193c Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Mon, 25 Mar 2024 09:49:47 -0700
Subject: [PATCH 381/670] [xla:gpu] Use fusion shape instead of hero shape when
 emit AddressComputationThunk

Supporting DUS in AddressComputationFusion means the fusion shape might be different from the hero custom call shape. Sometimes not the whole tuple returned by the hero custom call op is used, but only the output (e.g. the workspace is ignored for GEMM custom call). As a result, we should emit AddressComputationThunk based on the shape of the fusion instruction and not the one from hero custom call op

PiperOrigin-RevId: 618872694
---
 .../address_computation_fusion_test.cc        | 86 +++++++++++++++++++
 .../xla/xla/service/gpu/fusions/custom.cc     | 11 +--
 2 files changed, 92 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index 2fbbb99c80cd50..03017eeefd1666 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -2124,6 +2124,92 @@ TEST_F(AddressComputationFusionTest, CublasGemmDUSWithWorkspace) {
                                       /*run_hlo_passes=*/false));
 }
 
+TEST_F(AddressComputationFusionTest, CublasGemmDUSWorkspaceIgnored) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[8,8]{1,0} parameter(0)
+    %p1 = f16[8,8]{1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%p0, %p1),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    ROOT %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = f16[8,8]{1,0} parameter(0)
+    %p1 = f16[8,8]{1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] parameter(3)
+    %c0_s32 = s32[] parameter(4)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%p0, %p1),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    ROOT %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+  }
+
+  ENTRY %main.9 {
+    %p0 = f16[8,8]{1,0} parameter(0)
+    %p1 = f16[8,8]{1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    ROOT %fusion.2 = f16[4,8,8]{2,1,0} fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 1b6f0722005d9c..faf3efd2e552fb 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -286,15 +286,16 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
     return GetAllocationSlice(buffer_assignment, &fusion, index);
   };
 
-  int64_t out_byte_size = 0;
-  if (custom_call.shape().IsArray()) {
+  int64_t out_fake_byte_size = ShapeUtil::ByteSizeOf(
+      custom_call.shape().IsArray() ? custom_call.shape()
+                                    : custom_call.shape().tuple_shapes(0));
+  if (fusion.shape().IsArray()) {
     TF_ASSIGN_OR_RETURN(output,
                         get_original_result_slice(&custom_call, /*index=*/{}));
     collect_slice_info();
     // Collect slice info for std::nullopt workspace.
     slice_instr = nullptr;
     collect_slice_info();
-    out_byte_size = ShapeUtil::ByteSizeOf(custom_call.shape());
   } else {
     TF_ASSIGN_OR_RETURN(output,
                         get_original_result_slice(&custom_call, /*index=*/{0}));
@@ -305,7 +306,6 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
                                                       &fusion, /*index=*/{1}));
     slice_instr = nullptr;
     collect_slice_info();
-    out_byte_size = ShapeUtil::ByteSizeOf(custom_call.shape().tuple_shapes(0));
     slice_workspace_fake =
         BufferAllocation::Slice(workspace->allocation(), 0, workspace->size());
   }
@@ -338,7 +338,8 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
   BufferAllocation::Slice slice_rhs_fake(rhs_slice.allocation(), 0,
                                          rhs_byte_size);
 
-  BufferAllocation::Slice slice_out_fake(output.allocation(), 0, out_byte_size);
+  BufferAllocation::Slice slice_out_fake(output.allocation(), 0,
+                                         out_fake_byte_size);
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<GemmThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(&custom_call), std::move(config),

From 6c0dc443baa1a01be215484280030e922fc85aff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 09:55:17 -0700
Subject: [PATCH 382/670] Print out a warning if an instruction with misaligned
 sharding annotations.

PiperOrigin-RevId: 618874284
---
 .../auto_sharding/auto_sharding.cc            |  8 +++++
 .../auto_sharding/auto_sharding_util.cc       | 29 +++++++++++++++++++
 .../auto_sharding/auto_sharding_util.h        |  5 ++++
 3 files changed, 42 insertions(+)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 116447777fd9cf..73ece1d214e5dc 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -3420,6 +3420,14 @@ AutoShardingImplementation::SaveAndRemoveShardingAnnotation(
                                          /* save_for_copy_users */ false,
                                          preserve_shardings);
       }
+      if (inst->has_sharding() &&
+          spmd::IsShardingMisaligned(inst->sharding(), inst->shape())) {
+        LOG(WARNING)
+            << "Instruction " << inst->name()
+            << " has a user sharding annotation that is misaligned. Shape: "
+            << inst->shape().ToString()
+            << ". Sharding:" << inst->sharding().ToString();
+      }
     }
   }
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index f2cf5dab656ecd..75f843829b55a6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -2282,5 +2282,34 @@ std::vector<std::vector<int64_t>> InferOrEnumerateMeshShapesToTry(
   return mesh_shapes;
 }
 
+bool IsShardingMisaligned(const HloSharding& sharding, const Shape& shape) {
+  if (shape.IsTuple()) {
+    for (size_t i = 0; i < shape.tuple_shapes_size(); ++i) {
+      if (IsShardingMisaligned(
+              sharding.IsTuple()
+                  ? sharding.GetSubSharding(shape, {static_cast<int64_t>(i)})
+                  : sharding,
+              shape.tuple_shapes(i))) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  if (sharding.IsReplicated() || sharding.IsManual() || sharding.IsUnknown() ||
+      sharding.IsTileMaximal()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < shape.rank(); ++i) {
+    int64_t shape_dim = shape.dimensions()[i];
+    int64_t sharding_dim = sharding.tile_assignment().dim(i);
+    if (shape_dim % sharding_dim != 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index b3781ce3534f6f..0b72ce14ac8abe 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -655,6 +655,11 @@ std::vector<std::vector<int64_t>> InferOrEnumerateMeshShapesToTry(
     const HloModule& module, int64_t num_devices, int num_mesh_dims,
     bool symmetrical_mesh_dims);
 
+// Check if the sharding is "misaligned" wrt the shape. This is true if there is
+// at least one dimension of the tensor that is sharded over a number of devices
+// that do not complete divide the size of the tensor dimension.
+bool IsShardingMisaligned(const HloSharding& sharding, const Shape& shape);
+
 }  // namespace spmd
 }  // namespace xla
 

From 5ecf14ed47994e711cb2170ef00d262446e05436 Mon Sep 17 00:00:00 2001
From: Penporn Koanantakool <penporn@google.com>
Date: Mon, 25 Mar 2024 09:57:33 -0700
Subject: [PATCH 383/670] Fix build errors and warnings when building TF with
 oneDNN v2.x. - Add missing ENABLE_ONEDNN_V3 guards for oneDNN sparse matmul
 op since the CSR primitive is only available in oneDNN v3+. - Add missing
 header includes and remove unused ones. - Add missing build dependencies. -
 Also fix the years in the copyright header. They were changed in PR #63030.
 The years shouldn't be changed when the files are updated.

Example error message:
```
In file included from tensorflow/core/kernels/mkl/mkl_batch_matmul_helper.h:25:
tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h:927:38: error: no member named 'csr' in 'dnnl::memory::desc'
  927 |       const auto tmp = memory::desc::csr(
      |                        ~~~~~~~~~~~~~~^
1 error generated.
```

PiperOrigin-RevId: 618874937
---
 .../common_runtime/eager/mkl_eager_op_rewrite.cc   |  2 +-
 tensorflow/core/common_runtime/mkl_layout_pass.cc  |  2 +-
 tensorflow/core/graph/mkl_graph_util.h             |  2 +-
 tensorflow/core/graph/mkl_testlib.cc               |  4 +++-
 tensorflow/core/graph/mkl_testlib.h                |  4 +++-
 .../core/kernels/mkl/mkl_matmul_ops_common.h       |  4 +++-
 .../mkl/mkl_sparse_matrix_matmul_op_benchmark.cc   |  5 ++---
 tensorflow/core/ops/BUILD                          | 13 ++++++++++++-
 tensorflow/core/ops/sparse_csr_matrix_ops.cc       | 14 +++++++++-----
 tensorflow/core/util/onednn_env_vars.cc            |  2 +-
 tensorflow/core/util/onednn_env_vars.h             |  2 +-
 11 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
index db1069acf6fc01..724f998e6f0f30 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 3d36b12d44b3df..3272940402b737 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 79a1867155ef33..00e2e74b684509 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/graph/mkl_testlib.cc b/tensorflow/core/graph/mkl_testlib.cc
index e8955da2f1748c..52a52ab828fe54 100644
--- a/tensorflow/core/graph/mkl_testlib.cc
+++ b/tensorflow/core/graph/mkl_testlib.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@ Node* oneDNNSoftmax(Graph* g, Node* input) {
   return ret;
 }
 
+#ifdef ENABLE_ONEDNN_V3
 Node* oneDNNSparseCSRMatmul(Graph* g, Node* csr_matrix_t, Node* b) {
   Node* ret = nullptr;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_MklNativeSparseMatrixMatMul")
@@ -42,6 +43,7 @@ Node* oneDNNSparseCSRMatmul(Graph* g, Node* csr_matrix_t, Node* b) {
                   .Finalize(g, &ret));
   return ret;
 }
+#endif  // ENABLE_ONEDNN_V3
 
 }  // namespace graph
 }  // namespace test
diff --git a/tensorflow/core/graph/mkl_testlib.h b/tensorflow/core/graph/mkl_testlib.h
index 1b783923c1f03c..3dffded1584465 100644
--- a/tensorflow/core/graph/mkl_testlib.h
+++ b/tensorflow/core/graph/mkl_testlib.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -26,7 +26,9 @@ namespace graph {
 
 Node* oneDNNSoftmax(Graph* g, Node* input);
 
+#ifdef ENABLE_ONEDNN_V3
 Node* oneDNNSparseCSRMatmul(Graph* g, Node* csr_matrix_t, Node* b);
+#endif  // ENABLE_ONEDNN_V3
 
 }  // namespace graph
 }  // namespace test
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index 7bfe75ec9baf76..922e6464663bb5 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -924,10 +924,12 @@ class MklMatMulPrimitive : public MklPrimitive {
     // Create MatMul descriptor and primitive descriptor.
     if constexpr (CSR) {
       // If it's a CSR matrix.
+#ifdef ENABLE_ONEDNN_V3
       const auto tmp = memory::desc::csr(
           params.a_dims, MklDnnType<Tlhs>(), params.a_nnz,
           dnnl::memory::data_type::s32, dnnl::memory::data_type::s32);
       context_.a_md.reset(new memory::desc(tmp));
+#endif  // ENABLE_ONEDNN_V3
     } else {
       context_.a_md.reset(new memory::desc({params.a_dims}, MklDnnType<Tlhs>(),
                                            params.a_strides));
diff --git a/tensorflow/core/kernels/mkl/mkl_sparse_matrix_matmul_op_benchmark.cc b/tensorflow/core/kernels/mkl/mkl_sparse_matrix_matmul_op_benchmark.cc
index 20ef908e9b9521..6f434c588bb4cf 100644
--- a/tensorflow/core/kernels/mkl/mkl_sparse_matrix_matmul_op_benchmark.cc
+++ b/tensorflow/core/kernels/mkl/mkl_sparse_matrix_matmul_op_benchmark.cc
@@ -1,6 +1,5 @@
 /* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -14,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
 
 #include <random>
 
@@ -196,4 +195,4 @@ BM_SparseMatrixMatmul(3200, 8, 4000000, 100);
 }  // namespace
 }  // end namespace tensorflow
 
-#endif
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/ops/BUILD b/tensorflow/core/ops/BUILD
index 548e0444bf2ce0..4d3c38aba41503 100644
--- a/tensorflow/core/ops/BUILD
+++ b/tensorflow/core/ops/BUILD
@@ -84,7 +84,6 @@ tf_gen_op_libs(
         "set_ops",
         "script_ops",
         "sendrecv_ops",
-        "sparse_csr_matrix_ops",
         "sparse_ops",
         "spectral_ops",
         "state_ops",
@@ -102,6 +101,18 @@ tf_gen_op_libs(
     ],
 )
 
+tf_gen_op_libs(
+    is_external = False,
+    op_lib_names = ["sparse_csr_matrix_ops"],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_gen_op_libs(
     is_external = False,
     op_lib_names = [
diff --git a/tensorflow/core/ops/sparse_csr_matrix_ops.cc b/tensorflow/core/ops/sparse_csr_matrix_ops.cc
index 25b39fa079093f..0a80e10f6a5313 100644
--- a/tensorflow/core/ops/sparse_csr_matrix_ops.cc
+++ b/tensorflow/core/ops/sparse_csr_matrix_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <tuple>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 
@@ -295,7 +299,7 @@ REGISTER_OP("SparseMatrixMatMul")
       return absl::OkStatus();
     });
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
 
 REGISTER_OP("_MklNativeSparseMatrixMatMul")
     .Input("a: variant")
@@ -373,7 +377,7 @@ REGISTER_OP("_MklNativeSparseMatrixMatMul")
       c->set_output(0, out);
       return OkStatus();
     });
-#endif
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
 REGISTER_OP("SparseMatrixMul")
     .Input("a: variant")
diff --git a/tensorflow/core/util/onednn_env_vars.cc b/tensorflow/core/util/onednn_env_vars.cc
index 1b73ef8e862fd7..e8d0e293d4ccf9 100644
--- a/tensorflow/core/util/onednn_env_vars.cc
+++ b/tensorflow/core/util/onednn_env_vars.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/util/onednn_env_vars.h b/tensorflow/core/util/onednn_env_vars.h
index e2cb27ccfc8115..d7debd22976d02 100644
--- a/tensorflow/core/util/onednn_env_vars.h
+++ b/tensorflow/core/util/onednn_env_vars.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

From ccb72d03b8c42963ae01d30fdceb4c746814dee7 Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Mon, 25 Mar 2024 10:07:55 -0700
Subject: [PATCH 384/670] [Take 2] Expose .layout on jax.Array. Also add checks
 in the AOT path to make sure that the input Array's layout matches the layout
 given to jax.jit.

PiperOrigin-RevId: 618878870
---
 third_party/xla/xla/python/py_array.cc    | 3 +++
 third_party/xla/xla/python/py_array.h     | 4 ++++
 third_party/xla/xla/python/xla_client.py  | 2 +-
 third_party/xla/xla/python/xla_client.pyi | 1 +
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index cd58ff580380c6..d73568c2ccb6d4 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/layout.h"
 #include "xla/layout_util.h"
@@ -1570,6 +1571,8 @@ Status PyArray::RegisterTypes(nb::module_& m) {
       nb::is_method());
   type.attr("__cuda_array_interface__") = nb_property_readonly(
       [](PyArray self) { return self.CudaArrayInterface(); });
+  type.attr("_pjrt_layout") =
+      nb_property_readonly(xla::ValueOrThrowWrapper(&PyArray::layout));
   type.attr("on_device_size_in_bytes") = nb::cpp_function(
       xla::ValueOrThrowWrapper(&PyArray::GetOnDeviceSizeInBytes),
       nb::is_method());
diff --git a/third_party/xla/xla/python/py_array.h b/third_party/xla/xla/python/py_array.h
index 1595c830a92a7b..31d26b95feb68c 100644
--- a/third_party/xla/xla/python/py_array.h
+++ b/third_party/xla/xla/python/py_array.h
@@ -176,6 +176,10 @@ class PyArray : public nanobind::object {
 
   const nanobind::object& sharding() const { return GetStorage().sharding; }
 
+  StatusOr<std::unique_ptr<PjRtLayout>> layout() {
+    return ifrt_array()->layout();
+  }
+
   bool committed() const { return GetStorage().committed; }
 
   const nanobind::object& npy_value() const { return GetStorage().npy_value; }
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 47e0600ee6219e..ccfbf7d00874f1 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -48,7 +48,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 248
+_version = 249
 
 # Version number for MLIR:Python components.
 mlir_api_version = 55
diff --git a/third_party/xla/xla/python/xla_client.pyi b/third_party/xla/xla/python/xla_client.pyi
index cedaa61db5cb96..cd8f5bd09615ee 100644
--- a/third_party/xla/xla/python/xla_client.pyi
+++ b/third_party/xla/xla/python/xla_client.pyi
@@ -42,6 +42,7 @@ from .xla_extension import OpSharding as OpSharding
 from .xla_extension import HloSharding as HloSharding
 from .xla_extension import PrimitiveType as PrimitiveType
 from .xla_extension import Traceback as Traceback
+from .xla_extension import PjRtLayout as PjRtLayout
 from .xla_extension import XlaBuilder as XlaBuilder
 from .xla_extension import XlaComputation as XlaComputation
 from .xla_extension import XlaOp as XlaOp

From 5a1f6733ac6c9a53b5dd22c7facddc490efa6c8e Mon Sep 17 00:00:00 2001
From: Michael Hudgins <michaelhudgins@google.com>
Date: Mon, 25 Mar 2024 10:10:46 -0700
Subject: [PATCH 385/670] Remove angerson from workflow reviewers

PiperOrigin-RevId: 618879841
---
 .github/workflows/update-rbe.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
index bdce23b94d02f1..d670cd6040401d 100644
--- a/.github/workflows/update-rbe.yml
+++ b/.github/workflows/update-rbe.yml
@@ -123,7 +123,7 @@ jobs:
         title: Update the RBE images to the latest container versions
         committer: TensorFlow Release Automation <jenkins@tensorflow.org>
         token: ${{ secrets.JENKINS_TOKEN }}
-        reviewers: angerson,mihaimaruseac,learning-to-play,nitins17
+        reviewers: mihaimaruseac,learning-to-play,nitins17
         body: |
           This PR was created by a GitHub Actions workflow to update all the SIG Build-based RBE containers to the most recent containers. See:
 

From a71465d024dfd665147012a12044bd17809372a9 Mon Sep 17 00:00:00 2001
From: mraunak <83710963+mraunak@users.noreply.github.com>
Date: Mon, 25 Mar 2024 10:42:12 -0700
Subject: [PATCH 386/670] Update configure.py

---
 configure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index 28b3101ab67f51..befa76554e3471 100644
--- a/configure.py
+++ b/configure.py
@@ -808,7 +808,7 @@ def choose_compiler(environ_cp):
 
 def choose_compiler_Win(environ_cp):
   question = 'Do you want to use Clang to build TensorFlow?'
-  yes_reply = 'Please use "--config=win_clang" to compile TensorFlow with CLANG.'
+  yes_reply = 'Add "--config=win_clang" to compile TensorFlow with CLANG.'
   no_reply = 'MSVC will be used to compile TensorFlow.'
   var = int(
       get_var(

From 13a363f37424cae2549ab629de063f6612496f99 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Mon, 25 Mar 2024 10:17:13 -0700
Subject: [PATCH 387/670] [XLA:GPU] Run canonicalizer and CSE after creating
 Triton module.

PiperOrigin-RevId: 618881884
---
 .../xla/xla/service/gpu/ir_emitter_triton.cc   |  5 +++++
 .../xla/service/gpu/ir_emitter_triton_test.cc  | 18 +++---------------
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index aa582e888d753d..e53c478a45f191 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -2271,6 +2271,11 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
 
   b.create<mt::ReturnOp>(loc);
 
+  mlir::PassManager pm(&mlir_context);
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+  TF_RET_CHECK(pm.run(triton_module.get()).succeeded());
+
   VLOG(6) << llvm_ir::DumpToString(*triton_module);
   if (DumpingEnabledForHloModule(*hlo_computation->parent())) {
     DumpToFileInDirOrStdout(*hlo_computation->parent(), "triton_ir", "ttir",
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 0cf4f75b2fe8cd..bdd1465db8dc2d 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -139,9 +139,6 @@ absl::Status TritonFilecheckTest::CreateTritonIrAndFileCheck(
       auto module, CreateTritonModule(analysis, "triton_fn", computation,
                                       TestGpuDeviceInfo::RTXA6000DeviceInfo(),
                                       config, emitter, context));
-  mlir::PassManager pm(&context);
-  pm.addPass(mlir::createCanonicalizerPass());
-  TF_RET_CHECK(pm.run(module.get()).succeeded());
 
   std::string out;
   llvm::raw_string_ostream os(out);
@@ -231,10 +228,8 @@ CHECK:        %[[RHS_MASKED:.*]] = arith.select %[[RHS_INBOUNDS_KN]], %[[RHS_TIL
 CHECK:        %[[ACC_NEXT:.*]] = tt.dot %[[LHS_MASKED]], %[[RHS_MASKED]], %[[ACC]]
 CHECK:        scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xi8>, 1>, !tt.ptr<tensor<32x64xf32>, 1>, tensor<16x64xf32>
 CHECK:      }
-CHECK:      %[[TILE_OFFSET_M_OUT:.*]] = arith.muli %[[TILE_INDEX_M]], %[[TILE_SIZE_M]]
-CHECK:      %[[TILE_OFFSET_N_OUT:.*]] = arith.muli %[[TILE_INDEX_N]], %[[TILE_SIZE_N]]
 CHECK:      %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[C80]], %[[SIZE_M]]], [%[[SIZE_M]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x64xf32>, 1>
-CHECK:      %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_OUT]], %[[TILE_OFFSET_N_OUT]]] : <tensor<16x64xf32>, 1>
+CHECK:      %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[TILE_OFFSET_N_RHS]]] : <tensor<16x64xf32>, 1>
 CHECK:      tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<16x64xf32>, 1>, tensor<16x64xf32>
 CHECK:      tt.return
 CHECK:    }
@@ -317,11 +312,8 @@ CHECK:      %[[ACC_NEXT:.*]] = tt.dot %[[LHS_MASKED]], %[[RHS_MASKED]], %[[ACC]]
 CHECK:      scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xf32>, 1>, !tt.ptr<tensor<32x16xf32>, 1>, tensor<16x16xf32>
 CHECK:    }
 
-
-CHECK:    %[[TILE_OFFSET_M_OUT:.*]] = arith.muli %[[TILE_INDEX_M]], %[[TILE_SIZE_M]]
-CHECK:    %[[TILE_OFFSET_N_OUT:.*]] = arith.muli %[[TILE_INDEX_N]], %[[TILE_SIZE_M]]
 CHECK:    %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[SIZE_M]], %[[C1]]], [%[[C1]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x16xf32>, 1>
-CHECK:    %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_OUT]], %[[TILE_OFFSET_N_OUT]]] : <tensor<16x16xf32>, 1>
+CHECK:    %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[TILE_OFFSET_N_RHS]]] : <tensor<16x16xf32>, 1>
 CHECK:    tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 0, 1>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<16x16xf32>, 1>, tensor<16x16xf32>
 CHECK:    tt.return
 CHECK:  }
@@ -1119,11 +1111,7 @@ CHECK-DAG: %[[ARG_PTR:.*]] = arith.select %[[CONCAT_COND:.*]], %[[P1]], %[[P2]]
 CHECK-DAG: %[[BATCH_STRIDE_P1:.*]] = arith.constant 1280
 CHECK-DAG: %[[BATCH_STRIDE_P2:.*]] = arith.constant 2560
 CHECK-DAG: %[[BATCH_STRIDE:.*]] = arith.select %[[CONCAT_COND_2:.*]], %[[BATCH_STRIDE_P1]], %[[BATCH_STRIDE_P2]]
-COM:       -- Note: we use "CHECK" below voluntarily because the current codegen
-COM:       -- does not do any kind of CSE before returning. This causes
-COM:       -- PID_BATCH to be constructed several times, and by construction,
-COM:       -- the second one is the relevant one.
-CHECK:     %[[PID_BATCH:.*]] = tt.get_program_id y
+CHECK-DAG: %[[PID_BATCH:.*]] = tt.get_program_id y
 CHECK-DAG: %[[OFFSET:.*]] = arith.muli %[[PID_BATCH]], %[[BATCH_STRIDE]]
 CHECK:     %[[BLOCK_BASE_PTR:.*]] = tt.addptr %[[ARG_PTR]], %[[OFFSET]]
 )"));

From 37019831296948da99379a9aee1fce1cbeb3771a Mon Sep 17 00:00:00 2001
From: Jake Harmon <jakeharmon@google.com>
Date: Mon, 25 Mar 2024 10:26:15 -0700
Subject: [PATCH 388/670] Delete obsolete block

PiperOrigin-RevId: 618884984
---
 third_party/xla/third_party/tsl/README.md | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/third_party/xla/third_party/tsl/README.md b/third_party/xla/third_party/tsl/README.md
index ff61283c5e5351..6f4ab2025257aa 100644
--- a/third_party/xla/third_party/tsl/README.md
+++ b/third_party/xla/third_party/tsl/README.md
@@ -10,16 +10,6 @@ This repo contains base utilities and cross-platform support for projects like
 > [upstream location](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tsl)
 > to make any contributions or report any issues.
 
-## Contacts
-
-Discord TBA
-
-Community proposals TBA
-
-Community meetings TBA
-
-Additional contacts TBA
-
 ## Code of Conduct
 
 While under TensorFlow governance, all community spaces are subject to the

From e1216db0986414ba7df8b81a0826b3d1d6e5196f Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Mon, 25 Mar 2024 10:46:09 -0700
Subject: [PATCH 389/670] [XLA:GPU][IndexAnalysis] Implement fusion of indexing
 maps with RTVars.

PiperOrigin-RevId: 618891650
---
 .../gpu/model/indexing_analysis_test.cc       |  71 ++++++++++++
 .../xla/xla/service/gpu/model/indexing_map.cc | 102 ++++++++++++++----
 2 files changed, 154 insertions(+), 19 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 12256b22989d6b..505d5e959bef9d 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -1303,6 +1303,77 @@ TEST_F(IndexingAnalysisTest, FusionOpWithSliceOfSlice) {
                           )"))));
 }
 
+TEST_F(IndexingAnalysisTest, FusionOpWithDynSliceOfDynSlice) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    f {
+      %src = s32[150, 64] parameter(0)
+      %of11 = s32[] parameter(1)
+      %of12 = s32[] parameter(2)
+      %of21 = s32[] parameter(3)
+      %of22 = s32[] parameter(4)
+
+      %ds1 = s32[50, 32] dynamic-slice(s32[150, 64] %src,
+        s32[] %of11, s32[] %of12), dynamic_slice_sizes={50, 32}
+
+      ROOT %ds2 = s32[25, 16] dynamic-slice(s32[50, 32] %ds1,
+        s32[] %of21, s32[] %of22), dynamic_slice_sizes={25, 16}
+    }
+    ENTRY e {
+      %p0 = s32[150, 64] parameter(0)
+      %p1 = s32[] parameter(1)
+      %p2 = s32[] parameter(2)
+      %p3 = s32[] parameter(3)
+      %p4 = s32[] parameter(4)
+      ROOT fusion = s32[25, 16] fusion(p0, p1, p2, p3, p4),
+        kind=kLoop, calls=f
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1)[s0, s1, s2, s3] -> (d0 + s0 + s2, d1 + s1 + s3)
+                domain:
+                d0 in [0, 24]
+                d1 in [0, 15]
+                s0 in [0, 100]
+                  hlo: %of11 = s32[] parameter(1)
+                  (d0, d1) -> ()
+                s1 in [0, 32]
+                  hlo: %of12 = s32[] parameter(2)
+                  (d0, d1) -> ()
+                s2 in [0, 25]
+                  hlo: %of21 = s32[] parameter(3)
+                  (d0, d1) -> ()
+                s3 in [0, 16]
+                  hlo: %of22 = s32[] parameter(4)
+                  (d0, d1) -> ()
+                )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                  (d0, d1) -> ()
+                  domain:
+                  d0 in [0, 24]
+                  d1 in [0, 15]
+                )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                  (d0, d1) -> ()
+                  domain:
+                  d0 in [0, 24]
+                  d1 in [0, 15]
+                )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                  (d0, d1) -> ()
+                  domain:
+                  d0 in [0, 24]
+                  d1 in [0, 15]
+                )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                  (d0, d1) -> ()
+                  domain:
+                  d0 in [0, 24]
+                  d1 in [0, 15]
+                )"))));
+}
+
 TEST_F(IndexingAnalysisTest, FusionOpSliceOfAllConcatenateOpInputs) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index f93fa03a4c31d8..5ad902f690bea1 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -564,6 +564,59 @@ bool SimplifyConstraintRange(AffineExpr* expr, Interval* range) {
   return is_simplified;
 }
 
+// Computes the symbols list replacement to go from
+// [range_vars(second)|rt_vars(second)|range_vars(first)|rt_vars(first)]
+// to
+// [range_vars(second)|range_vars(first)|rt_vars(second)|rt_vars(first)].
+SmallVector<AffineExpr, 4> GetComposedSymbolsPermutationToCorrectOrder(
+    const IndexingMap& first, const IndexingMap& second) {
+  SmallVector<AffineExpr, 4> symbol_replacements;
+  MLIRContext* mlir_context = first.GetMLIRContext();
+  for (int id = 0; id < second.GetRangeVarsCount(); ++id) {
+    symbol_replacements.push_back(getAffineSymbolExpr(id, mlir_context));
+  }
+  int64_t rt_vars_second_start =
+      first.GetRangeVarsCount() + second.GetRangeVarsCount();
+  for (int64_t id = 0; id < second.GetRTVarsCount(); ++id) {
+    symbol_replacements.push_back(
+        getAffineSymbolExpr(rt_vars_second_start++, mlir_context));
+  }
+  int64_t range_vars_first_start = second.GetRangeVarsCount();
+  for (int64_t id = 0; id < first.GetRangeVarsCount(); ++id) {
+    symbol_replacements.push_back(
+        getAffineSymbolExpr(range_vars_first_start++, mlir_context));
+  }
+  int64_t rt_vars_first_start = rt_vars_second_start + second.GetRTVarsCount();
+  for (int64_t id = 0; id < first.GetRTVarsCount(); ++id) {
+    symbol_replacements.push_back(
+        getAffineSymbolExpr(rt_vars_first_start++, mlir_context));
+  }
+  return symbol_replacements;
+}
+
+// Computes the symbols list mapping to go from
+// [range_vars(map)|rt_vars(map)]
+// to
+// [range_vars(second)|range_vars(first)|rt_vars(second)|rt_vars(first)].
+SmallVector<AffineExpr, 4> MapSymbolsToComposedSymbolsList(
+    const IndexingMap& map, const IndexingMap& composed) {
+  SmallVector<AffineExpr, 4> symbol_replacements;
+
+  MLIRContext* mlir_context = map.GetMLIRContext();
+  int64_t range_vars_start =
+      composed.GetRangeVarsCount() - map.GetRangeVarsCount();
+  for (int64_t id = 0; id < map.GetRangeVarsCount(); ++id) {
+    symbol_replacements.push_back(
+        getAffineSymbolExpr(range_vars_start++, mlir_context));
+  }
+  int64_t rt_vars_start = composed.GetSymbolCount() - map.GetRTVarsCount();
+  for (int64_t id = 0; id < map.GetRTVarsCount(); ++id) {
+    symbol_replacements.push_back(
+        getAffineSymbolExpr(rt_vars_start++, mlir_context));
+  }
+  return symbol_replacements;
+}
+
 }  // namespace
 
 std::string Interval::ToString() const {
@@ -1158,27 +1211,33 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
   if (second.IsUndefined() || first.IsUndefined()) {
     return IndexingMap::GetUndefined();
   }
-  // TODO(b/329052892): Implement composition with RT vars.
-  if (first.GetRTVarsCount() || second.GetRTVarsCount()) {
-    return IndexingMap::GetUndefined();
-  }
   AffineMap producer_affine_map = second.GetAffineMap();
   AffineMap composed_map = producer_affine_map.compose(first.GetAffineMap());
 
   // The symbols in the composed map, i.e. combined
-  // producer_map.compose(consumer_map) are packed as [symbols(producer_map) |
-  // symbols(consumer_map)].
-  std::vector<RangeVar> combined_symbol_ranges;
-  combined_symbol_ranges.reserve(second.GetRangeVarsCount() +
-                                 first.GetRangeVarsCount());
-  for (const RangeVar& symbol_range : llvm::concat<const RangeVar>(
+  // producer_map.compose(consumer_map) are packed as
+  // [range_vars(second)|rt_vars(second)|range_vars(first)|rt_vars(first)].
+  std::vector<RangeVar> combined_range_vars;
+  combined_range_vars.reserve(second.GetRangeVarsCount() +
+                              first.GetRangeVarsCount());
+  for (const RangeVar& range_var : llvm::concat<const RangeVar>(
            second.GetRangeVars(), first.GetRangeVars())) {
-    combined_symbol_ranges.push_back(symbol_range);
-  }
-
+    combined_range_vars.push_back(range_var);
+  }
+  std::vector<RTVar> combined_rt_vars;
+  combined_rt_vars.reserve(second.GetRTVarsCount() + first.GetRTVarsCount());
+  for (const RTVar& rt_var :
+       llvm::concat<const RTVar>(second.GetRTVars(), first.GetRTVars())) {
+    combined_rt_vars.push_back(rt_var);
+  }
+  // The symbols in the composed map have to be permuted to keep the invariant
+  // that range_vars go before rt_vars in the composed affine map symbols list.
+  SmallVector<AffineExpr, 4> symbol_replacements =
+      GetComposedSymbolsPermutationToCorrectOrder(first, second);
   IndexingMap composed_indexing_map(composed_map, first.GetDimVars(),
-                                    std::move(combined_symbol_ranges),
-                                    /*rt_vars=*/{});
+                                    std::move(combined_range_vars),
+                                    std::move(combined_rt_vars));
+
   // Add constraints that are already present in the producer_map. We have to
   // compute consumer_map(producer_constraints). To keep all symbols and
   // dimension IDs the same as in the `composed_indexing_map.affine_map`, we
@@ -1194,17 +1253,22 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
   auto constraints_map = AffineMap::get(
       producer_affine_map.getNumDims(), producer_affine_map.getNumSymbols(),
       constraints, producer_affine_map.getContext());
-  auto remapped_constraints = constraints_map.compose(first.GetAffineMap());
+  auto remapped_constraints =
+      constraints_map.compose(first.GetAffineMap())
+          .replaceDimsAndSymbols(/*dimReplacements=*/{}, symbol_replacements,
+                                 composed_indexing_map.GetDimensionCount(),
+                                 composed_indexing_map.GetSymbolCount());
   for (const auto& [expr, range] :
        llvm::zip(remapped_constraints.getResults(), constraints_ranges)) {
     composed_indexing_map.AddConstraint(expr, range);
   }
   // Remap symbol ids and add constraints that are already present in the
   // consumer_map.
+  SmallVector<AffineExpr, 4> first_map_symbols_to_composed_symbols =
+      MapSymbolsToComposedSymbolsList(first, composed_indexing_map);
   for (const auto& [expr, range] : first.GetConstraints()) {
     composed_indexing_map.AddConstraint(
-        expr.shiftSymbols(first.GetSymbolCount(), second.GetSymbolCount()),
-        range);
+        expr.replaceSymbols(first_map_symbols_to_composed_symbols), range);
   }
   // Add constraints for consumer's codomain w.r.t. producer's domain.
   for (auto [index, expr] :
@@ -1212,7 +1276,7 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
     Interval producer_dim_range =
         second.GetDimensionBound(static_cast<int64_t>(index));
     composed_indexing_map.AddConstraint(
-        expr.shiftSymbols(first.GetSymbolCount(), second.GetSymbolCount()),
+        expr.replaceSymbols(first_map_symbols_to_composed_symbols),
         producer_dim_range);
   }
   return composed_indexing_map;

From ec8563d9623b69cdee9eb8379cf8192fd4f2a8e3 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 25 Mar 2024 11:01:09 -0700
Subject: [PATCH 390/670] [xla:ffi] Add support for passing array attributes

PiperOrigin-RevId: 618896639
---
 third_party/xla/xla/ffi/BUILD           |  1 +
 third_party/xla/xla/ffi/api/api.h       |  6 +++--
 third_party/xla/xla/ffi/api/c_api.h     | 19 ++++++++++++---
 third_party/xla/xla/ffi/api/ffi.h       | 27 +++++++++++++++++++++
 third_party/xla/xla/ffi/api/ffi_test.cc | 23 ++++++++++++++++++
 third_party/xla/xla/ffi/call_frame.cc   | 32 ++++++++++++++++++++++---
 third_party/xla/xla/ffi/call_frame.h    |  9 +++++--
 third_party/xla/xla/ffi/ffi.h           | 28 ++++++++++++++++++++++
 third_party/xla/xla/ffi/ffi_test.cc     | 24 +++++++++++++++++++
 9 files changed, 159 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index e220f8a1d46c77..06b623d0721f98 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -81,6 +81,7 @@ xla_cc_test(
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index 42da19edc17340..c6a342f149f72c 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -1096,12 +1096,14 @@ class Handler : public Ffi {
 
 inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_AttrType type) {
   switch (type) {
+    case XLA_FFI_AttrType_ARRAY:
+      return os << "array";
+    case XLA_FFI_AttrType_DICTIONARY:
+      return os << "dictionary";
     case XLA_FFI_AttrType_SCALAR:
       return os << "scalar";
     case XLA_FFI_AttrType_STRING:
       return os << "string";
-    case XLA_FFI_AttrType_DICTIONARY:
-      return os << "dictionary";
   }
 }
 
diff --git a/third_party/xla/xla/ffi/api/c_api.h b/third_party/xla/xla/ffi/api/c_api.h
index 60c05cd57bca22..e7ebd8da060796 100644
--- a/third_party/xla/xla/ffi/api/c_api.h
+++ b/third_party/xla/xla/ffi/api/c_api.h
@@ -192,9 +192,10 @@ typedef enum {
 //===----------------------------------------------------------------------===//
 
 typedef enum {
-  XLA_FFI_AttrType_SCALAR = 1,
-  XLA_FFI_AttrType_STRING = 2,
-  XLA_FFI_AttrType_DICTIONARY = 3,
+  XLA_FFI_AttrType_ARRAY = 1,
+  XLA_FFI_AttrType_DICTIONARY = 2,
+  XLA_FFI_AttrType_SCALAR = 3,
+  XLA_FFI_AttrType_STRING = 4,
 } XLA_FFI_AttrType;
 
 //===----------------------------------------------------------------------===//
@@ -232,6 +233,18 @@ struct XLA_FFI_Scalar {
 
 XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Scalar, value);
 
+// A struct to pass a dense array to FFI handler.
+struct XLA_FFI_Array {
+  size_t struct_size;
+  void* priv;
+
+  XLA_FFI_DataType dtype;
+  size_t size;
+  void* data;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Array, data);
+
 struct XLA_FFI_Args {
   size_t struct_size;
   void* priv;
diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
index a30121216e8cd6..ab7a539c707215 100644
--- a/third_party/xla/xla/ffi/api/ffi.h
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -247,6 +247,33 @@ struct ArgDecoding<Buffer<dtype, rank>> {
 // Attributes decoding
 //===----------------------------------------------------------------------===//
 
+#define XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(T, TYPE)                      \
+  template <>                                                               \
+  struct AttrDecoding<Span<const T>> {                                      \
+    using Type = Span<const T>;                                             \
+    static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,    \
+                                      DiagnosticEngine& diagnostic) {       \
+      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_ARRAY)) {          \
+        return diagnostic.Emit("Wrong attribute type: expected ")           \
+               << XLA_FFI_AttrType_ARRAY << " but got " << type;            \
+      }                                                                     \
+                                                                            \
+      auto* array = reinterpret_cast<XLA_FFI_Array*>(attr);                 \
+      if (XLA_FFI_PREDICT_FALSE(array->dtype != TYPE)) {                    \
+        return diagnostic.Emit("Wrong array data type: expected ")          \
+               << TYPE << " but got " << array->dtype;                      \
+      }                                                                     \
+                                                                            \
+      return Span<const T>(reinterpret_cast<T*>(array->data), array->size); \
+    }                                                                       \
+  }
+
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int32_t, XLA_FFI_DataType_S32);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int64_t, XLA_FFI_DataType_S64);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(float, XLA_FFI_DataType_F32);
+
+#undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
+
 // A type tag to mark i64 attributes as pointers to `T`.
 template <typename T>
 struct Pointer {};
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index 53ef32daa7283c..f389a2ac05fb19 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -231,6 +231,29 @@ TEST(FfiTest, BindingPlatformStreamInference) {
   (void)Ffi::BindTo(+[](TestStream stream) { return Error::Success(); });
 }
 
+TEST(FfiTest, ArrayAttr) {
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("arr", std::vector<int32_t>({1, 2, 3, 4}));
+
+  CallFrameBuilder builder;
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto fn = [&](Span<const int32_t> arr) {
+    EXPECT_EQ(arr.size(), 4);
+    EXPECT_EQ(arr[0], 1);
+    EXPECT_EQ(arr[1], 2);
+    EXPECT_EQ(arr[2], 3);
+    EXPECT_EQ(arr[3], 4);
+    return Error::Success();
+  };
+
+  auto handler = Ffi::Bind().Attr<Span<const int32_t>>("arr").To(fn);
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, PointerAttr) {
   std::string foo = "foo";
 
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index 30cf762db44218..2220664d4b1969 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -126,6 +126,13 @@ struct CallFrame::Dictionary {
   std::unique_ptr<Attributes> attrs;
 };
 
+struct CallFrame::Array {
+  std::variant<std::vector<int32_t>, std::vector<int64_t>, std::vector<float>>
+      value;  // XLA_FFI_Array::data
+
+  XLA_FFI_Array array = {XLA_FFI_Array_STRUCT_SIZE, nullptr};
+};
+
 struct CallFrame::Scalar {
   std::variant<int32_t, int64_t, float> value;  // XLA_FFI_Scalar::value
 
@@ -265,9 +272,12 @@ CallFrame::~CallFrame() = default;
 // An std::visit overload set for converting CallFrameBuilder::Attribute to
 // CallFrame::Attribute.
 struct CallFrame::ConvertAttribute {
-  template <typename T>
-  CallFrame::Attribute operator()(const T& value) {
-    return CallFrame::Scalar{value};
+  CallFrame::Attribute operator()(const CallFrameBuilder::Array& array) {
+    return CallFrame::Array{array};
+  }
+
+  CallFrame::Attribute operator()(const CallFrameBuilder::Scalar& scalar) {
+    return CallFrame::Scalar{scalar};
   }
 
   CallFrame::Attribute operator()(const std::string& str) {
@@ -295,6 +305,16 @@ static XLA_FFI_DataType GetDataType() {
 // An std::visit overload set to fix up CallFrame::Attribute storage and
 // initialize XLA FFI structs with valid pointers into storage objects.
 struct CallFrame::FixupAttribute {
+  void operator()(CallFrame::Array& array) {
+    auto visitor = [&](auto& value) {
+      using T = typename std::remove_reference_t<decltype(value)>::value_type;
+      array.array.dtype = GetDataType<T>();
+      array.array.size = value.size();
+      array.array.data = value.data();
+    };
+    std::visit(visitor, array.value);
+  }
+
   void operator()(CallFrame::Scalar& scalar) {
     auto visitor = [&](auto& value) {
       using T = std::remove_reference_t<decltype(value)>;
@@ -314,6 +334,10 @@ struct CallFrame::FixupAttribute {
 
 // An std::visit overload set to get CallFrame::Attribute XLA FFI type.
 struct CallFrame::AttributeType {
+  XLA_FFI_AttrType operator()(CallFrame::Array&) {
+    return XLA_FFI_AttrType_ARRAY;
+  }
+
   XLA_FFI_AttrType operator()(CallFrame::Scalar&) {
     return XLA_FFI_AttrType_SCALAR;
   }
@@ -334,6 +358,8 @@ struct CallFrame::AttributeStorage {
     return &value;
   }
 
+  void* operator()(CallFrame::Array& array) { return &array.array; }
+
   void* operator()(CallFrame::Scalar& scalar) { return &scalar.scalar; }
 
   void* operator()(CallFrame::String& str) { return &str.span; }
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index 65fa5446d90597..b8f05105d92343 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -57,11 +57,15 @@ class CallFrameBuilder {
   CallFrameBuilder(CallFrameBuilder&&);
   CallFrameBuilder& operator=(CallFrameBuilder&&);
 
+  using Scalar = std::variant<int32_t, int64_t, float>;
+  using Array = std::variant<std::vector<int32_t>, std::vector<int64_t>,
+                             std::vector<float>>;
+
   // Declare implementation detail structs for call frame builder storage.
   struct Dictionary;
 
   // Attributes that do not support nested dictionaries.
-  using FlatAttribute = std::variant<int32_t, int64_t, float, std::string>;
+  using FlatAttribute = std::variant<Scalar, Array, std::string>;
   using FlatAttributesMap = absl::flat_hash_map<std::string, FlatAttribute>;
 
   // Attributes that support arbitrary nesting.
@@ -123,6 +127,7 @@ class CallFrame {
 
   // Declare implementation detail structs for call frame storage.
   struct Arguments;
+  struct Array;
   struct Attributes;
   struct Buffer;
   struct Dictionary;
@@ -130,7 +135,7 @@ class CallFrame {
   struct Scalar;
   struct String;
 
-  using Attribute = std::variant<Scalar, String, Dictionary>;
+  using Attribute = std::variant<Scalar, Array, String, Dictionary>;
 
   CallFrame(absl::Span<const CallFrameBuilder::Buffer> args,
             const CallFrameBuilder::AttributesMap& attrs);
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index 206c2927e77203..6720b641ee25a5 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -165,6 +165,34 @@ struct ArgDecoding<Buffer<dtype, rank>> {
 // Attributes decoding
 //===----------------------------------------------------------------------===//
 
+#define XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(T, TYPE)                   \
+  template <>                                                            \
+  struct AttrDecoding<absl::Span<const T>> {                             \
+    using Type = absl::Span<const T>;                                    \
+    static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr, \
+                                      DiagnosticEngine& diagnostic) {    \
+      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_ARRAY)) {       \
+        return diagnostic.Emit("Wrong attribute type: expected ")        \
+               << XLA_FFI_AttrType_ARRAY << " but got " << type;         \
+      }                                                                  \
+                                                                         \
+      auto* array = reinterpret_cast<XLA_FFI_Array*>(attr);              \
+      if (XLA_FFI_PREDICT_FALSE(array->dtype != TYPE)) {                 \
+        return diagnostic.Emit("Wrong array data type: expected ")       \
+               << TYPE << " but got " << array->dtype;                   \
+      }                                                                  \
+                                                                         \
+      return absl::Span<const T>(reinterpret_cast<T*>(array->data),      \
+                                 array->size);                           \
+    }                                                                    \
+  }
+
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int32_t, XLA_FFI_DataType_S32);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int64_t, XLA_FFI_DataType_S64);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(float, XLA_FFI_DataType_F32);
+
+#undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
+
 // A type tag to mark i64 attributes as pointers to `T`.
 template <typename T>
 struct Pointer {};
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index 7c4e5fe1e083fb..9b95ed3617f587 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/types/span.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/service/service_executable_run_options.h"
@@ -159,6 +160,29 @@ TEST(FfiTest, BuiltinAttributesAutoBinding) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, ArrayAttr) {
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("arr", std::vector<int32_t>({1, 2, 3, 4}));
+
+  CallFrameBuilder builder;
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto fn = [&](absl::Span<const int32_t> arr) {
+    EXPECT_EQ(arr.size(), 4);
+    EXPECT_EQ(arr[0], 1);
+    EXPECT_EQ(arr[1], 2);
+    EXPECT_EQ(arr[2], 3);
+    EXPECT_EQ(arr[3], 4);
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind().Attr<absl::Span<const int32_t>>("arr").To(fn);
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, PointerAttr) {
   std::string foo = "foo";
 

From acc308aff4982062b4893f43ec481cc583d9cd47 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Mon, 25 Mar 2024 11:23:32 -0700
Subject: [PATCH 391/670] [xla:gpu][NFC] Rename for consistency

PiperOrigin-RevId: 618904228
---
 .../gpu/runtime/address_computation_thunk.cc     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 87d2d2b42c58d6..47a258edc60759 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -116,7 +116,7 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
     return reinterpret_cast<int64_t*>(offsets_.at(stream.parent())->opaque());
   }();
 
-  for (auto [operand_idx, values] : llvm::enumerate(
+  for (auto [argument_idx, values] : llvm::enumerate(
            llvm::zip(embedded_thunk_arguments_, offset_buffer_indices_,
                      orig_shapes_, sliced_shapes_))) {
     auto [argument_slice, offset_slice, orig_shape, sliced_shape] = values;
@@ -125,14 +125,14 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
       continue;
     }
 
-    // `orig_operand` will contain the original offset for slice
+    // `orig_argument` will contain the original offset for slice
     // `argument_slice` within `orig_allocations`
-    se::DeviceMemoryBase orig_operand =
+    se::DeviceMemoryBase orig_argument =
         orig_allocations.GetDeviceAddress(*argument_slice);
     auto buffer_idx = argument_slice->index();
 
     if (offset_slice == std::nullopt) {
-      new_buffers[buffer_idx] = orig_operand;
+      new_buffers[buffer_idx] = orig_argument;
       continue;
     }
 
@@ -143,14 +143,14 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
     std::vector<int64_t> slice_starts;
     slice_starts.reserve(dst_shape.rank());
 
-    // Get offset for `operand_idx`-th operand, which has `dst_shape.rank()`
+    // Get offset for `argument_idx`-th argument, which has `dst_shape.rank()`
     // components.
     for (auto [offset_idx, offset_slice] : llvm::enumerate(*offset_slice)) {
       se::DeviceMemoryBase offset_src =
           orig_allocations.GetDeviceAddress(offset_slice);
-      int64_t* offset_dst = &offsets_base[operand_idx + offset_idx];
+      int64_t* offset_dst = &offsets_base[argument_idx + offset_idx];
       // Copy the `offset_idx`-th component of the offset for the
-      // `operand_idx`-th operand from device to host.
+      // `argument_idx`-th argument from device to host.
       TF_RETURN_IF_ERROR(
           stream.Memcpy(offset_dst, offset_src, sizeof(int64_t)));
 
@@ -172,7 +172,7 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
       new_offset += start * stride;
     }
 
-    new_buffers[buffer_idx] = orig_operand.GetByteSlice(new_offset, new_size);
+    new_buffers[buffer_idx] = orig_argument.GetByteSlice(new_offset, new_size);
   }
 
   // Safe to create a local BufferAllocations here since buffers are only slices

From f7bc1d7ed49889b03078b0494ababb54e8c97164 Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Mon, 25 Mar 2024 11:43:34 -0700
Subject: [PATCH 392/670] Add temporary backwards compatibility aliases that
 forward the old names (TfLiteRegistrationExternal etc.) to the new names
 (TfLiteOperator etc.). These backwards compatibility aliases are needed even
 though these names are experimental, because the old names are currently
 referenced in MediaPipe, which is in a separate repository, and thus can't be
 updated atomically.

PiperOrigin-RevId: 618911115
---
 tensorflow/lite/c/c_api.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tensorflow/lite/c/c_api.h b/tensorflow/lite/c/c_api.h
index 4b09cf88440bcf..01938c8141294b 100644
--- a/tensorflow/lite/c/c_api.h
+++ b/tensorflow/lite/c/c_api.h
@@ -23,4 +23,36 @@ limitations under the License.
 
 #include "tensorflow/lite/core/c/c_api.h"
 
+#ifndef DOYXGEN_SKIP
+// For backwards compatibility.
+// Deprecated. Use the names starting with TfLiteOperator instead.
+#ifdef __cplusplus
+using TfLiteRegistrationExternal = TfLiteOperator;
+// NOLINTBEGIN
+const auto TfLiteRegistrationExternalCreate = TfLiteOperatorCreate;
+const auto TfLiteRegistrationExternalGetBuiltInCode =
+    TfLiteOperatorGetBuiltInCode;
+const auto TfLiteRegistrationExternalGetVersion = TfLiteOperatorGetVersion;
+const auto TfLiteRegistrationExternalDelete = TfLiteOperatorDelete;
+const auto TfLiteRegistrationExternalSetInit = TfLiteOperatorSetInit;
+const auto TfLiteRegistrationExternalSetFree = TfLiteOperatorSetFree;
+const auto TfLiteRegistrationExternalSetPrepare = TfLiteOperatorSetPrepare;
+const auto TfLiteRegistrationExternalSetInvoke = TfLiteOperatorSetInvoke;
+const auto TfLiteRegistrationExternalGetCustomName =
+    TfLiteOperatorGetCustomName;
+// NOLINTEND
+#else
+typedef TfLiteOperator TfLiteRegistrationExternal;
+#define TfLiteRegistrationExternalCreate TfLiteOperatorCreate
+#define TfLiteRegistrationExternalGetBuiltInCode TfLiteOperatorGetBuiltInCode
+#define TfLiteRegistrationExternalGetVersion TfLiteOperatorGetVersion
+#define TfLiteRegistrationExternalDelete TfLiteOperatorDelete
+#define TfLiteRegistrationExternalSetInit TfLiteOperatorSetInit
+#define TfLiteRegistrationExternalSetFree TfLiteOperatorSetFree
+#define TfLiteRegistrationExternalSetPrepare TfLiteOperatorSetPrepare
+#define TfLiteRegistrationExternalSetInvoke TfLiteOperatorSetInvoke
+#define TfLiteRegistrationExternalGetCustomName TfLiteOperatorGetCustomName
+#endif  // __cplusplus
+#endif  // DOYXGEN_SKIP
+
 #endif  // TENSORFLOW_LITE_C_C_API_H_

From 27a60483a793d231b29b1af5bed6edeffe855dcb Mon Sep 17 00:00:00 2001
From: Arturo Schmidt <arturoschmidt@google.com>
Date: Mon, 25 Mar 2024 11:47:38 -0700
Subject: [PATCH 393/670] Remove stale todo.

PiperOrigin-RevId: 618912481
---
 .../mlir/tensorflow/translate/tf_mlir_translate_registration.cc  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
index 57b0d0e2ff2389..97bf4efedf1587 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
@@ -167,7 +167,6 @@ static LogicalResult MlirToGraphdefTranslateFunction(
     ModuleOp module, llvm::raw_ostream& output) {
   if (!module) return failure();
 
-  // TODO(fengliuai): Add exporter flags.
   tensorflow::GraphExportConfig confs;
   confs.export_entry_func_to_flib = export_entry_func_to_flib;
   confs.export_original_tf_func_name = export_original_tf_func_name;

From b90ead7207864e1459cbb225b23ca16c1351d300 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 11:51:25 -0700
Subject: [PATCH 394/670] Add a parameter to BuildHloFromGraph to enable the
 choice to force the use of `_output_shapes` attribute.

PiperOrigin-RevId: 618913733
---
 tensorflow/compiler/mlir/tf2xla/api/v1/BUILD  | 12 ++++
 .../mlir/tf2xla/api/v1/compile_mlir_util.cc   | 21 ++++--
 .../mlir/tf2xla/api/v1/compile_mlir_util.h    |  9 ++-
 .../tf2xla/api/v1/compile_mlir_util_test.cc   | 70 +++++++++++++++++++
 .../compiler/tf2xla/mlir_xla_op_kernel.cc     | 24 +++++--
 5 files changed, 121 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
index 38094bf7067d1b..3b55c2f954a22b 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
@@ -77,16 +77,28 @@ tf_cc_test(
     srcs = ["compile_mlir_util_test.cc"],
     deps = [
         ":compile_mlir_util_no_tf_dialect_passes",
+        "//tensorflow/compiler/jit:xla_compile_util",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
+        "//tensorflow/compiler/mlir/utils:array_container_utils",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/monitoring:cell_reader",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/client:xla_builder",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
index 20fff0cc549d0f..59fb22e87eab58 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
@@ -989,7 +989,8 @@ Status CompileGraphToXlaHlo(
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphToModule(
-    const Graph& graph, llvm::ArrayRef<std::string> control_rets,
+    bool unconditionally_use_set_output_shapes, const Graph& graph,
+    llvm::ArrayRef<std::string> control_rets,
     const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
     mlir::MLIRContext* context) {
   mlir::DialectRegistry registry;
@@ -1004,20 +1005,27 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphToModule(
   // the shape inference pass is run early in the pass pipeline, shape inference
   // during import is not necessary.
   config.enable_shape_inference = false;
+  // Some graphs may require _output_shapes (an unregistered attribute)
+  // to override shapes. It is unfortunately not always set correctly so only
+  // do it optionally.
+  config.unconditionally_use_set_output_shapes =
+      unconditionally_use_set_output_shapes;
   return ConvertGraphToMlir(graph, debug_info, flib_def, config, context);
 }
 
 Status BuildHloFromGraph(
     const Graph& graph, xla::XlaBuilder& builder,
     mlir::MLIRContext& mlir_context, llvm::ArrayRef<xla::XlaOp> xla_params,
-    std::vector<xla::XlaOp>& returns, llvm::ArrayRef<XlaArgument> args,
-    llvm::ArrayRef<std::string> control_rets, llvm::StringRef device_type,
-    const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
+    std::vector<xla::XlaOp>& returns, bool unconditionally_use_output_shapes,
+    llvm::ArrayRef<XlaArgument> args, llvm::ArrayRef<std::string> control_rets,
+    llvm::StringRef device_type, const FunctionLibraryDefinition& flib_def,
+    const GraphDebugInfo& debug_info,
     llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
         custom_legalization_passes) {
   TF_ASSIGN_OR_RETURN(
       mlir::OwningOpRef<mlir::ModuleOp> module,
-      GraphToModule(graph, control_rets, flib_def, debug_info, &mlir_context));
+      GraphToModule(unconditionally_use_output_shapes, graph, control_rets,
+                    flib_def, debug_info, &mlir_context));
   return BuildHloFromModule(module.get(), builder, xla_params, returns, args,
                             device_type, custom_legalization_passes);
 }
@@ -1034,7 +1042,8 @@ Status CompileGraphToXlaHlo(
   mlir::MLIRContext context;
   TF_ASSIGN_OR_RETURN(
       mlir::OwningOpRef<mlir::ModuleOp> module,
-      GraphToModule(graph, control_rets, flib_def, debug_info, &context));
+      GraphToModule(/*unconditionally_use_set_output_shapes=*/false, graph,
+                    control_rets, flib_def, debug_info, &context));
   return CompileGraphToXlaHlo(
       module.get(), args, device_type, use_tuple_args, enable_op_fallback,
       /*use_return_tuple=*/true, shape_determination_fns, compilation_result,
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
index b3e8311c2df7d4..aaccd39a3db398 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
@@ -206,14 +206,17 @@ Status CompileGraphToXlaHlo(
 // XlaBuilder. This function adds HLO to a larger HLO computation, so
 // HLO-level inputs are supplied, and HLO-level outputs are produced.
 // xla_params is the HLO-level inputs and returns is the HLO-level outputs.
+// If unconditionally_use_output_shapes is true then the unregistered
+// attribute _output_shapes is always used to set the output shapes of the ops.
 ABSL_DEPRECATED(
     "Use v1/compile_tf_graph.h::CompileTensorflowGraphToHlo instead.")
 Status BuildHloFromGraph(
     const Graph& graph, xla::XlaBuilder& builder,
     mlir::MLIRContext& mlir_context, llvm::ArrayRef<xla::XlaOp> xla_params,
-    std::vector<xla::XlaOp>& returns, llvm::ArrayRef<XlaArgument> args,
-    llvm::ArrayRef<std::string> control_rets, llvm::StringRef device_type,
-    const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
+    std::vector<xla::XlaOp>& returns, bool unconditionally_use_output_shapes,
+    llvm::ArrayRef<XlaArgument> args, llvm::ArrayRef<std::string> control_rets,
+    llvm::StringRef device_type, const FunctionLibraryDefinition& flib_def,
+    const GraphDebugInfo& debug_info,
     llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
         custom_legalization_passes = {});
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
index d7d8e8e4f4e894..62fbf4bb94381f 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
@@ -15,21 +15,37 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h"
 
+#include <initializer_list>
+#include <memory>
 #include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/jit/xla_compile_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/client/xla_builder.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -182,5 +198,59 @@ TEST(LegalizeMlirTest, LegalizesModuleWithDynamicShape) {
   EXPECT_TRUE(status.ok());
 }
 
+absl::StatusOr<std::unique_ptr<Graph>> BuildOpGraphWithOutputShapes() {
+  DataType data_type = DT_INT32;
+  std::initializer_list<int64_t> dims = {2, 3, 4, 5};
+  Tensor tensor(data_type, TensorShape(dims));
+  for (int i = 0; i < 2 * 3 * 4 * 5; ++i) {
+    tensor.flat<int32>()(i) = i;
+  }
+
+  NodeDef node;
+  auto builder = NodeDefBuilder("some_node", "Const")
+                     .Attr("dtype", data_type)
+                     .Attr("value", tensor);
+  // Create a bad output shape attr.
+  AttrValue shape_attr;
+  TensorShapeProto* shape_proto = shape_attr.mutable_list()->add_shape();
+  shape_proto->add_dim()->set_size(1);
+  builder.Attr("_output_shapes", shape_attr);
+
+  TF_RETURN_IF_ERROR(builder.Finalize(&node));
+
+  return CreateSingleOpGraph(node, {}, {DataType::DT_INT32});
+}
+
+absl::Status BuildHloFromGraph(Graph& graph, bool use_output_shapes) {
+  xla::XlaBuilder builder(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
+  mlir::MLIRContext mlir_context;
+  llvm::SmallVector<xla::XlaOp, 4> xla_params;
+  std::vector<xla::XlaOp> returns(1);
+  return BuildHloFromGraph(graph, builder, mlir_context, xla_params, returns,
+                           use_output_shapes, /*args=*/{},
+                           /*control_rets=*/{}, DEVICE_TPU,
+                           FunctionLibraryDefinition(OpRegistry::Global()),
+                           /*debug_info=*/{},
+                           /*custom_legalization_passes=*/{});
+}
+
+TEST(CompileMlirUtil, UsesCorrectOriginalShapeWithoutOutputShapes) {
+  TF_ASSERT_OK_AND_ASSIGN(auto graph, BuildOpGraphWithOutputShapes());
+
+  auto build_result = BuildHloFromGraph(*graph, /*use_output_shapes=*/false);
+  TF_ASSERT_OK(build_result);
+}
+
+TEST(CompileMlirUtil, UsesIncorrectOutputShapesWhenPresent) {
+  TF_ASSERT_OK_AND_ASSIGN(auto graph, BuildOpGraphWithOutputShapes());
+
+  auto build_result = BuildHloFromGraph(*graph, /*use_output_shapes=*/true);
+  ASSERT_FALSE(build_result.ok());
+  EXPECT_THAT(build_result.message(),
+              HasSubstr("op operand type 'tensor<2x3x4x5xi32>' and result type "
+                        "'tensor<1xi32>' are cast incompatible"));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc b/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc
index 3c453e88c9dc10..b2e52f6d0dbda5 100644
--- a/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc
@@ -137,12 +137,24 @@ Status MlirXlaOpKernel::ConstructXlaOp(XlaOpKernelContext* ctx) {
   // Compile the graph to HLO.
   GraphDebugInfo debug_info;
   std::vector<xla::XlaOp> returns(1);
-  TF_RETURN_IF_ERROR(BuildHloFromGraph(
-      *graph, *ctx->builder(), *ctx_res->GetContext(), xla_params, returns,
-      mlir::SpanToArrayRef<XlaCompiler::Argument>(xla_args), control_rets,
-      device->device_type(),
-      *ctx->function_library()->GetFunctionLibraryDefinition(), debug_info,
-      {}));
+  auto build_hlo = [&](bool unconditionally_use_output_shapes) {
+    return BuildHloFromGraph(
+        *graph, *ctx->builder(), *ctx_res->GetContext(), xla_params, returns,
+        unconditionally_use_output_shapes,
+        mlir::SpanToArrayRef<XlaCompiler::Argument>(xla_args), control_rets,
+        device->device_type(),
+        *ctx->function_library()->GetFunctionLibraryDefinition(), debug_info,
+        {});
+  };
+
+  // Some of the operations that come through here do not know how to set their
+  // own output shapes (e.g. _XlaHostComputeMlir') so we may need to use the
+  // unconditional output shapes option. However, many graphs fail if we do it
+  // unconditionally so try both.
+  if (!build_hlo(/*unconditionally_use_output_shapes=*/false).ok()) {
+    // If that failed, then try again with the unconditional set true
+    TF_RETURN_IF_ERROR(build_hlo(/*unconditionally_use_output_shapes=*/true));
+  }
 
   // Set context outputs.
   for (int i = 0, end = returns.size(); i < end; ++i) {

From 60b5e33d78b3769e8495587f033e4ea02fb0fe39 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 11:58:53 -0700
Subject: [PATCH 395/670] Fix memory budget lower bound computation for the
 case of solve_nd_sharding_iteratively=true

** Description of the issue
When solve_nd_sharding_iteratively is turned on, say for a 2D mesh [N,M] where N >= M, we would first shard the module across the partial mesh [N,1], and then use these results to prune the search space for sharding the module across the full mesh [N,M]. An artifact of the way we have implemented this is that after sharding the module with the partial mesh, we annotate the module with the computed partial shardings (in the SetHloSharding() function).

Given that we are sharding the devices across a different number of devices each time (N vs N*M), we re-compute the memory budget lower bound (in MemoryBudgetLowerBound()). This function uses pre-existing sharding annotations, if any, on instructions for computing a lower bound on the memory consumed per device. When computing the lower bound for the case of the full mesh, we therefore end up using the partial sharding annotations on the instructions to compute the lower bound, leading to a memory budget that is ~M times larger than it should be.

** Fix
This CL fixes this issue by enforcing that preexisting shardings annotations be used only if they correspond to the same number of devices as the current device mesh.

In some cases, user annotations may not be adjusted for partial meshes. Therefore we skip the above check for instructions with user annotations. If this is found to not affect results, it can be simplified.

PiperOrigin-RevId: 618916013
---
 .../xla/hlo/experimental/auto_sharding/BUILD  |  7 +-
 .../auto_sharding/auto_sharding.cc            | 48 ++++++++-----
 .../auto_sharding/auto_sharding.h             | 11 ++-
 .../auto_sharding/auto_sharding_test.cc       | 72 ++++++++++++++++++-
 4 files changed, 116 insertions(+), 22 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index d661e1290b1469..bb0b54184ce5c8 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -318,12 +318,17 @@ xla_cc_test(
     deps = [
         ":auto_sharding",
         ":auto_sharding_option",
+        ":auto_sharding_strategy",
         ":auto_sharding_util",
         "//xla:statusor",
-        "//xla:test_helpers",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_live_range",
         "//xla/hlo/utils:hlo_matchers",
+        "//xla/service:buffer_value",
+        "//xla/service:hlo_alias_analysis",
+        "//xla/service:hlo_memory_scheduler",
         "//xla/service:hlo_parser",
+        "//xla/service:hlo_value",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 73ece1d214e5dc..61d2e16711ca9f 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -2637,10 +2637,11 @@ void CheckUserShardingPreservation(
   }
 }
 
-int64_t MemoryBudgetLowerBound(const HloModule& module,
-                               const LivenessSet& liveness_set,
-                               const HloAliasAnalysis* alias_analysis,
-                               const int64_t num_devices) {
+int64_t MemoryBudgetLowerBound(
+    const HloModule& module, const LivenessSet& liveness_set,
+    const HloAliasAnalysis& alias_analysis, const int64_t num_devices,
+    const absl::flat_hash_map<std::string, std::vector<HloSharding>>&
+        preserved_shardings) {
   auto get_value_sharding = [](const HloValue* value) {
     return !value->index().empty()
                ? value->instruction()->sharding().GetSubSharding(
@@ -2657,13 +2658,14 @@ int64_t MemoryBudgetLowerBound(const HloModule& module,
   bool vlog_is_on_5 = VLOG_IS_ON(5);
   for (LivenessIdx time_idx = 0; time_idx < liveness_set.size(); ++time_idx) {
     for (const HloValue* value : liveness_set[time_idx]) {
-      const auto& buffer = alias_analysis->GetBufferContainingValue(*value);
+      const HloBuffer& buffer = alias_analysis.GetBufferContainingValue(*value);
       if (value->instruction()->has_sharding()) {
         if (vlog_is_on_5) {
-          auto this_value_sharding = get_value_sharding(value);
+          const HloSharding& this_value_sharding = get_value_sharding(value);
           auto iter = buffer_to_sharded_value_mapping.find(buffer.id());
           if (iter != buffer_to_sharded_value_mapping.end()) {
-            auto buffer_value_sharding = get_value_sharding(iter->second);
+            const HloSharding& buffer_value_sharding =
+                get_value_sharding(iter->second);
             if (this_value_sharding != buffer_value_sharding) {
               // TODO(pratikf): This is an unavoidable situation, but possibly
               // there is a better design decision that can be made here.
@@ -2692,11 +2694,22 @@ int64_t MemoryBudgetLowerBound(const HloModule& module,
       }
       Shape shape =
           ShapeUtil::GetSubshape(value->instruction()->shape(), value->index());
-      const auto& buffer = alias_analysis->GetBufferContainingValue(*value);
+      const HloBuffer& buffer = alias_analysis.GetBufferContainingValue(*value);
       auto iter = buffer_to_sharded_value_mapping.find(buffer.id());
       std::optional<HloSharding> optional_sharding = std::nullopt;
       if (iter != buffer_to_sharded_value_mapping.end()) {
-        optional_sharding = get_value_sharding(iter->second);
+        // The instructions here can have partial sharding annotations from
+        // previous iterations with partial mesh shapes when
+        // solve_nd_sharding_iteratively is true. To exclude these, we only
+        // utilize those shardings which corresponding to the current device
+        // mesh.
+        const HloSharding& value_sharding = get_value_sharding(iter->second);
+        if (preserved_shardings.find(value->instruction()->name()) !=
+                preserved_shardings.end() ||
+            !value_sharding.IsTiled() ||
+            value_sharding.TotalNumTiles() == num_devices) {
+          optional_sharding = value_sharding;
+        }
       }
       memory_usage +=
           GetShardedInstructionSize(shape, num_devices, optional_sharding);
@@ -3595,9 +3608,9 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
   absl::flat_hash_map<const HloValue*, HloLiveRange::TimeBound>&
       buffer_live_ranges = hlo_live_range->buffer_live_ranges();
   spmd::LivenessSet liveness_set(hlo_live_range->schedule_end_time() + 1);
-  for (const auto& iter : buffer_live_ranges) {
-    for (spmd::LivenessIdx i = iter.second.start; i <= iter.second.end; ++i) {
-      liveness_set[i].push_back(iter.first);
+  for (const auto& [hlo_value, live_range] : buffer_live_ranges) {
+    for (spmd::LivenessIdx i = live_range.start; i <= live_range.end; ++i) {
+      liveness_set[i].push_back(hlo_value);
     }
   }
   VLOG(10) << hlo_live_range->ToString();
@@ -3636,7 +3649,6 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
       .shape_size = [](const Shape& shape) { return spmd::GetBytes(shape); }};
   HloCostAnalysis hlo_cost_analysis(hlo_cost_analysis_options);
   CHECK_OK(module->entry_computation()->Accept(&hlo_cost_analysis));
-
   for (size_t mesh_idx = 0; mesh_idx < partial_mesh_shapes.size(); ++mesh_idx) {
     // Adjust existing shardings with current partial mesh shapes.
     std::vector<int64_t> mesh_shape = partial_mesh_shapes[mesh_idx];
@@ -3679,8 +3691,8 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
 
     XLA_VLOG_LINES(6, module->ToString());
     const int64_t memory_lower_bound = spmd::MemoryBudgetLowerBound(
-        *module, liveness_set, alias_analysis.get(),
-        device_mesh.num_elements());
+        *module, liveness_set, *alias_analysis, device_mesh.num_elements(),
+        preserve_shardings);
     const float memory_lower_bound_gb =
         static_cast<float>(memory_lower_bound) / (1024 * 1024 * 1024);
     LOG(INFO) << "Memory consumption lower bound is " << memory_lower_bound_gb
@@ -3853,9 +3865,6 @@ bool ModuleHasUserShardings(const HloModule* module) {
   return has_shardings;
 }
 
-AutoSharding::AutoSharding(const AutoShardingOption& option)
-    : option_(option) {}
-
 bool IsSmallTensor(const HloInstruction* ins,
                    const AutoShardingOption& option) {
   return spmd::GetInstructionSize(ins->shape()) <=
@@ -3893,6 +3902,9 @@ std::unique_ptr<HloModule> CloneModule(const HloModule* module) {
   return module_clone;
 }
 
+AutoSharding::AutoSharding(const AutoShardingOption& option)
+    : option_(option) {}
+
 absl::StatusOr<bool> AutoSharding::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
index 61d38b293bdd75..4a0dbe0902fe9c 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -40,11 +40,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/service/call_graph.h"
+#include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/shape.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 
 namespace xla {
 
@@ -385,6 +385,15 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
                      const HloCostAnalysis& hlo_cost_analysis,
                      bool trying_multiple_mesh_shapes);
 
+// Computes an approximate lower bound on the per-device memory usage of a
+// module once it has been sharded. This quantity is multiplied with
+// memory_budget_ratio to obtain the memory budget using in our ILP formulation.
+int64_t MemoryBudgetLowerBound(
+    const HloModule& module, const LivenessSet& liveness_set,
+    const HloAliasAnalysis& alias_analysis, int64_t num_devices,
+    const absl::flat_hash_map<std::string, std::vector<HloSharding>>&
+        preserved_shardings);
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index cd90015605086b..60014fa9530ca6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -28,13 +28,19 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/hlo/utils/hlo_matchers.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/hlo_alias_analysis.h"
+#include "xla/service/hlo_memory_scheduler.h"
 #include "xla/service/hlo_parser.h"
+#include "xla/service/hlo_value.h"
 #include "xla/statusor.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -162,6 +168,68 @@ ENTRY %elementwise {
   }
 };
 
+TEST_F(AutoShardingTest, MemoryBudgetTest) {
+  auto compute_memory_budget_lower_bound =
+      [](const HloModule& module, int64_t num_devices,
+         const absl::flat_hash_map<std::string, std::vector<HloSharding>>&
+             preserved_shardings = {}) -> absl::StatusOr<int64_t> {
+    auto size_fn = [](const BufferValue& buffer) {
+      return spmd::GetBytes(buffer.shape());
+    };
+    TF_ASSIGN_OR_RETURN(HloSchedule schedule,
+                        ScheduleModule(&module, size_fn,
+                                       ComputationSchedulerToModuleScheduler(
+                                           DFSMemoryScheduler),
+                                       /* execution_threads */ {}));
+    const HloComputation* entry_computation = module.entry_computation();
+    std::unique_ptr<HloAliasAnalysis> alias_analysis =
+        HloAliasAnalysis::Run(&module).value();
+
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloLiveRange> hlo_live_range,
+        HloLiveRange::Run(schedule, *alias_analysis, entry_computation));
+    absl::flat_hash_map<const HloValue*, HloLiveRange::TimeBound>&
+        buffer_live_ranges = hlo_live_range->buffer_live_ranges();
+    spmd::LivenessSet liveness_set(hlo_live_range->schedule_end_time() + 1);
+    for (const auto& [hlo_value, live_range] : buffer_live_ranges) {
+      for (spmd::LivenessIdx i = live_range.start; i <= live_range.end; ++i) {
+        liveness_set[i].push_back(hlo_value);
+      }
+    }
+    return spmd::MemoryBudgetLowerBound(module, liveness_set, *alias_analysis,
+                                        num_devices, preserved_shardings);
+  };
+
+  constexpr absl::string_view kHloString = R"(
+HloModule module
+ENTRY %elementwise {
+  %param0 = f32[16384,16384]{0,1} parameter(0)
+  %param1 = f32[16384,16384]{0,1} parameter(1)
+  %add = f32[16384,16384]{0,1} add(%param0, %param1)
+  ROOT %copy = f32[16384,16384]{0,1} copy(%add)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  TF_ASSERT_OK_AND_ASSIGN(HloSharding partial_sharding,
+                          ParseSharding("{devices=[64,1]<=[64]}"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      int64_t partial_mesh_64x1_budget_lower_bound,
+      compute_memory_budget_lower_bound(*module, /* num_devices */ 64));
+  for (HloInstruction* ins : module->entry_computation()->instructions()) {
+    ins->set_sharding(partial_sharding);
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      int64_t full_mesh_64x8_budget_lower_bound,
+      compute_memory_budget_lower_bound(*module, /* num_devices */ 512));
+  CHECK_LT(full_mesh_64x8_budget_lower_bound,
+           partial_mesh_64x1_budget_lower_bound)
+      << "The memory budget lower bound per device should be lower with a "
+         "larger number of devices. Instead, the bound was "
+      << partial_mesh_64x1_budget_lower_bound << " bytes for 64 devices and "
+      << full_mesh_64x8_budget_lower_bound << " bytes for 512 devices.";
+}
+
 TEST_F(AutoShardingTest, DISABLED_ElementWiseOperator) {
   constexpr absl::string_view kHloString = R"(
 HloModule module
@@ -270,7 +338,7 @@ ENTRY %elementwise {
 }
 
 TEST_F(AutoShardingTest, SliceMixedUserShardingTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %elementwise {
@@ -280,7 +348,7 @@ ENTRY %elementwise {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       AutoSharding(

From 0f102a6154eeb4294d254d326309e514f65a9b6e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 12:16:57 -0700
Subject: [PATCH 396/670] #tf-data Fix autograppler pattern mismatch for
 `framework_type` metric.

PiperOrigin-RevId: 618921260
---
 tensorflow/core/framework/metrics.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index d51711c8ce1c69..902b25bd12356d 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -341,7 +341,7 @@ auto* tf_data_error = tsl::monitoring::Counter<2>::New(
 
 auto* tf_data_framework_type = tsl::monitoring::Counter<1>::New(
     "/tensorflow/data/framework_type",
-    "The framework type used to build the tf.data.Dataset.", "framework_type");
+    "The framework type used to build the tf.data.Dataset.", "name");
 
 auto* parse_dense_feature_counter = tsl::monitoring::Counter<0>::New(
     "/tensorflow/data/dense_feature",

From 00090f00823d45d35995249ad92182ff04265f2d Mon Sep 17 00:00:00 2001
From: mraunak <83710963+mraunak@users.noreply.github.com>
Date: Mon, 25 Mar 2024 13:08:24 -0700
Subject: [PATCH 397/670] Update configure.py

---
 configure.py | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index befa76554e3471..43370cc62e94bf 100644
--- a/configure.py
+++ b/configure.py
@@ -857,6 +857,44 @@ def set_clang_compiler_path(environ_cp):
 
   return clang_compiler_path
 
+def set_clang_compiler_path_win(environ_cp):
+  """Set CLANG_COMPILER_PATH and environment variables.
+
+  Loop over user prompts for clang path until receiving a valid response.
+  Default is used if no input is given. Set CLANG_COMPILER_PATH and write
+  environment variables CC and BAZEL_COMPILER to .bazelrc.
+
+  Args:
+    environ_cp: (Dict) copy of the os.environ.
+
+  Returns:
+    string value for clang_compiler_path.
+  """
+  # Default path if clang-16 is installed by using apt-get install
+  default_clang_path = 'C:/Program Files/LLVM/bin/clang.exe'
+  if not os.path.exists(default_clang_path):
+    default_clang_path = which('clang') or ''
+
+  clang_compiler_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='CLANG_COMPILER_PATH',
+      var_default=default_clang_path,
+      ask_for_var='Please specify the path to clang executable.',
+      check_success=os.path.exists,
+      resolve_symlinks=True,
+      error_msg=(
+          'Invalid clang path. %s cannot be found. Note that TensorFlow now'
+          ' requires clang to compile. You may override this behavior by'
+          ' setting TF_NEED_CLANG=0'
+      ),
+  )
+
+  write_action_env_to_bazelrc('CLANG_COMPILER_PATH', clang_compiler_path)
+  write_to_bazelrc('build --repo_env=CC=%s' % clang_compiler_path)
+  write_to_bazelrc('build --repo_env=BAZEL_COMPILER=%s' % clang_compiler_path)
+
+  return clang_compiler_path
+
 
 def retrieve_clang_version(clang_executable):
   """Retrieve installed clang version.
@@ -1428,7 +1466,7 @@ def main():
     if is_windows():
       environ_cp['TF_NEED_CLANG'] = str(choose_compiler_Win(environ_cp))
       if environ_cp.get('TF_NEED_CLANG') == '1':
-        clang_compiler_path = set_clang_compiler_path(environ_cp)
+        clang_compiler_path = set_clang_compiler_path_win(environ_cp)
         clang_version = retrieve_clang_version(clang_compiler_path)
         disable_clang_offsetof_extension(clang_version)
 

From d9d0e7b150f4cf3ea3deb239cbebc6445194a5e7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 12:37:54 -0700
Subject: [PATCH 398/670] Avoid out-of-range crashes in
 hlo_proto_to_memory_visualization_utils.cc

PiperOrigin-RevId: 618926813
---
 tensorflow/core/profiler/convert/BUILD        |  1 +
 ...hlo_proto_to_memory_visualization_utils.cc | 29 +++++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 24fb84e9ae9549..a9cf400f917608 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -755,6 +755,7 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:memory_viewer_preprocess_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
index 6f63827b857d4b..d42235f4b723f8 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -216,6 +217,10 @@ class HloProtoBufferWrapper {
 
   const BufferAllocationStruct& GetBufferAllocation(
       int64_t buffer_allocation_id) const {
+    if (!id_to_buffer_allocation_.contains(buffer_allocation_id)) {
+      LOG(DFATAL) << "buffer_allocation_id " << buffer_allocation_id
+                  << " not found.";
+    }
     return *id_to_buffer_allocation_.at(buffer_allocation_id);
   }
 
@@ -230,6 +235,9 @@ class HloProtoBufferWrapper {
   }
 
   LogicalBufferStruct& GetLogicalBuffer(int64_t logical_buffer_id) const {
+    if (!id_to_logical_buffer_.contains(logical_buffer_id)) {
+      LOG(DFATAL) << "logical_buffer_id " << logical_buffer_id << "not found.";
+    }
     return *id_to_logical_buffer_.at(logical_buffer_id);
   }
 
@@ -294,9 +302,17 @@ class HloProtoBufferWrapper {
           std::make_unique<BufferAllocationStruct>(buffer_allocation);
       for (const auto& assigned : buffer_allocation.assigned()) {
         const auto id = assigned.logical_buffer_id();
+        if (!id_to_logical_buffer_proto.contains(id)) {
+          LOG(DFATAL) << "logical_buffer_id " << id << " not found.";
+          continue;
+        }
         const auto* logical_buffer = id_to_logical_buffer_proto.at(id);
-        const auto* instruction =
-            unique_id_to_hlo.at(logical_buffer->defined_at().instruction_id());
+        int64_t inst_id = logical_buffer->defined_at().instruction_id();
+        if (!unique_id_to_hlo.contains(inst_id)) {
+          LOG(DFATAL) << "instruction_id " << inst_id << " not found.";
+          continue;
+        }
+        const auto* instruction = unique_id_to_hlo.at(inst_id);
         id_to_logical_buffer_[id] = std::make_unique<LogicalBufferStruct>(
             *logical_buffer, *buffer_allocation_s, *instruction,
             assigned.offset());
@@ -328,6 +344,10 @@ class HloProtoBufferWrapper {
           hlo_proto_.buffer_assignment().heap_simulator_traces(i);
       int64_t event_count = 0;
       for (const auto& event : heap_simulator_trace.events()) {
+        if (!id_to_logical_buffer_.contains(event.buffer_id())) {
+          LOG(DFATAL) << "buffer_id " << event.buffer_id() << "not found.";
+          continue;
+        }
         const auto& logical_buffer =
             id_to_logical_buffer_.at(event.buffer_id());
         if (logical_buffer->color() == memory_color) {
@@ -895,6 +915,11 @@ void ConvertAllocationTimeline(const HloProtoBufferWrapper& wrapper,
     if (!heap_simulator_trace_id) continue;
     buffer_allocation_offsets.push_back(total_y_size);
     total_y_size += buffer_allocation->size();
+    if (*heap_simulator_trace_id >= heap_simulator_traces.size()) {
+      LOG(DFATAL) << "heap_simulator_trace_id " << *heap_simulator_trace_id
+                  << " out of bounds.";
+      continue;
+    }
     total_x_size = std::max<size_t>(
         total_x_size,
         heap_simulator_traces.at(*heap_simulator_trace_id).events_size());

From 1b41fda4f87509b13dee47e10c829738ba0a391e Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Mon, 25 Mar 2024 12:38:42 -0700
Subject: [PATCH 399/670] [xla:gpu] Support dynamic offset size in
 AddressComputationThunk

Contrary to the documentation, dynamic-slice and DUS ops may have start indices of type different than S64[]. We need to be more precise when loading the offset from device to host at runtime.

PiperOrigin-RevId: 618927023
---
 .../address_computation_fusion_test.cc        | 98 +++++++++++++++++++
 .../xla/xla/service/gpu/fusions/custom.cc     |  6 +-
 .../gpu/runtime/address_computation_thunk.cc  | 24 +++--
 .../gpu/runtime/address_computation_thunk.h   |  4 +-
 .../runtime/address_computation_thunk_test.cc | 24 +++--
 5 files changed, 137 insertions(+), 19 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index 03017eeefd1666..2ff4a53cea5448 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -2210,6 +2210,104 @@ TEST_F(AddressComputationFusionTest, CublasGemmDUSWorkspaceIgnored) {
                                       /*run_hlo_passes=*/false));
 }
 
+TEST_F(AddressComputationFusionTest, CublasGemmDUSOffsetS32NotConstant) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] parameter(3)
+    %c0_s32 = s32[] parameter(4)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] parameter(3)
+    %c0_s32 = s32[] parameter(4)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+  }
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] parameter(3)
+    %c0_s32 = s32[] parameter(4)
+    ROOT %fusion.2 = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index faf3efd2e552fb..7e0479d550be77 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -193,6 +193,7 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
       offset_buffer_indices;
   std::vector<std::optional<const Shape>> orig_shapes;
   std::vector<std::optional<const Shape>> sliced_shapes;
+  std::vector<std::optional<uint64_t>> offset_byte_sizes;
 
   HloDynamicIndexInstruction* slice_instr = nullptr;
   auto get_original_operand_slice =
@@ -226,6 +227,7 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
       offset_buffer_indices.push_back(std::nullopt);
       orig_shapes.push_back(std::nullopt);
       sliced_shapes.push_back(std::nullopt);
+      offset_byte_sizes.push_back(std::nullopt);
       return;
     }
 
@@ -243,6 +245,8 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
     sliced_shapes.push_back(DynCast<HloDynamicSliceInstruction>(slice_instr)
                                 ? slice_instr->shape()
                                 : slice_instr->operand(1)->shape());
+    offset_byte_sizes.push_back(ShapeUtil::ByteSizeOfPrimitiveType(
+        slice_instr->index_operands().front()->shape().element_type()));
   };
 
   TF_ASSIGN_OR_RETURN(
@@ -352,7 +356,7 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
   auto thunk = std::make_unique<AddressComputationThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(&custom_call),
       std::make_unique<ThunkSequence>(std::move(seq)), arguments,
-      offset_buffer_indices, orig_shapes, sliced_shapes);
+      offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes);
 
   FusionEmissionResult result;
   result.thunks.push_back(std::move(thunk));
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 47a258edc60759..61c6913aecff6e 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -48,14 +48,16 @@ AddressComputationThunk::AddressComputationThunk(
     std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
         offset_buffer_indices,
     std::vector<std::optional<const Shape>> orig_shapes,
-    std::vector<std::optional<const Shape>> sliced_shapes)
+    std::vector<std::optional<const Shape>> sliced_shapes,
+    std::vector<std::optional<uint64_t>> offset_byte_sizes)
     : Thunk(Kind::kAddressComputation, thunk_info),
       embedded_thunk_(std::make_unique<SequentialThunk>(
           ThunkInfo(thunk_info.op), std::move(*embedded_thunk))),
       embedded_thunk_arguments_(std::move(arguments)),
       offset_buffer_indices_(std::move(offset_buffer_indices)),
       orig_shapes_(std::move(orig_shapes)),
-      sliced_shapes_(std::move(sliced_shapes)) {}
+      sliced_shapes_(std::move(sliced_shapes)),
+      offset_byte_sizes_(std::move(offset_byte_sizes)) {}
 
 absl::Status AddressComputationThunk::Prepare(
     const PrepareParams& params, ResourceRequests& resource_requests) {
@@ -63,13 +65,16 @@ absl::Status AddressComputationThunk::Prepare(
   TF_RET_CHECK(num_arguments == offset_buffer_indices_.size());
   TF_RET_CHECK(num_arguments == orig_shapes_.size());
   TF_RET_CHECK(num_arguments == sliced_shapes_.size());
-  for (auto [argument, offset_slice, orig_shape, sliced_shape] :
+  TF_RET_CHECK(num_arguments == offset_byte_sizes_.size());
+  for (auto [argument, offset_slice, orig_shape, sliced_shape,
+             offset_byte_size] :
        llvm::zip(embedded_thunk_arguments_, offset_buffer_indices_,
-                 orig_shapes_, sliced_shapes_)) {
+                 orig_shapes_, sliced_shapes_, offset_byte_sizes_)) {
     if (offset_slice.has_value()) {
       TF_RET_CHECK(argument.has_value());
       TF_RET_CHECK(orig_shape.has_value());
       TF_RET_CHECK(sliced_shape.has_value());
+      TF_RET_CHECK(offset_byte_size.has_value());
 
       TF_RET_CHECK(orig_shape->IsArray());
       TF_RET_CHECK(sliced_shape->IsArray());
@@ -118,8 +123,9 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
 
   for (auto [argument_idx, values] : llvm::enumerate(
            llvm::zip(embedded_thunk_arguments_, offset_buffer_indices_,
-                     orig_shapes_, sliced_shapes_))) {
-    auto [argument_slice, offset_slice, orig_shape, sliced_shape] = values;
+                     orig_shapes_, sliced_shapes_, offset_byte_sizes_))) {
+    auto [argument_slice, offset_slice, orig_shape, sliced_shape,
+          offset_byte_size] = values;
 
     if (argument_slice == std::nullopt) {
       continue;
@@ -145,14 +151,14 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
 
     // Get offset for `argument_idx`-th argument, which has `dst_shape.rank()`
     // components.
-    for (auto [offset_idx, offset_slice] : llvm::enumerate(*offset_slice)) {
+    for (auto [offset_idx, slice] : llvm::enumerate(*offset_slice)) {
       se::DeviceMemoryBase offset_src =
-          orig_allocations.GetDeviceAddress(offset_slice);
+          orig_allocations.GetDeviceAddress(slice);
       int64_t* offset_dst = &offsets_base[argument_idx + offset_idx];
       // Copy the `offset_idx`-th component of the offset for the
       // `argument_idx`-th argument from device to host.
       TF_RETURN_IF_ERROR(
-          stream.Memcpy(offset_dst, offset_src, sizeof(int64_t)));
+          stream.Memcpy(offset_dst, offset_src, offset_byte_size.value()));
 
       if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
         return absl::InternalError(absl::StrFormat(
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index a08d5c19d0d47b..f7cefe1b1843fb 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -48,7 +48,8 @@ class AddressComputationThunk : public Thunk {
       std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
           offset_buffer_indices,
       std::vector<std::optional<const Shape>> orig_shapes,
-      std::vector<std::optional<const Shape>> sliced_shapes);
+      std::vector<std::optional<const Shape>> sliced_shapes,
+      std::vector<std::optional<uint64_t>> offset_byte_sizes);
 
   AddressComputationThunk(const AddressComputationThunk&) = delete;
   AddressComputationThunk& operator=(const AddressComputationThunk&) = delete;
@@ -66,6 +67,7 @@ class AddressComputationThunk : public Thunk {
       offset_buffer_indices_;
   std::vector<std::optional<const Shape>> orig_shapes_;
   std::vector<std::optional<const Shape>> sliced_shapes_;
+  std::vector<std::optional<uint64_t>> offset_byte_sizes_;
 
   // Pinned host memory for transferring offset values from device to host.
   absl::Mutex mutex_;
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index 37906bf6de7760..83f227121b2203 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -134,7 +134,8 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
        std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
-       std::nullopt, std::nullopt});
+       std::nullopt, std::nullopt},
+      {sizeof(int64_t), std::nullopt, std::nullopt, std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -279,7 +280,8 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
        std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2}),
        ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2}), std::nullopt,
-       std::nullopt});
+       std::nullopt},
+      {sizeof(int64_t), sizeof(int64_t), std::nullopt, std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -428,7 +430,8 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
        std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}),
        ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), std::nullopt,
-       std::nullopt});
+       std::nullopt},
+      {sizeof(int64_t), sizeof(int64_t), std::nullopt, std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -590,7 +593,8 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
       {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 8}), std::nullopt},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
       // original slice result and not bitcasted one)
-      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 8, 8}), std::nullopt});
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 8, 8}), std::nullopt},
+      {sizeof(int64_t), std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -748,7 +752,8 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
       // original slice result and not bitcasted one)
       {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2}),
-       ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})});
+       ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})},
+      {sizeof(int64_t), sizeof(int64_t)});
 
   // Step 2:
   // Execute address computation thunk.
@@ -895,7 +900,8 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
        std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
-       std::nullopt, std::nullopt});
+       std::nullopt, std::nullopt},
+      {sizeof(int64_t), std::nullopt, std::nullopt, std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -1022,7 +1028,8 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
        std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
-       std::nullopt, std::nullopt});
+       std::nullopt, std::nullopt},
+      {sizeof(int64_t), std::nullopt, std::nullopt, std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -1150,7 +1157,8 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
        std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
-       std::nullopt, std::nullopt});
+       std::nullopt, std::nullopt},
+      {sizeof(int64_t), std::nullopt, std::nullopt, std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.

From 853ee43529d15d1c234068985aefe080b76714b5 Mon Sep 17 00:00:00 2001
From: mraunak <83710963+mraunak@users.noreply.github.com>
Date: Mon, 25 Mar 2024 13:33:24 -0700
Subject: [PATCH 400/670] Update configure.py

---
 configure.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configure.py b/configure.py
index 43370cc62e94bf..6491a5b163dd98 100644
--- a/configure.py
+++ b/configure.py
@@ -883,9 +883,9 @@ def set_clang_compiler_path_win(environ_cp):
       check_success=os.path.exists,
       resolve_symlinks=True,
       error_msg=(
-          'Invalid clang path. %s cannot be found. Note that TensorFlow now'
-          ' requires clang to compile. You may override this behavior by'
-          ' setting TF_NEED_CLANG=0'
+          'Invalid clang path. %s cannot be found. Note that Clang is now'
+          'preferred compiler. You may use MSVC by removing --config=win_clang'
+  
       ),
   )
 

From 27c34b26599d7d352d60b97694dd3e2a72b4c5b9 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 25 Mar 2024 12:39:40 -0700
Subject: [PATCH 401/670] [xla:ffi] Add support for array attributes to custom
 call thunk

PiperOrigin-RevId: 618927279
---
 third_party/xla/xla/ffi/BUILD                 |  1 +
 third_party/xla/xla/ffi/api/api.h             |  5 ++-
 third_party/xla/xla/ffi/ffi_test.cc           | 14 ++++++--
 third_party/xla/xla/service/gpu/BUILD         |  1 +
 .../xla/xla/service/gpu/custom_call_test.cc   | 36 +++++++++++++++++++
 third_party/xla/xla/service/gpu/runtime/BUILD |  3 ++
 .../service/gpu/runtime/custom_call_thunk.cc  | 20 +++++++++++
 7 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index 06b623d0721f98..f2ed209201a313 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -81,6 +81,7 @@ xla_cc_test(
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status_matchers",
diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index c6a342f149f72c..72139873306e41 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -694,7 +694,10 @@ struct internal::Decode<internal::AttrTag<T>> {
 
     // Attribute name does not match.
     std::string_view attr_name_view = {attr_name->ptr, attr_name->len};
-    if (attr_name_view != ctx.attrs_names[i]) return std::nullopt;
+    if (attr_name_view != ctx.attrs_names[i]) {
+      return diagnostic.Emit("Attribute name mismatch: ")
+             << attr_name_view << " vs " << ctx.attrs_names[i];
+    }
 
     return AttrDecoding<T>::Decode(attr_type, attr, diagnostic);
   }
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index 9b95ed3617f587..56c49fc8063b7a 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/match.h"
 #include "absl/types/span.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/ffi_api.h"
@@ -377,9 +378,18 @@ TEST(FfiTest, DecodingErrors) {
 
   auto status = Call(*handler, call_frame);
 
-  ASSERT_EQ(
+  EXPECT_TRUE(absl::StrContains(
       status.message(),
-      "Failed to decode all FFI handler operands (bad operands at: 0, 1, 3)");
+      "Failed to decode all FFI handler operands (bad operands at: 0, 1, 3)"));
+
+  EXPECT_TRUE(absl::StrContains(
+      status.message(), "Attribute name mismatch: i32 vs not_i32_should_fail"));
+
+  EXPECT_TRUE(absl::StrContains(
+      status.message(), "Attribute name mismatch: i64 vs not_i64_should_fail"));
+
+  EXPECT_TRUE(absl::StrContains(
+      status.message(), "Attribute name mismatch: str vs not_str_should_fail"));
 }
 
 TEST(FfiTest, BufferBaseArgument) {
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 5d214b9becb46a..2ff67c1c8824cc 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -185,6 +185,7 @@ xla_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index aad23d161985b6..c1ac8806f0e3cd 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/span.h"
 #include "xla/client/lib/constants.h"
 #include "xla/client/xla_builder.h"
 #include "xla/ffi/ffi.h"
@@ -614,6 +615,41 @@ TEST_F(CustomCallTest, ExportedFfiWithStatusSucceeded) {
   TF_ASSERT_OK(Execute(&b, {}).status());
 }
 
+//===----------------------------------------------------------------------===//
+// XLA:FFI handler for testing attributes decoding
+//===----------------------------------------------------------------------===//
+
+static absl::Status FfiAttributes(ffi::BufferBase,
+                                  absl::Span<const int32_t> i32_arr) {
+  if (i32_arr.size() != 4)
+    return absl::InternalError("i32_arr size does not match");
+
+  if (i32_arr[0] != 1 || i32_arr[1] != 2 || i32_arr[2] != 3 || i32_arr[3] != 4)
+    return absl::InternalError("i32_arr values do not match");
+
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kFfiAttributes, FfiAttributes,
+                       ffi::Ffi::Bind()
+                           .Arg<ffi::BufferBase>()
+                           .Attr<absl::Span<const int32_t>>("i32_arr"));
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.ffi_attributes",
+                         PLATFORM, kFfiAttributes);
+
+TEST_F(CustomCallTest, FfiAttributes) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "xla.gpu.ffi_attributes", /*operands=*/{},
+             ShapeUtil::MakeShape(F32, {}),
+             /*opaque=*/"{ i32_arr = array<i32: 1, 2, 3, 4> }",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  TF_ASSERT_OK(Execute(&b, {}).status());
+}
+
 //===----------------------------------------------------------------------===//
 // XLA:FFI handler with attached HloComputation
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index bf5efda4935f9f..eae262d65a0930 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -469,9 +469,12 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_types_header",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
index 0edf3b7c9dced4..4bec97549515e5 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
@@ -15,14 +15,20 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 
+#include <cstdint>
 #include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/call_frame.h"
@@ -182,6 +188,19 @@ absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
       }
     };
 
+    auto arr = [&](mlir::DenseArrayAttr arr) {
+      if (auto dense = mlir::dyn_cast<mlir::DenseI32ArrayAttr>(arr)) {
+        attributes[name] = dense.asArrayRef().vec();
+        return absl::OkStatus();
+      } else if (auto dense = mlir::dyn_cast<mlir::DenseI64ArrayAttr>(arr)) {
+        attributes[name] = dense.asArrayRef().vec();
+        return absl::OkStatus();
+      }
+
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported array element type for attribute: ", name));
+    };
+
     auto str = [&](mlir::StringAttr str) {
       attributes[name] = str.getValue().str();
       return absl::OkStatus();
@@ -191,6 +210,7 @@ absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
         llvm::TypeSwitch<mlir::Attribute, Status>(kv.getValue())
             .Case<mlir::IntegerAttr>(integer)
             .Case<mlir::FloatAttr>(fp)
+            .Case<mlir::DenseArrayAttr>(arr)
             .Case<mlir::StringAttr>(str)
             .Default([&](mlir::Attribute) {
               return absl::InvalidArgumentError(absl::StrCat(

From f187e341111700cea2cf38f74ffc04be793166df Mon Sep 17 00:00:00 2001
From: mraunak <83710963+mraunak@users.noreply.github.com>
Date: Mon, 25 Mar 2024 13:45:31 -0700
Subject: [PATCH 402/670] Update configure.py

---
 configure.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/configure.py b/configure.py
index 6491a5b163dd98..e1538d80d20bf2 100644
--- a/configure.py
+++ b/configure.py
@@ -885,7 +885,6 @@ def set_clang_compiler_path_win(environ_cp):
       error_msg=(
           'Invalid clang path. %s cannot be found. Note that Clang is now'
           'preferred compiler. You may use MSVC by removing --config=win_clang'
-  
       ),
   )
 
@@ -895,7 +894,6 @@ def set_clang_compiler_path_win(environ_cp):
 
   return clang_compiler_path
 
-
 def retrieve_clang_version(clang_executable):
   """Retrieve installed clang version.
 

From dc80edfd0743c11a0c5a234e74c3ffa82678f299 Mon Sep 17 00:00:00 2001
From: Arturo Schmidt <arturoschmidt@google.com>
Date: Mon, 25 Mar 2024 12:53:32 -0700
Subject: [PATCH 403/670] Remove unused ConvertFunctionToBef.

PiperOrigin-RevId: 618931239
---
 .../mlir/tfrt/translate/import_model.cc       | 29 -------------------
 .../mlir/tfrt/translate/import_model.h        | 11 -------
 2 files changed, 40 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
index 66aee10db7e050..f61a087e782704 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
@@ -147,35 +147,6 @@ StatusOr<std::vector<FunctionDef>> ExportXlaFunctions(
 
 }  // namespace
 
-Status ConvertFunctionToBef(
-    mlir::StringRef function_name, const tensorflow::FunctionBody* fbody,
-    const FunctionLibraryDefinition& flib_def,
-    tfrt::ArrayRef<tfrt::string_view> devices,
-    const tensorflow::TfrtFunctionCompileOptions& options,
-    tfrt::BefBuffer* bef_buffer) {
-  mlir::MLIRContext context;
-  // FunctionDef -> TF Dialect
-  auto expected_module =
-      tensorflow::ConvertFunctionToMlir(fbody, flib_def, &context);
-
-  if (!expected_module.ok())
-    return absl::InternalError(absl::StrCat(
-        "Failed to convert function to mlir for function ", function_name.str(),
-        ". Error: ", expected_module.status().message()));
-
-  auto module = std::move(expected_module).value();
-
-  // Attach devices to the MLIR module.
-  if (!devices.empty()) {
-    mlir::Builder builder(module->getContext());
-    module->getOperation()->setAttr("tf.devices",
-                                    builder.getStrArrayAttr(devices));
-  }
-
-  // TF Dialect -> BEF
-  return tensorflow::CompileTFMLIRToBEF(options, module.get(), bef_buffer);
-}
-
 Status ConvertTfMlirToRuntimeExecutable(
     const TfrtCompileOptions& options, mlir::ModuleOp module,
     absl::FunctionRef<Status(mlir::PassManager&, mlir::ModuleOp,
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.h b/tensorflow/compiler/mlir/tfrt/translate/import_model.h
index 09917ea61095fe..c8aece1e8f4706 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.h
@@ -37,17 +37,6 @@ namespace tensorflow {
 
 struct FunctionBody;
 
-// Converts FunctionDef to TFRT's Binary Executable Format. This is the entry
-// point of tf.function to TFRT. function_name and device_name are given from
-// the Python context. The lowered BEF will be stored in an external buffer
-// pointed by bef_buffer.
-Status ConvertFunctionToBef(
-    mlir::StringRef function_name, const tensorflow::FunctionBody* fbody,
-    const FunctionLibraryDefinition& flib_def,
-    tfrt::ArrayRef<tfrt::string_view> devices,
-    const tensorflow::TfrtFunctionCompileOptions& options,
-    tfrt::BefBuffer* bef_buffer);
-
 // Converts an MLIR `module` in TF dialect to TFRT's Binary Executable Format.
 // If `fallback_state` is not null, the MLIR functions for XLA clusters in
 // the form of XlaLaunch will be exported and added to the function library when

From fd007362c8692260e33f997411a49ed6a5be68ba Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 12:58:35 -0700
Subject: [PATCH 404/670] Ensure that the module we consume has no unused
 computations. This can causes issues as we clone modules to support
 try_multiple_mesh_shapes, and cloning an HLO module removes dead computations
 leading to mismatches.

PiperOrigin-RevId: 618932647
---
 .../xla/hlo/experimental/auto_sharding/BUILD  |  1 +
 .../auto_sharding/auto_sharding.cc            |  5 ++++
 .../auto_sharding/auto_sharding_test.cc       | 27 +++++++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index bb0b54184ce5c8..edfe9d8cf61b5c 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -59,6 +59,7 @@ cc_library(
         "//xla/service:hlo_alias_analysis",
         "//xla/service:hlo_buffer",
         "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_dce",
         "//xla/service:hlo_memory_scheduler",
         "//xla/service:hlo_pass",
         "//xla/service:hlo_value",
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 61d2e16711ca9f..53faf0be87f3bb 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -73,6 +73,7 @@ limitations under the License.
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_dce.h"
 #include "xla/service/hlo_memory_scheduler.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/optimize_input_output_buffer_alias.h"
@@ -3937,6 +3938,8 @@ absl::StatusOr<bool> AutoSharding::Run(
   metrics::RecordAutoShardingInvocations();
 #endif
 
+  CHECK_OK(HloDCE().Run(module, execution_threads));
+
   TF_RETURN_IF_ERROR(option_.CheckAndSetup());
   LOG(INFO) << "AutoShardingOptions:\n" << option_.ToString();
 
@@ -4098,6 +4101,8 @@ absl::StatusOr<bool> AutoSharding::Run(
         chosen_mesh_shape_ = mesh_shapes[min_mesh_shape_index];
         absl::flat_hash_map<HloComputation*, HloComputation*>
             computation_replacements;
+        CHECK_EQ(module->computation_count(),
+                 modules[min_mesh_shape_index]->computation_count());
         for (size_t i = 0; i < module->computation_count(); ++i) {
           auto original_computation = module->mutable_computation(i);
           auto new_computation =
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index 60014fa9530ca6..2f93b7f20813d0 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -256,6 +256,33 @@ ENTRY %elementwise {
   EXPECT_THAT(instruction, op::Sharding("{devices=[2,2]0,2,1,3}"));
 }
 
+TEST_F(AutoShardingTest, UnusedComputationInModuleTest) {
+  constexpr absl::string_view kHloString = R"(
+HloModule module
+
+%unused_computation (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+ENTRY %module_does_not_invoke_any_computation {
+  %param0 = f32[1,16,128]{2,1,0} parameter(0)
+  %param1 = f32[1,16,128]{2,1,0} parameter(1)
+  %add = f32[1,16,128]{2,1,0} add(f32[1,16,128]{2,1,0} %param0, f32[1,16,128]{2,1,0} %param1)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  EXPECT_TRUE(changed);
+}
+
 TEST_F(AutoShardingTest, Unsupported3DShardingTest) {
   constexpr absl::string_view kHloString = R"(
 HloModule module

From 80f5c63cb36d263d97a2fb1ca043031906e8900b Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 25 Mar 2024 13:11:48 -0700
Subject: [PATCH 405/670] [xla:ffi] Pass device allocator and device ordinal to
 FFI execution context

PiperOrigin-RevId: 618936780
---
 third_party/xla/xla/service/gpu/custom_call_test.cc          | 4 ++++
 third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index c1ac8806f0e3cd..e9fe12d6a7c4c5 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -667,6 +667,10 @@ static absl::Status MemcpyWithCalledComputation(
   if (!DynCast<HloParameterInstruction>(called_computation->root_instruction()))
     return absl::InternalError("ROOT must be a paremeter");
 
+  // Check that scratch allocator is working.
+  auto scratch = scratch_allocator.AllocateBytes(1024);
+  if (!scratch.ok()) return scratch.status();
+
   return Memcpy(stream, src, dst);
 }
 
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
index 4bec97549515e5..2eaa44b8e0e166 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -147,6 +148,8 @@ absl::Status CustomCallThunk::ExecuteFfiHandler(const ExecuteParams& params) {
   // execution context, as apparently it's not easily accessible from Thunk.
   ExecutableRunOptions run_options;
   run_options.set_stream(params.stream);
+  run_options.set_allocator(params.buffer_allocations->memory_allocator());
+  run_options.set_device_ordinal(params.buffer_allocations->device_ordinal());
   ServiceExecutableRunOptions service_run_options(run_options);
 
   CallOptions options = {&service_run_options, called_computation_};

From c76b77eb0768ea34027dbf0514b0abfbe73b00b2 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Mon, 25 Mar 2024 13:31:33 -0700
Subject: [PATCH 406/670] [xla:gpu] Support DUS for
 AddressComputationFusionRewriter

PiperOrigin-RevId: 618942742
---
 .../address_computation_fusion_rewriter.cc    |  89 ++++++++---
 ...ddress_computation_fusion_rewriter_test.cc | 143 ++++++++++++++++++
 2 files changed, 214 insertions(+), 18 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index dfb81a149721de..2d545ec263982b 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -117,10 +117,14 @@ bool IsAlignedSlice(const Shape& src_shape, const Shape& dst_shape,
   return true;
 }
 
-absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
-    const HloInstruction* instr, bool dynamic) {
-  absl::InlinedVector<HloInstruction*, 8> sliced_operand_chains = {
+absl::InlinedVector<HloInstruction*, 8> GetSlicedChains(
+    const HloInstruction* instr, bool dynamic,
+    absl::flat_hash_map<const HloInstruction*, const HloInstruction*>&
+        replacement_map) {
+  replacement_map[instr] = instr;
+  absl::InlinedVector<HloInstruction*, 8> dyn_slice_chains = {
       const_cast<HloInstruction*>(instr)};
+  absl::InlinedVector<HloInstruction*, 8> dus_chain;
   auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
   // This set is used to avoid duplicates in the matched results. It contains
   // the matched instructions that we have seen so far.
@@ -147,12 +151,6 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
           if (processed_sliced_chain_set.contains(cur)) return true;
           maybe_sliced_operand_chain.push_back(
               const_cast<HloInstruction*>(cur));
-          // TODO(vuson): lift the first restriction by considering fusing other
-          // uses of the operand to reuse the address computation. Only worth it
-          // if other uses are also custom calls though.
-          // TODO(vuson): lift the second restriction by considering fusing the
-          // non-noop instructions to the computation if possible (i.e. for
-          // dynamic slices).
           if (dynamic) {
             if (const auto slice_instr =
                     DynCast<HloDynamicSliceInstruction>(cur)) {
@@ -171,6 +169,9 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
               }
             }
           }
+          // TODO(vuson): lift the first restriction by considering fusing other
+          // uses of the operand to reuse the address computation. Only worth it
+          // if other uses are also custom calls though.
           return cur->user_count() > 1 || !IsNoOp(cur);
         });
     if (maybe_slice_adaptor == std::nullopt) continue;
@@ -180,14 +181,58 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
       // Even in the case of stopping at a match that has been processed, we
       // still need to add instructions encountered in the sliced operand chain
       // during the latest traversal.
-      sliced_operand_chains.insert(sliced_operand_chains.end(),
-                                   maybe_sliced_operand_chain.begin(),
-                                   maybe_sliced_operand_chain.end());
+      dyn_slice_chains.insert(dyn_slice_chains.end(),
+                              maybe_sliced_operand_chain.begin(),
+                              maybe_sliced_operand_chain.end());
       processed_sliced_chain_set.insert(maybe_sliced_operand_chain.begin(),
                                         maybe_sliced_operand_chain.end());
     }
   }
-  return sliced_operand_chains;
+
+  if (dynamic) {
+    for (auto* user : instr->users()) {
+      absl::InlinedVector<HloInstruction*, 4> maybe_sliced_user_chain;
+      bool dus_found = false;
+      auto maybe_dus_adaptor = HloFindIf(
+          {HloInstructionAdaptor(*user)}, *fusion,
+          [&](auto node) {
+            const HloInstruction* cur = &node.instruction();
+            // If the node is a match that has been processed, stop the
+            // traversal.
+            if (processed_sliced_chain_set.contains(cur)) return true;
+            maybe_sliced_user_chain.push_back(const_cast<HloInstruction*>(cur));
+            if (const auto slice_instr =
+                    DynCast<HloDynamicUpdateSliceInstruction>(cur)) {
+              if (IsAlignedSlice(slice_instr->shape(),
+                                 slice_instr->operand(1)->shape(), nullptr)) {
+                dus_found = true;
+                replacement_map[instr] = cur;
+                return dus_found;
+              }
+            }
+            // TODO(vuson): lift the first restriction by considering fusing
+            // other uses of the user to reuse the address computation. Only
+            // worth it if other uses are also custom calls though.
+            return cur->user_count() > 1 || !IsNoOp(cur);
+          },
+          /*visit_operands=*/false);
+      if (maybe_dus_adaptor == std::nullopt) continue;
+      const auto& maybe_dus_instr = maybe_dus_adaptor->instruction();
+      if (dus_found || processed_sliced_chain_set.contains(&maybe_dus_instr)) {
+        // Even in the case of stopping at a match that has been processed, we
+        // still need to add instructions encountered in the sliced user chain
+        // during the latest traversal.
+        dus_chain.insert(dus_chain.end(), maybe_sliced_user_chain.rbegin(),
+                         maybe_sliced_user_chain.rend());
+        processed_sliced_chain_set.insert(maybe_sliced_user_chain.begin(),
+                                          maybe_sliced_user_chain.end());
+      }
+    }
+  }
+
+  dus_chain.insert(dus_chain.end(), dyn_slice_chains.begin(),
+                   dyn_slice_chains.end());
+  return dus_chain;
 }
 
 absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
@@ -333,6 +378,8 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
     absl::flat_hash_map<HloInstruction*,
                         absl::InlinedVector<HloInstruction*, 8>>
         matches;
+    absl::flat_hash_map<const HloInstruction*, const HloInstruction*>
+        replacement_map;
 
     // Collect all potential custom call matches in the non-fusion computations.
     for (HloComputation* computation : module->computations()) {
@@ -340,7 +387,8 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
       for (HloInstruction* instr : computation->instructions()) {
         if (IsLegacyCublasMatmul(*instr) ||
             (!dynamic && IsCustomCall(instr, platform_name_))) {
-          auto sliced_operand_chains = GetSlicedOperandChains(instr, dynamic);
+          auto sliced_operand_chains =
+              GetSlicedChains(instr, dynamic, replacement_map);
           if (!(sliced_operand_chains.size() == 1 &&
                 sliced_operand_chains.front() == instr)) {
             matches[instr] = std::move(sliced_operand_chains);
@@ -373,7 +421,8 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
       sequence.replace_instruction(kv.first, fusion);
 
       // TODO(vuson): handle control dependencies
-      TF_RETURN_IF_ERROR(parent->ReplaceInstruction(kv.first, fusion));
+      TF_RETURN_IF_ERROR(parent->ReplaceInstruction(
+          const_cast<HloInstruction*>(replacement_map[kv.first]), fusion));
     }
 
     TF_RETURN_IF_ERROR(module->schedule().Update());
@@ -381,9 +430,13 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
     return true;
   };
 
-  TF_ASSIGN_OR_RETURN(bool static_sliced, process_slices(false));
-  TF_ASSIGN_OR_RETURN(bool dynamic_sliced, process_slices(true));
-  return static_sliced || dynamic_sliced;
+  // TODO(vuson): unify dynamic_address_computation and address_computation
+  TF_ASSIGN_OR_RETURN(bool processed_pattern_with_static_slices,
+                      process_slices(false));
+  TF_ASSIGN_OR_RETURN(bool processed_pattern_with_dynamic_slices,
+                      process_slices(true));
+  return processed_pattern_with_static_slices ||
+         processed_pattern_with_dynamic_slices;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
index 0ec82c1f3395fe..e0480392c2bc82 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
@@ -1240,4 +1240,147 @@ TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemmNotRoot) {
                             });
 }
 
+TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemm) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY main.9 {
+      p0 = f16[1,8,8]{2,1,0} parameter(0)
+      p1 = f16[1,8,8]{2,1,0} parameter(1)
+      p2 = f16[4,8,8]{2,1,0} parameter(2)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      bitcast.41 = f16[8,8]{1,0} bitcast(p0)
+      bitcast.42 = f16[8,8]{1,0} bitcast(p1)
+
+      custom-call.1 = f16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      bitcast.43 = f16[1,8,8]{2,1,0} bitcast(custom-call.1)
+      ROOT dus = f16[4,8,8]{2,1,0} dynamic-update-slice(p2, bitcast.43, c1_s32, c0_s32, c0_s32)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[8,8]{1,0} parameter(3)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[8,8]{1,0} parameter(4)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]]),
+    ; CHECK-DAG:          custom_call_target="__cublas$gemm"
+    ; CHECK-DAG:   [[BC:%[^ ]+]] = f16[1,8,8]{2,1,0} bitcast([[CC]])
+    ; CHECK:       ROOT {{.*}} = f16[4,8,8]{2,1,0} dynamic-update-slice([[P2]], [[BC]], [[C1]], [[C0]], [[C0]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = f16[4,8,8]{2,1,0} fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmNotRoot) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      p2 = f16[4,8,8]{2,1,0} parameter(2)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      custom-call.1 = f16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      bitcast.43 = f16[1,8,8]{2,1,0} bitcast(custom-call.1)
+      dus = f16[4,8,8]{2,1,0} dynamic-update-slice(p2, bitcast.43, c1_s32, c0_s32, c0_s32)
+      ROOT res = f16[4,8,8]{2,1,0} log(dus)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(4)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P1]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK-DAG:   [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK-DAG:          custom_call_target="__cublas$gemm"
+    ; CHECK-DAG:   [[BC:%[^ ]+]] = f16[1,8,8]{2,1,0} bitcast([[CC]])
+    ; CHECK:       ROOT {{.*}} = f16[4,8,8]{2,1,0} dynamic-update-slice([[P2]], [[BC]], [[C1]], [[C0]], [[C0]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = f16[4,8,8]{2,1,0} fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT {{.*}} = f16[4,8,8]{2,1,0} log([[FUSION]])
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
 }  // namespace xla::gpu

From 5e24f9662c24a8bf2d867dda9164ebbe9de5a7f5 Mon Sep 17 00:00:00 2001
From: Deqiang Chen <deqiangc@google.com>
Date: Mon, 25 Mar 2024 13:45:19 -0700
Subject: [PATCH 407/670] Add a pass to remove unused restore op

PiperOrigin-RevId: 618946899
---
 .../tfrt/tests/ifrt/tf_restore_pruning.mlir   | 25 +++++++++
 .../compiler/mlir/tfrt/transforms/ifrt/BUILD  |  1 +
 .../mlir/tfrt/transforms/ifrt/passes.td       | 14 +++++
 .../tfrt/transforms/ifrt/tf_ifrt_passes.cc    |  4 ++
 .../tfrt/transforms/ifrt/tf_ifrt_passes.h     |  4 ++
 .../transforms/ifrt/tf_restore_pruning.cc     | 52 +++++++++++++++++++
 6 files changed, 100 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_pruning.mlir
 create mode 100644 tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_pruning.cc

diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_pruning.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_pruning.mlir
new file mode 100644
index 00000000000000..3055438d5c468d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_pruning.mlir
@@ -0,0 +1,25 @@
+// RUN: tf-tfrt-opt -tf-restore-pruning %s | FileCheck %s
+
+// CHECK-LABEL:   func.func @prune_unused_restore
+func.func @prune_unused_restore() {
+  %cst = "tf.Const"() <{value = dense<"restore_ariables"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+  %cst_0 = "tf.Const"() <{value = dense<""> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+  %cst_1 = "tf.Const"() <{value = dense<"y"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+  // CHECK-NOT: tf.RestoreV2
+  %0 = "tf.RestoreV2"(%cst, %cst_1, %cst_0): (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<3x1xf32>
+  %1 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+  return
+}
+
+
+// CHECK-LABEL: func.func @used_restore_remains
+func.func @used_restore_remains() {
+  %cst = "tf.Const"() <{value = dense<"restore_ariables"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+  %cst_0 = "tf.Const"() <{value = dense<""> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+  %cst_1 = "tf.Const"() <{value = dense<"y"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+  // CHECK: tf.RestoreV2
+  %0 = "tf.RestoreV2"(%cst, %cst_1, %cst_0): (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<3x1xf32>
+  %1 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+  "tf.AssignVariableOp"(%1, %0) : (tensor<!tf_type.resource<tensor<3x1xf32>>>, tensor<3x1xf32>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
index db6335ce1e6d7b..fd3ff2cd31512f 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
@@ -59,6 +59,7 @@ cc_library(
         "tf_identity_propagation.cc",
         "tf_ifrt_passes.cc",
         "tf_restore_merging.cc",
+        "tf_restore_pruning.cc",
         "tf_restore_splitting.cc",
     ],
     hdrs = [
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td
index 3ead3c515f4d15..7cdc5576ae5465 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td
@@ -59,6 +59,20 @@ def LowerToIfrtRestoreVariablePass: Pass<"lower-to-ifrt-restore-variable", "mlir
   let constructor = "CreateLowerToIfrtRestoreVariablePass()";
  }
 
+def TfRestorePruningPass
+    : Pass<"tf-restore-pruning", "mlir::func::FuncOp"> {
+  let summary = "Prune unused`tf.RestoreV2` ops";
+
+  let description = [{
+    This pass prune unused `tf.RestoreV2` op. A typical use case is to combine
+    `TfRestoreSplittingPass`, this pass and `TfRestoreMergingPass` in sequence
+    so that the un-used restored tensors are not read into host memory.
+  }];
+
+  let constructor = "CreateTfRestorePruningPass()";
+}
+
+
 def TfRestoreSplittingPass
     : Pass<"tf-restore-splitting", "mlir::func::FuncOp"> {
   let summary = "Splits `tf.RestoreV2` ops";
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
index a547a05d5d34ea..458b15289d66c3 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
@@ -72,6 +72,10 @@ void AddClusterToIfrtRuntimeOpsPassPipeline(OpPassManager& pm,
 
   pm.addNestedPass<mlir::func::FuncOp>(CreateTfIdentityPropagationPass());
 
+  pm.addNestedPass<mlir::func::FuncOp>(CreateTfRestoreSplittingPass());
+  pm.addNestedPass<mlir::func::FuncOp>(CreateTfRestorePruningPass());
+  pm.addNestedPass<mlir::func::FuncOp>(CreateTfRestoreMergingPass());
+
   pm.addPass(CreateRewriteClusterToIfrtCallPass());
 
   // Sink VarHandle with ReadVariableOp: subsequent SinkVariableAsNamedArrayPass
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
index 7ef062ec5601f4..93713fbdc13646 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
@@ -48,6 +48,10 @@ CreateTfRestoreMergingPass();
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 CreateTfIdentityPropagationPass();
 
+// Creates a pass that prunes unused `tf.RestoreV2` ops.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfRestorePruningPass();
+
 // Creates a pass that lower `tf.RestoreVariableOp` to
 // `tf.IfrtRestoreVariableOp`.
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_pruning.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_pruning.cc
new file mode 100644
index 00000000000000..6491be3f7151fa
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_pruning.cc
@@ -0,0 +1,52 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+#define GEN_PASS_DEF_TFRESTOREPRUNINGPASS
+#define GEN_PASS_DECL_TFRESTOREPRUNINGPASS
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.h.inc"  // IWYU pragma: keep
+
+// Prune unused RestoreV2 Op.
+class TfRestorePruningPass
+    : public impl::TfRestorePruningPassBase<TfRestorePruningPass> {
+ public:
+  void runOnOperation() override {
+    mlir::func::FuncOp func = getOperation();
+    func.walk([&](mlir::TF::RestoreV2Op restore) {
+      if (restore.use_empty()) {
+        restore.erase();
+      }
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfRestorePruningPass() {
+  return std::make_unique<TfRestorePruningPass>();
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow

From e8086c7f638b5c4145038ed10bbc5c0e533eeae1 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Mon, 25 Mar 2024 14:25:45 -0700
Subject: [PATCH 408/670] [xla:gpu][NFC] Add `update` accessor to
 HloDynamicUpdateSliceInstruction API

PiperOrigin-RevId: 618959577
---
 third_party/xla/xla/hlo/ir/hlo_instructions.h               | 2 ++
 .../xla/service/gpu/address_computation_fusion_rewriter.cc  | 2 +-
 third_party/xla/xla/service/gpu/fusions/custom.cc           | 6 ++++--
 .../gpu/fusions/in_place_dynamic_update_slice_mlir.cc       | 2 +-
 third_party/xla/xla/service/gpu/ir_emission_utils.cc        | 2 +-
 .../xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc      | 2 +-
 third_party/xla/xla/service/gpu/model/indexing_analysis.cc  | 2 +-
 7 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index 5399190bb5af97..0a2364355dbd4c 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -2215,6 +2215,8 @@ class HloDynamicUpdateSliceInstruction : public HloDynamicIndexInstruction {
 
   int64_t first_index_operand_number() const override { return 2; }
 
+  const HloInstruction* update() const { return operand(1); }
+
   static bool ClassOf(const HloInstruction* hlo) {
     return hlo->opcode() == HloOpcode::kDynamicUpdateSlice;
   }
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index 2d545ec263982b..026d98f3aa718f 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -204,7 +204,7 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedChains(
             if (const auto slice_instr =
                     DynCast<HloDynamicUpdateSliceInstruction>(cur)) {
               if (IsAlignedSlice(slice_instr->shape(),
-                                 slice_instr->operand(1)->shape(), nullptr)) {
+                                 slice_instr->update()->shape(), nullptr)) {
                 dus_found = true;
                 replacement_map[instr] = cur;
                 return dus_found;
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 7e0479d550be77..afef3ae930dcd6 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -279,8 +279,10 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
           static_cast<const HloDynamicIndexInstruction*>(
               &slice_adaptor->instruction()));
 
-      if (!IsContiguousSlice(slice_instr->operand(0)->shape(),
-                             slice_instr->shape())) {
+      if (!IsContiguousSlice(slice_instr->shape(),
+                             Cast<HloDynamicUpdateSliceInstruction>(slice_instr)
+                                 ->update()
+                                 ->shape())) {
         return absl::InternalError(
             "DynamicAddressComputationFusion only handles contiguous slices "
             "currently");
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
index fcc4a31905f92e..acb612bce464ad 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
@@ -120,7 +120,7 @@ absl::Status MlirInPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
 
   const auto* dus_instr =
       Cast<HloDynamicUpdateSliceInstruction>(dus_ops_.front());
-  const auto& update_shape = dus_instr->operand(kDUSUpdateIndex)->shape();
+  const auto& update_shape = dus_instr->update()->shape();
   auto result_tensors = EmitThreadLoopNest(
       b, output_tensor_args, indexing,
       [&](ValueRange output_tensors, ValueRange dim_values,
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index 2d083f4bc33b68..6265c5845d036c 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -519,7 +519,7 @@ absl::StatusOr<bool> CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
     // be necessary for the shape to be the same for all the dynamic slice
     // updates. Note that this equality check purposefully ignores the element
     // type.
-    if (dus->operand(1)->shape() != update_shape) {
+    if (dus->update()->shape() != update_shape) {
       return false;
     }
 
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
index a200803400223d..9fdbcd6790633b 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
@@ -170,7 +170,7 @@ static absl::StatusOr<GemmWithDynamicSlice> MatchGemmWithDynamicUpdateSlice(
     HloDynamicUpdateSliceInstruction* update_slice) {
   GemmWithDynamicSlice match(update_slice);
 
-  if (!Match(const_cast<HloInstruction*>(update_slice->operand(1)),
+  if (!Match(const_cast<HloInstruction*>(update_slice->update()),
              OptionalBitcast(&match.bitcast,
                              m::Dot(&match.dot, m::Op(), m::Op())))) {
     return absl::InternalError("failed to match update slice instr");
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index f96133e0089ba1..07e54e4c24dbf4 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -330,7 +330,7 @@ HloInstructionIndexing ComputeOutputToInputDynamicSliceOpIndexing(
 
 HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
     const HloDynamicUpdateSliceInstruction* dus, MLIRContext* mlir_context) {
-  const Shape& update_shape = dus->operand(1)->shape();
+  const Shape& update_shape = dus->update()->shape();
   const Shape& output_shape = dus->shape();
   int64_t rank = output_shape.rank();
 

From f9bf47528f9d6a88a3c094299bb320fa545d32bb Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Mon, 25 Mar 2024 14:33:59 -0700
Subject: [PATCH 409/670] Add `tsl::testing::AppendDotExeIfWindows` and use it
 inside `filecheck.cc` to fix Windows build errors

PiperOrigin-RevId: 618962127
---
 .../xla/third_party/tsl/tsl/platform/path.cc  |  7 +++
 .../xla/third_party/tsl/tsl/platform/path.h   |  3 +
 .../tsl/tsl/platform/subprocess_test.cc       | 55 ++++++++-----------
 .../xla/stream_executor/gpu/asm_compiler.cc   |  7 +--
 third_party/xla/xla/tests/filecheck.cc        |  7 ++-
 5 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/platform/path.cc b/third_party/xla/third_party/tsl/tsl/platform/path.cc
index b33af3eb7c311a..580aacde900c1a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/path.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/path.cc
@@ -407,5 +407,12 @@ bool ResolveTestPrefixes(tsl::StringPiece path, string& resolved_path) {
   }
 }
 
+[[maybe_unused]] std::string& AppendDotExeIfWindows(std::string& path) {
+#ifdef PLATFORM_WINDOWS
+  path.append(".exe");
+#endif  // PLATFORM_WINDOWS
+  return path;
+}
+
 }  // namespace io
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/path.h b/third_party/xla/third_party/tsl/tsl/platform/path.h
index 451addc60b465c..f0a5b87d135c2a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/path.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/path.h
@@ -126,6 +126,9 @@ bool GetTestUndeclaredOutputsDir(std::string* dir);
 // be resolved.
 bool ResolveTestPrefixes(tsl::StringPiece path, std::string& resolved_path);
 
+// Appends `.exe` if `PLATFORM_WINDOWS` is defined.
+[[maybe_unused]] std::string& AppendDotExeIfWindows(std::string& path);
+
 }  // namespace io
 }  // namespace tsl
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/subprocess_test.cc b/third_party/xla/third_party/tsl/tsl/platform/subprocess_test.cc
index 4e36b0d790da2f..1b1bbcb3113e17 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/subprocess_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/subprocess_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tsl/platform/subprocess.h"
 
 #include <stdlib.h>
-#include <string.h>
 
 #include <algorithm>
+#include <string>
 
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/path.h"
@@ -36,48 +36,43 @@ limitations under the License.
 namespace tsl {
 namespace {
 
-static string GetDataFilePath(const string& relative_path) {
-#ifdef PLATFORM_WINDOWS
-  // While CreateProcess on windows is resilient to not having ".exe" suffix,
-  // Bazel_tools has to have the exact file path to return the resource.
-  return strings::StrCat(relative_path, ".exe");
-#else
-  return relative_path;
-#endif
-}
 
 string EchoProgram() {
-  return io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
-                      "test_echo");
+  std::string path =
+      io::JoinPath(testing::TslSrcRoot(), "platform", "testdata", "test_echo");
+  return tsl::io::AppendDotExeIfWindows(path);
 }
 
 string EchoArgv1Program() {
-  return io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
-                      "test_echo_argv_1");
+  std::string path = io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
+                                  "test_echo_argv_1");
+  return tsl::io::AppendDotExeIfWindows(path);
 }
 
 string NoopProgram() {
-  return io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
-                      "test_noop");
+  std::string path =
+      io::JoinPath(testing::TslSrcRoot(), "platform", "testdata", "test_noop");
+  return tsl::io::AppendDotExeIfWindows(path);
 }
 
 string StdErrProgram() {
-  return io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
-                      "test_stderr");
+  std::string path = io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
+                                  "test_stderr");
+  return tsl::io::AppendDotExeIfWindows(path);
 }
 
 class SubProcessTest : public ::testing::Test {};
 
 TEST_F(SubProcessTest, NoOutputNoComm) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(NoopProgram()).c_str(), {NoopProgram()});
+  proc.SetProgram(NoopProgram().c_str(), {NoopProgram()});
   EXPECT_TRUE(proc.Start());
   EXPECT_TRUE(proc.Wait());
 }
 
 TEST_F(SubProcessTest, NoOutput) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(NoopProgram()).c_str(), {NoopProgram()});
+  proc.SetProgram(NoopProgram().c_str(), {NoopProgram()});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -93,7 +88,7 @@ TEST_F(SubProcessTest, NoOutput) {
 TEST_F(SubProcessTest, Stdout) {
   tsl::SubProcess proc;
   const char test_string[] = "hello_world";
-  proc.SetProgram(GetDataFilePath(EchoArgv1Program()).c_str(),
+  proc.SetProgram(EchoArgv1Program().c_str(),
                   {EchoArgv1Program(), test_string});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
@@ -110,7 +105,7 @@ TEST_F(SubProcessTest, Stdout) {
 TEST_F(SubProcessTest, StdoutIgnored) {
   tsl::SubProcess proc;
   const char test_string[] = "hello_world";
-  proc.SetProgram(GetDataFilePath(EchoArgv1Program()).c_str(),
+  proc.SetProgram(EchoArgv1Program().c_str(),
                   {EchoArgv1Program(), test_string});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
@@ -124,8 +119,7 @@ TEST_F(SubProcessTest, StdoutIgnored) {
 TEST_F(SubProcessTest, Stderr) {
   tsl::SubProcess proc;
   const char test_string[] = "muh_failure!";
-  proc.SetProgram(GetDataFilePath(StdErrProgram()).c_str(),
-                  {StdErrProgram(), test_string});
+  proc.SetProgram(StdErrProgram().c_str(), {StdErrProgram(), test_string});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -141,8 +135,7 @@ TEST_F(SubProcessTest, Stderr) {
 TEST_F(SubProcessTest, StderrIgnored) {
   tsl::SubProcess proc;
   const char test_string[] = "muh_failure!";
-  proc.SetProgram(GetDataFilePath(StdErrProgram()).c_str(),
-                  {StdErrProgram(), test_string});
+  proc.SetProgram(StdErrProgram().c_str(), {StdErrProgram(), test_string});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -154,7 +147,7 @@ TEST_F(SubProcessTest, StderrIgnored) {
 
 TEST_F(SubProcessTest, Stdin) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(EchoProgram()).c_str(), {EchoProgram()});
+  proc.SetProgram(EchoProgram().c_str(), {EchoProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
@@ -166,7 +159,7 @@ TEST_F(SubProcessTest, Stdin) {
 
 TEST_F(SubProcessTest, StdinStdout) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(EchoProgram()).c_str(), {EchoProgram()});
+  proc.SetProgram(EchoProgram().c_str(), {EchoProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -183,7 +176,7 @@ TEST_F(SubProcessTest, StdinStdout) {
 
 TEST_F(SubProcessTest, StdinChildExit) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(NoopProgram()).c_str(), {NoopProgram()});
+  proc.SetProgram(NoopProgram().c_str(), {NoopProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
@@ -202,7 +195,7 @@ TEST_F(SubProcessTest, StdinChildExit) {
 
 TEST_F(SubProcessTest, StdinStdoutOverlap) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(EchoProgram()).c_str(), {EchoProgram()});
+  proc.SetProgram(EchoProgram().c_str(), {EchoProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -226,7 +219,7 @@ TEST_F(SubProcessTest, StdinStdoutOverlap) {
 
 TEST_F(SubProcessTest, KillProc) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(EchoProgram()).c_str(), {EchoProgram()});
+  proc.SetProgram(EchoProgram().c_str(), {EchoProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
index 6293d7a8eaa7cf..0b77fb0f2d2862 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
@@ -183,11 +183,8 @@ absl::StatusOr<std::string> FindCudaExecutable(
     std::string_view binary_name, std::string_view preferred_cuda_dir,
     ToolVersion minimum_version,
     absl::Span<const ToolVersion> excluded_versions) {
-#if defined(PLATFORM_WINDOWS)
-  const std::string binary_filename = std::string{binary_name} + ".exe";
-#else
-  std::string_view binary_filename = binary_name;
-#endif
+  std::string binary_filename = std::string{binary_name};
+  tsl::io::AppendDotExeIfWindows(binary_filename);
 
   std::vector<std::string> candidates{};
 
diff --git a/third_party/xla/xla/tests/filecheck.cc b/third_party/xla/xla/tests/filecheck.cc
index 2031ec3d2b26ef..5ef61bf653008d 100644
--- a/third_party/xla/xla/tests/filecheck.cc
+++ b/third_party/xla/xla/tests/filecheck.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/tests/filecheck.h"
 
 #include <cstdlib>
+#include <string>
 
 #include "xla/types.h"
 #include "xla/util.h"
@@ -44,10 +45,12 @@ absl::StatusOr<bool> RunFileCheck(const std::string& input,
 absl::StatusOr<bool> RunFileCheckWithPatternFile(
     const std::string& input, const std::string& pattern_file) {
   // Invoke FileCheck to check whether input matches `pattern`.
+  std::string binary_name = "FileCheck";
+  tsl::io::AppendDotExeIfWindows(binary_name);
   std::string file_check_path = tsl::GetDataDependencyFilepath(
       tsl::testing::kIsOpenSource
-          ? tsl::io::JoinPath("external", "llvm-project", "llvm", "FileCheck")
-          : tsl::io::JoinPath("llvm", "llvm-project", "llvm", "FileCheck"));
+          ? tsl::io::JoinPath("external", "llvm-project", "llvm", binary_name)
+          : tsl::io::JoinPath("llvm", "llvm-project", "llvm", binary_name));
 
   tsl::SubProcess file_check_process;
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

From d4f617c4be62acaafcb4d0598aa5262b3d2badf8 Mon Sep 17 00:00:00 2001
From: Sandeep Dasgupta <sdasgup@google.com>
Date: Mon, 25 Mar 2024 15:04:06 -0700
Subject: [PATCH 410/670] Integrate StableHLO at openxla/stablehlo@b27ef13c

PiperOrigin-RevId: 618971032
---
 .../passes/nchw_convolution_to_nhwc.mlir      |   4 +-
 third_party/stablehlo/temporary.patch         | 426 +-----------------
 third_party/stablehlo/workspace.bzl           |   4 +-
 .../xla/third_party/stablehlo/temporary.patch | 426 +-----------------
 .../xla/third_party/stablehlo/workspace.bzl   |   4 +-
 5 files changed, 50 insertions(+), 814 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/nchw_convolution_to_nhwc.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/nchw_convolution_to_nhwc.mlir
index 6cdf9fdbf46b91..bdfce8cad3f5a8 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/nchw_convolution_to_nhwc.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/nchw_convolution_to_nhwc.mlir
@@ -75,8 +75,8 @@ func.func @conv_output_dim_numbers_mismatch(%arg0: tensor<1x8x4x4xf32>) -> tenso
 // Tests that a quantized convolution does not match. No conversion occurs.
 
 // CHECK-LABEL: quantized_convolution
-func.func @quantized_convolution(%arg0: tensor<1x4x3x3x!quant.uniform<i8:f32, 1.000000e+0:-100>>, %arg1: tensor<2x4x3x3x!quant.uniform<i8:f32:1, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>> {
-  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x4x3x3x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<2x4x3x3x!quant.uniform<i8:f32:1, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>>
+func.func @quantized_convolution(%arg0: tensor<1x4x3x3x!quant.uniform<i8:f32, 1.000000e+0:-100>>, %arg1: tensor<2x4x3x3x!quant.uniform<i8:f32:0, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>> {
+  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x4x3x3x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<2x4x3x3x!quant.uniform<i8:f32:0, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>>
   return %0 : tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>>
 }
 
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index d31b57fbd68ad9..1668264c315c19 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -1,3 +1,14 @@
+diff --ruN a/stablehlo/BUILD.bazel b/stablehlo/BUILD.bazel
+--- stablehlo/BUILD.bazel
++++ stablehlo/BUILD.bazel
+@@ -422,7 +422,6 @@
+     hdrs = [
+         "stablehlo/reference/Api.h",
+     ],
+-    strip_include_prefix = ".",
+     deps = [
+         ":interpreter_ops",
+         ":reference_configuration",
 diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 --- stablehlo/CMakeLists.txt
 +++ stablehlo/CMakeLists.txt
@@ -203,7 +214,7 @@ diff --ruN a/stablehlo/stablehlo/dialect/TypeInference.cpp b/stablehlo/stablehlo
  
  //===----------------------------------------------------------------------===//
  // Utils for shape functions.
-@@ -3472,61 +3453,6 @@
+@@ -3472,60 +3453,6 @@
                               "is incompatible with return type of operation ",
                               shapedResultType, "");
  
@@ -237,16 +248,15 @@ diff --ruN a/stablehlo/stablehlo/dialect/TypeInference.cpp b/stablehlo/stablehlo
 -  if (noneQuantized<quant::UniformQuantizedPerAxisType>(typeEntriesPerAxis))
 -    return success();
 -  // convolution_c31
--  if (!allQuantized<quant::UniformQuantizedPerAxisType>(typeEntriesPerAxis)) {
--    return emitOptionalError(location,
--                             "rhs and result are of mixed per_tensor and "
--                             "per_axis quantized tensor type ",
--                             rhsType, " and ", resultType);
--  }
--
 -  auto rhsQPAType = rhsQType.dyn_cast<quant::UniformQuantizedPerAxisType>();
 -  auto resultQPAType =
 -      resultQType.dyn_cast<quant::UniformQuantizedPerAxisType>();
+-  if (!rhsQPAType && resultQPAType) {
+-    return emitOptionalError(
+-        location, "per-tensor rhs expects per-tensor result but received ",
+-        rhsType, " and ", resultType, " respectively");
+-  }
+-
 -  // convolution_c32
 -  if (rhsQPAType &&
 -      rhsQPAType.getQuantizedDimension() != kernelOutputFeatureDimension)
@@ -2651,375 +2661,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
---- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
-+++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
-@@ -1324,99 +1324,99 @@
-   // CHECK: %[[TMP_40:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
-   // CHECK: %[[TMP_41:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_40]]
-   // CHECK: %[[TMP_42:.*]] = stablehlo.constant dense<2.200000e+01>
--  // CHECK: %[[TMP_43:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_42]]
-+  // CHECK: %[[TMP_43:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_42]]
-   // CHECK: %[[TMP_44:.*]] = stablehlo.constant dense<2.100000e+01>
--  // CHECK: %[[TMP_45:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_44]]
-+  // CHECK: %[[TMP_45:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_44]]
-   // CHECK: %[[TMP_46:.*]] = stablehlo.multiply %[[TMP_43]], %[[TMP_45]]
-   // CHECK: %[[TMP_47:.*]] = stablehlo.constant dense<-1.39544646E-19>
-   // CHECK: %[[TMP_48:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_47]]
-   // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_48]]
-   // CHECK: %[[TMP_50:.*]] = stablehlo.multiply %[[TMP_46]], %[[TMP_49]]
-   // CHECK: %[[TMP_51:.*]] = stablehlo.constant dense<2.000000e+01>
--  // CHECK: %[[TMP_52:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_51]]
-+  // CHECK: %[[TMP_52:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_51]]
-   // CHECK: %[[TMP_53:.*]] = stablehlo.constant dense<1.900000e+01>
--  // CHECK: %[[TMP_54:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_53]]
-+  // CHECK: %[[TMP_54:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_53]]
-   // CHECK: %[[TMP_55:.*]] = stablehlo.multiply %[[TMP_52]], %[[TMP_54]]
-   // CHECK: %[[TMP_56:.*]] = stablehlo.constant dense<5.50900303E-18>
-   // CHECK: %[[TMP_57:.*]] = stablehlo.add %[[TMP_50]], %[[TMP_56]]
-   // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_57]]
-   // CHECK: %[[TMP_59:.*]] = stablehlo.multiply %[[TMP_55]], %[[TMP_58]]
-   // CHECK: %[[TMP_60:.*]] = stablehlo.constant dense<1.800000e+01>
--  // CHECK: %[[TMP_61:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_60]]
-+  // CHECK: %[[TMP_61:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_60]]
-   // CHECK: %[[TMP_62:.*]] = stablehlo.constant dense<1.700000e+01>
--  // CHECK: %[[TMP_63:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_62]]
-+  // CHECK: %[[TMP_63:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_62]]
-   // CHECK: %[[TMP_64:.*]] = stablehlo.multiply %[[TMP_61]], %[[TMP_63]]
-   // CHECK: %[[TMP_65:.*]] = stablehlo.constant dense<-2.17486866E-16>
-   // CHECK: %[[TMP_66:.*]] = stablehlo.add %[[TMP_59]], %[[TMP_65]]
-   // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_66]]
-   // CHECK: %[[TMP_68:.*]] = stablehlo.multiply %[[TMP_64]], %[[TMP_67]]
-   // CHECK: %[[TMP_69:.*]] = stablehlo.constant dense<1.600000e+01>
--  // CHECK: %[[TMP_70:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_69]]
-+  // CHECK: %[[TMP_70:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_69]]
-   // CHECK: %[[TMP_71:.*]] = stablehlo.constant dense<1.500000e+01>
--  // CHECK: %[[TMP_72:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_71]]
-+  // CHECK: %[[TMP_72:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_71]]
-   // CHECK: %[[TMP_73:.*]] = stablehlo.multiply %[[TMP_70]], %[[TMP_72]]
-   // CHECK: %[[TMP_74:.*]] = stablehlo.constant dense<8.58606213E-15>
-   // CHECK: %[[TMP_75:.*]] = stablehlo.add %[[TMP_68]], %[[TMP_74]]
-   // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_75]]
-   // CHECK: %[[TMP_77:.*]] = stablehlo.multiply %[[TMP_73]], %[[TMP_76]]
-   // CHECK: %[[TMP_78:.*]] = stablehlo.constant dense<1.400000e+01>
--  // CHECK: %[[TMP_79:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_78]]
-+  // CHECK: %[[TMP_79:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_78]]
-   // CHECK: %[[TMP_80:.*]] = stablehlo.constant dense<1.300000e+01>
--  // CHECK: %[[TMP_81:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_80]]
-+  // CHECK: %[[TMP_81:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_80]]
-   // CHECK: %[[TMP_82:.*]] = stablehlo.multiply %[[TMP_79]], %[[TMP_81]]
-   // CHECK: %[[TMP_83:.*]] = stablehlo.constant dense<-3.3896803E-13>
-   // CHECK: %[[TMP_84:.*]] = stablehlo.add %[[TMP_77]], %[[TMP_83]]
-   // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_84]]
-   // CHECK: %[[TMP_86:.*]] = stablehlo.multiply %[[TMP_82]], %[[TMP_85]]
-   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<1.200000e+01>
--  // CHECK: %[[TMP_88:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_87]]
-+  // CHECK: %[[TMP_88:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_87]]
-   // CHECK: %[[TMP_89:.*]] = stablehlo.constant dense<1.100000e+01>
--  // CHECK: %[[TMP_90:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_89]]
-+  // CHECK: %[[TMP_90:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_89]]
-   // CHECK: %[[TMP_91:.*]] = stablehlo.multiply %[[TMP_88]], %[[TMP_90]]
-   // CHECK: %[[TMP_92:.*]] = stablehlo.constant dense<1.33825364E-11>
-   // CHECK: %[[TMP_93:.*]] = stablehlo.add %[[TMP_86]], %[[TMP_92]]
-   // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_93]]
-   // CHECK: %[[TMP_95:.*]] = stablehlo.multiply %[[TMP_91]], %[[TMP_94]]
-   // CHECK: %[[TMP_96:.*]] = stablehlo.constant dense<1.000000e+01>
--  // CHECK: %[[TMP_97:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_96]]
-+  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_96]]
-   // CHECK: %[[TMP_98:.*]] = stablehlo.constant dense<9.000000e+00>
--  // CHECK: %[[TMP_99:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_98]]
-+  // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_98]]
-   // CHECK: %[[TMP_100:.*]] = stablehlo.multiply %[[TMP_97]], %[[TMP_99]]
-   // CHECK: %[[TMP_101:.*]] = stablehlo.constant dense<-5.28419031E-10>
-   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_95]], %[[TMP_101]]
-   // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_102]]
-   // CHECK: %[[TMP_104:.*]] = stablehlo.multiply %[[TMP_100]], %[[TMP_103]]
-   // CHECK: %[[TMP_105:.*]] = stablehlo.constant dense<8.000000e+00>
--  // CHECK: %[[TMP_106:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_105]]
-+  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_105]]
-   // CHECK: %[[TMP_107:.*]] = stablehlo.constant dense<7.000000e+00>
--  // CHECK: %[[TMP_108:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_107]]
-+  // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_107]]
-   // CHECK: %[[TMP_109:.*]] = stablehlo.multiply %[[TMP_106]], %[[TMP_108]]
-   // CHECK: %[[TMP_110:.*]] = stablehlo.constant dense<2.08767563E-8>
-   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_104]], %[[TMP_110]]
-   // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_111]]
-   // CHECK: %[[TMP_113:.*]] = stablehlo.multiply %[[TMP_109]], %[[TMP_112]]
-   // CHECK: %[[TMP_114:.*]] = stablehlo.constant dense<6.000000e+00>
--  // CHECK: %[[TMP_115:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_114]]
-+  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_114]]
-   // CHECK: %[[TMP_116:.*]] = stablehlo.constant dense<5.000000e+00>
--  // CHECK: %[[TMP_117:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_116]]
-+  // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_116]]
-   // CHECK: %[[TMP_118:.*]] = stablehlo.multiply %[[TMP_115]], %[[TMP_117]]
-   // CHECK: %[[TMP_119:.*]] = stablehlo.constant dense<-8.26719599E-7>
-   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_113]], %[[TMP_119]]
-   // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_120]]
-   // CHECK: %[[TMP_122:.*]] = stablehlo.multiply %[[TMP_118]], %[[TMP_121]]
-   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<4.000000e+00>
--  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_123]]
-+  // CHECK: %[[TMP_124:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_123]]
-   // CHECK: %[[TMP_125:.*]] = stablehlo.constant dense<3.000000e+00>
--  // CHECK: %[[TMP_126:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_125]]
-+  // CHECK: %[[TMP_126:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_125]]
-   // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_124]], %[[TMP_126]]
-   // CHECK: %[[TMP_128:.*]] = stablehlo.constant dense<3.30687835E-5>
-   // CHECK: %[[TMP_129:.*]] = stablehlo.add %[[TMP_122]], %[[TMP_128]]
-   // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_129]]
-   // CHECK: %[[TMP_131:.*]] = stablehlo.multiply %[[TMP_127]], %[[TMP_130]]
-   // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.000000e+00>
--  // CHECK: %[[TMP_133:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_132]]
-+  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_132]]
-   // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_135:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_134]]
-+  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_134]]
-   // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_135]]
-   // CHECK: %[[TMP_137:.*]] = stablehlo.constant dense<-0.00138888892>
-   // CHECK: %[[TMP_138:.*]] = stablehlo.add %[[TMP_131]], %[[TMP_137]]
-@@ -1600,99 +1600,99 @@
-   // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
-   // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
-   // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
--  // CHECK: %[[TMP_131:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_130]]
-+  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
-   // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
--  // CHECK: %[[TMP_133:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_132]]
-+  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
-   // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
-   // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.39544646E-19>
-   // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
-   // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
-   // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
-   // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
--  // CHECK: %[[TMP_140:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_139]]
-+  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
-   // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
--  // CHECK: %[[TMP_142:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_141]]
-+  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
-   // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
-   // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.50900303E-18>
-   // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
-   // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
-   // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
-   // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
--  // CHECK: %[[TMP_149:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_148]]
-+  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
-   // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
--  // CHECK: %[[TMP_151:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_150]]
-+  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
-   // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
-   // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.17486866E-16>
-   // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
-   // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
-   // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
-   // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
--  // CHECK: %[[TMP_158:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_157]]
-+  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
-   // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
--  // CHECK: %[[TMP_160:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_159]]
-+  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
-   // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
-   // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.58606213E-15>
-   // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
-   // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
-   // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
-   // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
--  // CHECK: %[[TMP_167:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_166]]
-+  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
-   // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
--  // CHECK: %[[TMP_169:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_168]]
-+  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
-   // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
-   // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896803E-13>
-   // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
-   // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
-   // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
-   // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
--  // CHECK: %[[TMP_176:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_175]]
-+  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
-   // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
--  // CHECK: %[[TMP_178:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_177]]
-+  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
-   // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
-   // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.33825364E-11>
-   // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
-   // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
-   // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
-   // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
--  // CHECK: %[[TMP_185:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_184]]
-+  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
-   // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
--  // CHECK: %[[TMP_187:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_186]]
-+  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
-   // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
-   // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.28419031E-10>
-   // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
-   // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
-   // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
-   // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
--  // CHECK: %[[TMP_194:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_193]]
-+  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
-   // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
--  // CHECK: %[[TMP_196:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_195]]
-+  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
-   // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
-   // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767563E-8>
-   // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
-   // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
-   // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
-   // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
--  // CHECK: %[[TMP_203:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_202]]
-+  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
-   // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
--  // CHECK: %[[TMP_205:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_204]]
-+  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
-   // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
-   // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.26719599E-7>
-   // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
-   // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
-   // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
-   // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
--  // CHECK: %[[TMP_212:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_211]]
-+  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
-   // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
--  // CHECK: %[[TMP_214:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_213]]
-+  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
-   // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
-   // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.30687835E-5>
-   // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
-   // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
-   // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
-   // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
--  // CHECK: %[[TMP_221:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_220]]
-+  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
-   // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_223:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_222]]
-+  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
-   // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
-   // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.00138888892>
-   // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
-@@ -1988,99 +1988,99 @@
-   // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
-   // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
-   // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
--  // CHECK: %[[TMP_131:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_130]]
-+  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
-   // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
--  // CHECK: %[[TMP_133:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_132]]
-+  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
-   // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
-   // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
-   // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
-   // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
-   // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
-   // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
--  // CHECK: %[[TMP_140:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_139]]
-+  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
-   // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
--  // CHECK: %[[TMP_142:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_141]]
-+  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
-   // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
-   // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
-   // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
-   // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
-   // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
-   // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
--  // CHECK: %[[TMP_149:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_148]]
-+  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
-   // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
--  // CHECK: %[[TMP_151:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_150]]
-+  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
-   // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
-   // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
-   // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
-   // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
-   // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
-   // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
--  // CHECK: %[[TMP_158:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_157]]
-+  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
-   // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
--  // CHECK: %[[TMP_160:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_159]]
-+  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
-   // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
-   // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
-   // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
-   // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
-   // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
-   // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
--  // CHECK: %[[TMP_167:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_166]]
-+  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
-   // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
--  // CHECK: %[[TMP_169:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_168]]
-+  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
-   // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
-   // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
-   // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
-   // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
-   // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
-   // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
--  // CHECK: %[[TMP_176:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_175]]
-+  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
-   // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
--  // CHECK: %[[TMP_178:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_177]]
-+  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
-   // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
-   // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
-   // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
-   // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
-   // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
-   // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
--  // CHECK: %[[TMP_185:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_184]]
-+  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
-   // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
--  // CHECK: %[[TMP_187:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_186]]
-+  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
-   // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
-   // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
-   // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
-   // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
-   // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
-   // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
--  // CHECK: %[[TMP_194:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_193]]
-+  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
-   // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
--  // CHECK: %[[TMP_196:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_195]]
-+  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
-   // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
-   // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767569878681E-8>
-   // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
-   // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
-   // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
-   // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
--  // CHECK: %[[TMP_203:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_202]]
-+  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
-   // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
--  // CHECK: %[[TMP_205:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_204]]
-+  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
-   // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
-   // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
-   // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
-   // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
-   // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
-   // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
--  // CHECK: %[[TMP_212:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_211]]
-+  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
-   // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
--  // CHECK: %[[TMP_214:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_213]]
-+  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
-   // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
-   // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
-   // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
-   // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
-   // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
-   // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
--  // CHECK: %[[TMP_221:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_220]]
-+  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
-   // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_223:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_222]]
-+  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
-   // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
-   // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.0013888888888888889>
-   // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
 diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
 --- stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
 +++ stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
@@ -3066,14 +2707,14 @@ diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/
 -
 -// -----
 -
--func.func @convolution_c31(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{rhs and result are of mixed per_tensor and per_axis quantized tensor type 'tensor<3x3x207x16x!quant.uniform<i8:f32:0, {1.000000e-01:-30}>>' and 'tensor<1x8x8x16x!quant.uniform<i8:f32, 1.000000e+01:50>>'}}
+-func.func @convolution_c31(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 0.1:-30>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {10.0:50}>> {
+-  // expected-error@+1 {{per-tensor rhs expects per-tensor result but received 'tensor<3x3x207x16x!quant.uniform<i8:f32, 1.000000e-01:-30>>' and 'tensor<1x8x8x16x!quant.uniform<i8:f32:0, {1.000000e+01:50}>>' respectively}}
 -  %0 = stablehlo.convolution(%arg0, %arg1)
 -         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
 -         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
 -         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
+-       (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32, 0.1:-30>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {10.0:50}>>
+-  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {10.0:50}>>
 -}
 -
 -// -----
@@ -3099,27 +2740,4 @@ diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/
 -    (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:3, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
 - func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
 -}
-diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
---- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-+++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-@@ -1575,6 +1575,7 @@
- 
- static Value materializeZeta(ConversionPatternRewriter &rewriter, Location loc,
-                              ValueRange args) {
-+  // Code should match XLA's materializeZeta from chlo_legalize_to_hlo.cc
-   assert(args.size() == 2);
-   Value x = args[0];
-   Value q = args[1];
-@@ -1629,9 +1630,9 @@
-   // Using Horner's rule allows to avoid some NaN's and Infs from happening,
-   // resulting in more numerically stable code.
-   for (int i = 0; i < 11; ++i) {
--    Value factorLhs = rewriter.create<mlir::stablehlo::SubtractOp>(
-+    Value factorLhs = rewriter.create<mlir::stablehlo::AddOp>(
-         loc, x, getConstantLike(rewriter, loc, 22 - 2 * i, x));
--    Value factorRhs = rewriter.create<mlir::stablehlo::SubtractOp>(
-+    Value factorRhs = rewriter.create<mlir::stablehlo::AddOp>(
-         loc, x, getConstantLike(rewriter, loc, 21 - 2 * i, x));
-     factor = rewriter.create<mlir::stablehlo::MulOp>(loc, factorLhs, factorRhs);
-     hornerSum = rewriter.create<mlir::stablehlo::MulOp>(
 
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index 674fda41985b9a..de47cf2fc6db34 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "aa69baea1409d7c341705e0e9342ed62802d8a4d"
-    STABLEHLO_SHA256 = "4a95367f09657343704dff27c651bb0e648a205d5b82acbe35cd160b0d87ff35"
+    STABLEHLO_COMMIT = "b27ef13c377983d04c233adb8e1de093ec7f350a"
+    STABLEHLO_SHA256 = "fe48a38f20e73ddfeb6e364f4b43e13f490e60e236eaa23b109ce203dc49e62b"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index d31b57fbd68ad9..1668264c315c19 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1,3 +1,14 @@
+diff --ruN a/stablehlo/BUILD.bazel b/stablehlo/BUILD.bazel
+--- stablehlo/BUILD.bazel
++++ stablehlo/BUILD.bazel
+@@ -422,7 +422,6 @@
+     hdrs = [
+         "stablehlo/reference/Api.h",
+     ],
+-    strip_include_prefix = ".",
+     deps = [
+         ":interpreter_ops",
+         ":reference_configuration",
 diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 --- stablehlo/CMakeLists.txt
 +++ stablehlo/CMakeLists.txt
@@ -203,7 +214,7 @@ diff --ruN a/stablehlo/stablehlo/dialect/TypeInference.cpp b/stablehlo/stablehlo
  
  //===----------------------------------------------------------------------===//
  // Utils for shape functions.
-@@ -3472,61 +3453,6 @@
+@@ -3472,60 +3453,6 @@
                               "is incompatible with return type of operation ",
                               shapedResultType, "");
  
@@ -237,16 +248,15 @@ diff --ruN a/stablehlo/stablehlo/dialect/TypeInference.cpp b/stablehlo/stablehlo
 -  if (noneQuantized<quant::UniformQuantizedPerAxisType>(typeEntriesPerAxis))
 -    return success();
 -  // convolution_c31
--  if (!allQuantized<quant::UniformQuantizedPerAxisType>(typeEntriesPerAxis)) {
--    return emitOptionalError(location,
--                             "rhs and result are of mixed per_tensor and "
--                             "per_axis quantized tensor type ",
--                             rhsType, " and ", resultType);
--  }
--
 -  auto rhsQPAType = rhsQType.dyn_cast<quant::UniformQuantizedPerAxisType>();
 -  auto resultQPAType =
 -      resultQType.dyn_cast<quant::UniformQuantizedPerAxisType>();
+-  if (!rhsQPAType && resultQPAType) {
+-    return emitOptionalError(
+-        location, "per-tensor rhs expects per-tensor result but received ",
+-        rhsType, " and ", resultType, " respectively");
+-  }
+-
 -  // convolution_c32
 -  if (rhsQPAType &&
 -      rhsQPAType.getQuantizedDimension() != kernelOutputFeatureDimension)
@@ -2651,375 +2661,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
---- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
-+++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
-@@ -1324,99 +1324,99 @@
-   // CHECK: %[[TMP_40:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
-   // CHECK: %[[TMP_41:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_40]]
-   // CHECK: %[[TMP_42:.*]] = stablehlo.constant dense<2.200000e+01>
--  // CHECK: %[[TMP_43:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_42]]
-+  // CHECK: %[[TMP_43:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_42]]
-   // CHECK: %[[TMP_44:.*]] = stablehlo.constant dense<2.100000e+01>
--  // CHECK: %[[TMP_45:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_44]]
-+  // CHECK: %[[TMP_45:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_44]]
-   // CHECK: %[[TMP_46:.*]] = stablehlo.multiply %[[TMP_43]], %[[TMP_45]]
-   // CHECK: %[[TMP_47:.*]] = stablehlo.constant dense<-1.39544646E-19>
-   // CHECK: %[[TMP_48:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_47]]
-   // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_48]]
-   // CHECK: %[[TMP_50:.*]] = stablehlo.multiply %[[TMP_46]], %[[TMP_49]]
-   // CHECK: %[[TMP_51:.*]] = stablehlo.constant dense<2.000000e+01>
--  // CHECK: %[[TMP_52:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_51]]
-+  // CHECK: %[[TMP_52:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_51]]
-   // CHECK: %[[TMP_53:.*]] = stablehlo.constant dense<1.900000e+01>
--  // CHECK: %[[TMP_54:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_53]]
-+  // CHECK: %[[TMP_54:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_53]]
-   // CHECK: %[[TMP_55:.*]] = stablehlo.multiply %[[TMP_52]], %[[TMP_54]]
-   // CHECK: %[[TMP_56:.*]] = stablehlo.constant dense<5.50900303E-18>
-   // CHECK: %[[TMP_57:.*]] = stablehlo.add %[[TMP_50]], %[[TMP_56]]
-   // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_57]]
-   // CHECK: %[[TMP_59:.*]] = stablehlo.multiply %[[TMP_55]], %[[TMP_58]]
-   // CHECK: %[[TMP_60:.*]] = stablehlo.constant dense<1.800000e+01>
--  // CHECK: %[[TMP_61:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_60]]
-+  // CHECK: %[[TMP_61:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_60]]
-   // CHECK: %[[TMP_62:.*]] = stablehlo.constant dense<1.700000e+01>
--  // CHECK: %[[TMP_63:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_62]]
-+  // CHECK: %[[TMP_63:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_62]]
-   // CHECK: %[[TMP_64:.*]] = stablehlo.multiply %[[TMP_61]], %[[TMP_63]]
-   // CHECK: %[[TMP_65:.*]] = stablehlo.constant dense<-2.17486866E-16>
-   // CHECK: %[[TMP_66:.*]] = stablehlo.add %[[TMP_59]], %[[TMP_65]]
-   // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_66]]
-   // CHECK: %[[TMP_68:.*]] = stablehlo.multiply %[[TMP_64]], %[[TMP_67]]
-   // CHECK: %[[TMP_69:.*]] = stablehlo.constant dense<1.600000e+01>
--  // CHECK: %[[TMP_70:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_69]]
-+  // CHECK: %[[TMP_70:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_69]]
-   // CHECK: %[[TMP_71:.*]] = stablehlo.constant dense<1.500000e+01>
--  // CHECK: %[[TMP_72:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_71]]
-+  // CHECK: %[[TMP_72:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_71]]
-   // CHECK: %[[TMP_73:.*]] = stablehlo.multiply %[[TMP_70]], %[[TMP_72]]
-   // CHECK: %[[TMP_74:.*]] = stablehlo.constant dense<8.58606213E-15>
-   // CHECK: %[[TMP_75:.*]] = stablehlo.add %[[TMP_68]], %[[TMP_74]]
-   // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_75]]
-   // CHECK: %[[TMP_77:.*]] = stablehlo.multiply %[[TMP_73]], %[[TMP_76]]
-   // CHECK: %[[TMP_78:.*]] = stablehlo.constant dense<1.400000e+01>
--  // CHECK: %[[TMP_79:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_78]]
-+  // CHECK: %[[TMP_79:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_78]]
-   // CHECK: %[[TMP_80:.*]] = stablehlo.constant dense<1.300000e+01>
--  // CHECK: %[[TMP_81:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_80]]
-+  // CHECK: %[[TMP_81:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_80]]
-   // CHECK: %[[TMP_82:.*]] = stablehlo.multiply %[[TMP_79]], %[[TMP_81]]
-   // CHECK: %[[TMP_83:.*]] = stablehlo.constant dense<-3.3896803E-13>
-   // CHECK: %[[TMP_84:.*]] = stablehlo.add %[[TMP_77]], %[[TMP_83]]
-   // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_84]]
-   // CHECK: %[[TMP_86:.*]] = stablehlo.multiply %[[TMP_82]], %[[TMP_85]]
-   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<1.200000e+01>
--  // CHECK: %[[TMP_88:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_87]]
-+  // CHECK: %[[TMP_88:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_87]]
-   // CHECK: %[[TMP_89:.*]] = stablehlo.constant dense<1.100000e+01>
--  // CHECK: %[[TMP_90:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_89]]
-+  // CHECK: %[[TMP_90:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_89]]
-   // CHECK: %[[TMP_91:.*]] = stablehlo.multiply %[[TMP_88]], %[[TMP_90]]
-   // CHECK: %[[TMP_92:.*]] = stablehlo.constant dense<1.33825364E-11>
-   // CHECK: %[[TMP_93:.*]] = stablehlo.add %[[TMP_86]], %[[TMP_92]]
-   // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_93]]
-   // CHECK: %[[TMP_95:.*]] = stablehlo.multiply %[[TMP_91]], %[[TMP_94]]
-   // CHECK: %[[TMP_96:.*]] = stablehlo.constant dense<1.000000e+01>
--  // CHECK: %[[TMP_97:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_96]]
-+  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_96]]
-   // CHECK: %[[TMP_98:.*]] = stablehlo.constant dense<9.000000e+00>
--  // CHECK: %[[TMP_99:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_98]]
-+  // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_98]]
-   // CHECK: %[[TMP_100:.*]] = stablehlo.multiply %[[TMP_97]], %[[TMP_99]]
-   // CHECK: %[[TMP_101:.*]] = stablehlo.constant dense<-5.28419031E-10>
-   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_95]], %[[TMP_101]]
-   // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_102]]
-   // CHECK: %[[TMP_104:.*]] = stablehlo.multiply %[[TMP_100]], %[[TMP_103]]
-   // CHECK: %[[TMP_105:.*]] = stablehlo.constant dense<8.000000e+00>
--  // CHECK: %[[TMP_106:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_105]]
-+  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_105]]
-   // CHECK: %[[TMP_107:.*]] = stablehlo.constant dense<7.000000e+00>
--  // CHECK: %[[TMP_108:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_107]]
-+  // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_107]]
-   // CHECK: %[[TMP_109:.*]] = stablehlo.multiply %[[TMP_106]], %[[TMP_108]]
-   // CHECK: %[[TMP_110:.*]] = stablehlo.constant dense<2.08767563E-8>
-   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_104]], %[[TMP_110]]
-   // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_111]]
-   // CHECK: %[[TMP_113:.*]] = stablehlo.multiply %[[TMP_109]], %[[TMP_112]]
-   // CHECK: %[[TMP_114:.*]] = stablehlo.constant dense<6.000000e+00>
--  // CHECK: %[[TMP_115:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_114]]
-+  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_114]]
-   // CHECK: %[[TMP_116:.*]] = stablehlo.constant dense<5.000000e+00>
--  // CHECK: %[[TMP_117:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_116]]
-+  // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_116]]
-   // CHECK: %[[TMP_118:.*]] = stablehlo.multiply %[[TMP_115]], %[[TMP_117]]
-   // CHECK: %[[TMP_119:.*]] = stablehlo.constant dense<-8.26719599E-7>
-   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_113]], %[[TMP_119]]
-   // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_120]]
-   // CHECK: %[[TMP_122:.*]] = stablehlo.multiply %[[TMP_118]], %[[TMP_121]]
-   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<4.000000e+00>
--  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_123]]
-+  // CHECK: %[[TMP_124:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_123]]
-   // CHECK: %[[TMP_125:.*]] = stablehlo.constant dense<3.000000e+00>
--  // CHECK: %[[TMP_126:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_125]]
-+  // CHECK: %[[TMP_126:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_125]]
-   // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_124]], %[[TMP_126]]
-   // CHECK: %[[TMP_128:.*]] = stablehlo.constant dense<3.30687835E-5>
-   // CHECK: %[[TMP_129:.*]] = stablehlo.add %[[TMP_122]], %[[TMP_128]]
-   // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_129]]
-   // CHECK: %[[TMP_131:.*]] = stablehlo.multiply %[[TMP_127]], %[[TMP_130]]
-   // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.000000e+00>
--  // CHECK: %[[TMP_133:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_132]]
-+  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_132]]
-   // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_135:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_134]]
-+  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_134]]
-   // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_135]]
-   // CHECK: %[[TMP_137:.*]] = stablehlo.constant dense<-0.00138888892>
-   // CHECK: %[[TMP_138:.*]] = stablehlo.add %[[TMP_131]], %[[TMP_137]]
-@@ -1600,99 +1600,99 @@
-   // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
-   // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
-   // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
--  // CHECK: %[[TMP_131:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_130]]
-+  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
-   // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
--  // CHECK: %[[TMP_133:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_132]]
-+  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
-   // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
-   // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.39544646E-19>
-   // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
-   // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
-   // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
-   // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
--  // CHECK: %[[TMP_140:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_139]]
-+  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
-   // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
--  // CHECK: %[[TMP_142:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_141]]
-+  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
-   // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
-   // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.50900303E-18>
-   // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
-   // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
-   // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
-   // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
--  // CHECK: %[[TMP_149:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_148]]
-+  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
-   // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
--  // CHECK: %[[TMP_151:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_150]]
-+  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
-   // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
-   // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.17486866E-16>
-   // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
-   // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
-   // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
-   // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
--  // CHECK: %[[TMP_158:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_157]]
-+  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
-   // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
--  // CHECK: %[[TMP_160:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_159]]
-+  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
-   // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
-   // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.58606213E-15>
-   // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
-   // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
-   // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
-   // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
--  // CHECK: %[[TMP_167:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_166]]
-+  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
-   // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
--  // CHECK: %[[TMP_169:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_168]]
-+  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
-   // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
-   // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896803E-13>
-   // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
-   // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
-   // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
-   // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
--  // CHECK: %[[TMP_176:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_175]]
-+  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
-   // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
--  // CHECK: %[[TMP_178:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_177]]
-+  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
-   // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
-   // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.33825364E-11>
-   // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
-   // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
-   // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
-   // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
--  // CHECK: %[[TMP_185:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_184]]
-+  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
-   // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
--  // CHECK: %[[TMP_187:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_186]]
-+  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
-   // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
-   // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.28419031E-10>
-   // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
-   // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
-   // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
-   // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
--  // CHECK: %[[TMP_194:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_193]]
-+  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
-   // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
--  // CHECK: %[[TMP_196:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_195]]
-+  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
-   // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
-   // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767563E-8>
-   // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
-   // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
-   // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
-   // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
--  // CHECK: %[[TMP_203:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_202]]
-+  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
-   // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
--  // CHECK: %[[TMP_205:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_204]]
-+  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
-   // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
-   // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.26719599E-7>
-   // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
-   // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
-   // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
-   // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
--  // CHECK: %[[TMP_212:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_211]]
-+  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
-   // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
--  // CHECK: %[[TMP_214:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_213]]
-+  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
-   // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
-   // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.30687835E-5>
-   // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
-   // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
-   // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
-   // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
--  // CHECK: %[[TMP_221:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_220]]
-+  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
-   // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_223:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_222]]
-+  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
-   // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
-   // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.00138888892>
-   // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
-@@ -1988,99 +1988,99 @@
-   // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
-   // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
-   // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
--  // CHECK: %[[TMP_131:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_130]]
-+  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
-   // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
--  // CHECK: %[[TMP_133:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_132]]
-+  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
-   // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
-   // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
-   // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
-   // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
-   // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
-   // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
--  // CHECK: %[[TMP_140:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_139]]
-+  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
-   // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
--  // CHECK: %[[TMP_142:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_141]]
-+  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
-   // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
-   // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
-   // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
-   // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
-   // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
-   // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
--  // CHECK: %[[TMP_149:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_148]]
-+  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
-   // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
--  // CHECK: %[[TMP_151:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_150]]
-+  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
-   // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
-   // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
-   // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
-   // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
-   // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
-   // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
--  // CHECK: %[[TMP_158:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_157]]
-+  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
-   // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
--  // CHECK: %[[TMP_160:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_159]]
-+  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
-   // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
-   // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
-   // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
-   // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
-   // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
-   // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
--  // CHECK: %[[TMP_167:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_166]]
-+  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
-   // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
--  // CHECK: %[[TMP_169:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_168]]
-+  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
-   // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
-   // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
-   // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
-   // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
-   // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
-   // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
--  // CHECK: %[[TMP_176:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_175]]
-+  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
-   // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
--  // CHECK: %[[TMP_178:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_177]]
-+  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
-   // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
-   // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
-   // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
-   // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
-   // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
-   // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
--  // CHECK: %[[TMP_185:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_184]]
-+  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
-   // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
--  // CHECK: %[[TMP_187:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_186]]
-+  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
-   // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
-   // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
-   // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
-   // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
-   // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
-   // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
--  // CHECK: %[[TMP_194:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_193]]
-+  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
-   // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
--  // CHECK: %[[TMP_196:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_195]]
-+  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
-   // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
-   // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767569878681E-8>
-   // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
-   // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
-   // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
-   // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
--  // CHECK: %[[TMP_203:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_202]]
-+  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
-   // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
--  // CHECK: %[[TMP_205:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_204]]
-+  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
-   // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
-   // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
-   // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
-   // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
-   // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
-   // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
--  // CHECK: %[[TMP_212:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_211]]
-+  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
-   // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
--  // CHECK: %[[TMP_214:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_213]]
-+  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
-   // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
-   // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
-   // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
-   // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
-   // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
-   // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
--  // CHECK: %[[TMP_221:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_220]]
-+  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
-   // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_223:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_222]]
-+  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
-   // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
-   // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.0013888888888888889>
-   // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
 diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
 --- stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
 +++ stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
@@ -3066,14 +2707,14 @@ diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/
 -
 -// -----
 -
--func.func @convolution_c31(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{rhs and result are of mixed per_tensor and per_axis quantized tensor type 'tensor<3x3x207x16x!quant.uniform<i8:f32:0, {1.000000e-01:-30}>>' and 'tensor<1x8x8x16x!quant.uniform<i8:f32, 1.000000e+01:50>>'}}
+-func.func @convolution_c31(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 0.1:-30>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {10.0:50}>> {
+-  // expected-error@+1 {{per-tensor rhs expects per-tensor result but received 'tensor<3x3x207x16x!quant.uniform<i8:f32, 1.000000e-01:-30>>' and 'tensor<1x8x8x16x!quant.uniform<i8:f32:0, {1.000000e+01:50}>>' respectively}}
 -  %0 = stablehlo.convolution(%arg0, %arg1)
 -         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
 -         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
 -         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
+-       (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32, 0.1:-30>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {10.0:50}>>
+-  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {10.0:50}>>
 -}
 -
 -// -----
@@ -3099,27 +2740,4 @@ diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/
 -    (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:3, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
 - func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
 -}
-diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
---- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-+++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-@@ -1575,6 +1575,7 @@
- 
- static Value materializeZeta(ConversionPatternRewriter &rewriter, Location loc,
-                              ValueRange args) {
-+  // Code should match XLA's materializeZeta from chlo_legalize_to_hlo.cc
-   assert(args.size() == 2);
-   Value x = args[0];
-   Value q = args[1];
-@@ -1629,9 +1630,9 @@
-   // Using Horner's rule allows to avoid some NaN's and Infs from happening,
-   // resulting in more numerically stable code.
-   for (int i = 0; i < 11; ++i) {
--    Value factorLhs = rewriter.create<mlir::stablehlo::SubtractOp>(
-+    Value factorLhs = rewriter.create<mlir::stablehlo::AddOp>(
-         loc, x, getConstantLike(rewriter, loc, 22 - 2 * i, x));
--    Value factorRhs = rewriter.create<mlir::stablehlo::SubtractOp>(
-+    Value factorRhs = rewriter.create<mlir::stablehlo::AddOp>(
-         loc, x, getConstantLike(rewriter, loc, 21 - 2 * i, x));
-     factor = rewriter.create<mlir::stablehlo::MulOp>(loc, factorLhs, factorRhs);
-     hornerSum = rewriter.create<mlir::stablehlo::MulOp>(
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index 674fda41985b9a..de47cf2fc6db34 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "aa69baea1409d7c341705e0e9342ed62802d8a4d"
-    STABLEHLO_SHA256 = "4a95367f09657343704dff27c651bb0e648a205d5b82acbe35cd160b0d87ff35"
+    STABLEHLO_COMMIT = "b27ef13c377983d04c233adb8e1de093ec7f350a"
+    STABLEHLO_SHA256 = "fe48a38f20e73ddfeb6e364f4b43e13f490e60e236eaa23b109ce203dc49e62b"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(

From d04a6308c22df049004ad5f121ff9a68f05f5582 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Mon, 25 Mar 2024 15:21:42 -0700
Subject: [PATCH 411/670] [xla:gpu] Handle OOB start indices in
 AddressComputationThunk

PiperOrigin-RevId: 618975990
---
 .../address_computation_fusion_test.cc        |  98 ++++++++++
 .../gpu/runtime/address_computation_thunk.cc  |  11 +-
 .../runtime/address_computation_thunk_test.cc | 183 ++++++++++++++++++
 3 files changed, 290 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index 2ff4a53cea5448..3a20611f1fde4b 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -2308,6 +2308,104 @@ TEST_F(AddressComputationFusionTest, CublasGemmDUSOffsetS32NotConstant) {
                                       /*run_hlo_passes=*/false));
 }
 
+TEST_F(AddressComputationFusionTest, CublasGemmDUSOffsetOOB) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s64[] constant(10)
+    %c0_s32 = s64[] constant(-1)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s64[] parameter(3)
+    %c0_s32 = s64[] parameter(4)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+  }
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s64[] constant(10)
+    %c0_s32 = s64[] constant(-1)
+    ROOT %fusion.2 = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 61c6913aecff6e..97791d32b76284 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime/address_computation_thunk.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -151,7 +152,9 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
 
     // Get offset for `argument_idx`-th argument, which has `dst_shape.rank()`
     // components.
-    for (auto [offset_idx, slice] : llvm::enumerate(*offset_slice)) {
+    for (auto [offset_idx, values] : llvm::enumerate(llvm::zip(
+             *offset_slice, src_shape.dimensions(), dst_shape.dimensions()))) {
+      auto [slice, src_dim, dst_dim] = values;
       se::DeviceMemoryBase offset_src =
           orig_allocations.GetDeviceAddress(slice);
       int64_t* offset_dst = &offsets_base[argument_idx + offset_idx];
@@ -165,7 +168,11 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
             "Failed to retrieve all slice offset values on stream %p: %s",
             &stream, blocked.message()));
       }
-      slice_starts.push_back(*offset_dst);
+      // Clamp start indices:
+      // start_indices[i] = min(max(start_indices[i], 0),
+      //                        operand.dimension_size[i] - size_indices[i])
+      auto start_index = std::min(std::max(*offset_dst, 0L), src_dim - dst_dim);
+      slice_starts.push_back(start_index);
     }
 
     // Compute new slice. No need to copy the content to new buffers as we can
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index 83f227121b2203..bc5b0b1d998deb 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -1226,4 +1226,187 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
   ASSERT_EQ(dst, std::vector<float>({9}));
 }
 
+TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t src_count = 8 * 8 * 10 * 2;
+  int64_t dst_count = 2 * 2 * 2 * 2;
+  int64_t slice_count = 2 * 2;
+  int64_t src_length = sizeof(int32_t) * src_count;
+  int64_t dst_length = sizeof(int32_t) * dst_count;
+  int64_t offset_length = sizeof(int64_t);
+  int64_t slice_length = sizeof(int32_t) * slice_count;
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
+  BufferAllocation::Slice slice_src(&alloc_src, 0, src_length);
+
+  BufferAllocation alloc_dst(/*index=*/1, dst_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst(&alloc_dst, 0, dst_length);
+
+  BufferAllocation alloc_src_offset_0(/*index=*/2, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_0(&alloc_src_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_src_offset_1(/*index=*/3, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_1(&alloc_src_offset_1, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_src_offset_2(/*index=*/4, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_2(&alloc_src_offset_2, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_src_offset_3(/*index=*/5, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_3(&alloc_src_offset_3, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_0(/*index=*/6, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_0(&alloc_dst_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_1(/*index=*/7, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_1(&alloc_dst_offset_1, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_2(/*index=*/8, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_2(&alloc_dst_offset_2, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_3(/*index=*/9, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_3(&alloc_dst_offset_3, 0,
+                                             offset_length);
+
+  // Fake slices for embedded thunk creation.
+  BufferAllocation alloc_src_fake(/*index=*/0, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_fake(&alloc_src_fake, 0, slice_length);
+
+  BufferAllocation alloc_dst_fake(/*index=*/1, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_fake(&alloc_dst_fake, 0, slice_length);
+
+  // Preparing custom call thunk: setting up call target and operands + results
+  // buffers.
+  auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
+  ASSERT_TRUE(registration.ok());
+
+  std::vector<std::optional<CustomCallThunk::Slice>> operands{
+      CustomCallThunk::Slice{slice_src_fake,
+                             ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
+  std::vector<std::optional<CustomCallThunk::Slice>> results{
+      CustomCallThunk::Slice{slice_dst_fake,
+                             ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
+
+  // Creating embedded custom call thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<CustomCallThunk>(
+      Thunk::ThunkInfo(nullptr), registration->handler, operands, results,
+      /*attributes=*/CustomCallThunk::AttributesMap(),
+      /*called_computation=*/nullptr));
+
+  // Wrapping address computation thunk around the custom call thunk.
+  std::vector<BufferAllocation::Slice> slice_src_offsets{
+      slice_src_offset_0, slice_src_offset_1, slice_src_offset_2,
+      slice_src_offset_3};
+  std::vector<BufferAllocation::Slice> slice_dst_offsets{
+      slice_dst_offset_0, slice_dst_offset_1, slice_dst_offset_2,
+      slice_dst_offset_3};
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src, slice_dst},
+      {slice_src_offsets, slice_dst_offsets},
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 2}),
+       ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2, 2, 2})},
+      // Make sure to pass a dst shape with the same rank as src shape (i.e.
+      // original slice result and not bitcasted one)
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2}),
+       ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})},
+      {sizeof(int64_t), sizeof(int64_t)});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+  // Given a `src` tensor of shape s32[8,8,10,2]{3,2,1,0}
+  // The `src` slice that we want to copy from will be equivalent to this static
+  // slice op:
+  // s32[1,1,2,2]{3,2,1,0} slice(src), slice={[3:4], [5:6], [2:4], [0:2]}
+  //
+  // Given a `dst` tensor of shape s32[2,2,2,2]{3,2,1,0}
+  // The `dst` slice that we want to copy into will be equivalent to this static
+  // slice op:
+  // s32[1,1,2,2]{3,2,1,0} slice(dst), slice={[1:2], [1:2], [0:2], [0:2]}
+
+  // Preparing memory for thunk arguments.
+  se::DeviceMemory<int32_t> src = executor->AllocateArray<int32_t>(src_count);
+  std::vector<int32_t> src_arr(src_count, 0);
+  for (unsigned i = 0; i < src_count; ++i) src_arr[i] = i;
+  TF_ASSERT_OK(stream.Memcpy(&src, src_arr.data(), src_length));
+
+  se::DeviceMemory<int32_t> dst = executor->AllocateArray<int32_t>(dst_count);
+  TF_ASSERT_OK(stream.MemZero(&dst, dst_length));
+
+  se::DeviceMemory<int64_t> src_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> src_offset_1 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> src_offset_2 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> src_offset_3 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> src_ref_offset_arr{3, 5, 2, 0};
+  std::vector<int64_t> src_offset_arr{3, 5, 2, -3};
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_0, &src_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_1, &src_offset_arr[1], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_2, &src_offset_arr[2], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_3, &src_offset_arr[3], offset_length));
+
+  se::DeviceMemory<int64_t> dst_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> dst_offset_1 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> dst_offset_2 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> dst_offset_3 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> dst_ref_offset_arr{1, 1, 0, 0};
+  std::vector<int64_t> dst_offset_arr{3, 2, 5, -4};
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_0, &dst_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_1, &dst_offset_arr[1], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_2, &dst_offset_arr[2], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_3, &dst_offset_arr[3], offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations(
+      {src, dst, src_offset_0, src_offset_1, src_offset_2, src_offset_3,
+       dst_offset_0, dst_offset_1, dst_offset_2, dst_offset_3},
+      0, executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Executing address computation thunk.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copying `dst` data back to host for verification.
+  std::vector<int32_t> out(dst_count, 0);
+  TF_ASSERT_OK(stream.Memcpy(out.data(), dst, dst_length));
+
+  // Verifying that the right slice of `src` was copied to `dst`.
+  std::vector<int32_t> ref(dst_count, 0);
+  int64_t src_offset_val =
+      src_ref_offset_arr[3] +
+      2 * (src_ref_offset_arr[2] +
+           10 * (src_ref_offset_arr[1] + 8 * src_ref_offset_arr[0]));
+  int64_t dst_offset_val =
+      dst_ref_offset_arr[3] +
+      2 * (dst_ref_offset_arr[2] +
+           2 * (dst_ref_offset_arr[1] + 2 * dst_ref_offset_arr[0]));
+  std::copy(src_arr.begin() + src_offset_val,
+            src_arr.begin() + src_offset_val + slice_count,
+            ref.begin() + dst_offset_val);
+  ASSERT_EQ(out, ref);
+}
+
 }  // namespace xla::gpu

From 29561af231863afb3b6b89e3aa8a6a550c2b7bb0 Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Mon, 25 Mar 2024 15:39:45 -0700
Subject: [PATCH 412/670] [XLA:GPU] Enable GemmFusion to fuse dot with trivial
 non-contracting dimension

PiperOrigin-RevId: 618981039
---
 .../xla/xla/service/gpu/gemm_fusion.cc        | 17 ---------
 .../xla/xla/service/gpu/gemm_fusion_test.cc   | 37 ++++++++-----------
 2 files changed, 15 insertions(+), 39 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_fusion.cc b/third_party/xla/xla/service/gpu/gemm_fusion.cc
index b51e6aedea9143..2bceba577ff8d5 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion.cc
@@ -893,23 +893,6 @@ FusionDecision CanTritonHandleGEMM(
     return "No non-contracting dimensions.";
   }
 
-  for (int operand_number = 0; operand_number <= 1; ++operand_number) {
-    // This pass relies on dot decomposer which ensures that all non-contracting
-    // dimensions are merged into one. Using NonContractingDimensionIndex is
-    // sufficient.
-    absl::StatusOr<int64_t> non_contracting_dimension_index =
-        NonContractingDimensionIndex(dot, operand_number);
-    if (!non_contracting_dimension_index.ok()) {
-      return non_contracting_dimension_index.status().message();
-    }
-    const int64_t nc_size = dot.operand(operand_number)
-                                ->shape()
-                                .dimensions(*non_contracting_dimension_index);
-    if (nc_size <= 1) {
-      return "Trivial non-contracting dimensions.";
-    }
-  }
-
   return FusionDecision{};
 }
 
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
index bdb1be455024f0..a4db45eb78d08e 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
@@ -163,30 +163,23 @@ ENTRY e {
   EXPECT_FALSE(GemmFusion(gpu_version_).Run(module.get()).value());
 }
 
-TEST_F(GemmFusionTest, DoNotTriggerWhenTheLhsNoncontractingDimIs1) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
-ENTRY e {
-  p0 = s8[1,256] parameter(0)
-  p0c = f16[1,256] convert(p0)
-  p1 = f16[256,512] parameter(1)
-  ROOT r = f16[1,512] dot(p0c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})"));
-  EXPECT_FALSE(GemmFusion(gpu_version_).Run(module.get()).value());
-}
+TEST_F(GemmFusionTest, FuseDotWithTrivialNoncontractingDim) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule m
 
-TEST_F(GemmFusionTest, DoNotTriggerWhenTheRhsNoncontractingDimIs1) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
 ENTRY e {
-  p0 = s8[128,256] parameter(0)
-  p0c = f16[128,256] convert(p0)
-  p1 = f16[256,1] parameter(1)
-  ROOT r = f16[128,1] dot(p0c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})"));
-  EXPECT_FALSE(GemmFusion(gpu_version_).Run(module.get()).value());
+  p0 = s8[60,5] parameter(0)
+  r0 = s8[3,20,5] reshape(p0)
+  c0 = f16[3,20,5] convert(r0)
+  p1 = f16[3,1,20] parameter(1)
+  ROOT d = f16[3,5,1] dot(c0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={2},
+    lhs_batch_dims={0}, rhs_batch_dims={0}
+})")
+                    .value();
+  EXPECT_TRUE(GemmFusion(gpu_version_).Run(module.get()).value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
 }
 
 TEST_F(GemmFusionTest, HandleDotIfCublasRequiresPadding) {

From 40fc44202a542956a285b103b826ce107907be14 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 16:12:12 -0700
Subject: [PATCH 413/670] Use the sharding of the appropriate tuple element
 rather than that of the entire tuple when computing input_shardings for
 get-tuple-element HLO ops with user sharding annotations.

PiperOrigin-RevId: 618989496
---
 .../auto_sharding/auto_sharding.cc            |   6 +-
 .../auto_sharding/auto_sharding_test.cc       | 120 +++++++++++-------
 .../auto_sharding/auto_sharding_util.cc       |   2 +-
 3 files changed, 83 insertions(+), 45 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 53faf0be87f3bb..fcbc31a6636781 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -1461,9 +1461,13 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
                 strategy_map.at(operand).get();
             Shape operand_shape = operand->shape();
             if (ins->opcode() == HloOpcode::kGetTupleElement) {
+              if (input_sharding && input_sharding->IsTuple()) {
+                input_sharding = input_sharding->GetSubSharding(
+                    operand->shape(), {ins->tuple_index()});
+              }
               operand_strategy_group =
                   operand_strategy_group->childs[ins->tuple_index()].get();
-              operand_shape = operand_shape.tuple_shapes(ins->tuple_index());
+              operand_shape = operand->shape().tuple_shapes(ins->tuple_index());
             }
 
             if (input_sharding.has_value()) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index 2f93b7f20813d0..b8dcc047e90bad 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -74,7 +74,7 @@ ENTRY %elementwise {
   ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%add)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   TF_ASSERT_OK_AND_ASSIGN(bool changed, DummyAutoSharding().Run(module.get()));
   EXPECT_TRUE(changed);
@@ -102,7 +102,7 @@ ENTRY %elementwise {
   void RunMatMulAutoShardingWithOptions(
       AutoShardingOption option, size_t expected_num_tiles,
       size_t expected_sharded_dimensions = 1) {
-    TF_ASSERT_OK_AND_ASSIGN(auto module,
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                             ParseAndReturnVerifiedModule(kDotHloString));
     RunAutoShardingWithOptions(module.get(), option, expected_num_tiles,
                                expected_sharded_dimensions);
@@ -111,7 +111,7 @@ ENTRY %elementwise {
   void RunAddAutoShardingWithOptions(AutoShardingOption option,
                                      size_t expected_num_tiles,
                                      size_t expected_sharded_dimensions = 1) {
-    TF_ASSERT_OK_AND_ASSIGN(auto module,
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                             ParseAndReturnVerifiedModule(kAddHloString));
     RunAutoShardingWithOptions(module.get(), option, expected_num_tiles,
                                expected_sharded_dimensions);
@@ -133,7 +133,7 @@ ENTRY %elementwise {
   }
 
   void RunMatMulAutoShardingWithOptionsExpectFail(AutoShardingOption option) {
-    TF_ASSERT_OK_AND_ASSIGN(auto module,
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                             ParseAndReturnVerifiedModule(kDotHloString));
     RunAutoShardingWithOptionsExpectFail(module.get(), option);
   }
@@ -146,7 +146,7 @@ ENTRY %elementwise {
   void RunMatMulAutoShardingWithOptionsNoDeviceIds(
       AutoShardingOption option, std::vector<int64_t> expected_tile,
       bool expeted_last_dim_replicate = false) {
-    TF_ASSERT_OK_AND_ASSIGN(auto module,
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                             ParseAndReturnVerifiedModule(kDotHloString));
     RunAutoShardingWithOptionsNoDeviceIds(module.get(), option, expected_tile,
                                           expeted_last_dim_replicate);
@@ -240,7 +240,7 @@ ENTRY %elementwise {
   ROOT %copy = f32[128,128]{0,1} copy(%add)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -293,7 +293,7 @@ ENTRY %elementwise {
   ROOT %copy = f32[32,32,32,32] copy(%add)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -438,7 +438,7 @@ ENTRY %RngBitGenerator (p0: u64[2]) -> (u64[2], u32[16,16]) {
   ROOT %rand = (u64[2]{0}, u32[16,16]{1,0}) rng-bit-generator(u64[2]{0} %p0), algorithm=rng_three_fry
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -465,7 +465,7 @@ ENTRY %RngBitGenerator {
   ROOT rng-bit-generator = u32[100,100]{1,0:T(8,128)} rng-bit-generator(tuple.3), algorithm=rng_default
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -494,7 +494,7 @@ ENTRY %entry {
   ROOT %copy = f32[4,256,32]{2,1,0} copy(%dot)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -546,7 +546,7 @@ ENTRY %entry {
   ROOT %copy = f32[32,4,8]{2,1,0} copy(%dot)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -598,7 +598,7 @@ ENTRY %entry {
   ROOT %copy = f32[64,32]{1,0} copy(%dot)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -640,7 +640,7 @@ ENTRY twomatmul {
   ROOT dot.5 = f32[64,64]{1,0} dot(dot.4, parameter.3), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -721,7 +721,7 @@ ENTRY %entry {
   %copy.2 = f32[6,3] copy(%annotate)
   ROOT %copy.3 = f32[6,3] copy(%copy.2)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -748,7 +748,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[4,256,32]{2,1,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={devices=[2,2]0,1,2,3}
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   // Keep all user shardings
@@ -802,7 +802,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[4,256,32]{2,1,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={replicated}
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   // Keep all user shardings
@@ -854,7 +854,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0_copy, f32[4,256,32]{2,1,0} %param1_copy), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={devices=[2,2]0,1,2,3}
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.preserve_shardings =
@@ -933,7 +933,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   rhs_contracting_dims={0,1}, sharding={devices=[2,2]0,1,2,3} ROOT %copy =
   f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   // Remove all user shardings
@@ -967,7 +967,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[4,256,32]{2,1,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={replicated}
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={replicated}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   // Remove all user shardings
@@ -1034,7 +1034,7 @@ ENTRY %entry {
   %constant.b = s32[] constant(0)
   %reduce = (f32[1,16]{1,0}, s32[1,16]{1,0}) reduce(f32[1,16,40]{2,1,0} %param0, s32[1,16,40]{2,1,0} %iota, f32[] %constant.a, s32[] %constant.b), dimensions={2}, to_apply=%func
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1071,7 +1071,7 @@ ENTRY %entry {
   %param1 = f32[] parameter(1)
   %reduce = f32[1,16]{1,0} reduce(f32[1,16,128]{2,1,0} %param0, f32[] %param1), dimensions={2}, to_apply=%func
   })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1115,7 +1115,7 @@ ENTRY %Scatter {
   ROOT scatter = s32[4,128]{1,0} scatter(call, clamp, broadcast), update_window_dims={1}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, indices_are_sorted=true, unique_indices=true, to_apply=region
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1154,7 +1154,7 @@ ENTRY %Scatter {
   ROOT scatter = f32[4,128,128]{2,1,0} scatter(call, clamp, multiply), update_window_dims={1,2}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0,1,2}, index_vector_dim=1, indices_are_sorted=true, unique_indices=true, to_apply=region
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1189,7 +1189,7 @@ ENTRY %entry {
   %param1 = s32[128,512,1]{2,1,0} parameter(1)
   ROOT %gather = f32[128,512,1024]{2,1,0} gather(f32[256,1024]{0,1} %param0, s32[128,512,1]{2,1,0} %param1), offset_dims={2}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2, slice_sizes={1,1024}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1220,7 +1220,7 @@ ENTRY %entry {
   reshape = s32[8,1,1]{2,1,0} parameter(1)
   gather = s8[8,1,128]{2,1,0} gather(get-tuple-element, reshape), offset_dims={2}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2, slice_sizes={1,128}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1257,7 +1257,7 @@ ENTRY %entry {
   ROOT convolution = f32[128,1024,1024]{2,1,0} convolution(gather, reshape),
   window={size=1}, dim_labels=b0f_io0->b0f
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1553,7 +1553,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   // Remove the sharding in dot
   auto* dot = FindInstruction(module.get(), "dot");
@@ -1583,7 +1583,7 @@ ENTRY %elementwise {
   %add = f32[128,128]{0,1} add(%param0, %param1), sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
   ROOT %copy = f32[128,128]{0,1} copy(%add)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   // Run AutoSharding
   AutoShardingOption option;
@@ -1619,7 +1619,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   // Remove the sharding in param0, param1 and copy
   auto* param0 = FindInstruction(module.get(), "param0");
@@ -1679,7 +1679,7 @@ ENTRY %entry {
   %reduce = (f32[1,16]{1,0}, s32[1,16]{1,0}) reduce(f32[1,16,40]{2,1,0} %param0, s32[1,16,40]{2,1,0} %iota, f32[] %constant.a, s32[] %constant.b), dimensions={2}, to_apply=%func,
     sharding={{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}, {devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1702,6 +1702,40 @@ ENTRY %entry {
   EXPECT_FALSE(param0->sharding().IsReplicated());
 }
 
+TEST_F(AutoShardingTest, GetTupleElementUserShardingsParameter) {
+  constexpr absl::string_view kHloString = R"(
+HloModule module
+ENTRY %tupleparameter {
+  %param0 = f32[32,64]{1,0} parameter(0)
+  %param1 = f32[32,64]{1,0} parameter(1), sharding={devices=[2,2]<=[4]}
+  %tuple1 = (f32[32,64]{1,0}, f32[32,64]{1,0}) tuple(f32[32,64]{1,0} %param0, f32[32,64]{1,0} %param1)
+  %first = f32[32,64]{1,0} get-tuple-element((f32[32,64]{1,0}, f32[32,64]{1,0}) %tuple1), index=0
+  %second = f32[32,64]{1,0} get-tuple-element((f32[32,64]{1,0}, f32[32,64]{1,0}) %tuple1), index=1, sharding={devices=[4,1]<=[4]}
+  ROOT root = f32[32,64]{1,0} add(%first, %second)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  AutoShardingOption option;
+  option.enable = true;
+  option.preserve_shardings =
+      AutoShardingOption::PreserveShardingsType::kKeepAllShardings;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+  const HloInstruction* param1 = FindInstruction(module.get(), "param1");
+  ASSERT_NE(param1, nullptr);
+  EXPECT_THAT(param1, op::Sharding("{devices=[2,2]<=[4]}"));
+
+  const HloInstruction* second = FindInstruction(module.get(), "root");
+  ASSERT_NE(second, nullptr);
+  EXPECT_THAT(second, op::Sharding("{devices=[4,1]<=[4]}"));
+}
+
 TEST_F(AutoShardingTest, DISABLED_TupleParameter) {
   constexpr absl::string_view kHloString = R"(
 HloModule module
@@ -1712,7 +1746,7 @@ ENTRY %tupleparameter {
   ROOT root = f32[16,32,64]{2,1,0} add(%first, %second)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1765,7 +1799,7 @@ ENTRY %entry (param0: f32[16,256,256], param1: f32[16,256,256]) -> f32[16,256,25
   %tuple1 = f32[16,256,256]{2,1,0} get-tuple-element((u32[], f32[16,256,256]{2,1,0}, f32[16,256,256]{2,1,0}) %while.1), index=1, sharding={devices=[2,2,1]0,2,1,3}
   ROOT %tanh = f32[16,256,256]{2,1,0} tanh(f32[16,256,256]{2,1,0} %tuple1)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.preserve_shardings =
@@ -1816,7 +1850,7 @@ ENTRY %entry {
   ROOT %result = bf16[128,512,768] get-tuple-element(%while), index=3
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1875,7 +1909,7 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1901,7 +1935,7 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1928,7 +1962,7 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -1975,7 +2009,7 @@ ENTRY %entry {
   ROOT maximum = f32[8,512]{1,0} maximum(subtract, broadcast.d)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -2006,7 +2040,7 @@ ENTRY %entry {
   %dot = bf16[512,1024,16,128]{3,2,1,0} dot(bf16[512,1024,2048]{2,1,0} %param.2, bf16[2048,16,128]{2,1,0} %reshape), lhs_contracting_dims={2}, rhs_contracting_dims={0}
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -2030,7 +2064,7 @@ ENTRY %entry {
   %copy = bf16[1,24,16,16]{3,2,1,0} copy(%reshape)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -2054,7 +2088,7 @@ ENTRY %entry {
   %param.0 = s32[32]{0} parameter(0)
   ROOT broadcast = s32[512,1024,1024,32]{3,2,1,0} broadcast(s32[32]{0} %param.0), dimensions={3}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -2075,7 +2109,7 @@ ENTRY %entry {
   %dot = f32[256,256] dot(%param0, %param1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
   ROOT %result = f32[256,256] tanh(%dot), sharding={devices=[1,4]0,1,2,3}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -2104,7 +2138,7 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
@@ -2131,7 +2165,7 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index 75f843829b55a6..4fd1887493e6c5 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -1447,7 +1447,7 @@ void FixMixedMeshShapeReshardingGetTupleElement(
   CHECK_NE(preserve_shardings, nullptr);
   if (preserve_shardings->contains(inst->name())) {
     (*preserve_shardings)[replace_with->name()] =
-        preserve_shardings->at(inst->name());
+        std::vector<HloSharding>(preserve_shardings->at(inst->name()));
     preserve_shardings->erase(inst->name());
   }
 }

From 4f761ef29fff996162b083efcc4e0cc65392e0b0 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Mon, 25 Mar 2024 16:13:25 -0700
Subject: [PATCH 414/670] #tf-data Add a test for global shuffling `list_files`
 dataset.

PiperOrigin-RevId: 618989811
---
 tensorflow/python/data/kernel_tests/BUILD     |  6 +-
 .../data/kernel_tests/list_files_test.py      | 95 +++++++++++++++++++
 2 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 109eeb25ad87bb..77e899a9968199 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -681,11 +681,15 @@ tf_py_strict_test(
 
 tf_py_strict_test(
     name = "list_files_test",
-    size = "small",
+    size = "medium",
     srcs = ["list_files_test.py"],
+    shard_count = 4,
     deps = [
+        ":checkpoint_test_base",
         ":test_base",
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:client_testlib",
diff --git a/tensorflow/python/data/kernel_tests/list_files_test.py b/tensorflow/python/data/kernel_tests/list_files_test.py
index 47f4223505a1b3..525ced4d80c9ab 100644
--- a/tensorflow/python/data/kernel_tests/list_files_test.py
+++ b/tensorflow/python/data/kernel_tests/list_files_test.py
@@ -17,11 +17,15 @@
 from os import path
 import shutil
 import tempfile
+from typing import Callable, Optional
 
 from absl.testing import parameterized
 
+from tensorflow.python.data.experimental.ops import global_shuffle_op
+from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -230,5 +234,96 @@ def testMultiplePatternsAsTensor(self):
         assert_items_equal=True)
 
 
+class ListFilesGlobalShuffleTest(ListFilesTest, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              repetitions=[1, 2],
+              seed=[None, 42],
+              reshuffle_each_iteration=[True, False])))
+  def test(
+      self,
+      repetitions: int,
+      seed: Optional[int],
+      reshuffle_each_iteration: bool):
+    filenames = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']
+    self._touchTempFiles(filenames)
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'),
+                                             shuffle=False)
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+    if repetitions > 1:
+      dataset = dataset.repeat(repetitions)
+    dataset = global_shuffle_op._global_shuffle(
+        dataset, seed=seed, reshuffle_each_iteration=reshuffle_each_iteration)
+
+    expected = [
+        compat.as_bytes(path.join(self.tmp_dir, filename))
+        for filename in filenames
+    ] * repetitions
+    dataset_output = self.getDatasetOutput(
+        dataset, requires_initialization=True)
+    self.assertCountEqual(dataset_output, expected)
+    self.assertNotEqual(dataset_output, expected)
+    self.assertLen(dataset_output, self.evaluate(dataset.cardinality()))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testShuffleNotSupported(self):
+    filenames = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']
+    self._touchTempFiles(filenames)
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=True)
+    with self.assertRaises(errors.FailedPreconditionError):
+      dataset = global_shuffle_op._global_shuffle(dataset)
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+
+class ListFilesGlobalShuffleCheckpointTest(
+    ListFilesTest,
+    checkpoint_test_base.CheckpointTestBase,
+    parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              repetitions=[1, 2],
+              reshuffle_each_iteration=[True, False],
+              symbolic_checkpoint=[True, False])))
+  def test(
+      self,
+      verify_fn: Callable[..., None],
+      repetitions: int,
+      reshuffle_each_iteration: bool,
+      symbolic_checkpoint: bool):
+    filenames = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']
+    self._touchTempFiles(filenames)
+
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'),
+                                               shuffle=False)
+      # TODO(b/325112575): Swapping the order of `repeat` and `prefetch` causes
+      # `warm_start` to be turned on which causes the wrong iterator context to
+      # be passed to the prefetch thread. Investigate this.
+      if repetitions > 1:
+        dataset = dataset.repeat(repetitions)
+      dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+      dataset = global_shuffle_op._global_shuffle(
+          dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      options.experimental_warm_start = False
+      options.experimental_optimization.apply_default_optimizations = False
+      return dataset.with_options(options)
+
+    verify_fn(
+        self,
+        _build_dataset,
+        num_outputs=len(filenames) * repetitions,
+        assert_items_equal=True)
+
+
 if __name__ == '__main__':
   test.main()

From 530903542b6ed2c0a7d59a4e3d1bc07721e97450 Mon Sep 17 00:00:00 2001
From: Philipp Hack <phack@nvidia.com>
Date: Mon, 25 Mar 2024 16:49:51 -0700
Subject: [PATCH 415/670] PR #10878: Disable Fusion of FP8 Matrix Bias

Imported from GitHub PR https://github.com/openxla/xla/pull/10878

Disables the fusion of a matrix bias into a GEMM Custom Call when the bias has an FP8 data type.
Copybara import of the project:

--
83d29227b698059b56bed4d84ef44c0967a003fe by Philipp Hack <phack@nvidia.com>:

Disable FP8 matrix bias fusion.

Merging this change closes #10878

PiperOrigin-RevId: 618999250
---
 .../xla/xla/service/gpu/gemm_rewriter.cc      |  9 ++-
 .../service/gpu/tests/gemm_rewrite_test.cc    | 55 +++++++++++++++++++
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index 4068b2336f6b9a..0aa610fc92f335 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -821,8 +821,13 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
                                         .WithOneUser()))
                       .WithOneUser(),
                   m::Op(&bias).WithPredicate(is_not_broadcast)))) {
-      return FuseMatrixBiasAdd(instr, bias, existing_gemm,
-                               optional_bitcast_matrix, optional_slice_matrix);
+      // The matrix bias must not be FP8, see
+      // https://docs.nvidia.com/cuda/cublas/index.html.
+      if (!IsF8Type(bias)) {
+        return FuseMatrixBiasAdd(instr, bias, existing_gemm,
+                                 optional_bitcast_matrix,
+                                 optional_slice_matrix);
+      }
     }
 
     return absl::OkStatus();
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index 52d9f288df29ec..69e956f62deac4 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -4841,6 +4841,61 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDF8) {
       )");
 }
 
+// Do not fuse FP8 matrix bias.
+TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDMatrixBiasF8) {
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+
+#if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION < 60000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in ROCm 6.0 and above.";
+#endif  // TF_ROCM_VERSION < 60000
+
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = <<F8E4M3>>[16,32] parameter(0)
+      y = <<F8E4M3>>[32,16] parameter(1)
+      dot_a = <<F8E4M3>>[16,16] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      b = <<F8E4M3>>[16,16] parameter(2)
+      ROOT out = <<F8E4M3>>[16,16] add(dot_a, b)
+          }
+
+)";
+
+  CheckFp8IfSupported(hlo_text);
+  RunAndFilecheckHloRewrite(
+      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      R"(
+; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: <<F8E4M3>>[16,16]) -> <<F8E4M3>>[16,16] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = <<F8E4M3>>[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C1:[^ ]+]] = f32[] constant(1)
+; CHECK-GCN-NEXT:    [[DOT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-PTX-NEXT:    [[DOT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
+; CHECK-DAG:         }
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
+; CHECK-NEXT:    [[P2:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} parameter(2)
+; CHECK-NEXT:    [[ROOT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} add([[DOT]], [[P2]])
+      )");
+}
+
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8) {
 #if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";

From 1ca8bf5736fc5da24125f3aaf77b6b21b42ae357 Mon Sep 17 00:00:00 2001
From: Adam Cogdell <adamcogdell@google.com>
Date: Mon, 25 Mar 2024 16:55:12 -0700
Subject: [PATCH 416/670] Log checkpoint hashing error.

PiperOrigin-RevId: 619000563
---
 tensorflow/cc/saved_model/BUILD             | 1 +
 tensorflow/cc/saved_model/fingerprinting.cc | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index a7a0af29268459..6cc731e722d16b 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -501,6 +501,7 @@ cc_library(
         "//tensorflow/core/graph/regularization:util",
         "//tensorflow/core/util/tensor_bundle:naming",
         "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/cc/saved_model/fingerprinting.cc b/tensorflow/cc/saved_model/fingerprinting.cc
index a98980d3c2760a..cf2ae4721623fa 100644
--- a/tensorflow/cc/saved_model/fingerprinting.cc
+++ b/tensorflow/cc/saved_model/fingerprinting.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/btree_map.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -68,6 +69,7 @@ uint64_t HashCheckpointIndexFile(absl::string_view model_dir) {
   if (read_status.ok()) {
     return tensorflow::Fingerprint64(data);
   } else {
+    LOG(WARNING) << "Failed to read checkpoint file: " << read_status;
     return 0;
   }
 }
@@ -209,8 +211,7 @@ absl::StatusOr<FingerprintDef> ReadSavedModelFingerprint(
     absl::string_view export_dir) {
   const std::string fingerprint_pb_path =
       io::JoinPath(export_dir, kFingerprintFilenamePb);
-  absl::Status found_pb = Env::Default()->FileExists(fingerprint_pb_path);
-  if (!found_pb.ok()) return found_pb;
+  TF_RETURN_IF_ERROR(Env::Default()->FileExists(fingerprint_pb_path));
 
   FingerprintDef fingerprint_proto;
   absl::Status result =

From ccd0e45bdede6359b1fafe0a4fd0fa0a18aee78a Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Mon, 25 Mar 2024 17:00:07 -0700
Subject: [PATCH 417/670] [xla][gpu] Change related to the handling of
 pipelined Send/Recv.

Previously, we relied on certain code patterns to find the Send/Recv for
Send-done/Recv-done for a pipelined loop. Now that we realize copy-insertion
can complicate such code patterns and make such pattern matching fragile.

We modify the collective-permute-decomposer to annotate the pipeline decision
on Send-done/Recv-done, in additional to Send/Recv. We also change the HLO
scheduler and verifier to use such annotation instead of relying on code
pattern matching.

PiperOrigin-RevId: 619001952
---
 .../service/collective_permute_decomposer.cc  |  11 +-
 .../collective_permute_decomposer_test.cc     |  12 ++
 .../xla/service/collective_pipeliner_test.cc  |  24 ++-
 .../xla/xla/service/gpu/gpu_hlo_schedule.cc   |  63 ++-----
 .../xla/service/gpu/gpu_hlo_schedule_test.cc  |  60 +++++--
 .../xla/service/gpu/gpu_p2p_pipeliner_test.cc |  10 +-
 .../xla/xla/service/hlo_parser_test.cc        |  32 +++-
 third_party/xla/xla/service/hlo_verifier.cc   | 159 +++++++++++-------
 .../xla/xla/service/hlo_verifier_test.cc      |  89 ++++++++++
 .../xla/xla/service/layout_assignment_test.cc |  35 ++--
 10 files changed, 347 insertions(+), 148 deletions(-)

diff --git a/third_party/xla/xla/service/collective_permute_decomposer.cc b/third_party/xla/xla/service/collective_permute_decomposer.cc
index 08e5f78217060a..784d3c7318ec25 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer.cc
@@ -106,8 +106,10 @@ bool MayPipeline(const HloCollectivePermuteInstruction& collective_permute) {
 }
 
 // Decomposes a collective-permute and adds frontend attributes to record
-// pipeline decision. The absence of the frontend attribute means the
-// collective-permute will not be pipelined.
+// pipeline decision. The present of the frontend attribute means that the
+// collective-permute will be pipelined and the value of the attribute
+// represents the runtime stream to execute the instruction. Without the
+// frontend attribute, the collective-permute will not be pipelined.
 Status DecomposeCollectivePermute(
     HloCollectivePermuteInstruction* collective_permute,
     HloComputation* computation, const std::string& pipeline_decision) {
@@ -151,7 +153,8 @@ Status DecomposeCollectivePermute(
 
   HloInstruction* recv_done =
       computation->AddInstruction(HloInstruction::CreateRecvDone(recv));
-  computation->AddInstruction(HloInstruction::CreateSendDone(send));
+  HloInstruction* send_done =
+      computation->AddInstruction(HloInstruction::CreateSendDone(send));
 
   HloInstruction* recv_data = computation->AddInstruction(
       HloInstruction::CreateGetTupleElement(recv_done, 0));
@@ -163,7 +166,9 @@ Status DecomposeCollectivePermute(
     xla::FrontendAttributes attributes;
     (*attributes.mutable_map())[kSendRecvPipelineAttr] = pipeline_decision;
     send->add_frontend_attributes(attributes);
+    send_done->add_frontend_attributes(attributes);
     recv->add_frontend_attributes(attributes);
+    recv_done->add_frontend_attributes(attributes);
   }
 
   return OkStatus();
diff --git a/third_party/xla/xla/service/collective_permute_decomposer_test.cc b/third_party/xla/xla/service/collective_permute_decomposer_test.cc
index 4a3bb2360c6878..e7a707743109dd 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer_test.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer_test.cc
@@ -214,6 +214,9 @@ TEST_F(CollectivePermuteDecomposerTest, Pipeline1) {
           "_xla_send_recv_source_target_pairs=\"{{0,1},{1,2},{2,3},{3,4}}\""));
   EXPECT_THAT(recv->ToString(), HasSubstr("_xla_send_recv_pipeline=\"0\""));
   EXPECT_THAT(recv->ToString(), HasSubstr("_xla_other_attribute=\"xyz\""));
+  HloInstruction* recv_done = FindInstruction(module.get(), "recv-done");
+  EXPECT_THAT(recv_done->ToString(),
+              HasSubstr("_xla_send_recv_pipeline=\"0\""));
 
   HloInstruction* send = FindInstruction(module.get(), "send");
   EXPECT_EQ(send->channel_id().value(), 1);
@@ -223,6 +226,9 @@ TEST_F(CollectivePermuteDecomposerTest, Pipeline1) {
           "_xla_send_recv_source_target_pairs=\"{{0,1},{1,2},{2,3},{3,4}}\""));
   EXPECT_THAT(send->ToString(), HasSubstr("_xla_send_recv_pipeline=\"0\""));
   EXPECT_THAT(send->ToString(), HasSubstr("_xla_other_attribute=\"xyz\""));
+  HloInstruction* send_done = FindInstruction(module.get(), "send-done");
+  EXPECT_THAT(send_done->ToString(),
+              HasSubstr("_xla_send_recv_pipeline=\"0\""));
 }
 
 TEST_F(CollectivePermuteDecomposerTest, ForwardPipeline2) {
@@ -293,11 +299,17 @@ TEST_F(CollectivePermuteDecomposerTest, ForwardPipeline2) {
       recv1->ToString(),
       HasSubstr("_xla_send_recv_source_target_pairs=\"{{0,1},{1,2},{2,3}}\""));
   EXPECT_THAT(recv1->ToString(), HasSubstr("_xla_send_recv_pipeline=\"1\""));
+  HloInstruction* recv_done1 = FindInstruction(module.get(), "recv-done.1");
+  EXPECT_THAT(recv_done1->ToString(),
+              HasSubstr("_xla_send_recv_pipeline=\"1\""));
   HloInstruction* send1 = FindInstruction(module.get(), "send.1");
   EXPECT_THAT(
       send1->ToString(),
       HasSubstr("_xla_send_recv_source_target_pairs=\"{{0,1},{1,2},{2,3}}\""));
   EXPECT_THAT(send1->ToString(), HasSubstr("_xla_send_recv_pipeline=\"1\""));
+  HloInstruction* send_done1 = FindInstruction(module.get(), "send-done.1");
+  EXPECT_THAT(send_done1->ToString(),
+              HasSubstr("_xla_send_recv_pipeline=\"1\""));
 }
 
 TEST_F(CollectivePermuteDecomposerTest, BackwardPipeline2) {
diff --git a/third_party/xla/xla/service/collective_pipeliner_test.cc b/third_party/xla/xla/service/collective_pipeliner_test.cc
index 75e373e9b37777..f28d00b4d2fbd7 100644
--- a/third_party/xla/xla/service/collective_pipeliner_test.cc
+++ b/third_party/xla/xla/service/collective_pipeliner_test.cc
@@ -1751,12 +1751,16 @@ TEST_F(CollectivePipelinerTest, TransformRecvSendBackwards) {
 
     after-all = token[] after-all()
     recv = (f32[1, 1024, 1024], u32[], token[]) recv(after-all), channel_id=1, frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}, {2, 3}, {3, 4}}"
+      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}, {2, 3}, {3, 4}}",
+      _xla_send_recv_pipeline="0"
     }
     send = (f32[1, 1024, 1024], u32[], token[]) send(p, after-all), channel_id=1, frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}, {2, 3}, {3, 4}}"
+      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}, {2, 3}, {3, 4}}",
+      _xla_send_recv_pipeline="0"
+    }
+    recv-done = (f32[1, 1024, 1024], token[]) recv-done(recv), channel_id=1, frontend_attributes={
+       _xla_send_recv_pipeline="0"
     }
-    recv-done = (f32[1, 1024, 1024], token[]) recv-done(recv), channel_id=1
     recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done), index=0
 
     replica = u32[] replica-id()
@@ -1769,7 +1773,9 @@ TEST_F(CollectivePipelinerTest, TransformRecvSendBackwards) {
     d = f32[1, 1024, 1024] tan(c)
     s = f32[1, 1024, 1024] dot(c, d), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
 
-    send-done = token[] send-done(send), channel_id=1
+    send-done = token[] send-done(send), channel_id=1, frontend_attributes={
+       _xla_send_recv_pipeline="0"
+    }
     ROOT result = (u32[], f32[1, 1024, 1024]) tuple(new_count, s)
   }
 
@@ -1848,7 +1854,10 @@ TEST_F(CollectivePipelinerTest,
         _xla_send_recv_source_target_pairs="{{3,0}}",
         _xla_other_attr="0"
       }
-    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data = u32[2] get-tuple-element(recv-done.0), index=0
 
     c1 = u32[] constant(1)
@@ -1857,7 +1866,10 @@ TEST_F(CollectivePipelinerTest,
     r = u32[2] broadcast(c1), dimensions={}
     s = u32[2] add(r, recv-data)
 
-    send-done.0 = token[] send-done(send.0), channel_id=1
+    send-done.0 = token[] send-done(send.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     ROOT result = (u32[], u32[2]) tuple(new_count, s)
   }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 5683766497bf40..08725514ceecf0 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -317,41 +317,10 @@ int64_t GetPipelineStream(const HloInstruction& start) {
 // Returns the resource type and resource usage for a P2P instruction.
 std::pair<GpuResourceType, ResourceUsageType> GetP2PResourceAndUsage(
     const HloInstruction& instr, const CanonicalAsyncOp& op) {
-  ResourceUsageType usage;
-  int64_t pipeline = 0;
-  if (op.outer == HloOpcode::kAsyncStart) {
-    usage = ResourceUsageType::kResourceRelease;
-    pipeline = GetPipelineStream(instr);
-  } else {
-    usage = ResourceUsageType::kResourceOccupy;
-    // Check the operand for the Send-done or Recv-done instruction.
-    const HloInstruction* operand = instr.operand(0);
-    HloOpcode operand_opcode = operand->opcode();
-    if (operand_opcode == HloOpcode::kSend ||
-        operand_opcode == HloOpcode::kRecv) {
-      // Not a pipelined P2P.
-      pipeline = GetPipelineStream(*operand);
-    } else {
-      // A pipelined P2P. Find the corresponding start-op.
-      const HloSendRecvInstruction* start;
-      const HloGetTupleElementInstruction* gte =
-          Cast<HloGetTupleElementInstruction>(operand);
-      int64_t tuple_index = gte->tuple_index();
-      if (gte->operand(0)->opcode() == HloOpcode::kWhile) {
-        // The op is a while-result, so the start-op should be a value in the
-        // while-op operands.
-        start = Cast<HloSendRecvInstruction>(
-            gte->operand(0)->operand(0)->operand(tuple_index));
-      } else {
-        // The op is a while-body parameter, so the start-op should be a value
-        // in the while-body result.
-        const HloComputation* computation = instr.parent();
-        start = Cast<HloSendRecvInstruction>(
-            computation->root_instruction()->operand(tuple_index));
-      }
-      pipeline = GetPipelineStream(*start);
-    }
-  }
+  ResourceUsageType usage = op.outer == HloOpcode::kAsyncStart
+                                ? ResourceUsageType::kResourceRelease
+                                : ResourceUsageType::kResourceOccupy;
+  int64_t pipeline = GetPipelineStream(instr);
   HloOpcode opcode = op.inner;
   GpuResourceType resource;
   if (pipeline == 0) {
@@ -539,26 +508,30 @@ class GpuAsyncTracker : public GpuAsyncTrackerBase {
     }
     auto find_instruction_for_pipeline = [&](HloOpcode opcode,
                                              int64_t pipeline) {
-      for (auto operand : instr.operand(0)->operands()) {
-        if (operand->opcode() == opcode) {
-          int64_t cur_pipeline = GetPipelineStream(*operand);
-          if (cur_pipeline == pipeline) {
-            return true;
+      for (auto user1 : instr.users()) {
+        if (user1->opcode() == HloOpcode::kGetTupleElement) {
+          for (auto user2 : user1->users()) {
+            if (user2->opcode() == opcode) {
+              if (GetPipelineStream(*user2) == pipeline) {
+                return true;
+              }
+            }
           }
         }
       }
       return false;
     };
     bool found;
-    // Look into the while-op init-values to find pipelined Send/Recv.
+    // Look into the users of the while-result to find pipelined Send-done or
+    // Recv-done.
     if (resource_type == first_p2p_resource) {
-      found = find_instruction_for_pipeline(HloOpcode::kSend, 0);
+      found = find_instruction_for_pipeline(HloOpcode::kSendDone, 0);
     } else if (resource_type == first_p2p_resource + 1) {
-      found = find_instruction_for_pipeline(HloOpcode::kSend, 1);
+      found = find_instruction_for_pipeline(HloOpcode::kSendDone, 1);
     } else if (resource_type == first_p2p_resource + 2) {
-      found = find_instruction_for_pipeline(HloOpcode::kRecv, 0);
+      found = find_instruction_for_pipeline(HloOpcode::kRecvDone, 0);
     } else {
-      found = find_instruction_for_pipeline(HloOpcode::kRecv, 1);
+      found = find_instruction_for_pipeline(HloOpcode::kRecvDone, 1);
     }
     return num_resources - (found ? 1 : 0);
   }
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index 7049d920faf445..aa669290f9c9d7 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -790,11 +790,17 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined1) {
     count = get-tuple-element(param), index=0
 
     recv.1.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=1
-    recv-done.1 = (f32[1,1024,1024], token[]) recv-done(recv.1.q), channel_id=1
+    recv-done.1 = (f32[1,1024,1024], token[]) recv-done(recv.1.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
     recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
 
     send.1.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=2
-    send-done.1 = token[] send-done(send.1.q), channel_id=1
+    send-done.1 = token[] send-done(send.1.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
 
     c1 = u32[] constant(1)
     new-count = u32[] add(count, c1)
@@ -852,10 +858,16 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined1) {
       backend_config={"known_trip_count":{"n":"25"}}
 
     recv.2.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=1
-    recv-done.2 = (f32[1,1024,1024], token[]) recv-done(recv.2.q), channel_id=1
+    recv-done.2 = (f32[1,1024,1024], token[]) recv-done(recv.2.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
 
     send.2.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=2
-    send-done.2 = token[] send-done(send.2.q), channel_id=1
+    send-done.2 = token[] send-done(send.2.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
 
     ROOT entry-result = f32[1, 1024, 1024] get-tuple-element(recv-done.2), index=0
   }
@@ -920,18 +932,30 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined2) {
     count = get-tuple-element(param), index=0
 
     recv.0.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=1
-    recv-done.0 = (f32[1,1024,1024], token[]) recv-done(recv.0.q), channel_id=1
+    recv-done.0 = (f32[1,1024,1024], token[]) recv-done(recv.0.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
     recv-data.0 = f32[1, 1024, 1024] get-tuple-element(recv-done.0), index=0
 
     send.0.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=2
-    send-done.0 = token[] send-done(send.0.q), channel_id=1
+    send-done.0 = token[] send-done(send.0.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
 
     recv.1.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=3
-    recv-done.1 = (f32[1,1024,1024], token[]) recv-done(recv.1.q), channel_id=2
+    recv-done.1 = (f32[1,1024,1024], token[]) recv-done(recv.1.q), channel_id=2,
+      frontend_attributes={
+      _xla_send_recv_pipeline="1"
+      }
     recv-data.1 = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
 
     send.1.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=4
-    send-done.1 = token[] send-done(send.1.q), channel_id=2
+    send-done.1 = token[] send-done(send.1.q), channel_id=2,
+      frontend_attributes={
+      _xla_send_recv_pipeline="1"
+      }
 
     replica = u32[] replica-id()
     constant0 = u32[] constant(0)
@@ -1021,18 +1045,30 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined2) {
       backend_config={"known_trip_count":{"n":"25"}}
 
     recv.2.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=1
-    recv-done.2 = (f32[1,1024,1024], token[]) recv-done(recv.2.q), channel_id=1
+    recv-done.2 = (f32[1,1024,1024], token[]) recv-done(recv.2.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
     recv-data.2 = f32[1, 1024, 1024] get-tuple-element(recv-done.2), index=0
 
     send.2.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=2
-    send-done.2 = token[] send-done(send.2.q), channel_id=1
+    send-done.2 = token[] send-done(send.2.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
 
     recv.3.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=3
-    recv-done.3 = (f32[1,1024,1024], token[]) recv-done(recv.3.q), channel_id=2
+    recv-done.3 = (f32[1,1024,1024], token[]) recv-done(recv.3.q), channel_id=2,
+      frontend_attributes={
+      _xla_send_recv_pipeline="1"
+      }
     recv-data.3 = f32[1, 1024, 1024] get-tuple-element(recv-done.3), index=0
 
     send.3.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=4
-    send-done.3 = token[] send-done(send.3.q), channel_id=2
+    send-done.3 = token[] send-done(send.3.q), channel_id=2,
+      frontend_attributes={
+      _xla_send_recv_pipeline="1"
+      }
 
     replica = u32[] replica-id()
     constant0 = u32[] constant(0)
diff --git a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc
index abefe7451c4568..7e71820ed87a14 100644
--- a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc
@@ -93,7 +93,10 @@ TEST_F(GpuP2PPipelinerTest,
         _xla_send_recv_pipeline="0",
         _xla_send_recv_validation="{{1,7}}"
       }
-    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data = u32[2] get-tuple-element(recv-done.0), index=0
 
     c1 = u32[] constant(1)
@@ -102,7 +105,10 @@ TEST_F(GpuP2PPipelinerTest,
     r = u32[2] broadcast(c1), dimensions={}
     s = u32[2] add(r, recv-data)
 
-    send-done.0 = token[] send-done(send.0), channel_id=1
+    send-done.0 = token[] send-done(send.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     ROOT result = (u32[], u32[2]) tuple(new_count, s)
   }
 
diff --git a/third_party/xla/xla/service/hlo_parser_test.cc b/third_party/xla/xla/service/hlo_parser_test.cc
index da0ca7362b5d2a..5a097d2d80565d 100644
--- a/third_party/xla/xla/service/hlo_parser_test.cc
+++ b/third_party/xla/xla/service/hlo_parser_test.cc
@@ -5214,19 +5214,26 @@ TEST_F(HloParserTest, PipelinedSendRecv) {
     count = get-tuple-element(%param), index=0
 
     recv.0 = (u32[2], u32[], token[]) get-tuple-element(param), index=1
-    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data.0 = u32[2] get-tuple-element(recv-done.0), index=0
 
     c1 = u32[] constant(1)
     new_count = u32[] add(count, c1)
 
     send.0 = (u32[2], u32[], token[]) get-tuple-element(param), index=2
-    send-done.0 = (u32[2], token[]) recv-done(send.0), channel_id=1
+    send-done.0 = (u32[2], token[]) recv-done(send.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
 
     after-all.0.n = token[] after-all()
     recv.0.n = (u32[2], u32[], token[]) recv(after-all.0.n), channel_id=1,
       frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{1,0}}"
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
       }
 
 
@@ -5234,7 +5241,8 @@ TEST_F(HloParserTest, PipelinedSendRecv) {
     send.0.n = (u32[2], u32[], token[]) send(recv-data.0, after-all.1.n),
       channel_id=1,
       frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{1,0}}"
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
       }
 
     ROOT result = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[])) tuple(new_count, recv.0.n, send.0.n)
@@ -5246,23 +5254,31 @@ TEST_F(HloParserTest, PipelinedSendRecv) {
     after-all.0.p = token[] after-all()
     recv.0.p = (u32[2], u32[], token[]) recv(after-all.0.p), channel_id=1,
       frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{1,0}}"
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
       }
 
     after-all.1.p = token[] after-all()
     send.0.p = (u32[2], u32[], token[]) send(init, after-all.1.p),
       channel_id=1,
       frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{1,0}}"
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
       }
 
     while_init = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[])) tuple(c0, recv.0.p, send.0.p)
     while_result = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[])) while(while_init), body=body, condition=cond
 
     recv.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=1
-    recv-done.0.q = (u32[2], token[]) recv-done(recv.0.q), channel_id=1
+    recv-done.0.q = (u32[2], token[]) recv-done(recv.0.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     send.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=2
-    send-done.0.q = token[] send-done(send.0.q), channel_id=1
+    send-done.0.q = token[] send-done(send.0.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
 
     ROOT recv-data.0.q = u32[2] get-tuple-element(recv-done.0.q), index=0
       })";
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index c202158f761b45..497b74734920af 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -2259,8 +2259,40 @@ Status VerifyChannels(const HloModule& module) {
   absl::flat_hash_map<int64_t, std::vector<const HloInstruction*>>
       channel_instructions;
 
-  // Send/Recv instruction must have a single user: the corresponding
-  // SendDone/RecvDone. with matching channel.
+  // For Async operations, we need to make sure:
+  // (1) AsyncStart and AsyncDone are used in pairs
+  // (2) AsynStart and Asyndone are connected, that is, an AsynDone has an
+  //     AsyncStart as its only operand, and an AsynStart has an AsyncDone as
+  //     its only user
+  // (3) the channel ID used by a pair of Async operations is unique
+  //
+  // Send and SendDone, Recv and RecvDone are such pairs of Async operations.
+  // Different from other Async operations, a channel ID can be used by one
+  // Send-SendDone pair and one Recv-RecvDone pair. As such, we verify the
+  // above three invariants for Send/Recv related instructions with adjustment
+  // to (3):
+  // (3*) the channel ID used by a pair of Send-SendDone can be shared by at
+  //       most one pair of Recv-RecvDone.
+  //
+  // Currently, the GPU compiler can decomposed collective-permute into a group
+  // of instructions with a pair of Send-SendDone and a pair of Recv-RecvDone
+  // that use the same channel ID. When a while-body contains such instructions,
+  // the GPU compiler can also peel off Send and Recv, and statically order
+  // SendDone/RecvDone inside the while-body before Send/Recv. This breaks
+  // invariants (2) and (3*) for the pipelined Send/Recv case. We verify the
+  // following for a group of instructions using the same channel ID but don't
+  // satisfy invariants (1)(2)(3*):
+  // (4) All instructions in the group are annotated with frontend attributes.
+  //     We avoid verifying the content of such a frontend attribute to avoid
+  //     making the general HLO instruction verifier depend on the compiler pass
+  //     that performs the transformation.
+  // (5) the group should contain equal number uses of each Send/Recv related
+  //     instructions.
+  //
+  // Comparing the verification of unpiplined Send/Recv with the verification
+  // of pipelined, what we missing verifying is that the direct connection
+  // between Send/Recv and SendDone/RecvDone through operands.
+  //
   for (const HloComputation* computation : module.computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
       auto channel_instr = DynCast<HloChannelInstruction>(instruction);
@@ -2271,67 +2303,53 @@ Status VerifyChannels(const HloModule& module) {
 
       switch (instruction->opcode()) {
         case HloOpcode::kSend: {
-          TF_RET_CHECK(instruction->users().size() == 1);
-          const HloInstruction* send_user = instruction->users().front();
-          if (send_user->opcode() == HloOpcode::kSendDone) {
-            TF_RETURN_IF_ERROR(CheckSameChannel(instruction, send_user));
-            TF_RETURN_IF_ERROR(CheckSameIsHostTransfer(instruction, send_user));
-          } else {
-            // If a Send user is not a SendDone, it has to be a tuple that is
-            // either the root of a while-body or the init of a while-loop.
-            TF_RET_CHECK(send_user->opcode() == HloOpcode::kTuple);
-            if (send_user != send_user->parent()->root_instruction()) {
-              TF_RET_CHECK(send_user->users().size() == 1);
-              const HloInstruction* user = send_user->users().front();
-              TF_RET_CHECK(user->opcode() == HloOpcode::kWhile);
+          bool pipelined = true;
+          if (instruction->users().size() == 1) {
+            const HloInstruction* send_user = instruction->users().front();
+            if (send_user->opcode() == HloOpcode::kSendDone) {
+              TF_RETURN_IF_ERROR(CheckSameChannel(instruction, send_user));
+              TF_RETURN_IF_ERROR(
+                  CheckSameIsHostTransfer(instruction, send_user));
+              pipelined = false;
             }
           }
+          // Pipelined Send should be annotated with frontend attributes.
+          TF_RET_CHECK(pipelined == false ||
+                       !instruction->frontend_attributes().map().empty());
           break;
         }
         case HloOpcode::kRecv: {
-          TF_RET_CHECK(instruction->users().size() == 1);
-          const HloInstruction* recv_user = instruction->users().front();
-          if (recv_user->opcode() == HloOpcode::kRecvDone) {
-            TF_RETURN_IF_ERROR(CheckSameChannel(instruction, recv_user));
-            TF_RETURN_IF_ERROR(CheckSameIsHostTransfer(instruction, recv_user));
-          } else {
-            // If a Recv user is not a RecvDone, it has to be a tuple that is
-            // either the root of a while-body or the init of a while-loop.
-            TF_RET_CHECK(recv_user->opcode() == HloOpcode::kTuple);
-            if (recv_user != recv_user->parent()->root_instruction()) {
-              TF_RET_CHECK(recv_user->users().size() == 1);
-              const HloInstruction* user = recv_user->users().front();
-              TF_RET_CHECK(user->opcode() == HloOpcode::kWhile);
+          bool pipelined = true;
+          if (instruction->users().size() == 1) {
+            const HloInstruction* recv_user = instruction->users().front();
+            if (recv_user->opcode() == HloOpcode::kRecvDone) {
+              TF_RETURN_IF_ERROR(CheckSameChannel(instruction, recv_user));
+              TF_RETURN_IF_ERROR(
+                  CheckSameIsHostTransfer(instruction, recv_user));
+              pipelined = false;
             }
           }
+          // Pipelined Recv should be annotated with frontend attributes.
+          TF_RET_CHECK(pipelined == false ||
+                       !instruction->frontend_attributes().map().empty());
           break;
         }
         case HloOpcode::kSendDone: {
           TF_RET_CHECK(instruction->operands().size() == 1);
           const HloInstruction* send_done_operand = instruction->operand(0);
-          if (send_done_operand->opcode() != HloOpcode::kSend) {
-            // If the SendDone operand is not a Send, it has to be either part
-            // of a while-loop result or a parameter of a while-body.
-            TF_RET_CHECK(send_done_operand->opcode() ==
-                         HloOpcode::kGetTupleElement);
-            HloOpcode opcode = send_done_operand->operand(0)->opcode();
-            TF_RET_CHECK(opcode == HloOpcode::kWhile ||
-                         opcode == HloOpcode::kParameter);
-          }
+          // If the operand is not a Send, the Send-done is pipelined and should
+          // have frontend attributes.
+          TF_RET_CHECK(send_done_operand->opcode() == HloOpcode::kSend ||
+                       !instruction->frontend_attributes().map().empty());
           break;
         }
         case HloOpcode::kRecvDone: {
           TF_RET_CHECK(instruction->operands().size() == 1);
           const HloInstruction* recv_done_operand = instruction->operand(0);
-          if (recv_done_operand->opcode() != HloOpcode::kRecv) {
-            // If the RecvDone operand is not a Recv, it has to be either part
-            // of a while-loop result or a parameter of a while-body.
-            TF_RET_CHECK(recv_done_operand->opcode() ==
-                         HloOpcode::kGetTupleElement);
-            HloOpcode opcode = recv_done_operand->operand(0)->opcode();
-            TF_RET_CHECK(opcode == HloOpcode::kWhile ||
-                         opcode == HloOpcode::kParameter);
-          }
+          // If the operand is not a Recv, the Recv-done is pipelined and should
+          // have frontend attributes.
+          TF_RET_CHECK(recv_done_operand->opcode() == HloOpcode::kRecv ||
+                       !instruction->frontend_attributes().map().empty());
           break;
         }
         default:
@@ -2346,33 +2364,50 @@ Status VerifyChannels(const HloModule& module) {
     const HloInstruction* first = instructions[0];
     auto sendrecv = DynCast<HloSendRecvInstruction>(first);
     if (sendrecv) {
-      absl::flat_hash_set<HloOpcode> opcodes;
-      bool maybe_send_recv_pipeline = false;
+      // Check that all instructions are Send/Recv related and count the
+      // appearance of each opcode in the group.
+      absl::flat_hash_map<HloOpcode, int> opcode_to_count;
       for (const HloInstruction* instr : instructions) {
-        if (opcodes.insert(instr->opcode()).second == false) {
-          // A channel is used by multiple instructions with the same opcode.
-          // This is only allows for pipelining Send and Recv, assuming such
-          // instructions have non-empty frontend attributes.
-          if (DynCast<HloSendInstruction>(instr) ||
-              DynCast<HloRecvInstruction>(instr)) {
-            maybe_send_recv_pipeline =
-                (!instr->frontend_attributes().map().empty());
-          }
+        auto it = opcode_to_count.find(instr->opcode());
+        if (it != opcode_to_count.end()) {
+          it->second++;
+        } else {
+          opcode_to_count[instr->opcode()] = 1;
         }
-        auto cast = DynCast<HloSendRecvInstruction>(instr);
-        TF_RET_CHECK(cast != nullptr)
+        TF_RET_CHECK(DynCast<HloSendRecvInstruction>(instr) != nullptr)
             << "channel " << pair.first
             << " is used for different types of channel instructions";
       }
+
+      int count = opcode_to_count.begin()->second;
+      bool consistent_count =
+          absl::c_all_of(opcode_to_count, [count](const auto& opcode_count) {
+            return opcode_count.second == count;
+          });
+      // A pipelined group of Send/Recv should all have frontend attributes.
+      bool maybe_pipelined =
+          absl::c_all_of(instructions, [](const HloInstruction* inst) {
+            return !inst->frontend_attributes().map().empty();
+          });
+
       if (sendrecv->is_host_transfer()) {
-        TF_RET_CHECK(instructions.size() == 2)
+        TF_RET_CHECK(consistent_count && count == 1 && instructions.size() == 2)
             << "channel " << pair.first
             << " is used for multiple host send/recv instructions";
       } else {
-        if (!maybe_send_recv_pipeline) {
-          TF_RET_CHECK(instructions.size() == opcodes.size())
+        if (consistent_count && count == 1) {
+          TF_RET_CHECK(instructions.size() == opcode_to_count.size())
               << "channel " << pair.first
               << " is used for multiple send/recv instructions";
+        } else {
+          TF_RET_CHECK(maybe_pipelined) << "channel " << pair.first
+                                        << " is used for multiple send/recv "
+                                           "instructions but not pipelined";
+          TF_RET_CHECK(consistent_count && opcode_to_count.size() % 2 == 0)
+              << "channel " << pair.first
+              << " is pipelined. Not all Send/Recv related instructions are"
+                 " used the same number of times or channel is used for other "
+                 "instructions";
         }
       }
     } else {
diff --git a/third_party/xla/xla/service/hlo_verifier_test.cc b/third_party/xla/xla/service/hlo_verifier_test.cc
index 9d803e20295237..15dd3208efb25a 100644
--- a/third_party/xla/xla/service/hlo_verifier_test.cc
+++ b/third_party/xla/xla/service/hlo_verifier_test.cc
@@ -2043,6 +2043,95 @@ TEST_F(HloVerifierTest, ChannelVerifier) {
               HasSubstr("used for different types of channel instructions"));
 }
 
+TEST_F(HloVerifierTest, ChannelVerifierPipelinedMissingDones) {
+  const char* const kModuleStr = R"(
+  HloModule test
+  cond {
+    param = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[])) parameter(0)
+    count = get-tuple-element(%param), index=0
+    ub = u32[] constant(1)
+    ROOT result = pred[] compare(count, ub), direction=LT
+  }
+
+  body {
+    param = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[])) parameter(0)
+    count = get-tuple-element(%param), index=0
+
+    recv.0 = (u32[2], u32[], token[]) get-tuple-element(param), index=1
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    recv-data.0 = u32[2] get-tuple-element(recv-done.0), index=0
+
+    c1 = u32[] constant(1)
+    new_count = u32[] add(count, c1)
+
+    send.0 = (u32[2], u32[], token[]) get-tuple-element(param), index=2
+    send-done.0 = (u32[2], token[]) recv-done(send.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+
+    after-all.0.n = token[] after-all()
+    recv.0.n = (u32[2], u32[], token[]) recv(after-all.0.n), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+
+
+    after-all.1.n = token[] after-all()
+    send.0.n = (u32[2], u32[], token[]) send(recv-data.0, after-all.1.n),
+      channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+
+    ROOT result = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[]))
+      tuple(new_count, recv.0.n, send.0.n)
+  }
+
+  ENTRY test_computation {
+    c0 = u32[] constant(0)
+    init = u32[2] broadcast(c0), dimensions={}
+    after-all.0.p = token[] after-all()
+    recv.0.p = (u32[2], u32[], token[]) recv(after-all.0.p), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+
+    after-all.1.p = token[] after-all()
+    send.0.p = (u32[2], u32[], token[]) send(init, after-all.1.p),
+      channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+
+    while_init = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[]))
+      tuple(c0, recv.0.p, send.0.p)
+    while_result = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[]))
+      while(while_init), body=body, condition=cond
+
+    recv.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=1
+    recv-done.0.q = (u32[2], token[]) recv-done(recv.0.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+
+    ROOT recv-data.0.q = u32[2] get-tuple-element(recv-done.0.q), index=0
+      })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr));
+  EXPECT_THAT(
+      verifier().Run(module.get()).status().message(),
+      HasSubstr("is pipelined. Not all Send/Recv related instructions are used"
+                " the same number of times"));
+}
+
 TEST_F(HloVerifierTest, CollectiveChannelVerifier) {
   const char* const kModuleStr = R"(
   HloModule test
diff --git a/third_party/xla/xla/service/layout_assignment_test.cc b/third_party/xla/xla/service/layout_assignment_test.cc
index 110a76aa1266e5..9a0766e8bc92e4 100644
--- a/third_party/xla/xla/service/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/layout_assignment_test.cc
@@ -1612,7 +1612,7 @@ ENTRY main {
 TEST_F(LayoutAssignmentTest, PropagateOperandLayout2) {
   const char* module_str = R"(
  HloModule TensorFlowGather, entry_computation_layout={(f32[32,650]{1,0},s32[16,1,18]{0,1,2})->f32[16,1,18,32]{3,1,2,0}}
- 
+
  ENTRY %main (operand: f32[32,650], indices: s32[16,1,18]) -> f32[16,1,18,32] {
    %operand = f32[32,650]{1,0} parameter(0)
    %transpose = f32[650,32]{0,1} transpose(f32[32,650]{1,0} %operand), dimensions={1,0}
@@ -1638,7 +1638,7 @@ TEST_F(LayoutAssignmentTest, PropagateOperandLayout2) {
 TEST_F(LayoutAssignmentTest, PreserveInstructionLayout) {
   const char* module_str = R"(
  HloModule TensorFlowGather, entry_computation_layout={(f32[32,650]{1,0},s32[16,1,18]{0,1,2})->(f32[16,1,18,32]{3,1,2,0})}
- 
+
  ENTRY %main  {
    %operand = f32[32,650]{1,0} parameter(0)
    %transpose = f32[650,32]{0,1} transpose(f32[32,650]{1,0} %operand), dimensions={1,0}
@@ -1697,7 +1697,7 @@ ENTRY main {
 TEST_F(LayoutAssignmentTest, PartialEntryParameterLayout) {
   const char* module_str = R"(
  HloModule EntryLayout, entry_computation_layout={(f32[32,650]{1,0},s32[16,1,18]{0,1,2})->(f32[650,32]{1,0},s32[18,16,1]{0,1,2})}
- 
+
  ENTRY %main {
    operand = f32[32,650] parameter(0)
    transpose = transpose(operand), dimensions={1,0}
@@ -1726,7 +1726,7 @@ TEST_F(LayoutAssignmentTest, PartialEntryParameterLayout) {
 TEST_F(LayoutAssignmentTest, AliasParameterAndOutput) {
   const char* module_str = R"(
  HloModule EntryAlias, input_output_alias={ {}: (0, {}, may-alias) }
- 
+
  ENTRY %main {
    p0 = f32[65,65] parameter(0)
    p1 = f32[4225] parameter(1)
@@ -1752,7 +1752,7 @@ TEST_F(LayoutAssignmentTest, AliasParameterAndOutput) {
 TEST_F(LayoutAssignmentTest, AliasUnconstrainedParamterWithConstrainedOutput) {
   const char* module_str = R"(
  HloModule EntryAlias, input_output_alias={ {}: (0, {}, may-alias) }
- 
+
  ENTRY %main {
    p0 = f32[65,65] parameter(0)
    p1 = f32[4225] parameter(1)
@@ -1776,7 +1776,7 @@ TEST_F(LayoutAssignmentTest, AliasUnconstrainedParamterWithConstrainedOutput) {
 TEST_F(LayoutAssignmentTest, AliasConstrainedParamterWithUnconstrainedOutput) {
   const char* module_str = R"(
  HloModule EntryAlias, input_output_alias={ {}: (0, {}, may-alias) }
- 
+
  ENTRY %main {
    p0 = f32[65,65] parameter(0)
    p1 = f32[4225] parameter(1)
@@ -1811,9 +1811,15 @@ body {
 
     t1 = f32[100,100] get-tuple-element(p), index=0
     t = (f32[100,100], u32[], token[]) get-tuple-element(p), index=1
-    sdone = token[] send-done(t), channel_id=0
+    sdone = token[] send-done(t), channel_id=1,
+      frontend_attributes={
+       _xla_send_recv_pipeline="0"
+      }
     tk = token[] after-all()
-    snd = (f32[100,100], u32[], token[]) send(t1, tk), channel_id=0
+    snd = (f32[100,100], u32[], token[]) send(t1, tk), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     a = add(t1, t1)
     ROOT tup =  tuple(a, snd)
 }
@@ -1821,9 +1827,18 @@ body {
 ENTRY %main {
     p0 = f32[100,100] parameter(0)
     tk = token[] after-all()
-    snd = (f32[100,100], u32[], token[]) send(p0, tk), channel_id=1
+    snd = (f32[100,100], u32[], token[]) send(p0, tk), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     t = tuple(p0, snd)
-    ROOT loop = while(t), condition=condition, body=body
+    loop = while(t), condition=condition, body=body
+    ssend = (f32[100,100], u32[], token[]) get-tuple-element(loop), index=1
+    sdone = token[] send-done(ssend), channel_id=1,
+      frontend_attributes={
+       _xla_send_recv_pipeline="0"
+      }
+    ROOT result = f32[100,100] get-tuple-element(loop), index=0
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,

From a4c0017931b56294450969132e1a121f090fd5a5 Mon Sep 17 00:00:00 2001
From: Philipp Hack <phack@nvidia.com>
Date: Mon, 25 Mar 2024 17:01:43 -0700
Subject: [PATCH 418/670] PR #10925: Include FP8 GEMMs in IsCublasGemm

Imported from GitHub PR https://github.com/openxla/xla/pull/10925

Adds FP8 GEMM Custom Calls to the set of ops considered in function IsCublasGemm.
Copybara import of the project:

--
8449c5ec4a1349fe54f211a2dbe4a0deb78a240b by Philipp Hack <phack@nvidia.com>:

Include FP8 GEMMs in function IsCublasGemm.

--
f09bcb8b208c6a74567093af505b26436bd3df9c by Philipp Hack <phack@nvidia.com>:

Include FP8 GEMMs in function IsCublasGemm.

Merging this change closes #10925

PiperOrigin-RevId: 619002481
---
 third_party/xla/xla/service/gpu/cublas_cudnn.cc          | 3 ++-
 third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/cublas_cudnn.cc b/third_party/xla/xla/service/gpu/cublas_cudnn.cc
index 906780efb2d0a7..cab9e2c54a3f15 100644
--- a/third_party/xla/xla/service/gpu/cublas_cudnn.cc
+++ b/third_party/xla/xla/service/gpu/cublas_cudnn.cc
@@ -30,7 +30,8 @@ namespace xla {
 namespace gpu {
 
 bool IsCublasGemm(const HloInstruction& hlo) {
-  return IsLegacyCublasMatmul(hlo) || IsCublasLtMatmul(hlo);
+  return IsLegacyCublasMatmul(hlo) || IsCublasLtMatmul(hlo) ||
+         IsCublasLtMatmulF8(hlo);
 }
 
 bool IsLegacyCublasMatmul(const HloInstruction& hlo) {
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index c2a2b9e2c1950c..2efb705ab3926e 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -445,7 +445,7 @@ absl::StatusOr<bool> RunOnComputation(HloComputation* computation,
                                       AutotuneConfig config) {
   bool changed = false;
   for (HloInstruction* instr : computation->instructions()) {
-    if (IsCublasGemm(*instr) || IsCublasLtMatmulF8(*instr)) {
+    if (IsCublasGemm(*instr)) {
       TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr, config));
       changed |= result;
     }

From bee08f8a8c9ea82132fdcc747fd2fc2d0f4ebea2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 17:03:23 -0700
Subject: [PATCH 419/670] Ensure that the module we consume has no unused
 computations. This can causes issues as we clone modules to support
 try_multiple_mesh_shapes, and cloning an HLO module removes dead computations
 leading to mismatches.

PiperOrigin-RevId: 619002992
---
 .../xla/hlo/experimental/auto_sharding/BUILD  |  1 -
 .../auto_sharding/auto_sharding.cc            |  5 ----
 .../auto_sharding/auto_sharding_test.cc       | 27 -------------------
 3 files changed, 33 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index edfe9d8cf61b5c..bb0b54184ce5c8 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -59,7 +59,6 @@ cc_library(
         "//xla/service:hlo_alias_analysis",
         "//xla/service:hlo_buffer",
         "//xla/service:hlo_cost_analysis",
-        "//xla/service:hlo_dce",
         "//xla/service:hlo_memory_scheduler",
         "//xla/service:hlo_pass",
         "//xla/service:hlo_value",
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index fcbc31a6636781..f3b83d61c0d7cc 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -73,7 +73,6 @@ limitations under the License.
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_cost_analysis.h"
-#include "xla/service/hlo_dce.h"
 #include "xla/service/hlo_memory_scheduler.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/optimize_input_output_buffer_alias.h"
@@ -3942,8 +3941,6 @@ absl::StatusOr<bool> AutoSharding::Run(
   metrics::RecordAutoShardingInvocations();
 #endif
 
-  CHECK_OK(HloDCE().Run(module, execution_threads));
-
   TF_RETURN_IF_ERROR(option_.CheckAndSetup());
   LOG(INFO) << "AutoShardingOptions:\n" << option_.ToString();
 
@@ -4105,8 +4102,6 @@ absl::StatusOr<bool> AutoSharding::Run(
         chosen_mesh_shape_ = mesh_shapes[min_mesh_shape_index];
         absl::flat_hash_map<HloComputation*, HloComputation*>
             computation_replacements;
-        CHECK_EQ(module->computation_count(),
-                 modules[min_mesh_shape_index]->computation_count());
         for (size_t i = 0; i < module->computation_count(); ++i) {
           auto original_computation = module->mutable_computation(i);
           auto new_computation =
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index b8dcc047e90bad..359549ee07a872 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -256,33 +256,6 @@ ENTRY %elementwise {
   EXPECT_THAT(instruction, op::Sharding("{devices=[2,2]0,2,1,3}"));
 }
 
-TEST_F(AutoShardingTest, UnusedComputationInModuleTest) {
-  constexpr absl::string_view kHloString = R"(
-HloModule module
-
-%unused_computation (x: f32[], y: f32[]) -> f32[] {
-  %x = f32[] parameter(0)
-  %y = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %x, f32[] %y)
-}
-
-ENTRY %module_does_not_invoke_any_computation {
-  %param0 = f32[1,16,128]{2,1,0} parameter(0)
-  %param1 = f32[1,16,128]{2,1,0} parameter(1)
-  %add = f32[1,16,128]{2,1,0} add(f32[1,16,128]{2,1,0} %param0, f32[1,16,128]{2,1,0} %param1)
-})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(kHloString));
-  AutoShardingOption option;
-  option.enable = true;
-  option.device_mesh_shape = {2, 2};
-  option.device_mesh_ids = {0, 1, 2, 3};
-  option.device_mesh_alpha = {1.0, 1.0};
-  option.device_mesh_beta = {0.01, 1.0};
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
-  EXPECT_TRUE(changed);
-}
-
 TEST_F(AutoShardingTest, Unsupported3DShardingTest) {
   constexpr absl::string_view kHloString = R"(
 HloModule module

From d9b454ee0105b247f623e601a5a82511c4a404dc Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <isergachev@nvidia.com>
Date: Mon, 25 Mar 2024 17:17:31 -0700
Subject: [PATCH 420/670] PR #10835: [GPU] Add support of select operation in
 cuDNN fusions.

Imported from GitHub PR https://github.com/openxla/xla/pull/10835

Copybara import of the project:

--
90c7996c704aeb1edde40243c2770654e9aca446 by Ilia Sergachev <isergachev@nvidia.com>:

[GPU] Add support of select operation in cuDNN fusions.

Merging this change closes #10835

PiperOrigin-RevId: 619006475
---
 .../xla/service/gpu/cudnn_fusion_compiler.cc  | 14 +++++-
 .../xla/xla/service/gpu/fusions/cudnn_test.cc | 45 +++++++++++++++++++
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
index a8e6a50362c7e7..da304e5ebdf1f6 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "third_party/gpus/cudnn/cudnn_version.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
@@ -107,6 +108,10 @@ inline std::optional<fe::PointwiseMode_t> GetElementwiseMode(
       return m::POW;
     case HloOpcode::kRsqrt:
       return m::RSQRT;
+#if CUDNN_VERSION >= 90100
+    case HloOpcode::kSelect:
+      return m::BINARY_SELECT;
+#endif  // CUDNN_VERSION
     case HloOpcode::kSin:
       return m::SIN;
     case HloOpcode::kSqrt:
@@ -154,7 +159,7 @@ int FusionLevel(const HloInstruction& hlo) {
 class GemmDimensionAdapter {
   explicit GemmDimensionAdapter(const HloDotInstruction& dot,
                                 TritonFusionAnalysis analysis)
-      : analysis_(std::move(analysis)), dot_(dot){};
+      : analysis_(std::move(analysis)), dot_(dot) {};
 
  public:
   const TritonFusionAnalysis analysis_;
@@ -378,8 +383,13 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
       } else if (hlo->operand_count() == 2) {
         hlo_to_cudnn[hlo] = graph.pointwise(operand(0), operand(1), attrs);
       } else if (hlo->operand_count() == 3) {
+        if (hlo->opcode() != HloOpcode::kSelect) {
+          VLOG(3) << "Unexpected ternary operation: " << hlo->ToString();
+          return std::nullopt;
+        }
+        // Operand order for select differs between HLO and cuDNN.
         hlo_to_cudnn[hlo] =
-            graph.pointwise(operand(0), operand(1), operand(2), attrs);
+            graph.pointwise(operand(1), operand(2), operand(0), attrs);
       } else {
         VLOG(3) << "Unimplemented elementwise operation.";
         return std::nullopt;
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
index 150f085cc9b9b6..acd36195bed160 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
@@ -48,6 +48,12 @@ class CuDnnFusionTest : public GpuCodegenTest {
                .IsAtLeastHopper() &&
            GetDnnVersionInfo(executor).major_version() >= 9;
   }
+  bool IsAtLeastCuDnn91() {
+    se::StreamExecutor* executor = backend().default_stream_executor();
+    const se::dnn::VersionInfo version = GetDnnVersionInfo(executor);
+    return (version.major_version() == 9 && version.minor_version() >= 1) ||
+           version.major_version() > 9;
+  }
 
  protected:
   void SetUp() override {
@@ -656,6 +662,45 @@ INSTANTIATE_TEST_SUITE_P(
                                          cd::kLe, cd::kLt)),
     CompareTestParamsToString);
 
+class SelectTest : public CuDnnFusionExecutionTest,
+                   public ::testing::WithParamInterface<PrimitiveType> {};
+
+TEST_P(SelectTest, SelectFusionExecutesCorrectly) {
+  if (!IsAtLeastCuDnn91()) {
+    GTEST_SKIP() << "Select operation requires cuDNN 9.1+.";
+  }
+  const std::string kHloTemplate = R"(
+fusion_computation {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  p2 = $0[32,32] parameter(2)
+  p3 = pred[32,32] parameter(3)
+  s = $0[32,32] select(p3, p1, p2)
+  c = f32[32,32] convert(s)
+  ROOT r = f32[32,32] dot(p0, c),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  p2 = $0[32,32] parameter(2)
+  p3 = pred[32,32] parameter(3)
+  ROOT r = f32[32,32] fusion(p0, p1, p2, p3), kind=kCustom,
+    calls=fusion_computation,
+    backend_config={"fusion_backend_config":{"kind":"__cudnn$$fusion"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTemplate, primitive_util::LowercasePrimitiveTypeName(GetParam()));
+
+  EXPECT_TRUE(RunAndCompare(hlo_test, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4}));
+}
+
+constexpr std::array<PrimitiveType, 3> kSupportedDataTypes{F16, F32, BF16};
+
+INSTANTIATE_TEST_SUITE_P(SelectTestSuite, SelectTest,
+                         ::testing::ValuesIn(kSupportedDataTypes));
+
 class CuDnnFusionRewriteTest : public CuDnnFusionTest {
  public:
   DebugOptions GetDebugOptionsForTest() override {

From 5e775608195971dcb8cbe59177823771af86c616 Mon Sep 17 00:00:00 2001
From: Zixuan Jiang <zixuanjiang@google.com>
Date: Mon, 25 Mar 2024 17:28:50 -0700
Subject: [PATCH 421/670] Create a util function
 `BuildGetTupleElementsForTupleResults` with proper processing on the
 shardings.

**Issue 1.**
The following code snippet is widely used in `mlir_hlo_to_hlo.cc` when exporting to a XlaOp if the op has multiple results, such as RngBitGeneratorOp and OptimizationBarrierOp. We create a util function to replace the repeating code snippets.
```
for (auto [index, result] : llvm::enumerate(op.getResults())) {
    value_map[result] = xla::GetTupleElement(tuple, index);
}
```

**Issue 2.**
When we build `xla::GetTupleElement` in the code snippet, we inherit the tuple sharding (if any) and assign the tuple sharding to the `xla::GetTupleElement`. This triggers a mismatching error on shape and sharding in XlaBuilder. This cl uses `XlaScopedShardingAssignment` to process it appropriately.

Enhance the test cases accordingly.

PiperOrigin-RevId: 619009107
---
 .../translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc  | 103 +++++++-----------
 .../translate/mhlo_to_hlo/tests/export.mlir   |  15 +--
 .../translate/mhlo_to_hlo/tests/sharding.mlir |  17 +++
 3 files changed, 65 insertions(+), 70 deletions(-)

diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 597272b23d627b..8911c7864ac83e 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
 
 #include <algorithm>
+#include <cassert>
 #include <cstddef>
 #include <iterator>
 #include <memory>
@@ -832,6 +833,25 @@ bool SimplyReturnedOp(mlir::Operation* op) {
   return false;
 }
 
+void BuildGetTupleElementsForTupleResults(mlir::Operation* op, xla::XlaOp tuple,
+                                          OpLoweringContext ctx) {
+  const std::optional<xla::OpSharding>& tuple_sharding =
+      ctx.builder->sharding();
+  if (tuple_sharding.has_value()) {
+    assert(op->getNumResults() == tuple_sharding->tuple_shardings_size());
+    for (auto [index, result] : llvm::enumerate(op->getResults())) {
+      xla::XlaScopedShardingAssignment scoped_sharding(
+          ctx.builder, tuple_sharding->tuple_shardings(index));
+      (*ctx.values)[result] = xla::GetTupleElement(tuple, index);
+    }
+  } else {
+    xla::XlaScopedShardingAssignment scoped_sharding(ctx.builder, std::nullopt);
+    for (auto [index, result] : llvm::enumerate(op->getResults())) {
+      (*ctx.values)[result] = xla::GetTupleElement(tuple, index);
+    }
+  }
+}
+
 }  // namespace
 
 namespace mlir {
@@ -995,9 +1015,7 @@ LogicalResult ExportXlaOp(AllGatherOp op, OpLoweringContext ctx) {
         Convert_replica_groups(op.getReplicaGroups()),
         Convert_channel_handle(op.getChannelHandle()), layout,
         Convert_use_global_device_ids(op.getUseGlobalDeviceIds()));
-    for (auto [index, result] : llvm::enumerate(op.getResults())) {
-      value_map[result] = xla::GetTupleElement(tuple, index);
-    }
+    BuildGetTupleElementsForTupleResults(op, tuple, ctx);
   } else {
     value_map[op->getResults()[0]] = xla::AllGather(
         operands[0], all_gather_dim, shard_count,
@@ -1030,9 +1048,7 @@ LogicalResult ExportXlaOp(AllReduceOp op, OpLoweringContext ctx) {
         operands, computation, Convert_replica_groups(op.getReplicaGroups()),
         Convert_channel_handle(op.getChannelHandle()), shape_with_layout,
         Convert_use_global_device_ids(op.getUseGlobalDeviceIds()));
-    for (auto [index, result] : llvm::enumerate(op.getResults())) {
-      value_map[result] = xla::GetTupleElement(tuple, index);
-    }
+    BuildGetTupleElementsForTupleResults(op, tuple, ctx);
   } else {
     value_map[op->getResults()[0]] = xla::AllReduce(
         operands[0], computation, Convert_replica_groups(op.getReplicaGroups()),
@@ -1061,9 +1077,7 @@ LogicalResult ExportXlaOp(AllToAllOp op, OpLoweringContext ctx) {
     auto tuple = xla::AllToAllTuple(
         operands, Convert_replica_groups(op.getReplicaGroups()), layout,
         Convert_channel_handle(op.getChannelHandle()));
-    for (auto [index, result] : llvm::enumerate(op.getResults())) {
-      value_map[result] = xla::GetTupleElement(tuple, index);
-    }
+    BuildGetTupleElementsForTupleResults(op, tuple, ctx);
   } else {
     // ArrayAllToAll always has exactly one operand (checked in the verifier).
     value_map[op->getResults()[0]] = xla::AllToAll(
@@ -1361,11 +1375,7 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
     if (op.getNumResults() == 1) {
       value_map[op.getResult(0)] = xla_recv;
     } else {
-      xla::XlaScopedShardingAssignment scoped_sharding(ctx.builder,
-                                                       std::nullopt);
-      for (const auto& item : llvm::enumerate(op.getResults())) {
-        value_map[item.value()] = xla::GetTupleElement(xla_recv, item.index());
-      }
+      BuildGetTupleElementsForTupleResults(op, xla_recv, ctx);
     }
     return success();
   }
@@ -1384,9 +1394,7 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = exportedOp;
   } else {
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      value_map[item.value()] = xla::GetTupleElement(exportedOp, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, exportedOp, ctx);
   }
   return success();
 }
@@ -1600,9 +1608,7 @@ LogicalResult ExportXlaOp(IfOp op, OpLoweringContext ctx) {
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = ifop;
   } else {
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      value_map[item.value()] = xla::GetTupleElement(ifop, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, ifop, ctx);
   }
 
   return success();
@@ -1657,9 +1663,7 @@ LogicalResult ExportXlaOp(CaseOp op, OpLoweringContext ctx) {
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = caseop;
   } else {
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      value_map[item.value()] = xla::GetTupleElement(caseop, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, caseop, ctx);
   }
   return success();
 }
@@ -2013,9 +2017,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
                               reduction_dim, comparator, recall_target,
                               aggregate_to_topk, reduction_input_size_override);
     }
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      value_map[item.value()] = xla::GetTupleElement(cc_op, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, cc_op, ctx);
     return success();
   }
 
@@ -2115,9 +2117,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
   if (op->getNumResults() == 1) {
     value_map[op.getResult(0)] = custom_call;
   } else {
-    for (auto [index, result] : llvm::enumerate(op.getResults())) {
-      value_map[result] = xla::GetTupleElement(custom_call, index);
-    }
+    BuildGetTupleElementsForTupleResults(op, custom_call, ctx);
   }
 
   return success();
@@ -2312,9 +2312,7 @@ LogicalResult ExportXlaOp(ReduceOp op, OpLoweringContext ctx) {
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = result;
   } else {
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      value_map[item.value()] = xla::GetTupleElement(result, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, result, ctx);
   }
   return success();
 }
@@ -2342,9 +2340,7 @@ LogicalResult ExportXlaOp(ReduceWindowOp op, OpLoweringContext ctx) {
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = result;
   } else {
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      value_map[item.value()] = xla::GetTupleElement(result, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, result, ctx);
   }
   return success();
 }
@@ -2374,9 +2370,7 @@ LogicalResult ExportXlaOp(RngBitGeneratorOp op, OpLoweringContext ctx) {
       static_cast<xla::RandomAlgorithm>(op.getRngAlgorithm()),
       Unwrap(xla_arg_1), xla::TypeToShape(results[1].getType()));
 
-  for (const auto& item : llvm::enumerate(results))
-    value_map[item.value()] = xla::GetTupleElement(xla_result, item.index());
-
+  BuildGetTupleElementsForTupleResults(op, xla_result, ctx);
   return mlir::success();
 }
 
@@ -2391,7 +2385,6 @@ LogicalResult ExportXlaOp(XlaRngGetAndUpdateStateOp op, OpLoweringContext ctx) {
 
 LogicalResult ExportXlaOp(BatchNormGradOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
-  auto results = op.getResults();
 
   xla::XlaOp operand, scale, mean, variance, grad_output;
   if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
@@ -2407,15 +2400,13 @@ LogicalResult ExportXlaOp(BatchNormGradOp op, OpLoweringContext ctx) {
       xla::BatchNormGrad(operand, scale, mean, variance, grad_output,
                          ConvertAPFloat(op.getEpsilon()), op.getFeatureIndex());
 
-  for (const auto& item : llvm::enumerate(results))
-    value_map[item.value()] = xla::GetTupleElement(xla_result, item.index());
+  BuildGetTupleElementsForTupleResults(op, xla_result, ctx);
 
   return mlir::success();
 }
 
 LogicalResult ExportXlaOp(BatchNormTrainingOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
-  auto results = op.getResults();
 
   xla::XlaOp operand, scale, offset;
   if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
@@ -2428,8 +2419,7 @@ LogicalResult ExportXlaOp(BatchNormTrainingOp op, OpLoweringContext ctx) {
                                            ConvertAPFloat(op.getEpsilon()),
                                            op.getFeatureIndex());
 
-  for (const auto& item : llvm::enumerate(results))
-    value_map[item.value()] = xla::GetTupleElement(xla_result, item.index());
+  BuildGetTupleElementsForTupleResults(op, xla_result, ctx);
 
   return mlir::success();
 }
@@ -2478,9 +2468,7 @@ LogicalResult ExportXlaOp(ScatterOp op, OpLoweringContext ctx) {
   }
 
   // mhlo.ScatterOp supports multiple returns, untuple all the results of XLA's.
-  for (const auto& it : llvm::enumerate(op.getResults())) {
-    value_map[it.value()] = xla::GetTupleElement(scatter_op, it.index());
-  }
+  BuildGetTupleElementsForTupleResults(op, scatter_op, ctx);
 
   return success();
 }
@@ -2607,9 +2595,7 @@ LogicalResult ExportXlaOp(SortOp op, OpLoweringContext ctx) {
   }
 
   // MLIR's sort supports multiple returns, untuple all the results of XLA's.
-  for (const auto& it : llvm::enumerate(op.getResults())) {
-    value_map[it.value()] = xla::GetTupleElement(sorted, it.index());
-  }
+  BuildGetTupleElementsForTupleResults(op, sorted, ctx);
   return success();
 }
 
@@ -2672,9 +2658,7 @@ LogicalResult ExportXlaOp(WhileOp op, OpLoweringContext ctx) {
   }
 
   // mhlo.WhileOp supports multiple returns, untuple all the results of XLA's.
-  for (const auto& it : llvm::enumerate(op.getResults())) {
-    value_map[it.value()] = xla::GetTupleElement(whileop, it.index());
-  }
+  BuildGetTupleElementsForTupleResults(op, whileop, ctx);
 
   return success();
 }
@@ -2693,10 +2677,7 @@ LogicalResult ExportXlaOp(OptimizationBarrierOp op, OpLoweringContext ctx) {
         xla::OptimizationBarrier(operands[0]);
   } else {
     auto result = xla::OptimizationBarrier(Tuple(ctx.builder, operands));
-
-    for (const auto& it : llvm::enumerate(op.getResults())) {
-      value_map[it.value()] = xla::GetTupleElement(result, it.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, result, ctx);
   }
 
   return success();
@@ -2729,9 +2710,7 @@ LogicalResult ExportXlaOp(FusionOp op, OpLoweringContext ctx) {
   if (op.getNumResults() == 1) {
     values[op.getResult(0)] = fusion;
   } else {
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      values[item.value()] = xla::GetTupleElement(fusion, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, fusion, ctx);
   }
   return success();
 }
@@ -2790,9 +2769,7 @@ LogicalResult ExportXlaOp(TopKOp op, OpLoweringContext ctx) {
   auto topk = xla::TopK(operand, op.getK(), op.getLargest());
 
   // Untuple the two results of XLA's topk.
-  for (const auto& [index, value] : llvm::enumerate(op.getResults())) {
-    value_map[value] = xla::GetTupleElement(topk, index);
-  }
+  BuildGetTupleElementsForTupleResults(op, topk, ctx);
   return success();
 }
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
index e44e994a5f23a8..0a49b55a59c879 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
@@ -2173,10 +2173,8 @@ func.func @main(%token: !mhlo.token) -> (tensor<3x4xi32>, !mhlo.token) {
 // CHECK-SAME:  sharding={
 // CHECK-SAME:    {maximal device=0}, {maximal device=0}
 // CHECK-SAME:  }
-// CHECK:  [[TUPLE0:%.*]] = s32[3,4] get-tuple-element((s32[3,4], token[]) [[RECV_DONE]]), index=0
-// CHECK-NOT: sharding=
-// CHECK:  [[TUPLE1:%.*]] = token[] get-tuple-element((s32[3,4], token[]) [[RECV_DONE]]), index=1
-// CHECK-NOT: sharding=
+// CHECK:  [[TUPLE0:%.*]] = s32[3,4] get-tuple-element((s32[3,4], token[]) [[RECV_DONE]]), index=0, sharding={maximal device=0}
+// CHECK:  [[TUPLE1:%.*]] = token[] get-tuple-element((s32[3,4], token[]) [[RECV_DONE]]), index=1, sharding={maximal device=0}
 // CHECK:  ROOT {{%.*}} = (s32[3,4], token[]) tuple(s32[3,4] [[TUPLE0]], token[] [[TUPLE1]])
 
 // -----
@@ -2855,9 +2853,12 @@ func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4x1xf32> {
 func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> (tensor<4x4xf32>, tensor<3x4xf32>) {
 // CHECK: %[[ARG0:.*]] = f32[4,4] parameter(0)
 // CHECK: %[[ARG1:.*]] = f32[3,4] parameter(1)
-// CHECK: %[[ARGS:.*]] = (f32[4,4], f32[3,4]) tuple(f32[4,4] %[[ARG0]], f32[3,4] %[[ARG1]])
-// CHECK: %[[RESULT:.*]] = (f32[4,4], f32[3,4]) opt-barrier((f32[4,4], f32[3,4]) %[[ARGS]])
-  %0, %1 = "mhlo.optimization_barrier"(%arg0, %arg1) : (tensor<4x4xf32>, tensor<3x4xf32>) -> (tensor<4x4xf32>, tensor<3x4xf32>)
+// CHECK: %[[ARGS:.*]] = (f32[4,4], f32[3,4]) tuple(f32[4,4] %[[ARG0]], f32[3,4] %[[ARG1]]), sharding={{\{}}{replicated}, {devices=[1,2]<=[2]}}
+// CHECK: %[[OPT:.*]] = (f32[4,4], f32[3,4]) opt-barrier((f32[4,4], f32[3,4]) %[[ARGS]]), sharding={{\{}}{replicated}, {devices=[1,2]<=[2]}}
+// CHECK: %[[GTE0:.*]] = f32[4,4] get-tuple-element((f32[4,4], f32[3,4]) %[[OPT]]), index=0, sharding={replicated}
+// CHECK: %[[GTE1:.*]] = f32[3,4] get-tuple-element((f32[4,4], f32[3,4]) %[[OPT]]), index=1, sharding={devices=[1,2]<=[2]}
+// CHECK: ROOT %[[ROOT:.*]] = (f32[4,4], f32[3,4]) tuple(f32[4,4] %[[GTE0]], f32[3,4] %[[GTE1]])
+  %0, %1 = "mhlo.optimization_barrier"(%arg0, %arg1) {mhlo.sharding = "{{replicated}, {devices=[1,2]<=[2]}}"} : (tensor<4x4xf32>, tensor<3x4xf32>) -> (tensor<4x4xf32>, tensor<3x4xf32>)
   func.return %0, %1 : tensor<4x4xf32>, tensor<3x4xf32>
 }
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
index db19b7b24f8b6b..0342ce2b4c2eee 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
@@ -49,3 +49,20 @@ func.func @main() -> (tensor<12x24x36xf32>) {
   %1 = mhlo.add %0, %0 : tensor<12x24x36xf32>
   return %1 : tensor<12x24x36xf32>
 }
+
+// -----
+
+// CHECK-LABEL: ENTRY %main.{{.*}} (Arg_0.1: u64[2]) -> (u64[2], u32[512,4])
+func.func @main(%arg0: tensor<2xui64>) -> (tensor<2xui64> {mhlo.sharding = "{devices=[2,16]<=[32] last_tile_dim_replicate}"}, tensor<512x4xui32> {mhlo.sharding = "{devices=[4,8]<=[32]}"}) {
+  // CHECK-NEXT: %Arg_0.1 = u64[2] parameter(0)
+  // CHECK-NEXT: %rng-bit-generator.2 = (u64[2], u32[512,4]) rng-bit-generator(u64[2] %Arg_0.1), algorithm=rng_default, sharding={{\{}}{replicated}, {devices=[8,4]<=[32]}}
+  // CHECK-NEXT: %get-tuple-element.3 = u64[2] get-tuple-element((u64[2], u32[512,4]) %rng-bit-generator.2), index=0, sharding={replicated}
+  // CHECK-NEXT: %add.5 = u64[2] add(u64[2] %get-tuple-element.3, u64[2] %get-tuple-element.3)
+  // CHECK-NEXT: %reshape.6 = u64[2] reshape(u64[2] %add.5)
+  // CHECK-NEXT: %get-tuple-element.4 = u32[512,4] get-tuple-element((u64[2], u32[512,4]) %rng-bit-generator.2), index=1, sharding={devices=[8,4]<=[32]}
+  // CHECK-NEXT: %reshape.7 = u32[512,4] reshape(u32[512,4] %get-tuple-element.4)
+  // CHECK-NEXT: ROOT %tuple.8 = (u64[2], u32[512,4]) tuple(u64[2] %reshape.6, u32[512,4] %reshape.7), sharding={{\{}}{devices=[2,16]<=[32] last_tile_dim_replicate}, {devices=[4,8]<=[32]}}
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {mhlo.sharding = "{{replicated}, {devices=[8,4]<=[32]}}", rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<512x4xui32>)
+  %0 = mhlo.add %output_state, %output_state : tensor<2xui64>
+  return %0, %output : tensor<2xui64>, tensor<512x4xui32>
+}

From 4c65498c773375c306640e3ccb80722aef72481a Mon Sep 17 00:00:00 2001
From: Deqiang Chen <deqiangc@google.com>
Date: Mon, 25 Mar 2024 17:37:02 -0700
Subject: [PATCH 422/670] Change tf.IfrtLoadVariableOp to receive tensor future
 from IfrtRestoreTensorRegistry instead of from resource manager.
 Correspondingly, IfrtLoadVariableRegistry is changed to store a future of
 array. This enables tfrt+ifrt to restore tensor asynchronously

PiperOrigin-RevId: 619011241
---
 .../compiler/mlir/tfrt/transforms/ifrt/BUILD  |  12 ++
 .../mlir/tfrt/transforms/ifrt/ifrt_types.h    |  33 +++
 .../mlir/tfrt/transforms/ifrt/tf2hlo.cc       |   1 +
 .../mlir/tfrt/transforms/ifrt/tf2hlo.h        |   6 +-
 .../tfrt/transforms/ifrt/tf_ifrt_passes.cc    |   2 +
 tensorflow/core/tfrt/ifrt/BUILD               |   2 +
 .../ifrt/ifrt_loaded_variable_registry.cc     |   6 +-
 .../tfrt/ifrt/ifrt_loaded_variable_registry.h |  17 +-
 .../core/tfrt/ifrt/ifrt_model_context.h       |  10 +-
 .../core/tfrt/ifrt/ifrt_serving_executable.cc |  14 +-
 .../tfrt/ifrt/ifrt_serving_executable_test.cc |  19 +-
 tensorflow/core/tfrt/mlrt/kernel/BUILD        |   2 +
 .../core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc  | 129 +++++++++---
 .../tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc  | 197 ++++++++----------
 .../tests/saved_model_ifrt_test.cc            |   3 +-
 15 files changed, 285 insertions(+), 168 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h

diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
index fd3ff2cd31512f..6ef5c011d0a11d 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
@@ -50,6 +50,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ifrt_types",
+    srcs = [],
+    hdrs = ["ifrt_types.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tf_ifrt_passes",
     srcs = [
@@ -111,6 +122,7 @@ cc_library(
     hdrs = ["tf2hlo.h"],
     deps = [
         ":ifrt_constants",
+        ":ifrt_types",
         "//tensorflow/compiler/jit:xla_cpu_jit",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h
new file mode 100644
index 00000000000000..c64672cdb10e69
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_TYPES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_TYPES_H_
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+struct DtypeAndShape {
+  tensorflow::DataType dtype;
+  tensorflow::TensorShape shape;
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_TYPES_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
index 312761a3ba06d7..65b5078495ea04 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_constants.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "tensorflow/compiler/tf2xla/layout_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
index 74fa271401f547..fec9bbb2c740e7 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/python/ifrt/client.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -31,11 +32,6 @@ limitations under the License.
 namespace tensorflow {
 namespace ifrt_serving {
 
-struct DtypeAndShape {
-  tensorflow::DataType dtype;
-  tensorflow::TensorShape shape;
-};
-
 struct Tf2HloResult {
   mlir::OwningOpRef<mlir::ModuleOp> mlir_hlo_module;
   tensorflow::tpu::TPUCompileMetadataProto compile_metadata;
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
index 458b15289d66c3..9737c681d28aa8 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
@@ -76,6 +76,8 @@ void AddClusterToIfrtRuntimeOpsPassPipeline(OpPassManager& pm,
   pm.addNestedPass<mlir::func::FuncOp>(CreateTfRestorePruningPass());
   pm.addNestedPass<mlir::func::FuncOp>(CreateTfRestoreMergingPass());
 
+  pm.addPass(CreateLowerToIfrtRestoreVariablePass());
+
   pm.addPass(CreateRewriteClusterToIfrtCallPass());
 
   // Sink VarHandle with ReadVariableOp: subsequent SinkVariableAsNamedArrayPass
diff --git a/tensorflow/core/tfrt/ifrt/BUILD b/tensorflow/core/tfrt/ifrt/BUILD
index e0ecfe019f9994..8e9e7c060b2fa6 100644
--- a/tensorflow/core/tfrt/ifrt/BUILD
+++ b/tensorflow/core/tfrt/ifrt/BUILD
@@ -32,6 +32,7 @@ cc_library(
         ":ifrt_loaded_variable_registry",
         ":ifrt_tensor_utils",
         ":sharding_utils",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
         "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:tf2hlo",
         "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/core:framework",
@@ -99,6 +100,7 @@ cc_library(
     srcs = ["ifrt_loaded_variable_registry.cc"],
     hdrs = ["ifrt_loaded_variable_registry.h"],
     deps = [
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc
index 213d75431df6c9..cdc8d669df989e 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/python/ifrt/array.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/statusor.h"
 
 namespace tensorflow {
@@ -36,7 +34,7 @@ absl::Status IfrtLoadedVariableRegistry::TryRegisterLoadedVariable(
     LoadedVariableConstructor&& loaded_variable_constructor) {
   absl::MutexLock lock(&mutex_);
   auto& variable = loaded_variable_map_[name];
-  if (variable != nullptr) {
+  if (variable.array.IsValid()) {
     // Already registered. This is rare.
     VLOG(1) << "Variable '" << name << "' already registered.";
     return absl::OkStatus();
@@ -45,7 +43,7 @@ absl::Status IfrtLoadedVariableRegistry::TryRegisterLoadedVariable(
   return absl::OkStatus();
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+absl::StatusOr<IfrtLoadedVariableRegistry::LoadedVariable>
 IfrtLoadedVariableRegistry::GetLoadedVariable(absl::string_view name) const {
   absl::MutexLock lock(&mutex_);
   auto it = loaded_variable_map_.find(name);
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
index ccfc4aa3d46a1b..5cb885598174bd 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
@@ -22,7 +22,9 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/future.h"
 #include "tsl/concurrency/ref_count.h"
 
 namespace tensorflow {
@@ -31,9 +33,12 @@ namespace ifrt_serving {
 // This class is thread safe.
 class IfrtLoadedVariableRegistry {
  public:
+  struct LoadedVariable {
+    DtypeAndShape dtype_and_shape;
+    xla::ifrt::Future<absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>> array;
+  };
   using LoadedVariableConstructor =
-      absl::AnyInvocable<absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>()
-                             const>;
+      absl::AnyInvocable<absl::StatusOr<LoadedVariable>() const>;
 
   // Tries to register a loaded variable with the given name.
   // Returns an error if the named array does not already exists and
@@ -45,13 +50,13 @@ class IfrtLoadedVariableRegistry {
       LoadedVariableConstructor&& loaded_variable_constructor)
       ABSL_LOCKS_EXCLUDED(mutex_);
 
-  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> GetLoadedVariable(
-      absl::string_view name) const ABSL_LOCKS_EXCLUDED(mutex_);
+  absl::StatusOr<LoadedVariable> GetLoadedVariable(absl::string_view name) const
+      ABSL_LOCKS_EXCLUDED(mutex_);
 
  private:
   mutable absl::Mutex mutex_;
-  absl::flat_hash_map<std::string, tsl::RCReference<xla::ifrt::Array>>
-      loaded_variable_map_ ABSL_GUARDED_BY(mutex_);
+  absl::flat_hash_map<std::string, LoadedVariable> loaded_variable_map_
+      ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace ifrt_serving
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_model_context.h b/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
index dd1ef572bb5e4d..067ccfe2365ea2 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
@@ -91,9 +91,11 @@ class IfrtModelContext {
     return restore_tensor_registry_;
   }
 
-  tfrt::ConcurrentWorkQueue* work_queue() const { return work_queue_; }
-  void set_work_queue(tfrt::ConcurrentWorkQueue* work_queue) {
-    work_queue_ = work_queue;
+  tfrt::ConcurrentWorkQueue* checkpoint_loader_queue() const {
+    return checkpoint_loader_queue_;
+  }
+  void set_checkpoint_loader_queue(tfrt::ConcurrentWorkQueue* work_queue) {
+    checkpoint_loader_queue_ = work_queue;
   }
 
  private:
@@ -103,7 +105,7 @@ class IfrtModelContext {
       tensorflow::IdentityShapeRepresentationFn();
 
   // Dedicated work queue for heavy task such as variable tensor restoration.
-  tfrt::ConcurrentWorkQueue* work_queue_ = nullptr;
+  tfrt::ConcurrentWorkQueue* checkpoint_loader_queue_ = nullptr;
 
   std::vector<ServingExecutableRegistry::Handle> handles_;
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
index 847b6ea5bbb88c..868f49ad070ac4 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
-#include <string>
 #include <utility>
 #include <vector>
 
@@ -30,6 +29,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/hlo/ir/hlo_sharding.h"
@@ -37,10 +37,8 @@ limitations under the License.
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/future.h"
-#include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
@@ -74,12 +72,10 @@ absl::StatusOr<std::vector<DtypeAndShape>> BuildDtypeAndShape(
     if (variable_index < variable_arg_indices.size() &&
         i == variable_arg_indices[variable_index]) {
       // Get already loaded variable tensor.
-      TF_ASSIGN_OR_RETURN(auto single_array,
+      TF_ASSIGN_OR_RETURN(auto loaded_variable,
                           ifrt_loaded_variable_registry.GetLoadedVariable(
                               inputs[i].scalar<tsl::tstring>()()));
-      TF_ASSIGN_OR_RETURN(auto dtype, ToTensorDataType(single_array->dtype()));
-      dtypes_and_shapes.push_back(DtypeAndShape{
-          .dtype = dtype, .shape = ToTensorShape(single_array->shape())});
+      dtypes_and_shapes.push_back(loaded_variable.dtype_and_shape);
 
       variable_index++;
     } else {
@@ -304,9 +300,11 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   for (int i = 0; i < inputs.size(); i++) {
     if (variable_index < variable_arg_indices.size() &&
         i == variable_arg_indices[variable_index]) {
-      TF_ASSIGN_OR_RETURN(auto single_array,
+      TF_ASSIGN_OR_RETURN(auto loaded_variable,
                           ifrt_loaded_variable_registry_.GetLoadedVariable(
                               inputs[i].scalar<tsl::tstring>()()));
+      TF_ASSIGN_OR_RETURN(tsl::RCReference<xla::ifrt::Array> single_array,
+                          loaded_variable.array.Await());
       args.push_back(single_array);
       variable_index++;
     } else {
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
index 94c67be2260972..5d225f4c8db598 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/test_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_matcher.h"
@@ -348,15 +349,27 @@ TEST_P(VariableInputTest, InterleaveVariable) {
       std::string variable_name = absl::StrCat("variable_", i);
       ASSERT_OK(ifrt_loaded_variable_registry.TryRegisterLoadedVariable(
           variable_name,
-          [&]() -> absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> {
+          [&]() -> absl::StatusOr<IfrtLoadedVariableRegistry::LoadedVariable> {
+            tensorflow::Tensor in_tensor = GetParam().in_tensors[i];
             TF_ASSIGN_OR_RETURN(
                 tsl::RCReference<xla::ifrt::Array> array,
-                MakeArrayFromTensor(*client, GetParam().in_tensors[i],
+                MakeArrayFromTensor(*client, in_tensor,
                                     /*device_ids=*/{0},
                                     xla::HloSharding::Replicate(),
                                     GetThreadPool()));
 
-            return array;
+            auto promise = xla::ifrt::Future<absl::StatusOr<
+                tsl::RCReference<xla::ifrt::Array>>>::CreatePromise();
+            auto future = xla::ifrt::Future<
+                absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>(promise);
+            promise.Set(array);
+
+            IfrtLoadedVariableRegistry::LoadedVariable loaded_variable;
+            loaded_variable.array = future;
+            loaded_variable.dtype_and_shape.dtype = in_tensor.dtype();
+            loaded_variable.dtype_and_shape.shape = in_tensor.shape();
+
+            return loaded_variable;
           }));
       loaded_variable_indices.push_back(i);
 
diff --git a/tensorflow/core/tfrt/mlrt/kernel/BUILD b/tensorflow/core/tfrt/mlrt/kernel/BUILD
index 223dd6da29d54f..304f637952aec3 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/BUILD
+++ b/tensorflow/core/tfrt/mlrt/kernel/BUILD
@@ -54,6 +54,7 @@ cc_library(
         ":context",
         ":kernel",
         ":kernel_runner_utils",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core/framework:attr_value_proto_cc",
@@ -62,6 +63,7 @@ cc_library(
         "//tensorflow/core/platform:refcount",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/core/tfrt/ifrt:ifrt_config_proto_cc",
+        "//tensorflow/core/tfrt/ifrt:ifrt_loaded_variable_registry",
         "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
         "//tensorflow/core/tfrt/ifrt:ifrt_restore_tensor_registry",
         "//tensorflow/core/tfrt/ifrt:sharding_utils",
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
index 99f6ec4727b9d1..47d1bd03e22758 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/future.h"
@@ -34,14 +35,13 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_handle.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/protobuf.h"  // IWYU pragma: keep
-#include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_model_context.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
 #include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
@@ -90,6 +90,27 @@ std::string GetRuntimeNameFromVarHandle(const ResourceHandle& handle) {
   return absl::StrCat(handle.container(), "__", handle.name());
 }
 
+absl::StatusOr<ifrt_serving::DtypeAndShape> GetDtypeAndShape(
+    const ResourceHandle& variable) {
+  std::vector<DtypeAndPartialTensorShape> dtype_and_partial_shapes =
+      variable.dtypes_and_shapes();
+
+  if (dtype_and_partial_shapes.size() != 1) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Expected 1 dtype and shape, got ", dtype_and_partial_shapes.size()));
+  }
+  ifrt_serving::DtypeAndShape dtype_and_shape;
+  if (!dtype_and_partial_shapes.front().shape.AsTensorShape(
+          &dtype_and_shape.shape)) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Failed to convert partial shape to full tensor shape: ",
+                     dtype_and_partial_shapes.front().shape.DebugString()));
+  }
+
+  dtype_and_shape.dtype = dtype_and_partial_shapes.front().dtype;
+  return dtype_and_shape;
+}
+
 struct MlrtIfrtRestoreVariableKernel : mlrt::KernelFrame {
   using KernelFrame::KernelFrame;
 
@@ -218,9 +239,9 @@ void MlrtIfrtRestoreVariableKernel::Invoke() {
   }
 
   // Use dedicated work queue for restore operation.
-  DCHECK((*ifrt_model_context)->work_queue() != nullptr);
+  DCHECK((*ifrt_model_context)->checkpoint_loader_queue() != nullptr);
   (*ifrt_model_context)
-      ->work_queue()
+      ->checkpoint_loader_queue()
       ->AddTask(
           [runner = std::move(runner), async_state = std::move(async_state)]() {
             auto* op_kernel_context_ptr = &async_state->context;
@@ -241,7 +262,8 @@ void MlrtIfrtRestoreVariableKernel::Invoke() {
           });
 }
 
-struct MlrtIfrtLoadVariableKernel : mlrt::KernelFrame {
+class MlrtIfrtLoadVariableKernel : public mlrt::KernelFrame {
+ public:
   using KernelFrame::KernelFrame;
 
   static constexpr char kName[] = "tf_mlrt.ifrt_load_variable";
@@ -258,16 +280,23 @@ struct MlrtIfrtLoadVariableKernel : mlrt::KernelFrame {
     DCHECK_EQ(attributes().size(), 2);
     return attributes().GetAs<mlrt::bc::String>(0).Get();
   }
-  absl::string_view name() const {
-    DCHECK_EQ(attributes().size(), 2);
-    return attributes().GetAs<mlrt::bc::String>(1).Get();
-  }
 
   Context& context() { return execution_context().GetUserContext<Context>(); }
   void Invoke();
+
+ private:
+  absl::Status InvokeHelper();
 };
 
 void MlrtIfrtLoadVariableKernel::Invoke() {
+  absl::Status status = InvokeHelper();
+  if (!status.ok()) {
+    execution_context().Fail(std::move(status));
+    return;
+  }
+}
+
+absl::Status MlrtIfrtLoadVariableKernel::InvokeHelper() {
   DCHECK_EQ(1, results().size());
   std::optional<tensorflow::ifrt_serving::IfrtModelContext*>
       ifrt_model_context =
@@ -276,36 +305,74 @@ void MlrtIfrtLoadVariableKernel::Invoke() {
               .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
                   "IfrtModelContext");
   if (!ifrt_model_context.has_value()) {
-    execution_context().Fail(absl::FailedPreconditionError(
-        "LoadVariableOp: failed to fetch IfrtModelContext: "));
-    return;
+    return absl::FailedPreconditionError(
+        "LoadVariableOp: failed to fetch IfrtModelContext: ");
   }
 
-  auto status =
+  // TODO(b/319045348): remove name() attribute. we now gets name from variable
+  // handle.
+  std::string runtime_name = GetRuntimeNameFromVarHandle(variable());
+  xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> restored_tensor_future =
+      (*ifrt_model_context)->GetRestoreTensorRegistry().Get(runtime_name);
+  if (!restored_tensor_future.IsValid()) {
+    return absl::InternalError(absl::StrCat(
+        "LoadVariableOp: failed to fetch variable tensor: ", runtime_name));
+  }
+
+  auto loaded_variable_promise = xla::ifrt::Future<
+      absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>::CreatePromise();
+  auto loaded_variable_future =
+      xla::ifrt::Future<absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>(
+          loaded_variable_promise);
+
+  TF_ASSIGN_OR_RETURN(ifrt_serving::DtypeAndShape dtype_and_shape,
+                      GetDtypeAndShape(variable()));
+
+  TF_RETURN_IF_ERROR(
       (*ifrt_model_context)
           ->GetLoadedVariableRegistry()
           .TryRegisterLoadedVariable(
-              name(),
-              [&]() -> absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> {
-                core::RefCountPtr<Var> variable_resource;
-                TF_RETURN_IF_ERROR(
-                    LookupResource(&context().op_kernel_context(), variable(),
-                                   &variable_resource));
-
-                return LoadIfrtVariable(**ifrt_model_context,
-                                        *(variable_resource->tensor()),
-                                        sharding_config_proto_text(), name());
-              });
-  if (!status.ok()) {
-    execution_context().Fail(std::move(status));
-    return;
-  }
-
+              runtime_name,
+              [&]() -> absl::StatusOr<ifrt_serving::IfrtLoadedVariableRegistry::
+                                          LoadedVariable> {
+                return ifrt_serving::IfrtLoadedVariableRegistry::LoadedVariable(
+                    {.dtype_and_shape = dtype_and_shape,
+                     .array = loaded_variable_future});
+              }));
+
+  restored_tensor_future.OnReady(
+      [ifrt_model_context = *ifrt_model_context,
+       sharding_config = std::string(sharding_config_proto_text()),
+       runtime_name = runtime_name,
+       loaded_variable_promise = std::move(loaded_variable_promise)](
+          absl::StatusOr<tensorflow::Tensor> restored_tensor) mutable {
+        if (!restored_tensor.ok()) {
+          loaded_variable_promise.Set(std::move(restored_tensor).status());
+          return;
+        }
+
+        // Transfer tensor to array in a separate thread.
+        ifrt_model_context->checkpoint_loader_queue()->AddTask(
+            [ifrt_model_context, runtime_name = std::move(runtime_name),
+             sharding_config = std::move(sharding_config),
+             restored_tensor = std::move(*restored_tensor),
+             loaded_variable_promise =
+                 std::move(loaded_variable_promise)]() mutable {
+              absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+                  variable_array =
+                      LoadIfrtVariable(*ifrt_model_context, restored_tensor,
+                                       sharding_config, runtime_name);
+              loaded_variable_promise.Set(std::move(variable_array));
+            });
+      });
   // Return the name as the key
   tensorflow::Tensor key_tensor(tensorflow::DT_STRING, {});
-  key_tensor.scalar<tsl::tstring>()() = std::string(name());
+  key_tensor.scalar<tsl::tstring>()() = runtime_name;
   results()[0].Set(tensorflow::tfrt_stub::FallbackTensor(key_tensor));
+
+  return absl::OkStatus();
 }
+
 void RegisterTfMlrtIfrtKernels(mlrt::KernelRegistry& registry) {
   registry.Register<MlrtIfrtLoadVariableKernel>();
   registry.Register<MlrtIfrtRestoreVariableKernel>();
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
index 69aec242d5d346..c76deb3bfb62c3 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
 #include "absl/synchronization/notification.h"
 #include "absl/types/span.h"
 #include "xla/python/ifrt/client.h"
@@ -70,7 +71,9 @@ using tensorflow::test::AsTensor;
 using tensorflow::test::ExpectEqual;
 using tensorflow::test::TensorEq;
 
-static absl::string_view kVariableName = "test_variable";
+constexpr absl::string_view kContainer = "test";
+constexpr absl::string_view kSharedName = "y";
+constexpr absl::string_view kVariableRuntimeName = "test__y";
 
 tsl::thread::ThreadPool& GetThreadPool() {
   constexpr int kMaxParallelism = 16;
@@ -115,26 +118,28 @@ mlrt::bc::Buffer CreateExecutableForIfrtRestoreVariableOp() {
   attributes.Add("restore_dtypes", restore_dtypes);
 
   attributes.Add("var_handle_op_node_def",
-                 R"pb(name: "VarHandleOp"
-                      op: "VarHandleOp"
-                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
-                      attr {
-                        key: "container"
-                        value { s: "test" }
-                      }
-                      attr {
-                        key: "shared_name"
-                        value { s: "y" }
-                      }
-                      attr {
-                        key: "dtype"
-                        value { type: DT_INT32 }
-                      }
-                      attr {
-                        key: "shape"
-                        value { shape { dim { size: 1 } } }
-                      }
-                 )pb");
+                 absl::Substitute(
+                     R"pb(name: "VarHandleOp"
+                          op: "VarHandleOp"
+                          device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                          attr {
+                            key: "container"
+                            value { s: "$0" }
+                          }
+                          attr {
+                            key: "shared_name"
+                            value { s: "$1" }
+                          }
+                          attr {
+                            key: "dtype"
+                            value { type: DT_INT32 }
+                          }
+                          attr {
+                            key: "shape"
+                            value { shape { dim { size: 1 } } }
+                          }
+                     )pb",
+                     kContainer, kSharedName));
 
   attributes.Add("var_handle_op_key", 0);
 
@@ -216,7 +221,7 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
   kernels.Def(kernel_names);
 
   mlrt::testing::AttributeTable attributes(
-      executable_ctor.construct_attributes(6));
+      executable_ctor.construct_attributes(4));
 
   tensorflow::ifrt_serving::VariableDeviceShardingConfigProto sharding_config;
   sharding_config.add_device_ids(0);
@@ -226,46 +231,34 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
   printer.PrintToString(sharding_config, &serialized_sharding_config);
 
   attributes.Add("sharding_config", serialized_sharding_config);
-  attributes.Add("variable_name", kVariableName);
+  attributes.Add("variable_name", kVariableRuntimeName);
 
   attributes.Add("var_handle_op_node_def",
-                 R"pb(name: "VarHandleOp"
-                      op: "VarHandleOp"
-                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
-                      attr {
-                        key: "container"
-                        value { s: "test" }
-                      }
-                      attr {
-                        key: "shared_name"
-                        value { s: "y" }
-                      }
-                      attr {
-                        key: "dtype"
-                        value { type: DT_INT32 }
-                      }
-                      attr {
-                        key: "shape"
-                        value { shape { dim { size: 1 } } }
-                      }
-                 )pb");
+                 absl::Substitute(
+                     R"pb(name: "VarHandleOp"
+                          op: "VarHandleOp"
+                          device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                          attr {
+                            key: "container"
+                            value { s: "$0" }
+                          }
+                          attr {
+                            key: "shared_name"
+                            value { s: "$1" }
+                          }
+                          attr {
+                            key: "dtype"
+                            value { type: DT_INT32 }
+                          }
+                          attr {
+                            key: "shape"
+                            value { shape { dim { size: 1 } } }
+                          }
+                     )pb",
+                     kContainer, kSharedName));
 
   attributes.Add("var_handle_op_key", 0);
 
-  attributes.Add("assign_variable_op_node_def",
-                 R"pb(name: "AssignVariableOp"
-                      op: "AssignVariableOp"
-                      input: "dummy_resource"
-                      input: "dummy_tensor"
-                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
-                      attr {
-                        key: "dtype"
-                        value { type: DT_INT32 }
-                      }
-                 )pb");
-
-  attributes.Add("assign_variable_op_key", 1);
-
   auto functions_ctor = executable_ctor.construct_functions(1);
 
   {
@@ -274,10 +267,9 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
 
     mlrt::testing::SymbolTable regs;
 
-    function_ctor.construct_input_regs(1).Assign({regs.Def("input_tensor")});
     function_ctor.construct_output_regs(1).Assign({regs.Def("output_tensor")});
 
-    const int kNumKernels = 6 + (redundant_ifrt_load_variable_op ? 1 : 0);
+    const int kNumKernels = 4 + (redundant_ifrt_load_variable_op ? 1 : 0);
     auto kernels_ctor = function_ctor.construct_kernels(kNumKernels);
     int kernel_index = 0;
 
@@ -292,17 +284,6 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
            attributes.GetHandle("var_handle_op_key")});
       kernel_index++;
     }
-    {
-      // Create AssignVariableOp
-      auto createop_ctor = kernels_ctor.ConstructAt(kernel_index);
-      createop_ctor.set_code(kernels.Use("tf_mlrt.createop"));
-      createop_ctor.construct_arguments(0);
-      createop_ctor.construct_results(0);
-      createop_ctor.construct_attributes(2).Assign(
-          {attributes.GetHandle("assign_variable_op_node_def"),
-           attributes.GetHandle("assign_variable_op_key")});
-      kernel_index++;
-    }
     {
       // Execute VarHandleOp
       auto executeop_ctor = kernels_ctor.ConstructAt(kernel_index);
@@ -312,24 +293,8 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
       executeop_ctor.construct_attributes(2).Assign(
           {attributes.GetHandle("var_handle_op_node_def"),
            attributes.GetHandle("var_handle_op_key")});
-      executeop_ctor.construct_last_uses(1).Assign({0});
       kernel_index++;
     }
-
-    {
-      // Execute AssignVariableOp
-      auto executeop_ctor = kernels_ctor.ConstructAt(kernel_index);
-      executeop_ctor.set_code(kernels.Use("tf_mlrt.executeop"));
-      executeop_ctor.construct_arguments(2).Assign(
-          regs.Use({"variable_handle", "input_tensor"}));
-      executeop_ctor.construct_results(0);
-      executeop_ctor.construct_attributes(2).Assign(
-          {attributes.GetHandle("assign_variable_op_node_def"),
-           attributes.GetHandle("assign_variable_op_key")});
-      executeop_ctor.construct_last_uses(2).Assign({0, 0});
-      kernel_index++;
-    }
-
     {
       auto kernel_ctor = kernels_ctor.ConstructAt(kernel_index);
       kernel_ctor.set_code(kernels.Use("tf_mlrt.ifrt_load_variable"));
@@ -338,7 +303,8 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
       kernel_ctor.construct_attributes(2).Assign(
           {attributes.GetHandle("sharding_config"),
            attributes.GetHandle("variable_name")});
-      kernel_ctor.construct_last_uses(1).Assign({1});
+      kernel_ctor.construct_last_uses(1).Assign(
+          {redundant_ifrt_load_variable_op ? 0 : 1});
       kernel_index++;
     }
     if (redundant_ifrt_load_variable_op) {
@@ -348,7 +314,7 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
       kernel_ctor.construct_attributes(2).Assign(
           {attributes.GetHandle("sharding_config"),
            attributes.GetHandle("variable_name")});
-      kernel_ctor.construct_arguments(1).Assign({regs.Use("input_tensor")});
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("variable_handle")});
       kernel_ctor.construct_last_uses(1).Assign({1});
       kernel_index++;
     }
@@ -417,18 +383,29 @@ TEST(KernelTest, IfrtLoadVariableOp) {
   ASSERT_TRUE(ifrt_model_context.has_value());
   EXPECT_THAT((*ifrt_model_context)
                   ->GetLoadedVariableRegistry()
-                  .GetLoadedVariable(kVariableName)
+                  .GetLoadedVariable(kVariableRuntimeName)
                   .status(),
               ::tsl::testing::StatusIs(absl::StatusCode::kNotFound));
 
-  std::vector<mlrt::Value> args;
-  args.resize(1);
+  auto restore_work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  (*ifrt_model_context)->set_checkpoint_loader_queue(restore_work_queue.get());
+
   tensorflow::Tensor input_tensor;
   TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {}, &input_tensor));
   input_tensor.scalar<int32_t>()() = 1234;
-  args.at(0).Set(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  auto input_tensor_promise =
+      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>::CreatePromise();
+  auto input_tensor_future =
+      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(
+          input_tensor_promise);
+  input_tensor_promise.Set(input_tensor);
+  TF_ASSERT_OK((*ifrt_model_context)
+                   ->GetRestoreTensorRegistry()
+                   .TryRegister(kVariableRuntimeName, input_tensor_future));
 
-  std::vector<uint8_t> last_uses = {true};
+  std::vector<mlrt::Value> args;
+  std::vector<uint8_t> last_uses;
   std::vector<mlrt::Value> results;
   results.resize(1);
 
@@ -446,11 +423,11 @@ TEST(KernelTest, IfrtLoadVariableOp) {
 
   TF_ASSERT_OK((*ifrt_model_context)
                    ->GetLoadedVariableRegistry()
-                   .GetLoadedVariable(kVariableName)
+                   .GetLoadedVariable(kVariableRuntimeName)
                    .status());
 
   ExpectEqual(results[0].Get<tfrt_stub::FallbackTensor>().tensor(),
-              AsScalar(tsl::tstring(kVariableName)));
+              AsScalar(tsl::tstring(kVariableRuntimeName)));
 }
 
 TEST(KernelTest, DuplicateIfrtLoadVariableOpShallSucceed) {
@@ -505,18 +482,29 @@ TEST(KernelTest, DuplicateIfrtLoadVariableOpShallSucceed) {
   ASSERT_TRUE(ifrt_model_context.has_value());
   EXPECT_THAT((*ifrt_model_context)
                   ->GetLoadedVariableRegistry()
-                  .GetLoadedVariable(kVariableName)
+                  .GetLoadedVariable(kVariableRuntimeName)
                   .status(),
               ::tsl::testing::StatusIs(absl::StatusCode::kNotFound));
 
-  std::vector<mlrt::Value> args;
-  args.resize(1);
+  auto restore_work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  (*ifrt_model_context)->set_checkpoint_loader_queue(restore_work_queue.get());
+
   tensorflow::Tensor input_tensor;
   TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {}, &input_tensor));
   input_tensor.scalar<int32_t>()() = 1234;
-  args.at(0).Set(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  auto input_tensor_promise =
+      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>::CreatePromise();
+  auto input_tensor_future =
+      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(
+          input_tensor_promise);
+  input_tensor_promise.Set(input_tensor);
+  TF_ASSERT_OK((*ifrt_model_context)
+                   ->GetRestoreTensorRegistry()
+                   .TryRegister(kVariableRuntimeName, input_tensor_future));
 
-  std::vector<uint8_t> last_uses = {true};
+  std::vector<mlrt::Value> args;
+  std::vector<uint8_t> last_uses;
   std::vector<mlrt::Value> results;
   results.resize(1);
 
@@ -534,17 +522,14 @@ TEST(KernelTest, DuplicateIfrtLoadVariableOpShallSucceed) {
 
   TF_ASSERT_OK((*ifrt_model_context)
                    ->GetLoadedVariableRegistry()
-                   .GetLoadedVariable(kVariableName)
+                   .GetLoadedVariable(kVariableRuntimeName)
                    .status());
 
   ExpectEqual(results[0].Get<tfrt_stub::FallbackTensor>().tensor(),
-              AsScalar(tsl::tstring(kVariableName)));
+              AsScalar(tsl::tstring(kVariableRuntimeName)));
 }
 
 TEST(KernelTest, IfrtRestoreVariableOp) {
-  // runtime name = container_name + shared_name
-  constexpr absl::string_view kVariableRuntimeName = "test__y";
-
   std::string checkpoint_prefix =
       tensorflow::GetDataDependencyFilepath(
           "tensorflow/core/tfrt/mlrt/kernel/testdata/"
@@ -609,7 +594,7 @@ TEST(KernelTest, IfrtRestoreVariableOp) {
 
   auto restore_work_queue = tfrt::CreateMultiThreadedWorkQueue(
       /*num_threads=*/4, /*num_blocking_threads=*/4);
-  (*ifrt_model_context)->set_work_queue(restore_work_queue.get());
+  (*ifrt_model_context)->set_checkpoint_loader_queue(restore_work_queue.get());
 
   std::vector<mlrt::Value> args;
   args.resize(3);
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
index 2cee765fa56ea4..ca4f42ce64da92 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
@@ -49,7 +49,8 @@ tsl::thread::ThreadPool& GetThreadPool() {
   return *thread_pool;
 }
 
-TEST(SavedModelIfrt, Basic) {
+// TODO(b/319045348): replace with a variableless model.
+TEST(SavedModelIfrt, DISABLED_Basic) {
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
       "tensorflow/core/tfrt/saved_model/tests/toy_v2");
 

From 99dedfec0fc6b4d3379d398dcc58dbfaad0c8b24 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 17:45:59 -0700
Subject: [PATCH 423/670] Use the information in
 allow_spmd_sharding_propagation_to_output and
 allow_spmd_sharding_propagation_to_parameters to determine what input and
 output tuple elements we are allowed to modfy the shardings of.

PiperOrigin-RevId: 619013275
---
 .../auto_sharding/auto_sharding.cc            | 34 ++++++++++++++++---
 .../auto_sharding/auto_sharding_util.cc       | 26 ++++++++++++++
 .../auto_sharding/auto_sharding_util.h        |  6 ++++
 3 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index f3b83d61c0d7cc..82bac59e3b2d7d 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -1422,6 +1421,9 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
           cluster_env, pretrimmed_strategy_map, call_graph, strict);
     }
   } else {
+    if (existing_sharding.IsUnknown()) {
+      return;
+    }
     if (ShardingIsComplete(existing_sharding,
                            cluster_env.device_mesh_.num_elements())) {
       // Sharding provided by XLA users, we need to keep them.
@@ -2616,6 +2618,7 @@ void CheckUserShardingPreservation(
                    << preserve_shardings.at(inst->name())[0].ToString()
                    << "\nbut it's empty.";
       } else if (!inst->sharding().IsTuple() &&
+                 !preserve_shardings.at(inst->name())[0].IsUnknown() &&
                  preserve_shardings.at(inst->name())[0] != inst->sharding()) {
         LOG(FATAL) << "User sharding is not preserved! Instruction with name "
                    << inst->name() << " should be: "
@@ -2625,8 +2628,9 @@ void CheckUserShardingPreservation(
         const std::vector<HloSharding>* preserve_shardings_tuple =
             &preserve_shardings.at(inst->name());
         for (size_t i = 0; i < inst->shape().tuple_shapes_size(); i++) {
-          if (preserve_shardings_tuple->at(i) !=
-              inst->sharding().tuple_elements().at(i)) {
+          if (!preserve_shardings_tuple->at(i).IsUnknown() &&
+              preserve_shardings_tuple->at(i) !=
+                  inst->sharding().tuple_elements().at(i)) {
             LOG(FATAL) << "Tuple sharding is not preserved! Instruction "
                           "with name "
                        << inst->name() << " " << i << "th tuple element "
@@ -3983,13 +3987,33 @@ absl::StatusOr<bool> AutoSharding::Run(
     mesh_shapes.push_back(option_.device_mesh_shape);
   }
 
+  HloInstruction* parameter_instruction =
+      module->entry_computation()->parameter_instruction(0);
+  if (parameter_instruction->shape().IsTuple() &&
+      parameter_instruction->has_sharding()) {
+    CHECK_EQ(module->entry_computation()->num_parameters(), 1);
+    parameter_instruction->set_sharding(
+        spmd::ReplaceGivenShardingsWithUnknownForTuple(
+            parameter_instruction->sharding(), parameter_instruction->shape(),
+            module->config().allow_spmd_sharding_propagation_to_parameters()));
+  }
+
+  HloInstruction* root_instruction =
+      module->entry_computation()->root_instruction();
+  if (root_instruction->shape().IsTuple() && root_instruction->has_sharding()) {
+    root_instruction->set_sharding(
+        spmd::ReplaceGivenShardingsWithUnknownForTuple(
+            root_instruction->sharding(), root_instruction->shape(),
+            module->config().allow_spmd_sharding_propagation_to_output()));
+  }
+
   absl::flat_hash_map<std::string, const HloInstruction*>
       sharding_propagation_solution;
   std::unique_ptr<HloModule> module_with_default_solution = nullptr;
   if (option_.use_sharding_propagation_for_default_shardings) {
     module_with_default_solution = CloneModule(module);
-    // TODO(pratikf): Ensure that we're passing the correct custom call sharding
-    // helper to the sharding propagation pass.
+    // TODO(pratikf): Ensure that we're passing the correct custom call
+    // sharding helper to the sharding propagation pass.
     auto sharding_prop = ShardingPropagation(
         /*is_spmd */ true, /*propagate_metadata */ false,
         /*allow_spmd_sharding_propagation_to_output*/
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index 4fd1887493e6c5..827fb83881231d 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -1880,6 +1880,9 @@ int64_t GetInstructionSize(const Shape& shape) {
 
 int64_t GetShardedInstructionSize(const Shape& shape, int64_t num_devices,
                                   std::optional<HloSharding> sharding) {
+  if (sharding && sharding->IsUnknown()) {
+    sharding = HloSharding::Replicate();
+  }
   if (shape.IsTuple()) {
     int64_t size = 0;
     for (size_t i = 0; i < shape.tuple_shapes_size(); i++) {
@@ -2031,6 +2034,10 @@ absl::StatusOr<bool> AdjustShardingsWithPartialMeshShape(
       for (size_t i = 0; i < inst->shape().tuple_shapes_size(); i++) {
         auto shape = inst->shape().tuple_shapes(i);
         auto sharding = inst->sharding().tuple_elements()[i];
+        if (sharding.IsUnknown()) {
+          output_flattened_shardings.push_back(sharding);
+          continue;
+        }
         absl::StatusOr<std::optional<HloSharding>> new_sharding_result =
             AdjustShardingWithPartialMeshShapePerElement(
                 sharding, valid_shards, total_num_devices, crash_on_error);
@@ -2311,5 +2318,24 @@ bool IsShardingMisaligned(const HloSharding& sharding, const Shape& shape) {
   return false;
 }
 
+HloSharding ReplaceGivenShardingsWithUnknownForTuple(
+    const HloSharding& sharding, const Shape& shape,
+    absl::Span<const bool> to_replace_sharding_ids) {
+  std::vector<HloSharding> new_tuple_shardings;
+  int64_t num_elements = sharding.tuple_elements().size();
+  for (int32_t i = 0; i < num_elements; ++i) {
+    bool can_change_sharding = to_replace_sharding_ids.size() == 1
+                                   ? to_replace_sharding_ids[0]
+                                   : to_replace_sharding_ids[i];
+    if (can_change_sharding) {
+      new_tuple_shardings.push_back(HloSharding::Unknown());
+    } else {
+      new_tuple_shardings.push_back(sharding.tuple_elements()[i]);
+    }
+  }
+
+  return HloSharding::Tuple(shape, new_tuple_shardings);
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index 0b72ce14ac8abe..e293adddd19ca6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -660,6 +660,12 @@ std::vector<std::vector<int64_t>> InferOrEnumerateMeshShapesToTry(
 // that do not complete divide the size of the tensor dimension.
 bool IsShardingMisaligned(const HloSharding& sharding, const Shape& shape);
 
+// In a given tuple sharding, replace certain leaves with
+// HloSharding::Unknown()
+HloSharding ReplaceGivenShardingsWithUnknownForTuple(
+    const HloSharding& sharding, const Shape& shape,
+    absl::Span<const bool> to_replace_sharding_ids);
+
 }  // namespace spmd
 }  // namespace xla
 

From c2ffcb6a84b13ae75c0e00bb08d51df2ed18bdc0 Mon Sep 17 00:00:00 2001
From: Zixuan Jiang <zixuanjiang@google.com>
Date: Mon, 25 Mar 2024 17:50:14 -0700
Subject: [PATCH 424/670] Add two arguments in OpExpanderPass:
 `preserve_sharding` and `relay_control_dependency`.

PiperOrigin-RevId: 619014199
---
 third_party/xla/xla/service/BUILD               |  7 +++++--
 third_party/xla/xla/service/op_expander_pass.cc | 16 ++++++++++++----
 third_party/xla/xla/service/op_expander_pass.h  | 12 ++++++++++--
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 51bea8807ae831..8ad3fb93699cc4 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -2091,12 +2091,15 @@ cc_library(
     srcs = ["op_expander_pass.cc"],
     hdrs = ["op_expander_pass.h"],
     deps = [
-        ":hlo_creation_utils",
         ":hlo_pass",
-        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/op_expander_pass.cc b/third_party/xla/xla/service/op_expander_pass.cc
index 11f4dfb5c42330..318211dce1f08a 100644
--- a/third_party/xla/xla/service/op_expander_pass.cc
+++ b/third_party/xla/xla/service/op_expander_pass.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #include "xla/service/op_expander_pass.h"
 
-#include <utility>
+#include <iterator>
+#include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/hlo_creation_utils.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -45,7 +49,11 @@ absl::StatusOr<bool> OpExpanderPass::Run(
     if (expanded_root == nullptr) {
       continue;
     }
-    TF_RETURN_IF_ERROR(inst->parent()->ReplaceInstruction(inst, expanded_root));
+    TF_ASSIGN_OR_RETURN(bool changed,
+                        inst->parent()->ReplaceInstruction(
+                            inst, expanded_root, preserve_sharding_,
+                            relay_control_dependency_));
+    DCHECK(changed);
   }
 
   return !matching_instructions.empty();
diff --git a/third_party/xla/xla/service/op_expander_pass.h b/third_party/xla/xla/service/op_expander_pass.h
index bac8e736928fc7..c86c3a44f55633 100644
--- a/third_party/xla/xla/service/op_expander_pass.h
+++ b/third_party/xla/xla/service/op_expander_pass.h
@@ -34,8 +34,14 @@ class OpExpanderPass : public HloModulePass {
 
   // extra_filter: Optional extra filtering criteria for matching instructions,
   // used in conjunction with InstructionMatchesPattern.
-  explicit OpExpanderPass(HloPredicate extra_filter = nullptr)
-      : extra_filter_(std::move(extra_filter)) {}
+  // preserve_sharding and relay_control_dependency: If we preserve sharding and
+  // relay control dependency when replacing the matched instructions.
+  explicit OpExpanderPass(HloPredicate extra_filter = nullptr,
+                          bool preserve_sharding = false,
+                          bool relay_control_dependency = false)
+      : extra_filter_(std::move(extra_filter)),
+        preserve_sharding_(preserve_sharding),
+        relay_control_dependency_(relay_control_dependency) {}
 
  protected:
   // Returns `true` if `instruction` should be expanded by this pass.
@@ -48,6 +54,8 @@ class OpExpanderPass : public HloModulePass {
       HloInstruction* instruction) = 0;
 
   HloPredicate extra_filter_;
+  const bool preserve_sharding_;
+  const bool relay_control_dependency_;
 };
 
 }  // namespace xla

From 3ade5ee80e1d5b4c29bea211e860f3351ca449b9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 Mar 2024 17:54:53 -0700
Subject: [PATCH 425/670] Remove CHECKs from op_profile_builder.cc

PiperOrigin-RevId: 619015115
---
 tensorflow/core/profiler/convert/BUILD                 | 1 +
 tensorflow/core/profiler/convert/op_profile_builder.cc | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index a9cf400f917608..1b6781bdd8e012 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -817,6 +817,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/convert:xla_op_utils",
     ],
diff --git a/tensorflow/core/profiler/convert/op_profile_builder.cc b/tensorflow/core/profiler/convert/op_profile_builder.cc
index c6d45f70df29c7..4ee010aa50eb99 100644
--- a/tensorflow/core/profiler/convert/op_profile_builder.cc
+++ b/tensorflow/core/profiler/convert/op_profile_builder.cc
@@ -24,6 +24,7 @@ limitations under the License.
 // #include "perftools/accelerators/xprof/convert/device_type_utils.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/log/log.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/gtl/top_n.h"
@@ -369,7 +370,10 @@ OpProfileBuilder::OpProfileBuilder(
     tensorflow::profiler::op_profile::Node* root,
     const tensorflow::protobuf::Map<uint64_t, std::string>* program_name_map)
     : options_(options), root_(root), program_name_map_(program_name_map) {
-  CHECK(root != nullptr);
+  if (root == nullptr) {
+    LOG(DFATAL) << "root is null.";
+    return;
+  }
   DCHECK(!options_.group_by_program || program_name_map_ != nullptr);
   root->set_name(options_.group_by_program ? "by_program" : "by_category");
 }

From 446e27af68a70be9c9ec8175ff21195cd59d51b0 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Mon, 25 Mar 2024 18:58:57 -0700
Subject: [PATCH 426/670] PR #10892: Fix some clang-tidy reported issues in
 xla/service

Imported from GitHub PR https://github.com/openxla/xla/pull/10892

Fixes:
1. `~BufferAllocation() {}` - There is no needed to define empty dtor explicitly. Also, if dtor is explicitly defined we also need to define copy ctor and copy operator=. The right solution here is to not define dtor explicitly.
2. `std::move(target_config)` - `target_config` is const ref - it is not possible to move/modify const refs. std::move can be removed since it is not doing anything here.
3. `mutable_dot_config()->insert(MapPair)` - better to use `try_emplace(K&&, V&&)` directly because it accepts K&& and V&& params, where `insert` requires to create temp `MapPair` object. Internally `insert` uses `try_emplace`.
4. `const std::vector<bool> require_broadcast` - remove const from this param because `require_broadcast` will be moved in this function
5. add `std::move` to `set_auto_spmd_partitioning_mesh_shape` and `set_auto_spmd_partitioning_mesh_ids` to make them similar and consistent with other setter(std::vector) functions in `HloModuleConfig` class.
6. update callers of HloModuleConfig setter(std::vector) functions to use std::move or rvalue to avoid vector copy.
7. `chunk_(std::move(chunk))` - remove move because struct Chunk is trivially-copyable type. So, move have the same price as copy for such types.
8. `additional_sort_data_.insert(pair)` - replaced with `try_emplace(K&&, V&&)` to remove `std::make_pair`. Remove move from `std::move(sort_data)` -  sort_data type AdditionalSortData is trivially-copyable type.
9. `std::move(*buffer_assignment_proto_after_opt)` - remove move because `buffer_assignment_proto_after_opt` is `const BufferAssignmentProto*` - it can not be moved/modified. move does nothing here.
10. return std::move(executable)` - move is not needed in return. It breaks RVO/NRVO compiler optimization.
11. `run_options_(std::move(run_options))` - `std::move` is removed because run_options type ExecutableRunOptions is trivially-copyable type. move has no effect here.
Copybara import of the project:

--
0ad7bde68fea25b626ef1e2ac99ca007151f8b0b by Alexander Pivovarov <pivovaa@amazon.com>:

Fix some clang-tidy reported issues in xla/service

Merging this change closes #10892

PiperOrigin-RevId: 619029010
---
 .../xla/client/executable_build_options.cc    |  4 ++--
 third_party/xla/xla/hlo/ir/hlo_module.cc      | 23 ++++++++-----------
 .../xla/xla/service/buffer_assignment.h       |  1 -
 third_party/xla/xla/service/compiler.h        |  2 +-
 .../xla/xla/service/hlo_module_config.cc      |  4 ++--
 .../xla/xla/service/hlo_module_config.h       | 10 ++++----
 .../xla/xla/service/hlo_module_util.cc        | 16 +++++--------
 .../memory_space_assignment/allocation.cc     |  2 +-
 .../memory_space_assignment.cc                |  7 +++---
 third_party/xla/xla/service/service.cc        |  4 ++--
 .../service/service_executable_run_options.h  |  4 ++--
 .../stream_executor/tpu/c_api_conversions.cc  |  9 ++++----
 12 files changed, 37 insertions(+), 49 deletions(-)

diff --git a/third_party/xla/xla/client/executable_build_options.cc b/third_party/xla/xla/client/executable_build_options.cc
index 3a583e78984eec..64be6188908e5b 100644
--- a/third_party/xla/xla/client/executable_build_options.cc
+++ b/third_party/xla/xla/client/executable_build_options.cc
@@ -110,14 +110,14 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_use_auto_spmd_partitioning(
 ExecutableBuildOptions&
 ExecutableBuildOptions::set_auto_spmd_partitioning_mesh_shape(
     std::vector<int64_t> mesh_shape) {
-  auto_spmd_partitioning_mesh_shape_ = mesh_shape;
+  auto_spmd_partitioning_mesh_shape_ = std::move(mesh_shape);
   return *this;
 }
 
 ExecutableBuildOptions&
 ExecutableBuildOptions::set_auto_spmd_partitioning_mesh_ids(
     std::vector<int64_t> mesh_ids) {
-  auto_spmd_partitioning_mesh_ids_ = mesh_ids;
+  auto_spmd_partitioning_mesh_ids_ = std::move(mesh_ids);
   return *this;
 }
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index e7a6edbd193843..28610dac086498 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -700,16 +700,12 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
         execution_options->use_spmd_partitioning());
     module_config.set_use_auto_spmd_partitioning(
         execution_options->use_auto_spmd_partitioning());
-    std::vector<int64_t> mesh_shape;
-    for (auto t : execution_options->auto_spmd_partitioning_mesh_shape()) {
-      mesh_shape.push_back(t);
-    }
-    module_config.set_auto_spmd_partitioning_mesh_shape(mesh_shape);
-    std::vector<int64_t> mesh_ids;
-    for (auto t : execution_options->auto_spmd_partitioning_mesh_ids()) {
-      mesh_ids.push_back(t);
-    }
-    module_config.set_auto_spmd_partitioning_mesh_ids(mesh_ids);
+    module_config.set_auto_spmd_partitioning_mesh_shape(std::vector<int64_t>(
+        execution_options->auto_spmd_partitioning_mesh_shape().begin(),
+        execution_options->auto_spmd_partitioning_mesh_shape().end()));
+    module_config.set_auto_spmd_partitioning_mesh_ids(std::vector<int64_t>(
+        execution_options->auto_spmd_partitioning_mesh_ids().begin(),
+        execution_options->auto_spmd_partitioning_mesh_ids().end()));
     module_config.set_deduplicate_hlo(execution_options->deduplicate_hlo());
     if (!execution_options->allow_spmd_sharding_propagation_to_parameters()
              .empty()) {
@@ -735,11 +731,10 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
                  module_config.num_partitions());
       }
     }
-    std::vector<bool> param_requires_broadcast_via_collectives(
+    module_config.set_param_requires_broadcast_via_collectives(std::vector<
+                                                               bool>(
         execution_options->param_requires_broadcast_via_collectives().begin(),
-        execution_options->param_requires_broadcast_via_collectives().end());
-    module_config.set_param_requires_broadcast_via_collectives(
-        param_requires_broadcast_via_collectives);
+        execution_options->param_requires_broadcast_via_collectives().end()));
     module_config.set_allow_separate_sharding_programs(
         execution_options->allow_separate_sharding_programs());
     HloModuleConfig::AssignStructShardableValueUpdatePairs(
diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
index 367151df0528e7..e908b617641f05 100644
--- a/third_party/xla/xla/service/buffer_assignment.h
+++ b/third_party/xla/xla/service/buffer_assignment.h
@@ -72,7 +72,6 @@ class BufferAllocation {
 
   BufferAllocation(Index index, int64_t size, LogicalBuffer::Color color)
       : index_(index), size_(size), color_(color) {}
-  ~BufferAllocation() {}
 
   // Returns the index of this allocation.
   Index index() const { return index_; }
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index 2f3ef5973bbe9e..bfb6b45c043689 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -448,7 +448,7 @@ class AotCompilationOptions {
     return target_config_;
   }
   void set_target_config(const Compiler::TargetConfig& target_config) {
-    target_config_ = std::move(target_config);
+    target_config_ = target_config;
   }
 
  protected:
diff --git a/third_party/xla/xla/service/hlo_module_config.cc b/third_party/xla/xla/service/hlo_module_config.cc
index bef8a0417c2b3b..4280bf3efd2d74 100644
--- a/third_party/xla/xla/service/hlo_module_config.cc
+++ b/third_party/xla/xla/service/hlo_module_config.cc
@@ -150,7 +150,7 @@ static void AssignProtoDotConfig(
     for (int64_t val : list_vector) {
       list.add_vals(val);
     }
-    proto.mutable_dot_config()->insert({key, std::move(list)});
+    proto.mutable_dot_config()->try_emplace(key, std::move(list));
   }
 }
 
@@ -198,7 +198,7 @@ static void AssignProtoPhaseOrderingConfig(
     pair.output_shape_index.assign(output_idx.begin(), output_idx.end());
     cfg_pairs.push_back(pair);
   }
-  config.set_shardable_value_update_pairs(cfg_pairs);
+  config.set_shardable_value_update_pairs(std::move(cfg_pairs));
 }
 
 static void AssignStructFusionConfig(HloModuleConfig& config,
diff --git a/third_party/xla/xla/service/hlo_module_config.h b/third_party/xla/xla/service/hlo_module_config.h
index 9a5e415bfa099c..4a137c59c426fb 100644
--- a/third_party/xla/xla/service/hlo_module_config.h
+++ b/third_party/xla/xla/service/hlo_module_config.h
@@ -169,7 +169,7 @@ class HloModuleConfig {
     return param_requires_broadcast_via_collectives_;
   }
   void set_param_requires_broadcast_via_collectives(
-      const std::vector<bool> require_broadcast) {
+      std::vector<bool> require_broadcast) {
     param_requires_broadcast_via_collectives_ = std::move(require_broadcast);
   }
 
@@ -195,16 +195,16 @@ class HloModuleConfig {
   }
 
   void set_auto_spmd_partitioning_mesh_shape(std::vector<int64_t> mesh_shape) {
-    auto_spmd_partitioning_mesh_shape_ = mesh_shape;
+    auto_spmd_partitioning_mesh_shape_ = std::move(mesh_shape);
   }
-  std::vector<int64_t> auto_spmd_partitioning_mesh_shape() const {
+  const std::vector<int64_t>& auto_spmd_partitioning_mesh_shape() const {
     return auto_spmd_partitioning_mesh_shape_;
   }
 
   void set_auto_spmd_partitioning_mesh_ids(std::vector<int64_t> mesh_ids) {
-    auto_spmd_partitioning_mesh_ids_ = mesh_ids;
+    auto_spmd_partitioning_mesh_ids_ = std::move(mesh_ids);
   }
-  std::vector<int64_t> auto_spmd_partitioning_mesh_ids() const {
+  const std::vector<int64_t>& auto_spmd_partitioning_mesh_ids() const {
     return auto_spmd_partitioning_mesh_ids_;
   }
 
diff --git a/third_party/xla/xla/service/hlo_module_util.cc b/third_party/xla/xla/service/hlo_module_util.cc
index fab668e4d4e6db..44fd94a1033786 100644
--- a/third_party/xla/xla/service/hlo_module_util.cc
+++ b/third_party/xla/xla/service/hlo_module_util.cc
@@ -112,16 +112,12 @@ StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
     }
     config->set_use_auto_spmd_partitioning(
         execution_options->use_auto_spmd_partitioning());
-    std::vector<int64_t> mesh_shape;
-    for (auto t : execution_options->auto_spmd_partitioning_mesh_shape()) {
-      mesh_shape.push_back(t);
-    }
-    config->set_auto_spmd_partitioning_mesh_shape(mesh_shape);
-    std::vector<int64_t> mesh_ids;
-    for (auto t : execution_options->auto_spmd_partitioning_mesh_ids()) {
-      mesh_ids.push_back(t);
-    }
-    config->set_auto_spmd_partitioning_mesh_ids(mesh_ids);
+    config->set_auto_spmd_partitioning_mesh_shape(std::vector<int64_t>(
+        execution_options->auto_spmd_partitioning_mesh_shape().begin(),
+        execution_options->auto_spmd_partitioning_mesh_shape().end()));
+    config->set_auto_spmd_partitioning_mesh_ids(std::vector<int64_t>(
+        execution_options->auto_spmd_partitioning_mesh_ids().begin(),
+        execution_options->auto_spmd_partitioning_mesh_ids().end()));
     config->set_deduplicate_hlo(execution_options->deduplicate_hlo());
     config->set_seed(execution_options->seed());
     config->set_launch_id(execution_options->launch_id());
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.cc b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
index 54cf00f9a31f39..49808709832f50 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
@@ -201,7 +201,7 @@ Allocation::Allocation(HloPosition defining_position, MemorySpace memory_space,
                        std::optional<int64_t> cross_program_prefetch_index)
     : original_defining_position_(std::move(defining_position)),
       memory_space_(memory_space),
-      chunk_(std::move(chunk)),
+      chunk_(chunk),
       start_time_(start_time),
       end_time_(end_time),
       is_scoped_allocation_(is_scoped_allocation),
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 71a28ed6bfdf00..6d24cca13ab831 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -6324,10 +6324,9 @@ DefaultCrossProgramPrefetchBufferIntervalComparator::GetTuple(
       sort_data.cumulative_use_size +=
           ShapeUtil::ElementsInRecursive(use.instruction->shape());
     });
-    sort_data_it = additional_sort_data_
-                       .insert(std::make_pair(buffer_interval.buffer,
-                                              std::move(sort_data)))
-                       .first;
+    sort_data_it =
+        additional_sort_data_.try_emplace(buffer_interval.buffer, sort_data)
+            .first;
   }
 
   return std::make_tuple(
diff --git a/third_party/xla/xla/service/service.cc b/third_party/xla/xla/service/service.cc
index 3c48e36215bb86..9017774ce2e51d 100644
--- a/third_party/xla/xla/service/service.cc
+++ b/third_party/xla/xla/service/service.cc
@@ -790,10 +790,10 @@ absl::StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
       buffer_assignment_proto_after_opt != nullptr) {
     CHECK(DumpingEnabledForHloModule(executable->module()));
     *hlo_proto_before_opt->mutable_buffer_assignment() =
-        std::move(*buffer_assignment_proto_after_opt);
+        *buffer_assignment_proto_after_opt;
     executable->set_hlo_proto(std::move(hlo_proto_before_opt));
   }
-  return std::move(executable);
+  return executable;
 }
 
 Status Service::Compile(const CompileRequest* arg, CompileResponse* result) {
diff --git a/third_party/xla/xla/service/service_executable_run_options.h b/third_party/xla/xla/service/service_executable_run_options.h
index 6013ca758d42ba..a2f57e6bbae635 100644
--- a/third_party/xla/xla/service/service_executable_run_options.h
+++ b/third_party/xla/xla/service/service_executable_run_options.h
@@ -42,9 +42,9 @@ class ServiceExecutableRunOptions {
   ServiceExecutableRunOptions()
       : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
 
-  explicit ServiceExecutableRunOptions(ExecutableRunOptions run_options,
+  explicit ServiceExecutableRunOptions(const ExecutableRunOptions& run_options,
                                        StreamBorrower stream_borrower = nullptr)
-      : run_options_(std::move(run_options)),
+      : run_options_(run_options),
         stream_borrower_(std::move(stream_borrower)) {}
 
   // Returns reference or pointer to `ExecutableRunOptions` member.
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
index cff8cdb449149c..24709bb78ef794 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
@@ -590,13 +590,12 @@ xla::HloModuleConfig FromC(const XLA_HloModuleConfig& c_config) {
       MakeSpan(c_config.allow_spmd_sharding_propagation_to_output));
   absl::Span<const int64_t> mesh_shape_span =
       MakeSpan(c_config.auto_spmd_partitioning_mesh_shape);
-  std::vector<int64_t> mesh_shape(mesh_shape_span.begin(),
-                                  mesh_shape_span.end());
-  config.set_auto_spmd_partitioning_mesh_shape(mesh_shape);
+  config.set_auto_spmd_partitioning_mesh_shape(
+      std::vector<int64_t>(mesh_shape_span.begin(), mesh_shape_span.end()));
   absl::Span<const int64_t> mesh_ids_span =
       MakeSpan(c_config.auto_spmd_partitioning_mesh_ids);
-  std::vector<int64_t> mesh_ids(mesh_ids_span.begin(), mesh_ids_span.end());
-  config.set_auto_spmd_partitioning_mesh_ids(mesh_ids);
+  config.set_auto_spmd_partitioning_mesh_ids(
+      std::vector<int64_t>(mesh_ids_span.begin(), mesh_ids_span.end()));
   if (c_config.has_static_device_assignment) {
     auto device_assignment = xla::DeviceAssignment::Deserialize(
         stream_executor::tpu::DeserializeProto<xla::DeviceAssignmentProto>(

From db91356eddfe5a5e1f3d00c3cd9760ade7921010 Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Mon, 25 Mar 2024 21:04:08 -0700
Subject: [PATCH 427/670] Add deferring transposes of the input of
 `stablehlo.maximum` to output.

PiperOrigin-RevId: 619052298
---
 .../mlir/quantization/stablehlo/BUILD         |   1 +
 .../passes/defer_activation_transpose.cc      | 115 ++++++++++++------
 .../passes/defer_activation_transpose.mlir    |  56 +++++++++
 .../tests/pipelines/process_nchw_tensor.mlir  |  29 +++++
 4 files changed, 166 insertions(+), 35 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 9263f5ccf68838..b18d039e17c64d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -114,6 +114,7 @@ cc_library(
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:quantization_util",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/random",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
index 566d416c0536aa..ec07590ca3651f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include "absl/base/nullability.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -40,8 +41,50 @@ namespace mlir::quant::stablehlo {
 namespace {
 
 using ::mlir::stablehlo::AddOp;
+using ::mlir::stablehlo::MaxOp;
 using ::mlir::stablehlo::TransposeOp;
 
+// Returns `success()` if `op` is a `TransposeOp` with permutation attribute
+// equivalent to `permuation`.
+LogicalResult IsTransposeOpWithPermuation(absl::Nullable<Operation*> op,
+                                          const ArrayRef<int64_t> permutation) {
+  auto transpose_op = dyn_cast_or_null<TransposeOp>(op);
+  return success(transpose_op != nullptr && transpose_op.getPermutation() ==
+                                                ArrayRef<int64_t>(permutation));
+}
+
+// Convenience function to create a `TransposeOp` with a given `permutation`.
+// The Location is set as `input`'s loc.
+TransposeOp CreateTransposeOp(Value input, const ArrayRef<int64_t> permutation,
+                              PatternRewriter& rewriter) {
+  return rewriter.create<TransposeOp>(
+      input.getLoc(), input, rewriter.getDenseI64ArrayAttr(permutation));
+}
+
+// Defers the transpose of the left-hand side (LHS) to the right-hand side and
+// the result of a binary operation. In detail, this rewrites the
+// `op(transpose(%rhs), %lhs)` to `transpose(op(%rhs, transpose(%lhs)))`. The
+// LHS transpose permutation must be a NCHW->NHWC permutation.
+template <typename OpT>
+void DeferRhsTransposeForBinaryOp(OpT op, PatternRewriter& rewriter) {
+  auto transpose_op = cast<TransposeOp>(op.getOperand(0).getDefiningOp());
+  Value lhs_pre_transpose = transpose_op.getOperand();
+
+  // NCHW -> NHWC for the right-hand side, to match the operand's shape.
+  Value rhs = op.getOperand(1);
+  TransposeOp rhs_transpose_op = CreateTransposeOp(
+      /*input=*/rhs, kNchwToNhwcPermutation, rewriter);
+
+  auto new_binary_op =
+      rewriter.create<OpT>(op.getLoc(), lhs_pre_transpose, rhs_transpose_op);
+
+  // NHWC -> NCHW for the output, to match the shapes of `op`'s users.
+  TransposeOp output_transpose_op = CreateTransposeOp(
+      /*input=*/new_binary_op, kNhwcToNchwPermutation, rewriter);
+
+  rewriter.replaceAllUsesWith(op.getResult(), output_transpose_op);
+}
+
 class RewriteAddWithActivationTranspose : public OpRewritePattern<AddOp> {
  public:
   using OpRewritePattern<AddOp>::OpRewritePattern;
@@ -58,35 +101,12 @@ class RewriteAddWithActivationTranspose : public OpRewritePattern<AddOp> {
     }
 
     // Match LHS permutation that converts: NHWC -> NCHW.
-    auto transpose_op = dyn_cast_or_null<TransposeOp>(lhs.getDefiningOp());
-    if (transpose_op == nullptr) {
-      return failure();
-    }
-
-    return success(transpose_op.getPermutation() ==
-                   ArrayRef<int64_t>(kNhwcToNchwPermutation));
+    return IsTransposeOpWithPermuation(lhs.getDefiningOp(),
+                                       kNhwcToNchwPermutation);
   }
 
   void rewrite(AddOp op, PatternRewriter& rewriter) const override {
-    auto lhs_transpose_op = cast<TransposeOp>(op.getOperand(0).getDefiningOp());
-    Value lhs_input = lhs_transpose_op.getOperand();
-
-    Value rhs_input = op.getOperand(1);
-
-    // NCHW -> NHWC for the right-hand side, to match the operand's shape.
-    auto rhs_transpose_op = rewriter.create<TransposeOp>(
-        op.getLoc(), /*operand=*/rhs_input,
-        rewriter.getDenseI64ArrayAttr(kNchwToNhwcPermutation));
-
-    auto add_op =
-        rewriter.create<AddOp>(op.getLoc(), lhs_input, rhs_transpose_op);
-
-    // NHWC -> NCHW for the output, to match the shapes of `op`'s users.
-    auto output_transpose_op = rewriter.create<TransposeOp>(
-        op.getLoc(), /*operand=*/add_op.getResult(),
-        rewriter.getDenseI64ArrayAttr(kNhwcToNchwPermutation));
-
-    rewriter.replaceAllUsesWith(op.getResult(), output_transpose_op);
+    DeferRhsTransposeForBinaryOp(op, rewriter);
   }
 };
 
@@ -108,11 +128,8 @@ class DeferActivationTransposeForMaxPoolReduceWindowOp
     if (!HasRankOf(lhs, /*rank=*/4)) return failure();
 
     // Match input permutation that converts: NHWC -> NCHW.
-    auto transpose_op = dyn_cast_or_null<TransposeOp>(lhs.getDefiningOp());
-
-    return success(transpose_op != nullptr &&
-                   transpose_op.getPermutation() ==
-                       ArrayRef<int64_t>(kNhwcToNchwPermutation));
+    return IsTransposeOpWithPermuation(lhs.getDefiningOp(),
+                                       kNhwcToNchwPermutation);
   }
 
   // Pushes the transpose op at the input to the result.
@@ -152,8 +169,9 @@ class DeferActivationTransposeForMaxPoolReduceWindowOp
     op.getBody().cloneInto(&new_reduce_window_op.getBody(), mapping);
 
     // Introduce a transpose to the result to match the shapes of `op`'s uses.
-    auto result_transpose_op = rewriter.create<stablehlo::TransposeOp>(
-        op.getLoc(), new_reduce_window_op.getResult(0), kNhwcToNchwPermutation);
+    TransposeOp result_transpose_op = CreateTransposeOp(
+        /*input=*/new_reduce_window_op.getResult(0), kNhwcToNchwPermutation,
+        rewriter);
 
     rewriter.replaceAllUsesWith(op.getResult(0), result_transpose_op);
   }
@@ -187,7 +205,7 @@ class DeferActivationTransposeForMaxPoolReduceWindowOp
     auto return_op = cast<mlir::stablehlo::ReturnOp>(block.getTerminator());
     if (return_op.getNumOperands() != 1) return false;
 
-    auto max_op = dyn_cast_or_null<mlir::stablehlo::MaxOp>(
+    auto max_op = dyn_cast_or_null<MaxOp>(
         return_op.getOperands().front().getDefiningOp());
     if (!max_op) return false;
 
@@ -201,6 +219,32 @@ class DeferActivationTransposeForMaxPoolReduceWindowOp
   }
 };
 
+// Rewrites `maximum(transpose(%rhs), %lhs)` patterns to
+// `transpose(maximum(%rhs, transpose(%lhs)))`.
+class DeferActivationTransposeForMaxOp : public OpRewritePattern<MaxOp> {
+ public:
+  using OpRewritePattern<MaxOp>::OpRewritePattern;
+
+  LogicalResult match(MaxOp op) const override {
+    Value input = op.getOperand(0);
+    if (!HasRankOf(input, /*rank=*/4)) return failure();
+
+    const Value max_value = op.getOperand(1);
+    Operation* max_value_op = max_value.getDefiningOp();
+    if (max_value_op == nullptr ||
+        !max_value_op->hasTrait<OpTrait::ConstantLike>()) {
+      return failure();
+    }
+
+    return IsTransposeOpWithPermuation(input.getDefiningOp(),
+                                       kNhwcToNchwPermutation);
+  }
+
+  void rewrite(MaxOp op, PatternRewriter& rewriter) const override {
+    DeferRhsTransposeForBinaryOp(op, rewriter);
+  }
+};
+
 }  // namespace
 
 class DeferActivationTransposePass
@@ -216,7 +260,8 @@ void DeferActivationTransposePass::runOnOperation() {
 
   RewritePatternSet patterns(&ctx);
   patterns.add<RewriteAddWithActivationTranspose,
-               DeferActivationTransposeForMaxPoolReduceWindowOp>(&ctx);
+               DeferActivationTransposeForMaxPoolReduceWindowOp,
+               DeferActivationTransposeForMaxOp>(&ctx);
   if (failed(applyPatternsAndFoldGreedily(func_op, std::move(patterns)))) {
     func_op->emitWarning() << "Failed to converge patterns: " << getArgument();
   }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
index b7100d88e96aa2..429f3f9f2452b0 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
@@ -223,3 +223,59 @@ func.func @reduce_window_max_activation_transpose_with_padding(%arg0: tensor<16x
 // CHECK-DAG: stablehlo.constant
 // CHECK: stablehlo.transpose %[[ARG]]
 // CHECK: stablehlo.reduce_window
+
+// -----
+
+// Tests that an `max(transpose(arg0), arg1)` pattern is converted to
+// `transpose(max(arg0, transpose(arg1)))`. The transpose in the activation is
+// deferred to the output of `stablehlo.max` and an extra transpose op is
+// inserted to the RHS to match the shape of the operand.
+
+// CHECK-LABEL: max_with_activation_transpose
+func.func @max_with_activation_transpose(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x4x3x3xf32>
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+  %2 = stablehlo.maximum %1, %0 : tensor<1x4x3x3xf32>
+  return %2 : tensor<1x4x3x3xf32>
+}
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[CONST_0]], dims = [0, 2, 3, 1] : (tensor<1x4x3x3xf32>) -> tensor<1x3x3x4xf32>
+
+// Check that the shape of the add is changed to reflect the deferred transpose.
+// CHECK: %[[MAX_0:.+]] = stablehlo.maximum %[[ARG_0]], %[[TRANSPOSE_0]] : tensor<1x3x3x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// [No change] Tests that the activation transpose of `stablehlo.maximum` whose
+// permutation is not `[0, 3, 1, 2]` is not deferred.
+
+// CHECK-LABEL: max_with_activation_transpose_permutation_mismatch
+func.func @max_with_activation_transpose_permutation_mismatch(
+      %arg0: tensor<1x2x3x4xf32>) -> tensor<1x3x2x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x2x4xf32>
+  %1 = stablehlo.transpose %arg0, dims = [0, 2, 1, 3] : (tensor<1x2x3x4xf32>) -> tensor<1x3x2x4xf32>
+  %2 = stablehlo.maximum %1, %0 : tensor<1x3x2x4xf32>
+  return %2 : tensor<1x3x2x4xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[MAX_0:.+]] = stablehlo.maximum %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[MAX_0]]
+
+// -----
+
+// [No change] Tests that the activation transpose of `stablehlo.maximum` whose
+// rank is not 4 is not deferred.
+
+// CHECK-LABEL: max_with_activation_transpose_rank_two
+func.func @max_with_activation_transpose_rank_two(%arg0: tensor<1x2xf32>) -> tensor<2x1xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<2x1xf32>
+  %1 = stablehlo.transpose %arg0, dims = [1, 0] : (tensor<1x2xf32>) -> tensor<2x1xf32>
+  %2 = stablehlo.maximum %1, %0 : tensor<2x1xf32>
+  return %2 : tensor<2x1xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[MAX_0:.+]] = stablehlo.maximum %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[MAX_0]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
index 9fc6b1369b2af5..636146a12b490e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
@@ -104,3 +104,32 @@ func.func @nchw_conv_with_bias_add_max_pool(%arg0: tensor<1x2x5x5xf32>) -> tenso
 // CHECK: {window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>} : (tensor<1x5x5x4xf32>, tensor<f32>) -> tensor<1x2x2x4xf32>
 // CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[REDUCE_WINDOW_MAX]], dims = [0, 3, 1, 2] : (tensor<1x2x2x4xf32>) -> tensor<1x4x2x2xf32>
 // CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// Tests that a `maximum(add(convolution(%activation, %weight), %bias), %zero)`
+// with the activation tensor of NCHW format is converted to NHWC convolution +
+// add + maximum operation. Transpose ops are inserted to the activation and the
+// final output to match the function signature. Constants are also transpose-
+// folded accordingly.
+
+// CHECK-LABEL: nchw_conv_with_bias_add_relu
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x5x5xf32>
+func.func @nchw_conv_with_bias_add_relu(%arg0: tensor<1x2x5x5xf32>) -> tensor<1x4x5x5xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4x2x3x3xf32>
+  %5 = stablehlo.constant dense<0.000000e+00> : tensor<1x4x5x5xf32>
+  %1 = stablehlo.constant dense<3.000000e+00> : tensor<1x4x5x5xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2x5x5xf32>, tensor<4x2x3x3xf32>) -> tensor<1x4x5x5xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x4x5x5xf32>
+  %4 = stablehlo.maximum %3, %5 : tensor<1x4x5x5xf32>
+  return %4 : tensor<1x4x5x5xf32>
+}
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} : tensor<3x3x2x4xf32>
+// CHECK-DAG: %[[ZERO_CONST:.+]] = stablehlo.constant {{.*}} : tensor<1x5x5x4xf32>
+// CHECK-DAG: %[[BIAS_CONST:.+]] = stablehlo.constant {{.*}} : tensor<1x5x5x4xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG]], dims = [0, 2, 3, 1] : (tensor<1x2x5x5xf32>) -> tensor<1x5x5x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[WEIGHT_CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2xf32>, tensor<3x3x2x4xf32>) -> tensor<1x5x5x4xf32>
+// CHECK: %[[ADD:.+]] = stablehlo.add %[[CONV]], %[[BIAS_CONST]] : tensor<1x5x5x4xf32>
+// CHECK: %[[MAX:.+]] = stablehlo.maximum %[[ADD]], %[[ZERO_CONST]] : tensor<1x5x5x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[MAX]], dims = [0, 3, 1, 2] : (tensor<1x5x5x4xf32>) -> tensor<1x4x5x5xf32>
+// CHECK: return %[[TRANSPOSE_1]]

From b8fc9161f34751fdcdf8d35d54f8b04a0cec21b7 Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Tue, 26 Mar 2024 00:19:11 -0700
Subject: [PATCH 428/670] No public description

PiperOrigin-RevId: 619089713
---
 .../mlir/quantization/stablehlo/BUILD         |   4 +
 .../mlir/quantization/stablehlo/cc/config.cc  |  64 ++++++-
 .../quantization/stablehlo/cc/config_test.cc  |  41 ++++-
 .../mlir/quantization/stablehlo/ops/BUILD     |   2 +
 .../stablehlo/ops/stablehlo_op_quant_spec.cc  |  32 +++-
 .../ops/stablehlo_op_quant_spec_test.cc       |  76 +++++++++
 .../stablehlo/passes/quantization_patterns.cc | 110 +++++++-----
 .../stablehlo/quantization_config.proto       |  60 ++++++-
 .../prepare_quantize_per_channel.mlir         |  40 ++++-
 .../passes/quantize_composite_functions.mlir  | 159 +++++++++++++++++-
 .../mlir/quantization/tensorflow/python/BUILD |   1 +
 .../tensorflow/python/quantize_model.cc       |  39 +++--
 12 files changed, 548 insertions(+), 80 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index b18d039e17c64d..5251ecebf1c9b3 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -157,8 +157,10 @@ cc_library(
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call",
         "//tensorflow/compiler/mlir/quantization/common:uniform_quantized_types",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
@@ -169,6 +171,8 @@ cc_library(
         "//tensorflow/core/platform:path",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
index 0284c00523f420..433cb3cdee3c1d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 
+#include <utility>
+
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 
 namespace stablehlo::quantization {
@@ -29,6 +31,62 @@ CalibrationOptions GetDefaultCalibrationOptions() {
   return options;
 }
 
+// Returns a default `QuantizationSpec` for performing static-range PTQ on all
+// ops.
+//
+// In textproto, the spec corresponds to:
+//
+// {
+//   {matcher {function_name {regex: ".*"}}
+//   {method {static_range_ptq {}}}
+// }
+QuantizationSpec GetDefaultStaticRangePtqSpec() {
+  QuantizationSpec spec{};
+  // Default for all ops.
+  spec.mutable_matcher()->mutable_function_name()->set_regex(".*");
+  spec.mutable_method()->mutable_static_range_ptq();
+
+  return spec;
+}
+
+// Returns a `QuantizationSpec` for performing static-range PTQ on the
+// convolution quantizable unit family. Enables per-channel quantization for
+// weights, on the channel dimension.
+//
+// In textproto, the spec corresponds to:
+//
+// {
+//   {matcher {function_name {regex: "composite_conv.*"}}}
+//   {method {static_range_ptq
+//     {input_quantized_types {
+//       key: 1,
+//       value {dimension_specs {dimension: 3}}}}
+//   }}
+// }
+QuantizationSpec GetStaticRangePtqSpecForConvolution() {
+  QuantizationSpec spec{};
+
+  // Matches all convolution quantizable unit family.
+  spec.mutable_matcher()->mutable_function_name()->set_regex(
+      "composite_conv.*");
+  StaticRangePtq& static_range_ptq_spec =
+      *spec.mutable_method()->mutable_static_range_ptq();
+
+  // Enable per-channel quantization for convolution weights.
+  QuantizedType conv_weight_quantized_type{};
+
+  // Assumes NHWC format, specifying the channel dimension (3) as the quantized
+  // axis.
+  conv_weight_quantized_type.mutable_dimension_specs()->set_dimension(3);
+
+  // The index of weight operands passed to lifted functions for convolution
+  // is 1.
+  static_range_ptq_spec.mutable_input_quantized_types()->try_emplace(
+      1, std::move(conv_weight_quantized_type));
+
+  return spec;
+};
+
 void ExpandStaticRangePtqPreset(const StaticRangePtqPreset& preset,
                                 QuantizationConfig& config) {
   // Populate with preset's representative dataset configs if the user didn't
@@ -45,10 +103,10 @@ void ExpandStaticRangePtqPreset(const StaticRangePtqPreset& preset,
   // from `StaticRangePtqPreset` gets populated first and then user-provided
   // explicit `QuantizationSpec`s will be appended.
   QuantizationSpecs new_specs{};
-  QuantizationSpec& spec = *new_specs.add_specs();
-  spec.mutable_matcher()->mutable_function_name()->set_regex(".*");
-  spec.mutable_method()->mutable_static_range_ptq();
+  *new_specs.add_specs() = GetDefaultStaticRangePtqSpec();
+  *new_specs.add_specs() = GetStaticRangePtqSpecForConvolution();
 
+  // Append user-provided specs to override existing specs.
   const QuantizationSpecs& previous_specs = config.specs();
   new_specs.mutable_specs()->Add(previous_specs.specs().begin(),
                                  previous_specs.specs().end());
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
index b606c797819c4b..021f4baccda50b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
@@ -87,11 +87,25 @@ TEST(ExpandPresetsTest, ExpandStaticRangePtqPreset) {
   preset_dataset_config.mutable_tf_record()->set_path("/test/path");
 
   const QuantizationConfig new_config = ExpandPresets(config);
-  ASSERT_THAT(new_config.specs().specs(), SizeIs(1));
+  ASSERT_THAT(new_config.specs().specs(), SizeIs(2));
+
+  const QuantizationSpec& default_spec = new_config.specs().specs(0);
+  EXPECT_THAT(default_spec.matcher().function_name().regex(), StrEq(".*"));
+  EXPECT_TRUE(default_spec.method().has_static_range_ptq());
+
+  // Test that the expansion for convolution ops is done.
+  const QuantizationSpec& conv_spec = new_config.specs().specs(1);
+  EXPECT_THAT(conv_spec.matcher().function_name().regex(),
+              StrEq("composite_conv.*"));
+  ASSERT_TRUE(conv_spec.method().has_static_range_ptq());
 
-  const QuantizationSpec& spec = new_config.specs().specs(0);
-  EXPECT_THAT(spec.matcher().function_name().regex(), StrEq(".*"));
-  EXPECT_TRUE(spec.method().has_static_range_ptq());
+  const StaticRangePtq& srq_spec = conv_spec.method().static_range_ptq();
+  ASSERT_THAT(srq_spec.input_quantized_types(), SizeIs(1));
+  ASSERT_TRUE(srq_spec.input_quantized_types().contains(1));
+
+  EXPECT_THAT(
+      srq_spec.input_quantized_types().at(1).dimension_specs().dimension(),
+      Eq(3));
 
   // Test that representative dataset config has been transferred to the
   // `CalibrationOptions`.
@@ -105,7 +119,7 @@ TEST(ExpandPresetsTest, ExpandStaticRangePtqPreset) {
 }
 
 TEST(ExpandPresetsTest,
-     ExpandStaticRangePtqPresetWithExplicitRepresentativeDatasetConfigs) {
+     ExpandStaticRangePtqPresetWithTopLevelRepresentativeDataset) {
   // Test the scenario where both
   // `config.calibration_options.representative_datasets` and
   // `config.static_range_ptq_preset.representative_datasets` are both
@@ -133,8 +147,7 @@ TEST(ExpandPresetsTest,
               StrEq("/test/path/1"));
 }
 
-TEST(ExpandPresetsTest,
-     ExpandStaticRangePtqPresetWithExplicitSpecsAppendedAfterExpandedSpecs) {
+TEST(ExpandPresetsTest, ExpandStaticRangePtqPresetThenAppendExplicitSpecs) {
   QuantizationConfig config{};
   config.mutable_static_range_ptq_preset();
 
@@ -150,11 +163,15 @@ TEST(ExpandPresetsTest,
   //
   // specs {matcher {function_name {regex: ".*"}} method {static_range_ptq {}}}
   // specs {
+  //   matcher {function_name {regex: "composite_conv.*"}}
+  //   method {static_range_ptq {...}}}
+  // }
+  // specs {
   //   matcher {function_name {regex: "composite_dot_general_fn_1"}}
   //   method {no_quantization {}}
   // }
   const QuantizationConfig new_config = ExpandPresets(config);
-  ASSERT_THAT(new_config.specs().specs(), SizeIs(2));
+  ASSERT_THAT(new_config.specs().specs(), SizeIs(3));
 
   const QuantizationSpec& first_spec = new_config.specs().specs(0);
   EXPECT_THAT(first_spec.matcher().function_name().regex(), StrEq(".*"));
@@ -162,8 +179,14 @@ TEST(ExpandPresetsTest,
 
   const QuantizationSpec& second_spec = new_config.specs().specs(1);
   EXPECT_THAT(second_spec.matcher().function_name().regex(),
+              StrEq("composite_conv.*"));
+  EXPECT_TRUE(second_spec.method().has_static_range_ptq());
+
+  // This corresponds to `user_provided_spec`.
+  const QuantizationSpec& third_spec = new_config.specs().specs(2);
+  EXPECT_THAT(third_spec.matcher().function_name().regex(),
               StrEq("composite_dot_general_fn_1"));
-  EXPECT_TRUE(second_spec.method().has_no_quantization());
+  EXPECT_TRUE(third_spec.method().has_no_quantization());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
index 0f8bb04d796a6e..35584857f5761f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
@@ -41,12 +41,14 @@ tf_cc_test(
         ":stablehlo_op_quant_spec",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common:test_base",
+        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:test",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@stablehlo//:stablehlo_ops",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
index a78a1feec9077e..c78ee607993385 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
@@ -47,6 +47,7 @@ namespace {
 
 using ::mlir::stablehlo::DotGeneralOp;
 using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::StaticRangePtq;
 
 // Whether it represents a lifted function (i.e. `op` is the corresponding
 // `XlaCallModuleOp`) that is explicitly marked `NoQuantization`.
@@ -61,6 +62,31 @@ bool IsDenylistedLiftedFunction(Operation* op) {
   return false;
 }
 
+// Populates `spec.coeff_op_quant_dim` according to `xla_call_module_op`'s
+// `_quantization_method` attribute. If there is an input `QuantizedType` with
+// `dimension_specs` set, which represents the quantization dimension for the
+// input, then the corresponding operand index -> quantization dimension mapping
+// is set for `spec`.
+// TODO: b/323478683 - Duplicate tracking of config will be eliminated.
+// `OpQuantSpec` will be deprecated and `Method` will be used instead.
+void PopulateCoeffOpQuantDimIfPerChannelQuantized(
+    TF::XlaCallModuleOp xla_call_module_op, OpQuantSpec& spec) {
+  absl::StatusOr<Method> method = GetQuantizationMethod(xla_call_module_op);
+  if (method.ok() && method->has_static_range_ptq()) {
+    // TODO: b/331145946 - Use `Method` accessors.
+    const StaticRangePtq& static_range_ptq_spec = method->static_range_ptq();
+    // Look for quantized dimension specs for each quantized type and
+    // populate `coeff_op_quant_dim`.
+    for (const auto& [operand_idx, quantized_type] :
+         static_range_ptq_spec.input_quantized_types()) {
+      if (quantized_type.has_dimension_specs()) {
+        spec.coeff_op_quant_dim[operand_idx] =
+            quantized_type.dimension_specs().dimension();
+      }
+    }
+  }
+}
+
 }  // namespace
 
 std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op) {
@@ -72,8 +98,12 @@ std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op) {
     if (!function_name.starts_with("composite_")) {
       return spec;
     }
+
     if (function_name.contains("conv")) {
-      spec->coeff_op_quant_dim[1] = 3;
+      // Looks up `Method` to see if it should be per-channel quantized and
+      // populates the spec accordingly.
+      PopulateCoeffOpQuantDimIfPerChannelQuantized(call_op, *spec);
+
       if (function_name.contains("with_bias")) {
         spec->biases_params[2] = {{0, 1},
                                   quant::GetUniformQuantizedTypeForBias};
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc
index 39baea749992d1..b3ba4818284498 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc
@@ -15,14 +15,18 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h"
 
+#include <memory>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/test.h"
@@ -30,7 +34,10 @@ limitations under the License.
 namespace mlir::quant::stablehlo {
 namespace {
 
+using ::testing::IsEmpty;
 using ::testing::NotNull;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
 
 using IsOpQuantizableStableHloTest = ::mlir::quant::QuantizationTestBase;
 
@@ -208,5 +215,74 @@ TEST_F(IsOpQuantizableStableHloTest, DenylistedXlaCallModuleOpNotQuantizable) {
   EXPECT_FALSE(IsOpQuantizableStableHlo(xla_call_module_op));
 }
 
+using GetStableHloOpQuantSpecTest = ::mlir::quant::QuantizationTestBase;
+
+TEST_F(GetStableHloOpQuantSpecTest,
+       EmptyCoeffOpQuantDimForPerTensorQuantizedConvolution) {
+  // A `TF::XlaCallModuleOp` with `_quantization_method = "static_range_ptq
+  // {}"`, representing a per-tensor static-range PTQ quantization.
+  constexpr absl::string_view
+      kXlaCallModuleOpWithPerTensorQuantizedConvolution = R"mlir(
+    func.func @main(%arg0: tensor<1x1x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x1x4xf32> {
+      %0 = "tf.XlaCallModule"(%arg0, %arg1) <{Sout = [#tf_type.shape<1x1x4>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}>
+          {
+            _entry_function = @composite_conv_fn_1,
+            _original_entry_function = "composite_conv_fn_1",
+            _quantization_method = "static_range_ptq {}",
+            _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true},
+            _tfl_quant_trait = "fully_quantizable"
+          } : (tensor<1x1x3xf32>, tensor<3x4xf32>) -> tensor<1x1x4xf32>
+      return %0 : tensor<1x1x4xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kXlaCallModuleOpWithPerTensorQuantizedConvolution);
+  ASSERT_TRUE(module_op);
+
+  const FailureOr<TF::XlaCallModuleOp> xla_call_module_op =
+      FindFirstOpFromMainFunc<TF::XlaCallModuleOp>(*module_op);
+  ASSERT_TRUE(succeeded(xla_call_module_op));
+
+  const std::unique_ptr<OpQuantSpec> op_quant_spec =
+      GetStableHloOpQuantSpec(*xla_call_module_op);
+  ASSERT_THAT(op_quant_spec, NotNull());
+
+  EXPECT_THAT(op_quant_spec->coeff_op_quant_dim, IsEmpty());
+}
+
+TEST_F(GetStableHloOpQuantSpecTest,
+       EmptyCoeffOpQuantDimForPerChannelQuantizedConvolution) {
+  constexpr absl::string_view
+      kXlaCallModuleOpWithPerChannelQuantizedConvolution = R"mlir(
+    func.func @main(%arg0: tensor<1x1x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x1x4xf32> {
+      %0 = "tf.XlaCallModule"(%arg0, %arg1) <{Sout = [#tf_type.shape<1x1x4>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}>
+          {
+            _entry_function = @composite_conv_fn_1,
+            _original_entry_function = "composite_conv_fn_1",
+            _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+            _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true},
+            _tfl_quant_trait = "fully_quantizable"
+          } : (tensor<1x1x3xf32>, tensor<3x4xf32>) -> tensor<1x1x4xf32>
+      return %0 : tensor<1x1x4xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kXlaCallModuleOpWithPerChannelQuantizedConvolution);
+  ASSERT_TRUE(module_op);
+
+  const FailureOr<TF::XlaCallModuleOp> xla_call_module_op =
+      FindFirstOpFromMainFunc<TF::XlaCallModuleOp>(*module_op);
+  ASSERT_TRUE(succeeded(xla_call_module_op));
+
+  const std::unique_ptr<OpQuantSpec> op_quant_spec =
+      GetStableHloOpQuantSpec(*xla_call_module_op);
+  ASSERT_THAT(op_quant_spec, NotNull());
+
+  EXPECT_THAT(op_quant_spec->coeff_op_quant_dim,
+              UnorderedElementsAre(Pair(1, 3)));
+}
+
 }  // namespace
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
index 2dfed06b8f7087..a43f4bab2b5d04 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -49,9 +48,11 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -73,6 +74,9 @@ using ::mlir::stablehlo::GatherOp;
 using ::mlir::stablehlo::GetDimensionSizeOp;
 using ::mlir::stablehlo::ReshapeOp;
 using ::mlir::stablehlo::UniformQuantizeOp;
+using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::QuantizedType;
+using ::stablehlo::quantization::StaticRangePtq;
 
 constexpr StringRef kCompositeFuncPrefix = "composite_";
 constexpr StringRef kQuantizedFuncPrefix = "quantized_";
@@ -139,22 +143,6 @@ Operation* GetBroadcastedUserOp(Operation* op) {
   return target_op;
 }
 
-// Checks if one of the inputs and outputs are quantized.
-bool HasQuantizedOperandOrOutput(Operation* call_op) {
-  SmallVector<Type> arg_types;
-  for (const Value arg : call_op->getOperands()) {
-    arg_types.push_back(arg.getType());
-  }
-
-  SmallVector<Type> output_types;
-  for (const Value output : call_op->getResults()) {
-    output_types.push_back(output.getType());
-  }
-
-  return absl::c_any_of(arg_types, IsQuantizedTensorType) &&
-         absl::c_any_of(output_types, IsQuantizedTensorType);
-}
-
 // Gets the corresponding quantized function name from the given function name.
 // Example: "composite_dot_general_fn_1" => "quantized_dot_general_fn"
 std::string GetQuantizedFunctionName(const StringRef func_name) {
@@ -170,7 +158,7 @@ std::string GetQuantizedFunctionName(const StringRef func_name) {
 // 3. It should also have the `kEntryFuncAttrName` attribute, which points to
 //    the function that `xla_call_module_op` represents.
 bool IsQuantizedXlaCallModuleOp(TF::XlaCallModuleOp xla_call_module_op) {
-  return HasQuantizedOperandOrOutput(xla_call_module_op) &&
+  return !IsOpNotQuantized(xla_call_module_op) &&
          xla_call_module_op->hasAttr(kQuantTraitAttrName) &&
          xla_call_module_op->hasAttr(kEntryFuncAttrName);
 }
@@ -287,6 +275,7 @@ class EntryFuncBodyQuantizationPattern {
 
   // Rewrites the `entry_func_op`'s body.
   virtual void rewrite(func::FuncOp entry_func_op,
+                       const Method& quantization_method,
                        PatternRewriter& rewriter) const = 0;
 };
 
@@ -429,7 +418,7 @@ class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
     return MatchGemmStyleOp<DotGeneralOp>(entry_func_op);
   }
 
-  void rewrite(func::FuncOp entry_func_op,
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {
     DotGeneralOp dot_general_op = *entry_func_op.getOps<DotGeneralOp>().begin();
     const bool should_quantize_per_channel =
@@ -440,7 +429,9 @@ class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
   }
 
  private:
-  const bool enable_per_channel_quantized_weight_;
+  [[deprecated(
+      "Do not rely on this field for per-channel quantization. Use `Method` "
+      "instead.")]] const bool enable_per_channel_quantized_weight_;
 };
 
 // Quantizes the entry function's body containing a `ConvolutionOp`.
@@ -455,14 +446,35 @@ class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
     return MatchGemmStyleOp<ConvolutionOp>(entry_func_op);
   }
 
-  void rewrite(func::FuncOp entry_func_op,
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {
-    RewriteGemmStyleOp<ConvolutionOp>(entry_func_op, rewriter,
-                                      enable_per_channel_quantized_weight_);
+    RewriteGemmStyleOp<ConvolutionOp>(
+        entry_func_op, rewriter,
+        enable_per_channel_quantized_weight_ &&
+            IsWeightPerChannelQuantized(quantization_method));
+  }
+
+  // Returns true if the quantization method indicates per-channel quantization
+  // for convolution weights. This method specifically matches a quantization
+  // dimension of 3 for the input index 1.
+  bool IsWeightPerChannelQuantized(const Method& quantization_method) const {
+    if (quantization_method.has_static_range_ptq()) {
+      const StaticRangePtq& static_range_ptq_spec =
+          quantization_method.static_range_ptq();
+
+      if (static_range_ptq_spec.input_quantized_types().contains(1)) {
+        const QuantizedType& weight_quantized_type =
+            static_range_ptq_spec.input_quantized_types().at(1);
+        return weight_quantized_type.dimension_specs().dimension() == 3;
+      }
+    }
+    return false;
   }
 
  private:
-  const bool enable_per_channel_quantized_weight_;
+  [[deprecated(
+      "Do not rely on this field for per-channel quantization. Use `Method` "
+      "instead.")]] const bool enable_per_channel_quantized_weight_;
 };
 
 template <typename SingularOpT>
@@ -487,7 +499,7 @@ class QuantizeSingularOpPattern : public EntryFuncBodyQuantizationPattern {
     return success();
   }
 
-  void rewrite(func::FuncOp entry_func_op,
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {
     auto singular_op = *entry_func_op.getOps<SingularOpT>().begin();
 
@@ -500,14 +512,17 @@ class QuantizeSingularOpPattern : public EntryFuncBodyQuantizationPattern {
 // inputs and outputs of `xla_call_module_op` that are possibly quantized. It
 // signature (type) is reset to match that of `xla_call_module_op`.
 // `entry_func_body_quantization_pattern` rewrites the function's body, based on
-// the new signature.
+// the new signature. `quantization_method` specifies the quantization method
+// applied to the quantizable unit `xla_call_module_op` and its corresponding
+// function `entry_func_op`.
 void QuantizeEntryFuncOp(
     const MLIRContext& ctx, PatternRewriter& rewriter,
     const TF::XlaCallModuleOp xla_call_module_op, func::FuncOp entry_func_op,
-    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern) {
+    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern,
+    const Method& quantization_method) {
   SetQuantizedFunctionType(rewriter, entry_func_op, xla_call_module_op);
 
-  body_rewrite_pattern.rewrite(entry_func_op, rewriter);
+  body_rewrite_pattern.rewrite(entry_func_op, quantization_method, rewriter);
 
   // Rename the function to be clear that the function has been quantized.
   const std::string quantized_function_name =
@@ -521,13 +536,14 @@ void QuantizeEntryFuncOp(
 void ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
     const MLIRContext& ctx, PatternRewriter& rewriter,
     TF::XlaCallModuleOp xla_call_module_op,
-    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern) {
+    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern,
+    const Method& quantization_method) {
   const ModuleOp module_op = xla_call_module_op->getParentOfType<ModuleOp>();
   const SymbolTable symbol_table(module_op);
 
   func::FuncOp entry_func_op = GetEntryFuncOp(xla_call_module_op, symbol_table);
   QuantizeEntryFuncOp(ctx, rewriter, xla_call_module_op, entry_func_op,
-                      body_rewrite_pattern);
+                      body_rewrite_pattern, quantization_method);
 
   // Replace the XlaCallModuleOp with a new CallOp.
   rewriter.setInsertionPoint(xla_call_module_op);
@@ -570,19 +586,29 @@ class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
       op->emitError("Failed to find a valid entry function.");
       return failure();
     }
+
     return FuncBodyRewritePatternT(enable_per_channel_quantized_weight_)
         .match(entry_func_op);
   }
 
   void rewrite(TF::XlaCallModuleOp xla_call_module_op,
                PatternRewriter& rewriter) const override {
+    // TODO: b/331145946 - Each quantization method should be valid
+    // (GetQuantizationMethodOrDefault swallows invalid method attribute). Check
+    // the validity in `match()`. Use accessors to achieve this.
+    const Method quantization_method =
+        GetQuantizationMethodOrDefault(xla_call_module_op);
+
     ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
         *rewriter.getContext(), rewriter, xla_call_module_op,
-        FuncBodyRewritePatternT(enable_per_channel_quantized_weight_));
+        FuncBodyRewritePatternT(enable_per_channel_quantized_weight_),
+        quantization_method);
   }
 
  private:
-  const bool enable_per_channel_quantized_weight_;
+  [[deprecated(
+      "Do not rely on this field for per-channel quantization. Use `Method` "
+      "instead.")]] const bool enable_per_channel_quantized_weight_;
 };
 
 // Quantizes op with regions such as stablehlo.reduce_window op.
@@ -592,7 +618,7 @@ class QuantizeOpWithRegionPattern
     : public OpRewritePattern<quantfork::DequantizeCastOp> {
  public:
   explicit QuantizeOpWithRegionPattern(MLIRContext& ctx)
-      : OpRewritePattern<quantfork::DequantizeCastOp>(&ctx){};
+      : OpRewritePattern<quantfork::DequantizeCastOp>(&ctx) {};
 
   LogicalResult match(quantfork::DequantizeCastOp op) const final {
     // Match only when there is one user of the dequantize op.
@@ -866,7 +892,7 @@ class QuantizeHybridDotGeneralPattern
     return MatchGemmStyleOp<DotGeneralOp>(entry_func_op);
   }
 
-  void rewrite(func::FuncOp entry_func_op,
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {}
 };
 
@@ -876,9 +902,8 @@ template <typename FuncBodyRewritePatternT,
 class HybridXlaCallModuleOpToCallOp
     : public OpRewritePattern<TF::XlaCallModuleOp> {
  public:
-  explicit HybridXlaCallModuleOpToCallOp(
-      MLIRContext& ctx, bool enable_per_channel_quantized_weight)
-      : OpRewritePattern<TF::XlaCallModuleOp>(&ctx){};
+  explicit HybridXlaCallModuleOpToCallOp(MLIRContext& ctx)
+      : OpRewritePattern<TF::XlaCallModuleOp>(&ctx) {};
 
   LogicalResult match(TF::XlaCallModuleOp op) const override {
     ModuleOp module_op = op->getParentOfType<ModuleOp>();
@@ -894,14 +919,21 @@ class HybridXlaCallModuleOpToCallOp
       op->emitError("Failed to find a valid entry function.");
       return failure();
     }
+
     return FuncBodyRewritePatternT().match(entry_func_op);
   }
 
   void rewrite(TF::XlaCallModuleOp xla_call_module_op,
                PatternRewriter& rewriter) const override {
+    // TODO: b/331145946 - Each quantization method should be valid
+    // (GetQuantizationMethodOrDefault swallows invalid method attribute). Check
+    // the validity in `match()`. Use accessors to achieve this.
+    const Method quantization_method =
+        GetQuantizationMethodOrDefault(xla_call_module_op);
+
     ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
         *rewriter.getContext(), rewriter, xla_call_module_op,
-        FuncBodyRewritePatternT());
+        FuncBodyRewritePatternT(), quantization_method);
   }
 };
 
@@ -924,7 +956,7 @@ void PopulateComputeHeavyPatterns(
 void PopulateQuantizeHybridPatterns(MLIRContext& ctx,
                                     RewritePatternSet& patterns) {
   patterns.add<HybridXlaCallModuleOpToCallOp<QuantizeHybridDotGeneralPattern>>(
-      ctx, false);
+      ctx);
 }
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
index 5ff51136bcd4e4..93d98d9067ef9c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
@@ -37,8 +37,22 @@ message RepresentativeDatasetConfig {
 //
 // ```
 // specs {matcher {function_name {regex: ".*"}} method {static_range_ptq {}}}
+// specs {
+//   matcher {function_name {regex: "composite_conv.*"}}
+//   method {static_range_ptq {
+//     input_quantized_types {
+//       key: 1
+//       value {dimension_specs {dimension: 3}}}
+//   }}
+// }
 // ```
 //
+// This preset:
+//   * Applies per-channel quantization for weights (input index 1) of
+//     convolution quantizable unit family. The quantization dimension is 3, the
+//     channel dimension, which assumes the weight tensor is in NHWC format.
+//   * Applies static-range PTQ for all other ops.
+//
 // Next ID: 3
 message StaticRangePtqPreset {
   // Configures representative dataset. Each item corresponds to a
@@ -48,11 +62,16 @@ message StaticRangePtqPreset {
   repeated RepresentativeDatasetConfig representative_datasets = 1;
 
   // NOTE: This field will be deprecated.
-  // Granularity should be controlled in custom configuration, deprecating
-  // this field once available.
-  // If set true, enable channel-wise quantization for all supported ops.
-  // This value is true by default.
-  bool enable_per_channel_quantized_weight = 2;
+  // Granularity should be controlled using `Method`, deprecating this field
+  // once available.
+  //
+  // If set to true, enable channel-wise quantization for:
+  //   * Convolution ops: When the attached `Method` also specifies per-channel
+  //                      quantization.
+  //   * Non-convolution ops: All
+  //
+  // Default value: true
+  bool enable_per_channel_quantized_weight = 2 [deprecated = true];
 }
 
 // Applies int8 per-tensor weight-only quantization for all dot_general op.
@@ -102,12 +121,39 @@ message QuantizationResults {
   repeated QuantizationResult results = 1;
 }
 
+message QuantizedDimension {
+  int32 dimension = 1;  // Should be less than the rank of the quantized tensor.
+}
+
+// Corresponds to StableHLO's `QuantizedTensorElementType`. Type parameters such
+// as `QuantizationParameters` is omitted because they are determined during
+// quantization.
+// See https://github.com/openxla/stablehlo/blob/main/docs/spec.md#types for
+// details.
+//
+// Currently only supports specifying quantization granularity (e.g. for
+// per-channel quantization).
+// TODO: b/331144430 - Support specifying storage types.
+message QuantizedType {
+  // Specifies the granularity of quantization parameters for each dimension of
+  // a quantized tensor. If specified, per-channel quantization is applied. If
+  // not specified, per-tensor quantization is applied.
+  // TODO: Make it a `repeated` field to be able to express multi-channel /
+  // sub-channel quantization.
+  QuantizedDimension dimension_specs = 1;
+}
+
 // A quantization method representing "do not quantize". Mostly used for
 // denylisting quantizable units from quantization.
 message NoQuantization {}
 
-// Configurations for static-range post-training quantization method.
-message StaticRangePtq {}
+// Configurations for static-range post-training quantization method on a
+// quantizable unit.
+message StaticRangePtq {
+  // Operand index -> QuantizedType mapping. Operands that are not specified
+  // here will be quantized with best effort.
+  map<int32, QuantizedType> input_quantized_types = 1;
+}
 
 // Represents a matching method that matches quantizable units by lifted
 // functions' names.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_per_channel.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_per_channel.mlir
index 9b3c6f0f0ae04f..1ff62b1170a6f5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_per_channel.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_per_channel.mlir
@@ -17,8 +17,11 @@ module {
     // CHECK: "tf.XlaCallModule"(%[[dq_act]], %[[dq_weight]]
     %1 = "tf.XlaCallModule"(%0, %cst_0, %cst) {
       Sout = [#tf_type.shape<1x2x2x2>], config = "",
-      _entry_function = @composite_conv2d_with_bias_and_relu6_fn_10,
       module = "composite_conv2d_with_bias_and_relu6_fn_10",
+      _entry_function = @composite_conv2d_with_bias_and_relu6_fn_10,
+      // Represents a per-channel quantization for the operand index 1 with
+      // quantization dimension of 3
+      _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
       platforms = [], version = 4 : i64
     } : (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x2x2x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[0.000000e+00, 6.000000e+00]> : tensor<2xf32>} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
@@ -90,3 +93,38 @@ module {
     return %0 : tensor<2x2xf32>
   }
 }
+
+// -----
+
+// Tests that the `PrepareQuantizePass` prepares for per-tensor quantization for
+// the weight of convolution. This is based on the `_quantization_method` that
+// does not have a `input_quantized_types` with a specified `dimension_specs`.
+
+// CHECK-LABEL: conv_per_tensor_quantized_method
+func.func private @conv_per_tensor_quantized_method(%arg0: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32> {
+  %cst = "tf.Const"() {device = "", value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<[[[[-6.30731344, 5.4962182], [1.80364347, -7.64542675], [-2.11145878, -7.08605719]], [[-9.54062747, -6.14013147], [6.12640238, -4.18223286], [5.05738974, 8.99269962]], [[3.3535192, 0.84816426], [-6.64676809, -7.95477629], [5.81315517, 9.21566581]]], [[[1.38622558, 4.63866329], [4.54742622, -1.43770897], [-3.96835279, 2.99996852]], [[0.989735424, -4.83384752], [-7.27702999, 1.17216611], [1.33735656, 0.728900194]], [[5.1286211, 8.98645591], [1.55008793, -3.85491467], [3.7003777, 9.26594448]]]]> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "quantfork.stats"(%arg0) {layerStats = dense<[1.27501142, 4.824783]> : tensor<2xf32>} : (tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
+  %1 = "tf.XlaCallModule"(%0, %cst_0, %cst) {
+    Sout = [#tf_type.shape<1x2x2x2>], config = "",
+    module = "composite_conv_fn_1",
+    _entry_function = @composite_conv_fn_1,
+    _quantization_method = "static_range_ptq {}",
+    platforms = [], version = 4 : i64
+  } : (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x2x2x2xf32>
+  %2 = "quantfork.stats"(%1) {layerStats = dense<[0.000000e+00, 6.000000e+00]> : tensor<2xf32>} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+  return %2 : tensor<1x2x2x2xf32>
+}
+// CHECK-SAME: %[[ARG_0:.+]]: tensor<1x3x2x3xf32>
+
+// Test that the weight is prepared for per-tensor quantization, based on the
+// `_quantization_method` attribute without a `dimension_specs` field in
+// `QuantizedType`.
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} tensor<2x3x3x2xf32>
+// CHECK: %[[Q_WEIGHT_PER_TENSOR:.*]] = "quantfork.qcast"(%[[WEIGHT_CONST]]) {{.*}} (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[DQ_WEIGHT:.*]] = "quantfork.dcast"(%[[Q_WEIGHT_PER_TENSOR]])
+
+// CHECK: %[[Q_ACTIVATION:.*]] = "quantfork.qcast"(%[[ARG_0]])
+// CHECK-SAME: -> tensor<1x3x2x3x!quant.uniform<i8:f32, 0.018920717052384919:-128>>
+// CHECK: %[[DQ_ACTIVATION:.*]] = "quantfork.dcast"(%[[Q_ACTIVATION]])
+// CHECK: "tf.XlaCallModule"(%[[DQ_ACTIVATION]], %[[DQ_WEIGHT]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
index b19f8af3f6e8c2..f9fa9ce5f60b87 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
@@ -187,13 +187,31 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
-// Tests that basic convolution is properly quantized.
+// Tests that basic convolution is properly quantized. It is per-channel
+// quantized unless `enable-per-channel-quantized-weight=false`, according to
+// `_quantization_method` with an `input_quantized_types` and explicit
+// `dimension_specs`.
 
 module attributes {tf_saved_model.semantics} {
   func.func private @quantize_conv_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64, _entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64,
+        _entry_function = @composite_conv_fn,
+        _original_entry_function = "composite_conv_fn",
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _stablehlo_module_attrs = {},
+        _tfl_quant_trait = "fully_quantizable",
+        device = ""
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
     return %2 : tensor<1x3x4x2xf32>
   }
@@ -235,6 +253,58 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
+// Tests that basic convolution is properly quantized. In this example, the
+// convolution is always per-tensor quantized (even if
+// enable-per-channel-quantized-weights=true), according to
+// `_quantization_method`.
+
+// CHECK-LABEL: quantize_conv_fn_per_tensor
+func.func @quantize_conv_fn_per_tensor(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> {
+  %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+  %1 = "tf.XlaCallModule"(%0, %cst) {
+      Sout = [#tf_type.shape<1x3x4x2>],
+      dim_args_spec = [],
+      disabled_checks = [],
+      has_token_input_output = false,
+      module = "",
+      platforms = [],
+      version = 5 : i64,
+      _entry_function = @composite_conv_fn,
+      _original_entry_function = "composite_conv_fn",
+      _quantization_method = "static_range_ptq {}",
+      _stablehlo_module_attrs = {},
+      _tfl_quant_trait = "fully_quantizable",
+      device = ""
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+  %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+  return %2 : tensor<1x3x4x2xf32>
+}
+// Check that the quantized XlaCallModule has been replaced by a CallOp, which
+// calls the quantized entry function.
+
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+  return %0 : tensor<1x3x4x2xf32>
+}
+// Checks that the entry function is quantized for convolution. Quantized
+// convolution outputs an i32 quantized tensor, followed by requantization to
+// i8 quantized tensor.
+
+// CHECK: func.func private @quantized_conv_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[CONVOLUTION_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// -----
+
 // Tests that fused pattern for convolution + bias is properly quantized.
 
 // Checks that fused functions with 1D bias is properly quantized.
@@ -246,7 +316,22 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
     %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<2xf32>} : () -> tensor<2xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_1d_fn, _original_entry_function = "composite_conv_with_bias_1d_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_1d_fn,
+        _original_entry_function = "composite_conv_with_bias_1d_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
     return %2 : tensor<1x3x4x2xf32>
   }
@@ -298,7 +383,22 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
     %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_fn, _original_entry_function = "composite_conv_with_bias_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_fn,
+        _original_entry_function = "composite_conv_with_bias_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x4x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
     return %2 : tensor<1x3x4x2xf32>
   }
@@ -349,7 +449,22 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
     %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_dynamic_fn, _original_entry_function = "composite_conv_with_bias_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_dynamic_fn,
+        _original_entry_function = "composite_conv_with_bias_dynamic_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
     return %2 : tensor<?x3x4x2xf32>
   }
@@ -426,7 +541,22 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
     %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_and_relu_dynamic_fn, _original_entry_function = "composite_conv_with_bias_and_relu_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_and_relu_dynamic_fn,
+        _original_entry_function = "composite_conv_with_bias_and_relu_dynamic_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[0.00000000e-6, 8.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
     return %2 : tensor<?x3x4x2xf32>
   }
@@ -506,7 +636,22 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
     %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_and_relu6_dynamic_fn, _original_entry_function = "composite_conv_with_bias_and_relu6_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_and_relu6_dynamic_fn,
+        _original_entry_function = "composite_conv_with_bias_and_relu6_dynamic_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 6.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
     return %2 : tensor<?x3x4x2xf32>
   }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index 2ad4a1898bc72e..bd53b29ad79255 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -36,6 +36,7 @@ cc_library(
         ":unfreeze_constants",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:config",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:context",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:debugger",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index 99471a5772b428..dfa7f702179137 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -35,9 +35,9 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
@@ -68,7 +68,6 @@ namespace tensorflow {
 namespace quantization {
 namespace {
 
-using ::mlir::quant::stablehlo::AddExportPasses;
 using ::mlir::quant::stablehlo::ConvertMlirModuleToExportedModel;
 using ::mlir::quant::stablehlo::CreateMlirContextForQuantization;
 using ::mlir::quant::stablehlo::ExportOptions;
@@ -84,6 +83,8 @@ using ::stablehlo::quantization::ChangeToQuantizedFilename;
 using ::stablehlo::quantization::DebuggerConfig;
 using ::stablehlo::quantization::DisableDebugging;
 using ::stablehlo::quantization::EnableDebugging;
+using ::stablehlo::quantization::ExpandPresets;
+using ::stablehlo::quantization::PopulateDefaults;
 using ::stablehlo::quantization::QuantizationConfig;
 using ::stablehlo::quantization::io::CreateTmpDir;
 using ::stablehlo::quantization::io::GetLocalTmpFileName;
@@ -176,6 +177,24 @@ absl::StatusOr<ExportedModel> ExportCalibrationModel(
   return *exported_model;
 }
 
+QuantizationConfig GetQuantizationConfigForStaticRangePtq(
+    const QuantizationOptions &quantization_options) {
+  QuantizationConfig quantization_config{};
+  // TODO: b/331302857 - Remove `enable_per_channel_quantized_weight` usage.
+  quantization_config.mutable_static_range_ptq_preset()
+      ->set_enable_per_channel_quantized_weight(
+          quantization_options.enable_per_channel_quantization());
+  // When targeting server TPUs quantized types should be unpacked into
+  // integer ops.
+  quantization_config.mutable_pipeline_config()->set_unpack_quantized_types(
+      true);
+  *quantization_config.mutable_debugger_config() =
+      quantization_options.debugger_config();
+  quantization_config.mutable_static_range_ptq_preset();
+
+  return ExpandPresets(PopulateDefaults(quantization_config));
+}
+
 absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibrationImpl(
     mlir::ModuleOp module_op, mlir::MLIRContext *context,
     const QuantizationOptions &quantization_options,
@@ -183,9 +202,9 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibrationImpl(
   const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
   // Use StableHLO Quantizer option if opset is specified.
   if (is_stablehlo) {
-    QuantizationConfig quantization_config;
-    *quantization_config.mutable_debugger_config() =
-        quantization_options.debugger_config();
+    const QuantizationConfig quantization_config =
+        GetQuantizationConfigForStaticRangePtq(quantization_options);
+
     PreCalibrationComponent pre_calibration_component(context);
     TF_ASSIGN_OR_RETURN(module_op, pre_calibration_component.Run(
                                        module_op, quantization_config));
@@ -210,14 +229,8 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibrationImpl(
   const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
   // Use StableHLO Quantizer option if opset is specified.
   if (is_stablehlo) {
-    QuantizationConfig quantization_config{};
-    quantization_config.mutable_static_range_ptq_preset()
-        ->set_enable_per_channel_quantized_weight(
-            quantization_options.enable_per_channel_quantization());
-    // When targeting server TPUs quantized types should be unpacked into
-    // integer ops.
-    quantization_config.mutable_pipeline_config()->set_unpack_quantized_types(
-        true);
+    const QuantizationConfig quantization_config =
+        GetQuantizationConfigForStaticRangePtq(quantization_options);
 
     PostCalibrationComponent post_calibration_component(context);
     TF_ASSIGN_OR_RETURN(module_op, post_calibration_component.Run(

From 145204039c8229f76a0002bbce0d4a54ea1d3498 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Tue, 26 Mar 2024 01:34:47 -0700
Subject: [PATCH 429/670] [XLA:GPU] Add option to return FDO profile as
 textproto.

PiperOrigin-RevId: 619105468
---
 third_party/xla/xla/python/BUILD              |  1 +
 third_party/xla/xla/python/profiler.cc        | 40 ++++++++++++++-----
 .../xla/xla/python/xla_extension/profiler.pyi |  6 ++-
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 026539d90f9a6f..31ea8792e45ed4 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -947,6 +947,7 @@ cc_library(
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_profiler_extension_hdrs",
         "@local_tsl//tsl/platform:macros",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
         "@local_tsl//tsl/profiler/lib:profiler_interface",
         "@local_tsl//tsl/profiler/lib:profiler_session",
diff --git a/third_party/xla/xla/python/profiler.cc b/third_party/xla/xla/python/profiler.cc
index 7f180bf20e4f79..204e3d942b0a25 100644
--- a/third_party/xla/xla/python/profiler.cc
+++ b/third_party/xla/xla/python/profiler.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/xplane_to_profile_instructions.h"
 #include "tsl/platform/macros.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/profiler/lib/profiler_factory.h"
 #include "tsl/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/lib/profiler_session.h"
@@ -132,6 +133,25 @@ struct ProfilerSessionWrapper {
   std::unique_ptr<tsl::ProfilerSession> session;
 };
 
+static std::string GetFdoProfile(const std::string& xspace,
+                                 bool as_textproto = false) {
+  tensorflow::profiler::XSpace xspace_proto;
+  // TODO(phawkins): change to std::string_view when protobuf is
+  // updated in XLA.
+  xspace_proto.ParseFromString(std::string(xspace.c_str(), xspace.size()));
+  tensorflow::profiler::ProfiledInstructionsProto fdo_profile;
+  xla::ThrowIfError(xla::ConvertXplaneToProfiledInstructionsProto(
+      {xspace_proto}, &fdo_profile));
+  if (as_textproto) {
+    std::string textproto;
+    if (tsl::protobuf::TextFormat::PrintToString(fdo_profile, &textproto)) {
+      return textproto;
+    }
+    throw xla::XlaRuntimeError("Unable to serialize format to textproto");
+  }
+  return fdo_profile.SerializeAsString();
+}
+
 void BuildProfilerSubmodule(nb::module_& m) {
   nb::module_ profiler =
       m.def_submodule("profiler", "TensorFlow profiler integration");
@@ -265,16 +285,16 @@ void BuildProfilerSubmodule(nb::module_& m) {
       },
       nb::arg("tensorboard_dir"));
 
-  profiler.def("get_fdo_profile", [](nb::bytes xspace) -> nb::bytes {
-    tensorflow::profiler::XSpace xspace_proto;
-    // TODO(phawkins): change to std::string_view when protobuf is
-    // updated in XLA.
-    xspace_proto.ParseFromString(std::string(xspace.c_str(), xspace.size()));
-    tensorflow::profiler::ProfiledInstructionsProto fdo_profile;
-    xla::ThrowIfError(xla::ConvertXplaneToProfiledInstructionsProto(
-        {xspace_proto}, &fdo_profile));
-    std::string fdo_profile_str = fdo_profile.SerializeAsString();
-    return nb::bytes(fdo_profile_str.data(), fdo_profile_str.size());
+  profiler.def("get_fdo_profile",
+               [](nb::bytes xspace, bool as_textproto = false) -> nb::object {
+                 std::string out = GetFdoProfile(
+                     std::string(xspace.c_str(), xspace.size()), as_textproto);
+                 return nb::bytes(out.data(), out.size());
+               });
+
+  profiler.def("get_fdo_profile", [](nb::bytes xspace) -> nb::object {
+    std::string out = GetFdoProfile(std::string(xspace.c_str(), xspace.size()));
+    return nb::bytes(out.data(), out.size());
   });
 }
 
diff --git a/third_party/xla/xla/python/xla_extension/profiler.pyi b/third_party/xla/xla/python/xla_extension/profiler.pyi
index e606f3b4416f6d..92dbb02639b7f3 100644
--- a/third_party/xla/xla/python/xla_extension/profiler.pyi
+++ b/third_party/xla/xla/python/xla_extension/profiler.pyi
@@ -14,7 +14,7 @@
 # ==============================================================================
 
 from types import TracebackType
-from typing import Any, Optional, Type
+from typing import Any, Optional, Type, Union
 
 _Status = Any
 
@@ -24,7 +24,9 @@ def start_server(port: int) -> ProfilerServer: ...
 def register_plugin_profiler(c_api: Any) -> None: ...
 
 def get_profiled_instructions_proto(tensorboard_dir: str) -> bytes: ...
-def get_fdo_profile(xspace: bytes) -> bytes: ...
+def get_fdo_profile(
+    xspace: bytes, as_textproto: bool = ...
+) -> Union[bytes, str]: ...
 
 class ProfilerSession:
   def __init__(self, options: Optional[ProfileOptions] = ...) -> None: ...

From 0ed421174d7da754415aab7cf9b907cb2f857aff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 02:02:34 -0700
Subject: [PATCH 430/670] Update GraphDef version to 1813.

PiperOrigin-RevId: 619111353
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 42afec7cd64fcd..515e839a2839ed 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1812  // Updated: 2024/3/25
+#define TF_GRAPH_DEF_VERSION 1813  // Updated: 2024/3/26
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 4452e9d6f9384ed3f20edc07912d18e6552fac81 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 02:03:22 -0700
Subject: [PATCH 431/670] compat: Update forward compatibility horizon to
 2024-03-26

PiperOrigin-RevId: 619111587
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index d3223e10f83839..9b490822242bf7 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 25)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 26)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 02cc45528b7b595d8b1754eaf4820b350888ac5d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 02:29:05 -0700
Subject: [PATCH 432/670] Automated Code Change

PiperOrigin-RevId: 619117651
---
 third_party/xla/xla/service/BUILD             | 61 +++++++++++++++----
 .../while_loop_fusible_sinking_test.cc        |  2 +-
 .../while_loop_invariant_code_motion.cc       | 12 +++-
 .../while_loop_invariant_code_motion.h        |  5 ++
 .../while_loop_invariant_code_motion_test.cc  | 11 ++++
 .../xla/xla/service/while_loop_simplifier.cc  | 12 +++-
 .../xla/xla/service/while_loop_simplifier.h   |  2 +
 .../xla/service/while_loop_simplifier_test.cc |  3 +
 .../while_loop_trip_count_annotator.cc        |  9 +++
 .../service/while_loop_trip_count_annotator.h |  2 +
 .../while_loop_trip_count_annotator_test.cc   |  6 +-
 third_party/xla/xla/service/while_util.h      |  1 +
 .../xla/xla/service/while_util_test.cc        |  4 ++
 .../xla/service/xla_aot_compile_cpu_test.cc   |  4 ++
 .../xla_aot_compile_stablehlo_cpu_test.cc     |  4 ++
 .../xla/xla/service/xla_debug_info_manager.cc |  4 ++
 .../xla/xla/service/xla_debug_info_manager.h  |  2 +
 .../service/xla_debug_info_manager_test.cc    |  5 ++
 .../xla/service/zero_sized_hlo_elimination.cc |  9 ++-
 .../xla/service/zero_sized_hlo_elimination.h  |  3 +
 .../zero_sized_hlo_elimination_test.cc        |  8 +--
 21 files changed, 140 insertions(+), 29 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 8ad3fb93699cc4..f5ed53d95befe1 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -3271,14 +3271,21 @@ cc_library(
         "//xla:comparison_util",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:status_macros",
         "//xla:statusor",
         "//xla:union_find",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -3295,10 +3302,12 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:test",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
@@ -3315,6 +3324,9 @@ cc_library(
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -3322,14 +3334,12 @@ xla_cc_test(
     name = "while_loop_trip_count_annotator_test",
     srcs = ["while_loop_trip_count_annotator_test.cc"],
     deps = [
-        ":pattern_matcher",
-        ":while_loop_simplifier",
         ":while_loop_trip_count_annotator",
-        "//xla:status_macros",
         "//xla:test",
+        "//xla:xla_data_proto_cc",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -5508,12 +5518,12 @@ cc_library(
         ":hlo_pass",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status_macros",
+        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -5521,18 +5531,16 @@ xla_cc_test(
     name = "zero_sized_hlo_elimination_test",
     srcs = ["zero_sized_hlo_elimination_test.cc"],
     deps = [
-        ":shape_inference",
         ":zero_sized_hlo_elimination",
-        "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:test",
-        "//xla:test_helpers",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6006,13 +6014,16 @@ xla_cc_test(
     srcs = ["while_util_test.cc"],
     deps = [
         ":while_util",
+        "//xla:statusor",
         "//xla:test",
         "//xla:util",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6108,7 +6119,6 @@ cc_library(
     deps = [
         ":hlo_dce",
         ":hlo_pass",
-        ":tuple_util",
         ":while_loop_analysis",
         ":while_util",
         "//xla:shape_util",
@@ -6119,7 +6129,11 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6129,11 +6143,17 @@ xla_cc_test(
     deps = [
         ":hlo_parser",
         ":while_loop_invariant_code_motion",
+        "//xla:literal_util",
+        "//xla:shape_util",
         "//xla:test",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/log",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6258,7 +6278,7 @@ xla_cc_test(
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
-        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6999,7 +7019,10 @@ cc_library(
         ":hlo_proto_cc",
         ":hlo_proto_util",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -7008,11 +7031,15 @@ xla_cc_test(
     name = "xla_debug_info_manager_test",
     srcs = ["xla_debug_info_manager_test.cc"],
     deps = [
+        ":hlo_module_config",
         ":hlo_proto_cc",
         ":xla_debug_info_manager",
+        "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -7514,11 +7541,15 @@ xla_cc_test(
     deps = [
         ":cpu_plugin",
         ":platform_util",
+        ":shaped_buffer",
         "//xla:executable_run_options",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla/client:client_library",
+        "//xla/client:executable_build_options",
         "//xla/client:local_client",
         "//xla/service/cpu:cpu_compiler",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
@@ -7535,9 +7566,13 @@ xla_cc_test(
     deps = [
         ":cpu_plugin",
         ":platform_util",
+        ":shaped_buffer",
+        "//xla:error_spec",
         "//xla:executable_run_options",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla/client:client_library",
+        "//xla/client:executable_build_options",
         "//xla/client:local_client",
         "//xla/service/cpu:cpu_compiler",
         "//xla/tests:literal_test_util",
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
index c94d9ecfec26cb..fc457f290ff895 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
-#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
index 8d132cd001d5a0..f55585f90b978d 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
@@ -19,13 +19,23 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/map_util.h"
 #include "xla/service/compile_time_cap.h"
 #include "xla/service/hlo_dce.h"
-#include "xla/service/tuple_util.h"
 #include "xla/service/while_loop_analysis.h"
 #include "xla/service/while_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/statusor.h"
 #include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.h b/third_party/xla/xla/service/while_loop_invariant_code_motion.h
index e88ffc3ddf9ac8..6cb74435d8106c 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.h
@@ -16,9 +16,14 @@ limitations under the License.
 #ifndef XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
 #define XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/compile_time_cap.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc b/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc
index 86bc35b887ad11..5a9a35e31fbe9b 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc
@@ -15,11 +15,22 @@ limitations under the License.
 
 #include "xla/service/while_loop_invariant_code_motion.h"
 
+#include "absl/log/log.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/utils/hlo_matchers.h"
+#include "xla/literal_util.h"
 #include "xla/service/hlo_parser.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/while_loop_simplifier.cc b/third_party/xla/xla/service/while_loop_simplifier.cc
index 4aa663341e67b8..a7e5bd524d6a84 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier.cc
@@ -22,8 +22,11 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -38,8 +41,13 @@ limitations under the License.
 #include "xla/service/hlo_dce.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/while_loop_analysis.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/union_find.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/while_loop_simplifier.h b/third_party/xla/xla/service/while_loop_simplifier.h
index 91173c92bbca35..47b698c0d63c45 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.h
+++ b/third_party/xla/xla/service/while_loop_simplifier.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
 #define XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/statusor.h"
diff --git a/third_party/xla/xla/service/while_loop_simplifier_test.cc b/third_party/xla/xla/service/while_loop_simplifier_test.cc
index b3dd15f1180955..2733e1cc69d980 100644
--- a/third_party/xla/xla/service/while_loop_simplifier_test.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier_test.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo_dce.h"
@@ -30,6 +32,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/while_loop_trip_count_annotator.cc b/third_party/xla/xla/service/while_loop_trip_count_annotator.cc
index e08364de8570c4..80e430e5df39c6 100644
--- a/third_party/xla/xla/service/while_loop_trip_count_annotator.cc
+++ b/third_party/xla/xla/service/while_loop_trip_count_annotator.cc
@@ -14,8 +14,17 @@ limitations under the License.
 ==============================================================================*/
 
 #include "xla/service/while_loop_trip_count_annotator.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/while_loop_analysis.h"
+#include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/while_loop_trip_count_annotator.h b/third_party/xla/xla/service/while_loop_trip_count_annotator.h
index 5daa8460e4d483..af39b7f0595296 100644
--- a/third_party/xla/xla/service/while_loop_trip_count_annotator.h
+++ b/third_party/xla/xla/service/while_loop_trip_count_annotator.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
 #define XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/statusor.h"
diff --git a/third_party/xla/xla/service/while_loop_trip_count_annotator_test.cc b/third_party/xla/xla/service/while_loop_trip_count_annotator_test.cc
index 26012ddbaea4cc..1b12f3178f4b09 100644
--- a/third_party/xla/xla/service/while_loop_trip_count_annotator_test.cc
+++ b/third_party/xla/xla/service/while_loop_trip_count_annotator_test.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include "xla/service/while_loop_trip_count_annotator.h"
 
-#include "xla/service/pattern_matcher.h"
-#include "xla/service/while_loop_simplifier.h"
-#include "xla/status_macros.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
-#include "tsl/lib/core/status_test_util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/while_util.h b/third_party/xla/xla/service/while_util.h
index 63611b3e7eb121..2f56ea074a8781 100644
--- a/third_party/xla/xla/service/while_util.h
+++ b/third_party/xla/xla/service/while_util.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/call_inliner.h"
 #include "xla/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 class WhileUtil {
diff --git a/third_party/xla/xla/service/while_util_test.cc b/third_party/xla/xla/service/while_util_test.cc
index a1c4a9203dd128..a899a89a5215c3 100644
--- a/third_party/xla/xla/service/while_util_test.cc
+++ b/third_party/xla/xla/service/while_util_test.cc
@@ -18,11 +18,15 @@ limitations under the License.
 #include <memory>
 
 #include "absl/algorithm/container.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_matchers.h"
+#include "xla/statusor.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
 #include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/xla_aot_compile_cpu_test.cc b/third_party/xla/xla/service/xla_aot_compile_cpu_test.cc
index 00dc0743f9670e..16ae69b5bf0e18 100644
--- a/third_party/xla/xla/service/xla_aot_compile_cpu_test.cc
+++ b/third_party/xla/xla/service/xla_aot_compile_cpu_test.cc
@@ -16,11 +16,15 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
+#include "absl/types/span.h"
 #include "xla/client/client_library.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
 #include "xla/executable_run_options.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/platform_util.h"
+#include "xla/service/shaped_buffer.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/path.h"
diff --git a/third_party/xla/xla/service/xla_aot_compile_stablehlo_cpu_test.cc b/third_party/xla/xla/service/xla_aot_compile_stablehlo_cpu_test.cc
index e84a16db9813a8..7526cd401c71ce 100644
--- a/third_party/xla/xla/service/xla_aot_compile_stablehlo_cpu_test.cc
+++ b/third_party/xla/xla/service/xla_aot_compile_stablehlo_cpu_test.cc
@@ -17,10 +17,14 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "xla/client/client_library.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/executable_run_options.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/platform_util.h"
+#include "xla/service/shaped_buffer.h"
 #include "xla/tests/literal_test_util.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
diff --git a/third_party/xla/xla/service/xla_debug_info_manager.cc b/third_party/xla/xla/service/xla_debug_info_manager.cc
index 6de403513478dc..b6d5e5ff90d135 100644
--- a/third_party/xla/xla/service/xla_debug_info_manager.cc
+++ b/third_party/xla/xla/service/xla_debug_info_manager.cc
@@ -20,6 +20,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_proto_util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/xla_debug_info_manager.h b/third_party/xla/xla/service/xla_debug_info_manager.h
index 4a700f5c586e53..0d3ce1ca18a42e 100644
--- a/third_party/xla/xla/service/xla_debug_info_manager.h
+++ b/third_party/xla/xla/service/xla_debug_info_manager.h
@@ -21,7 +21,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo.pb.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/service/xla_debug_info_manager_test.cc b/third_party/xla/xla/service/xla_debug_info_manager_test.cc
index a3ea0ec375c5fe..f3aaa7c47a41fc 100644
--- a/third_party/xla/xla/service/xla_debug_info_manager_test.cc
+++ b/third_party/xla/xla/service/xla_debug_info_manager_test.cc
@@ -18,8 +18,13 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/zero_sized_hlo_elimination.cc b/third_party/xla/xla/service/zero_sized_hlo_elimination.cc
index 9e8e323b039750..fb7ccc9a63c768 100644
--- a/third_party/xla/xla/service/zero_sized_hlo_elimination.cc
+++ b/third_party/xla/xla/service/zero_sized_hlo_elimination.cc
@@ -15,15 +15,18 @@ limitations under the License.
 
 #include "xla/service/zero_sized_hlo_elimination.h"
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
+#include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/zero_sized_hlo_elimination.h b/third_party/xla/xla/service/zero_sized_hlo_elimination.h
index e4296f7c0d07f0..5e908e09095362 100644
--- a/third_party/xla/xla/service/zero_sized_hlo_elimination.h
+++ b/third_party/xla/xla/service/zero_sized_hlo_elimination.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
 #define XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/statusor.h"
 
 // HLO pass that replaces zero sized Hlos with a zero sized constant literal.
 namespace xla {
diff --git a/third_party/xla/xla/service/zero_sized_hlo_elimination_test.cc b/third_party/xla/xla/service/zero_sized_hlo_elimination_test.cc
index 0137bf51d2e67b..9da305fb978cdb 100644
--- a/third_party/xla/xla/service/zero_sized_hlo_elimination_test.cc
+++ b/third_party/xla/xla/service/zero_sized_hlo_elimination_test.cc
@@ -20,17 +20,15 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/literal.h"
-#include "xla/service/shape_inference.h"
+#include "xla/literal_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
 #include "xla/test.h"
-#include "xla/test_helpers.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {

From 5f567533896ec9ba499d3c2ea99e1f0b49819dd6 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Tue, 26 Mar 2024 03:20:06 -0700
Subject: [PATCH 433/670] [XLA:GPU] Expand test coverage for SymbolicTile
 derivation.

Added tests to cover failure paths for mod, floordiv, and non-0 offset across
untiled dimensions.

Also contains some cosmetic/style changes.

PiperOrigin-RevId: 619128579
---
 .../gpu/model/symbolic_tile_analysis.cc       |  4 +-
 .../gpu/model/symbolic_tile_analysis.h        |  3 +-
 .../gpu/model/symbolic_tile_analysis_test.cc  |  2 -
 .../xla/service/gpu/model/tile_analysis.cc    | 21 +++--
 .../xla/xla/service/gpu/model/tile_analysis.h |  2 +-
 .../service/gpu/model/tile_analysis_test.cc   | 84 ++++++++++++++++++-
 6 files changed, 103 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index b1edee873b525c..aa38b747126039 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -206,9 +206,9 @@ std::vector<int64_t> SymbolicTileAnalysis::TileStrides(
                          *tile_parameters_);
 }
 
-void SymbolicTileAnalysis::SetTileSizes(absl::Span<int64_t const> sizes) {
+void SymbolicTileAnalysis::SetTileSizes(std::vector<int64_t> sizes) {
   // TODO(bchetioui): CHECK num parameters somehow?
-  tile_parameters_ = std::vector(sizes.begin(), sizes.end());
+  tile_parameters_ = std::vector(std::move(sizes));
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
index 142e1a97b15110..96c95fd9340b0d 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
-#include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/indexing_map.h"
@@ -94,7 +93,7 @@ class SymbolicTileAnalysis {
 
   // Populates input tile sizes. This is a prerequisite in order to extract
   // concrete values using `TileOffsets`, `TileSizes`, and `TileStrides`.
-  void SetTileSizes(absl::Span<int64_t const> sizes);
+  void SetTileSizes(std::vector<int64_t> sizes);
 
   // Returns the tiled root instruction.
   const TiledHloInstruction* GetRoot() const {
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
index ae6bd453284bc0..324937a4ac8991 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
@@ -18,12 +18,10 @@ limitations under the License.
 #include <memory>
 #include <utility>
 #include <variant>
-#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.cc b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
index f3371f32a8e6c0..a229d2ec71c70c 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
@@ -108,6 +108,10 @@ AffineExpr ToSymbol(mlir::AffineDimExpr dim_expr) {
                                    dim_expr.getContext());
 }
 
+// Extracts size and stride expressions from the operands to a modulo
+// expression.
+//
+// TODO(b/326998704): Currently, this fails when the stride is not exactly unit.
 std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromMod(
     AffineExpr lhs, AffineExpr modulus) {
   // TODO(b/326998704): derive constraints here, as well as the non-one stride
@@ -121,11 +125,11 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromMod(
   // Proof:
   //   * n < c (and c | n):
   //       n - ((n - 1) floordiv c) * c
-  //     = n - 0 * c           (n < c => n floordiv c == 0)
+  //     = n - 0 * c               (n < c => n floordiv c == 0)
   //     = n
   //   * n >= c (and n | c):
   //       n - ((n - 1) floordiv c) * c
-  //     = n - (n / c - 1) * c     (n | c => (n - 1) floordiv c = n / c)
+  //     = n - (n / c - 1) * c     (n | c => (n - 1) floordiv c = n / c - 1)
   //     = n - (n - c)
   //     = c
   CHECK(modulus.getKind() == AffineExprKind::Constant);
@@ -142,6 +146,11 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromMod(
   return std::nullopt;
 }
 
+// Extracts size and stride expressions from the operands to a floordiv
+// expression.
+//
+// TODO(b/326998704): Currently, this fails when the numerator of the stride
+// is not exactly unit.
 std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromFloorDiv(
     AffineExpr num, AffineExpr den) {
   if (den.getKind() != AffineExprKind::Constant) {
@@ -190,11 +199,12 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
           /*size=*/ToSymbol(llvm::cast<mlir::AffineDimExpr>(strided_indexing)),
           /*stride=*/getAffineConstantExpr(1, ctx)};
     case mlir::AffineExprKind::Mul: {
-      auto mul = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
+      const auto mul = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
       AffineExpr lhs = mul.getLHS();
       // The stride may not be fully collapsed if it is negative; in that case,
       // we need to extract the negative multiplier first.
-      if (auto rhs = llvm::dyn_cast<mlir::AffineConstantExpr>(mul.getRHS());
+      if (const auto rhs =
+              llvm::dyn_cast<mlir::AffineConstantExpr>(mul.getRHS());
           rhs && rhs.getValue() == -1) {
         std::optional<SizeAndStrideExpression> maybe_size_and_stride =
             ExtractSizeAndStride(lhs, symbol_intervals);
@@ -234,8 +244,9 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
               << printer.ToString(strided_indexing);
       return std::nullopt;
     case mlir::AffineExprKind::CeilDiv:
-      LOG(FATAL) << "unreachable";
+      break;
   };
+  LOG(FATAL) << "unreachable";
 }
 
 }  // anonymous namespace
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.h b/third_party/xla/xla/service/gpu/model/tile_analysis.h
index 65b838f9c85556..b488f1658eda25 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.h
@@ -37,7 +37,7 @@ namespace gpu {
 // A N-dimensional symbolic tile is a function from offsets, strides, and sizes
 // to a N-dimensional tile. It can be represented as three affine maps with
 // domain
-//     ()[size0, ..., size{M-1}}]
+//     ()[size0, ..., size{M-1}]
 // and respective co-domains
 //     (offset0, ..., offset{N-1})     (offset_map())
 //     (size0', ..., size'{N-1})       (size_map())
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
index c82d5bd7b59771..3c18b4a018ca81 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
@@ -307,6 +307,31 @@ TEST_F(SymbolicTileTest,
        FailsGracefullyAtPropagatingTileThroughSliceOfSplitReshape) {
   // TODO(b/326998704): constraints should allow us to unblock this use case.
   // A slice of a split reshape creates a non-unit stride atop a floordiv.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    computation {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      bitcast = f32[48,4]{1,0} bitcast(p0)
+      ROOT slice = f32[5,2]{1,0} slice(bitcast), slice={[18:43:5], [0:4:2]}
+    }
+
+    ENTRY e {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      ROOT fusion = f32[5,2]{1,0} fusion(p0), kind=kLoop, calls=computation
+    }
+  )"));
+
+  EXPECT_EQ(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      std::nullopt);
+}
+
+TEST_F(SymbolicTileTest,
+       FailsGracefullyAtPropagatingTileThroughMisalignedSliceOfSplitReshape) {
+  // TODO(b/326998704): constraints should allow us to unblock part of this use
+  // case.
+  // TODO(b/331257678): handling correctly cases where offsets don't get
+  // simplified away perfectly will allow us to unblock part of this use case.
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     computation {
@@ -326,6 +351,30 @@ TEST_F(SymbolicTileTest,
       std::nullopt);
 }
 
+TEST_F(SymbolicTileTest,
+       FailsGracefullyAtPropagatingTileThroughSliceOfSplitReshapeOnTranspose) {
+  // TODO(b/326998704): constraints should allow us to unblock this use case.
+  // A slice of a split reshape creates a non-unit stride atop a floordiv.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    computation {
+      p0 = f32[1,6,8,4]{3,2,1,0} parameter(0)
+      transpose = f32[1,8,6,4]{3,2,1,0} transpose(p0), dimensions={0,2,1,3}
+      bitcast = f32[48,4]{1,0} bitcast(transpose)
+      ROOT slice = f32[5,2]{1,0} slice(bitcast), slice={[18:43:5], [0:4:2]}
+    }
+
+    ENTRY e {
+      p0 = f32[1,6,8,4]{3,2,1,0} parameter(0)
+      ROOT fusion = f32[5,2]{1,0} fusion(p0), kind=kLoop, calls=computation
+    }
+  )"));
+
+  EXPECT_EQ(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      std::nullopt);
+}
+
 TEST_F(SymbolicTileTest,
        FailsGracefullyAtPropagatingTileThroughSliceOfSplitReshapeOfReverse) {
   // TODO(b/326998704): constraints should allow us to unblock this use case.
@@ -337,7 +386,7 @@ TEST_F(SymbolicTileTest,
       p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
       reverse = f32[1,8,6,4]{3,2,1,0} reverse(p0), dimensions={1,2}
       bitcast = f32[48,4]{1,0} bitcast(reverse)
-      ROOT slice = f32[5,2]{1,0} slice(bitcast), slice={[20:45:5], [0:4:2]}
+      ROOT slice = f32[5,2]{1,0} slice(bitcast), slice={[18:43:5], [0:4:2]}
     }
 
     ENTRY e {
@@ -351,6 +400,39 @@ TEST_F(SymbolicTileTest,
       std::nullopt);
 }
 
+TEST_F(SymbolicTileTest,
+       FailsGracefullyAtPropagatingTileThroughReductionOfConcatenation) {
+  // TODO(b/330906085): concatenating across a reduction dimension needs to be
+  // handled to unblock this.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    max_computation {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT maximum = f32[] maximum(p0, p1)
+    }
+
+    computation {
+      p0 = f32[10,8]{1,0} parameter(0)
+      p1 = f32[20,8]{1,0} parameter(1)
+      concatenate = f32[30,8]{1,0} concatenate(p0, p1), dimensions={0}
+      neg_inf = f32[] constant(-inf)
+      ROOT reduce = f32[8] reduce(concatenate, neg_inf), dimensions={0},
+        to_apply=max_computation
+    }
+
+    ENTRY e {
+      p0 = f32[10,8]{1,0} parameter(0)
+      p1 = f32[20,8]{1,0} parameter(1)
+      ROOT fusion = f32[8] fusion(p0, p1), kind=kLoop, calls=computation
+    }
+  )"));
+
+  EXPECT_EQ(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
+      std::nullopt);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From f791234e889404a0142f4f57f4f17cda2f047c37 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Tue, 26 Mar 2024 04:10:31 -0700
Subject: [PATCH 434/670] [XLA:GPU] Fix layout for kSend/kRecv on GPU to always
 be default

PiperOrigin-RevId: 619139935
---
 .../xla/service/gpu/gpu_layout_assignment.cc  | 13 ++++
 .../service/gpu/gpu_layout_assignment_test.cc | 61 +++++++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
index c69d218c683d47..8a26b0419b51d6 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
@@ -459,6 +459,19 @@ absl::Status GpuLayoutAssignment::AddBackendConstraints(
           ShapeUtil::MoveDimToMajor(all_to_all->shape(),
                                     *all_to_all->split_dimension()),
           all_to_all));
+    } else if (instruction->opcode() == HloOpcode::kSend) {
+      Shape s = instruction->operand(0)->shape();
+      LayoutUtil::SetToDefaultLayout(&s);
+      TF_RETURN_IF_ERROR(SetInstructionLayout(s, instruction->operand(0)));
+      TF_RETURN_IF_ERROR(
+          SetArrayOperandLayout(s.layout(), instruction->operand(0), 0));
+    } else if (instruction->opcode() == HloOpcode::kRecv) {
+      Shape s = instruction->shape();
+      ShapeUtil::ForEachMutableSubshape(
+          &s, [&](Shape* subshape, const ShapeIndex& index) {
+            LayoutUtil::SetToDefaultLayout(subshape);
+          });
+      TF_RETURN_IF_ERROR(SetInstructionLayout(s, instruction));
     }
   }
   return absl::OkStatus();
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
index 9bde416b2393fe..0a92f45cacc8e0 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -555,6 +555,67 @@ ENTRY main {
             LayoutUtil::MakeLayout({1, 3, 2, 0}).minor_to_major());
 }
 
+TEST_F(LayoutAssignmentTest, SendRcvLayout) {
+  const char* hlo = R"(
+HloModule Module
+
+condition  {
+    p = (f32[100,100], (f32[100,100], u32[], token[])) parameter(0)
+    ROOT lt = pred[] constant(1)
+}
+
+body {
+    p = (f32[100,100], (f32[100,100], u32[], token[])) parameter(0)
+
+    t1 = f32[100,100] get-tuple-element(p), index=0
+    t = (f32[100,100], u32[], token[]) get-tuple-element(p), index=1
+    sdone = token[] send-done(t), channel_id=3, frontend_attributes={
+      _xla_send_recv_pipeline="0"
+    }
+    tk = token[] after-all()
+
+
+    rcvd = (f32[100,100]{0,1}, u32[], token[]) recv(tk), channel_id=2
+    zz = (f32[100,100]{0,1}, token[]) recv-done(rcvd), channel_id=2
+
+    rcvd_d = get-tuple-element(zz), index=0
+
+    snd = (f32[100,100]{0,1}, u32[], token[]) send(t1, tk), channel_id=3, frontend_attributes={
+      _xla_send_recv_pipeline="0"
+    }
+    a = add(t1, t1)
+
+    b = add(rcvd_d, a)
+
+    ROOT tup =  tuple(b, snd)
+}
+
+ENTRY %main {
+    p0 = f32[100,100] parameter(0)
+    tk = token[] after-all()
+    snd = (f32[100,100]{0,1}, u32[], token[]) send(p0, tk), channel_id=1, frontend_attributes={
+      _xla_send_recv_pipeline="0"
+    }
+    t = tuple(p0, snd)
+    ROOT loop = while(t), condition=condition, body=body
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo));
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  RunAndFilecheckHloRewrite(
+      hlo,
+      GpuLayoutAssignment{&computation_layout, GetGpuComputeCapability(),
+                          GetDnnVersion()},
+      R"(
+// CHECK: (f32[100,100]{1,0}, u32[], token[]) recv
+// CHECK:  (f32[100,100]{1,0}, token[]) recv-done
+// CHECK:  (f32[100,100]{1,0}, u32[], token[]) send
+                                )");
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From e2e883c5f1768e90344c96c4473ae5ee50273c4c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 04:12:01 -0700
Subject: [PATCH 435/670] Automated Code Change

PiperOrigin-RevId: 619140196
---
 third_party/xla/xla/mlir/runtime/utils/BUILD               | 3 +++
 .../xla/xla/mlir/runtime/utils/async_runtime_api.cc        | 7 +++++++
 third_party/xla/xla/mlir/runtime/utils/async_runtime_api.h | 2 ++
 third_party/xla/xla/mlir/runtime/utils/constraints.cc      | 7 +++++++
 third_party/xla/xla/mlir/runtime/utils/constraints.h       | 2 ++
 third_party/xla/xla/mlir/runtime/utils/custom_calls.cc     | 5 ++++-
 third_party/xla/xla/mlir/runtime/utils/custom_calls.h      | 2 ++
 7 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/mlir/runtime/utils/BUILD b/third_party/xla/xla/mlir/runtime/utils/BUILD
index 08021de8b0da53..b45fcb04ffdbd8 100644
--- a/third_party/xla/xla/mlir/runtime/utils/BUILD
+++ b/third_party/xla/xla/mlir/runtime/utils/BUILD
@@ -17,8 +17,11 @@ cc_library(
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/functional:any_invocable",
         "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//llvm:OrcShared",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:mlir_async_runtime_api",
         "@local_tsl//tsl/concurrency:async_value",
+        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
diff --git a/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.cc b/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.cc
index f7bb04af20e0a3..2e7967a9e2ebad 100644
--- a/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.cc
+++ b/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.cc
@@ -25,11 +25,18 @@ limitations under the License.
 #include <utility>
 
 #include "absl/base/dynamic_annotations.h"
+#include "absl/functional/any_invocable.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/Mangling.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
 #include "mlir/ExecutionEngine/AsyncRuntime.h"  // from @llvm-project
 #include "xla/runtime/async_runtime.h"
 #include "tsl/concurrency/async_value.h"
 #include "tsl/concurrency/async_value_ref.h"
 #include "tsl/concurrency/chain.h"
+#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/mem.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.h b/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.h
index 486fbb185ad8db..94f3dcd2f46495 100644
--- a/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.h
+++ b/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.h
@@ -17,9 +17,11 @@ limitations under the License.
 #define XLA_MLIR_RUNTIME_UTILS_ASYNC_RUNTIME_API_H_
 
 #include "absl/functional/any_invocable.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/Mangling.h"
 #include "xla/runtime/async_runtime.h"
+#include "tsl/concurrency/async_value.h"
 #include "tsl/concurrency/async_value_ref.h"
 #include "tsl/concurrency/chain.h"
 
diff --git a/third_party/xla/xla/mlir/runtime/utils/constraints.cc b/third_party/xla/xla/mlir/runtime/utils/constraints.cc
index 6a38f6bf9f6d8f..da0023ea62b4de 100644
--- a/third_party/xla/xla/mlir/runtime/utils/constraints.cc
+++ b/third_party/xla/xla/mlir/runtime/utils/constraints.cc
@@ -21,8 +21,15 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "xla/runtime/constraints.h"
 
 namespace xla {
 namespace runtime {
diff --git a/third_party/xla/xla/mlir/runtime/utils/constraints.h b/third_party/xla/xla/mlir/runtime/utils/constraints.h
index bbecc4995347a2..f88c3f267e0d38 100644
--- a/third_party/xla/xla/mlir/runtime/utils/constraints.h
+++ b/third_party/xla/xla/mlir/runtime/utils/constraints.h
@@ -17,7 +17,9 @@ limitations under the License.
 #define XLA_MLIR_RUNTIME_UTILS_CONSTRAINTS_H_
 
 #include "absl/status/statusor.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
 #include "xla/runtime/constraints.h"
 
diff --git a/third_party/xla/xla/mlir/runtime/utils/custom_calls.cc b/third_party/xla/xla/mlir/runtime/utils/custom_calls.cc
index fec85d3d2c156f..41991bc8f50b14 100644
--- a/third_party/xla/xla/mlir/runtime/utils/custom_calls.cc
+++ b/third_party/xla/xla/mlir/runtime/utils/custom_calls.cc
@@ -15,12 +15,15 @@ limitations under the License.
 
 #include "xla/mlir/runtime/utils/custom_calls.h"
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 
 namespace xla {
 namespace runtime {
diff --git a/third_party/xla/xla/mlir/runtime/utils/custom_calls.h b/third_party/xla/xla/mlir/runtime/utils/custom_calls.h
index 0c89f922350b79..02a85d2c38ff62 100644
--- a/third_party/xla/xla/mlir/runtime/utils/custom_calls.h
+++ b/third_party/xla/xla/mlir/runtime/utils/custom_calls.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project

From db4ef5dfb64800986c2b9031ccdcd3034abe4323 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Tue, 26 Mar 2024 04:27:32 -0700
Subject: [PATCH 436/670] [XLA:GPU] Implement new Triton Softmax codegen based
 on symbolic tiles and indexing maps.

Symbolic tiles are still under development and can process all instructions. The new codegen is used only when all instruction in the fusion are supported, otherwise we fall back to the old codegen.

Triton Softmax codegen is behind `--xla_gpu_enable_triton_softmax_fusion` flag that is not turned on by default, so this change shouldn't affect any production workloads.

PiperOrigin-RevId: 619143278
---
 third_party/xla/xla/service/gpu/BUILD         |   9 +
 .../xla/xla/service/gpu/ir_emitter_triton.cc  | 370 +++++++++++++++++-
 .../xla/xla/service/gpu/ir_emitter_triton.h   |   7 +
 .../xla/service/gpu/ir_emitter_triton_test.cc | 134 +++++--
 third_party/xla/xla/service/gpu/model/BUILD   |   1 -
 .../service/gpu/model/indexing_analysis.cc    |   6 +-
 .../xla/service/gpu/model/indexing_analysis.h |  12 +
 7 files changed, 492 insertions(+), 47 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 2ff67c1c8824cc..75f83ca53799eb 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -529,7 +529,12 @@ cc_library(
         "//xla/service:algorithm_util",
         "//xla/service:dump",
         "//xla/service:hlo_module_config",
+        "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "//xla/service/gpu/llvm_gpu_backend",
+        "//xla/service/gpu/model:affine_map_printer",
+        "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:indexing_map",
+        "//xla/service/gpu/model:symbolic_tile_analysis",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
@@ -549,6 +554,8 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithToLLVM",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
@@ -612,6 +619,7 @@ xla_test(
         "//xla/hlo/ir:hlo",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
+        "//xla/service/gpu/model:indexing_test_utils",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cublas_plugin",
@@ -621,6 +629,7 @@ xla_test(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index e53c478a45f191..89a13a9d765023 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <climits>
 #include <cmath>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <limits>
@@ -26,6 +27,7 @@ limitations under the License.
 #include <string>
 #include <system_error>  // NOLINT(build/c++11): required to interface with LLVM
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h"
@@ -47,11 +49,14 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
+#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
@@ -59,6 +64,7 @@ limitations under the License.
 #include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/ExecutionEngine/OptUtils.h"  // from @llvm-project
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -68,6 +74,7 @@ limitations under the License.
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
@@ -93,17 +100,22 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
 #include "xla/primitive_util.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/service/dump.h"
+#include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/gpu/model/symbolic_tile_analysis.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/gpu/triton_tiling_propagation.h"
@@ -680,6 +692,138 @@ absl::StatusOr<Value> EmitReduce(ImplicitLocOpBuilder& b,
   return Cast(b, result, TritonType(b, hlo_reduce.shape().element_type()));
 }
 
+// TODO(b/331332678): Add unit tests to target this function specifically.
+Value EmitTiledBroadcast(
+    ImplicitLocOpBuilder& b, const SymbolicTileAnalysis& analysis,
+    const TiledHloInstruction& tiled_broadcast,
+    absl::flat_hash_map<const TiledHloInstruction*, Value>& values) {
+  auto input_tile_shape = analysis.TileSizes(*tiled_broadcast.operands[0]);
+  auto output_tile_shape = analysis.TileSizes(tiled_broadcast);
+
+  Value expanded_input = values[tiled_broadcast.operands[0]];
+
+  // Returns true if `dim_id` is broadcasted.
+  auto is_broadcasted_dim = [&](int64_t dim_id) {
+    return !llvm::is_contained(tiled_broadcast.hlo->dimensions(), dim_id);
+  };
+
+  // The loop below iterates over output dimensions and tracks matching dims in
+  // input_tile_shape and expended_input value.
+  // `input_dim_id != expanded_input_dim_id`, because size-1 dims are present in
+  // the input tile shape, but not in the MLIR value. Triton doesn't like size-1
+  // dims, so they are inserted only for dimensions that will be broadcasted.
+  int64_t input_dim_id = 0;
+  int64_t expanded_input_dim_id = 0;
+  for (size_t output_dim_id = 0; output_dim_id < output_tile_shape.size();
+       ++output_dim_id) {
+    if (is_broadcasted_dim(output_dim_id)) {
+      // The dim is broadcasted in the original instruction, but tiled to 1 in
+      // this case. Nothing to broadcast.
+      if (output_tile_shape[output_dim_id] == 1) continue;
+
+      // Expand dim for broadcast.
+      expanded_input =
+          b.create<mt::ExpandDimsOp>(expanded_input, expanded_input_dim_id);
+      ++expanded_input_dim_id;
+    } else {
+      // The dim is not broadcasted. Validate that it's equal in the input and
+      // output tile.
+      CHECK_EQ(input_tile_shape[input_dim_id],
+               output_tile_shape[output_dim_id]);
+      ++input_dim_id;
+
+      // Size-1 dims are not present in the tensor type.
+      if (output_tile_shape[output_dim_id] != 1) {
+        ++expanded_input_dim_id;
+      }
+    }
+  }
+
+  SmallVector<int64_t> padded_output_tile_shape;
+  padded_output_tile_shape.reserve(output_tile_shape.size());
+
+  for (int64_t tile_dim : output_tile_shape) {
+    if (tile_dim != 1) {
+      padded_output_tile_shape.push_back(llvm::PowerOf2Ceil(tile_dim));
+    }
+  }
+
+  return Broadcast(b, expanded_input.cast<TensorValue>(),
+                   padded_output_tile_shape);
+}
+
+absl::StatusOr<Value> EmitTiledHloInstruction(
+    ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
+    const se::DeviceDescription& device_info,
+    const SymbolicTileAnalysis& analysis, const TiledHloInstruction& tiled_hlo,
+    std::function<absl::StatusOr<Value>(const TiledHloInstruction&)>
+        emit_param_load_fn,
+    absl::flat_hash_map<const TiledHloInstruction*, Value>& values) {
+  const HloInstruction* hlo = tiled_hlo.hlo;
+
+  if (hlo->opcode() == HloOpcode::kParameter) {
+    return emit_param_load_fn(tiled_hlo);
+  }
+
+  if (hlo->opcode() == HloOpcode::kConstant &&
+      ShapeUtil::IsEffectiveScalar(hlo->shape())) {
+    // Splat makes it a tensor to avoid type mismatches.
+    return Splat(b, EmitConstant(b, *hlo), {});
+  }
+
+  if (hlo->opcode() == HloOpcode::kBroadcast) {
+    return EmitTiledBroadcast(b, analysis, tiled_hlo, values);
+  }
+
+  if (hlo->opcode() == HloOpcode::kReduce) {
+    return EmitReduce(b, libdevice_path, device_info, *hlo,
+                      values[tiled_hlo.operands[0]]);
+  }
+
+  if (hlo->IsElementwise()) {
+    std::vector<Value> operands;
+    operands.reserve(hlo->operands().size());
+
+    for (const TiledHloInstruction* operand : tiled_hlo.operands) {
+      operands.push_back(values[operand]);
+    }
+    return EmitElementwise(b, libdevice_path, device_info, *hlo, operands);
+  }
+
+  if (hlo->opcode() == HloOpcode::kTranspose ||
+      hlo->opcode() == HloOpcode::kSlice || hlo->opcode() == HloOpcode::kPad) {
+    // All these are currently supported only as operations on indices
+    // which are pushed to loads and stores. No operations on tiles are
+    // performed here.
+    return values[tiled_hlo.operands[0]];
+  }
+
+  return absl::UnimplementedError(
+      absl::StrCat("Unsupported opcode: ", hlo->opcode()));
+}
+
+// Emit sequence of instructions using compatible tiling ordered producers
+// before consumers.
+absl::StatusOr<Value> EmitTiledScope(
+    ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
+    const se::DeviceDescription& device_info,
+    const SymbolicTileAnalysis& analysis,
+    std::function<absl::StatusOr<Value>(const TiledHloInstruction&)>
+        emit_param_load_fn,
+    absl::flat_hash_map<const TiledHloInstruction*, Value>& values) {
+  for (const auto& tiled_hlo : analysis.GetTiledHloInstructions()) {
+    TF_ASSIGN_OR_RETURN(
+        Value result,
+        EmitTiledHloInstruction(b, libdevice_path, device_info, analysis,
+                                *tiled_hlo, emit_param_load_fn, values));
+    TF_RET_CHECK(values.insert({tiled_hlo.get(), result}).second)
+        << tiled_hlo->hlo->ToString();
+    VLOG(8) << "Emitted "
+            << tiled_hlo->hlo->ToString(HloPrintOptions::ShortParsable());
+  }
+  return values[analysis.GetRoot()];
+}
+
 // Emit sequence of instructions using compatible tiling ordered producers
 // before consumers.
 absl::StatusOr<Value> EmitScope(
@@ -2043,6 +2187,216 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   return absl::OkStatus();
 }
 
+// Computes indexing map from program id into the tile offset for the given
+// shape and tile sizes.
+IndexingMap ComputeProgramIdToOutputTileIndexing(
+    absl::Span<const int64_t> dimensions, absl::Span<const int64_t> tile_sizes,
+    mlir::MLIRContext* mlir_context) {
+  CHECK_EQ(dimensions.size(), tile_sizes.size());
+
+  int num_tiles = 1;
+  std::vector<int64_t> outer_loop_bounds;
+  outer_loop_bounds.reserve(dimensions.size());
+  for (auto [dim_size, tile_size] : llvm::zip(dimensions, tile_sizes)) {
+    int num_tiles_per_dim = (dim_size + tile_size - 1) / tile_size;
+
+    num_tiles *= num_tiles_per_dim;
+    outer_loop_bounds.push_back(num_tiles_per_dim);
+  }
+
+  mlir::AffineExpr program_id = mlir::getAffineDimExpr(0, mlir_context);
+
+  // Delinearize the block id.
+  auto tile_exprs =
+      DelinearizeIndex(outer_loop_bounds, program_id, mlir_context);
+
+  // Scale each index by the tile size to produce tile offset.
+  for (auto [tile_expr, tile_size] : llvm::zip(tile_exprs, tile_sizes)) {
+    tile_expr = tile_expr * tile_size;
+  }
+
+  return IndexingMap::FromTensorSizes(
+      mlir::AffineMap::get(
+          /*dimCount=*/1, /*symbolCount=*/0, tile_exprs, mlir_context),
+      /*dim_upper_bounds=*/{num_tiles}, /*symbol_upper_bounds=*/{});
+}
+
+// Computes the base pointer offset for the given pid and shape.
+// `tile_offset_indexing` is a mapping from
+// (program_id) -> [tile_offset0, ..., tile_offsetN]
+StatusOr<Value> ComputeBasePtrOffset(ImplicitLocOpBuilder b, Value pid,
+                                     const Shape& shape,
+                                     const IndexingMap& tile_offset_indexing) {
+  ArrayRef<mlir::AffineExpr> dimension_exprs =
+      tile_offset_indexing.GetAffineMap().getResults();
+
+  mlir::AffineExpr linear_index =
+      mlir::getAffineConstantExpr(0, b.getContext());
+  int64_t stride = 1;
+  for (int i : shape.layout().minor_to_major()) {
+    linear_index = linear_index + dimension_exprs[i] * stride;
+    stride *= shape.dimensions(i);
+  }
+
+  // A symbol in an indexing map means that to produce on element of output, we
+  // need to read all elements of input in the symbol range. Since this function
+  // computes start of the tile, we need to substitute each symbol with its
+  // lower bound value. We assume here the iteration order is normalized.
+  // TODO(b/330906085): Support cases when tile offsets are not 0.
+  for (const Interval& symbol_bound : tile_offset_indexing.GetSymbolBounds()) {
+    if (symbol_bound.lower != 0) {
+      return absl::FailedPreconditionError(absl::StrCat(
+          "Symbol lower bound is not zero. ", tile_offset_indexing.ToString()));
+    }
+  }
+
+  std::vector<Value> symbol_lower_bounds(
+      tile_offset_indexing.GetSymbolCount(),
+      b.create<ma::ConstantOp>(b.getIndexAttr(0)));
+
+  return b.create<ma::IndexCastUIOp>(
+      b.getI64Type(), mlir_converter::ApplyAffineExpr(linear_index, pid,
+                                                      symbol_lower_bounds, b));
+}
+
+absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
+                              absl::string_view libdevice_path,
+                              const se::DeviceDescription& device_info,
+                              SymbolicTileAnalysis* analysis,
+                              const HloComputation* computation,
+                              mlir::triton::FuncOp fn) {
+  mlir::MLIRContext* mlir_context = analysis->GetMLIRContext();
+
+  const HloInstruction* root = computation->root_instruction();
+  auto loc = mlir::NameLoc::get(builder.getStringAttr(root->name()));
+  ImplicitLocOpBuilder b(loc, builder);
+
+  // Assumptions we make about the matcher:
+  //   * matches Softmax "diamonds" on the last axis, along with any number of
+  //     elementwise operations/bitcasts on any edge
+  //   * within a given fusion, every argument to a Softmax diamond has the same
+  //     shape
+  //   * every reduction is on the last axis
+  //   * the last axis of every reduction parameter has the same length
+  //   * reductions only reduce a single operand
+  //   * all the shapes have canonical layout (logical layout = physical layout)
+  //   * the computation has a single output
+  //   * we tile along a single dimension
+
+  const HloInstruction* reduce = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+
+  if (reduce == nullptr) {
+    return absl::InvalidArgumentError("No reduce instruction found.");
+  }
+
+  const Shape& reduce_input_shape = reduce->operand(0)->shape();
+
+  if (reduce->dimensions().size() != 1 ||
+      reduce->dimensions(0) != reduce_input_shape.rank() - 1) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Reduce instruction must reduce inner-most dimension. ",
+                     reduce->ToString()));
+  }
+
+  const Shape& root_shape = computation->root_instruction()->shape();
+  if (!root_shape.IsArray() ||
+      LayoutUtil::IsMonotonicWithDim0Minor(root_shape.layout())) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Root shape is not supported. ", root_shape.ToString()));
+  }
+
+  int row_len = reduce_input_shape.dimensions_minor(0);
+
+  Value pid = b.create<ma::IndexCastUIOp>(
+      b.getIndexType(), b.create<mt::GetProgramIdOp>(mt::ProgramIDDim::X));
+
+  std::vector<int64_t> output_tile_sizes(
+      computation->root_instruction()->shape().rank(), 1);
+  output_tile_sizes.back() = row_len;
+
+  analysis->SetTileSizes(output_tile_sizes);
+
+  IndexingMap program_id_to_output_tile_indexing =
+      ComputeProgramIdToOutputTileIndexing(root_shape.dimensions(),
+                                           output_tile_sizes, mlir_context);
+
+  // block_size must be a power of two.
+  int result_block_size = llvm::PowerOf2Ceil(row_len);
+
+  std::vector<int32_t> boundary_checks;
+  if (result_block_size != row_len) {
+    boundary_checks.push_back(0);
+  }
+
+  // Emits load instructions
+  auto emit_param_load = [&](const TiledHloInstruction& tiled_hlo_instruction)
+      -> absl::StatusOr<Value> {
+    std::vector<Value> tile_sizes, tile_strides, tile_offsets;
+    for (auto [size, stride, offset] :
+         llvm::zip(analysis->TileSizes(tiled_hlo_instruction),
+                   analysis->TileStrides(tiled_hlo_instruction),
+                   analysis->TileOffsets(tiled_hlo_instruction))) {
+      if (size == 1) continue;
+
+      tile_sizes.push_back(CreateConst(b, b.getI64Type(), size));
+      tile_strides.push_back(CreateConst(b, b.getI64Type(), stride));
+      tile_offsets.push_back(CreateConst(b, b.getI32Type(), offset));
+    }
+
+    IndexingMap program_id_to_input_tile_indexing = ComposeIndexingMaps(
+        program_id_to_output_tile_indexing, tiled_hlo_instruction.indexing_map);
+    program_id_to_input_tile_indexing.Simplify();
+
+    // Manually compute pointer offset to avoid materialized fully parallel
+    // dimensions in the tile. Current codegen tried to avoid size-1 dims.
+    TF_ASSIGN_OR_RETURN(
+        Value ptr_offset,
+        ComputeBasePtrOffset(b, pid, tiled_hlo_instruction.hlo->shape(),
+                             program_id_to_input_tile_indexing));
+
+    auto fn_arg = fn.getArgument(tiled_hlo_instruction.hlo->parameter_number());
+    auto tile_ptr = AddPtr(b, fn_arg, ptr_offset);
+
+    if (tile_sizes.empty()) {
+      return EmitParameterLoad(b, tile_ptr, boundary_checks);
+    }
+
+    Value emitted_tensor = b.create<mt::MakeTensorPtrOp>(
+        /*base=*/tile_ptr,
+        /*shape=*/tile_sizes,
+        /*strides=*/tile_strides,
+        /*offsets=*/tile_offsets,
+        /*tensorShape=*/std::vector<int32_t>{result_block_size},
+        /*order=*/std::vector<int32_t>{0});
+
+    return EmitParameterLoad(b, emitted_tensor, boundary_checks);
+  };
+
+  absl::flat_hash_map<const TiledHloInstruction*, Value> values_out;
+  TF_ASSIGN_OR_RETURN(Value result,
+                      EmitTiledScope(b, libdevice_path, device_info, *analysis,
+                                     emit_param_load, values_out));
+
+  TF_ASSIGN_OR_RETURN(Value ptr_offset,
+                      ComputeBasePtrOffset(b, pid, root_shape,
+                                           program_id_to_output_tile_indexing));
+
+  Value store_tensor = b.create<mt::MakeTensorPtrOp>(
+      /*base=*/AddPtr(b, fn.getArgument(computation->num_parameters()),
+                      ptr_offset),
+      /*shape=*/ValueRange{CreateConst(b, b.getI64Type(), row_len)},
+      /*strides=*/ValueRange{CreateConst(b, b.getI64Type(), 1)},
+      /*offsets=*/ValueRange{CreateConst(b, b.getI32Type(), 0)},
+      /*tensorShape=*/std::vector<int32_t>{result_block_size},
+      /*order=*/std::vector<int32_t>{0});
+
+  b.create<mt::StoreOp>(store_tensor, result, std::vector<int32_t>{0},
+                        mt::CacheModifier::NONE, mt::EvictionPolicy::NORMAL);
+
+  return absl::OkStatus();
+}
+
 absl::Status EmitSoftMax(mlir::OpBuilder builder,
                          absl::string_view libdevice_path,
                          const se::DeviceDescription& device_info,
@@ -2050,6 +2404,15 @@ absl::Status EmitSoftMax(mlir::OpBuilder builder,
                          const HloComputation* computation,
                          mlir::triton::FuncOp fn,
                          const TritonGemmConfig& config) {
+  SymbolicTileAnalysisOrError symbolic_tile_analysis_or =
+      SymbolicTileAnalysis::AnalyzeComputation(*computation,
+                                               builder.getContext());
+  if (auto* symbolic_tile_analysis =
+          std::get_if<SymbolicTileAnalysis>(&symbolic_tile_analysis_or)) {
+    return EmitTiledSoftMax(builder, libdevice_path, device_info,
+                            symbolic_tile_analysis, computation, fn);
+  }
+
   const HloInstruction* root = computation->root_instruction();
   auto loc = mlir::NameLoc::get(builder.getStringAttr(root->name()));
   ImplicitLocOpBuilder b(loc, builder);
@@ -2235,7 +2598,9 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     const HloComputation* hlo_computation,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context) {
-  mlir_context.loadDialect<mt::TritonDialect>();
+  mlir_context.loadDialect<mt::TritonDialect, mlir::arith::ArithDialect,
+                           mlir::affine::AffineDialect>();
+
   mlir::OpBuilder b(&mlir_context);
   auto loc = mlir::NameLoc::get(b.getStringAttr(hlo_computation->name()));
   mlir::OwningOpRef<mlir::ModuleOp> triton_module =
@@ -2399,6 +2764,9 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
     }
   }
 
+  // Lower affine expressions into arithmetic ops.
+  pm.addPass(mlir::createLowerAffinePass());
+
   mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
   if (!CreateTritonPipeline(pm, cc, config, /*out*/ cluster_info).ok()) {
     return Internal("Failed to create Triton pipeline.");
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index bba456b5393509..0d575605325dcb 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/status.h"
@@ -48,6 +49,12 @@ struct TritonWrapperResult {
   std::optional<se::ClusterDim> cluster_dim;
 };
 
+// Computes indexing map from program id into the tile offset for the given
+// shape and tile sizes.
+IndexingMap ComputeProgramIdToOutputTileIndexing(
+    absl::Span<const int64_t> dimensions, absl::Span<const int64_t> tile_sizes,
+    mlir::MLIRContext* mlir_context);
+
 // Compute the launch dimensions for the given Triton MatMul.
 absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
     const TritonFusionAnalysis& analysis, const HloFusionAdaptor& fusion,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index bdd1465db8dc2d..7597c556406465 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/ir_emitter_triton.h"
 
+#include <cstdint>
 #include <iterator>
 #include <limits>
 #include <memory>
@@ -27,6 +28,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -42,6 +44,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/pattern_matcher.h"
@@ -150,6 +153,31 @@ absl::Status TritonFilecheckTest::CreateTritonIrAndFileCheck(
   return absl::OkStatus();
 }
 
+TEST_F(TritonTest, ComputeProgramIdToOutputTileIndexing) {
+  mlir::MLIRContext context;
+
+  auto compute_map = [&](absl::Span<const int64_t> dimensions,
+                         absl::Span<const int64_t> tile_sizes) {
+    return ComputeProgramIdToOutputTileIndexing(dimensions, tile_sizes,
+                                                &context);
+  };
+
+  EXPECT_THAT(compute_map(/*dimensions=*/{9, 17}, /*tile_sizes=*/{5, 10}),
+              MatchIndexingMap(R"(
+    (d0) -> ((d0 floordiv 2) * 5, (d0 mod 2) * 10)
+    domain:
+    d0 in [0, 3]
+  )"));
+
+  EXPECT_THAT(
+      compute_map(/*dimensions=*/{8, 16, 32}, /*tile_sizes=*/{1, 1, 32}),
+      MatchIndexingMap(R"(
+    (d0) -> (d0 floordiv 16, d0 mod 16, 0)
+    domain:
+    d0 in [0, 127]
+  )"));
+}
+
 TEST_F(TritonFilecheckTest, TestGemm) {
   const std::string kHloText = R"(
 HloModule t, is_scheduled=true
@@ -345,9 +373,10 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 CHECK:            %[[PID:.*]] = tt.get_program_id x : i32
-CHECK:            arith.extsi %[[PID]] : i32 to i64
+CHECK:            arith.index_castui %[[PID]] : i32 to index
 CHECK:            tt.addptr %[[P0]]
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
@@ -397,9 +426,10 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:            %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:            arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:            arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:            %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
 CHECK-DAG:            %[[ARG_0:.*]] = tt.addptr %[[P0]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK:            tt.load %[[ARG_0]] {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
@@ -452,12 +482,14 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
@@ -516,18 +548,20 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
-CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
 CHECK-NEXT:       tt.load
 CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
-CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
 CHECK-NEXT:       tt.load
@@ -578,27 +612,29 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
 CHECK-NEXT:       tt.load
 CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
-CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
-CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
-CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG3:[^:]*]]: f32, %[[ARG4:[^:]*]]: f32):
 CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
 CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
 CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
 CHECK:            tt.addptr %[[P2]]
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
@@ -644,12 +680,14 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
-CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
+CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
+CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
@@ -660,6 +698,7 @@ CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
 CHECK-NEXT:       tt.load
 CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            %[[PID_i64:.*]] = arith.index_castui %[[PID_INDEX]] : index to i64
 CHECK:            %[[ARG2:.*]] = tt.addptr %[[P2]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.load %[[ARG2]] {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
 CHECK:            tt.reduce
@@ -714,12 +753,14 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK-DAG:        %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK-DAG:        %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
@@ -730,6 +771,7 @@ CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
 CHECK-NEXT:       tt.load
 CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            %[[PID_i64:.*]] = arith.index_castui %[[PID_INDEX]] : index to i64
 CHECK:            %[[ARG2:.*]] = tt.addptr %[[P2]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.load %[[ARG2]] {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
 CHECK:            tt.reduce
@@ -785,6 +827,7 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
 CHECK-LABEL:   tt.func @triton_fn(
 CHECK-SAME:        %[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
 CHECK-SAME:        %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
@@ -793,16 +836,16 @@ CHECK-DAG:       %[[ZERO_OFFSET:.*]] = arith.constant 0 : i32
 CHECK-DAG:       %[[C1_i64:.*]] = arith.constant 1 : i64
 CHECK-DAG:       %[[C16_i64:.*]] = arith.constant 16 : i64
 CHECK-DAG:       %[[PID:.*]] = tt.get_program_id x : i32
-CHECK:           %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
-CHECK:           %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C16_i64]] : i64
+CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
+CHECK:           %[[PID_i64:.*]] = arith.index_castui %[[PID_INDEX]] : index to i64
+CHECK:           tt.addptr %[[P1]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
+CHECK:           tt.splat
+CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK:           tt.make_tensor_ptr
 CHECK-SAME:      <tensor<16xf32>, 1>
 CHECK:           tt.load
-CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1> -> tensor<16xf32>
-CHECK:           tt.addptr %[[P1]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
-CHECK:           tt.load
-CHECK-SAME:      {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
 CHECK:           tt.reduce
 CHECK-NEXT:      ^bb0(%[[ARG3:.*]]: f32, %[[ARG4:.*]]: f32):
 CHECK:             %[[MAX:.*]] = arith.maximumf %[[ARG3]], %[[ARG4]] : f32
@@ -854,23 +897,24 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 32)>
 CHECK-LABEL:   tt.func @triton_fn(
 CHECK-SAME:        %[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
 CHECK-SAME:        %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
 CHECK-SAME:        %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
-CHECK-DAG:       %[[ZERO_OFFSET_i32:.*]] = arith.constant 0 : i32
+CHECK-DAG:       %[[ZERO_OFFSET:.*]] = arith.constant 0 : i32
+CHECK-DAG:       %[[C0_i64:.*]] = arith.constant 0 : i64
 CHECK-DAG:       %[[C1_i64:.*]] = arith.constant 1 : i64
-CHECK-DAG:       %[[C32_i64:.*]] = arith.constant 32 : i64
-CHECK-DAG:       %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
 CHECK-DAG:       %[[PID:.*]] = tt.get_program_id x : i32
-CHECK:           %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
-CHECK:           %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C32_i64]] : i64
-CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
+CHECK:           tt.addptr %[[P1]], %[[C0_i64]] : !tt.ptr<f32, 1>, i64
 CHECK:           tt.make_tensor_ptr
 CHECK-SAME:      <tensor<32xf32>, 1>
 CHECK:           tt.load
 CHECK-SAME:      !tt.ptr<tensor<32xf32>, 1> -> tensor<32xf32>
-CHECK:           tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
+CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:      tt.make_tensor_ptr
 CHECK-SAME:      <tensor<32xf32>, 1>
 CHECK:           tt.load
@@ -926,6 +970,7 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+// CHECK:         #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
 // CHECK-LABEL:   tt.func @triton_fn(
 // CHECK-SAME:                       %[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
 // CHECK-SAME:                       %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
@@ -935,16 +980,17 @@ ENTRY main {
 // CHECK-DAG:       %[[C16_i64:.*]] = arith.constant 16 : i64
 // CHECK-DAG:       %[[ZERO_OFFSET_i64:.*]] = arith.constant 0 : i64
 // CHECK-DAG:       %[[PID:.*]] = tt.get_program_id x : i32
-// CHECK:           %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
-// CHECK:           %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C16_i64]] : i64
+// CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
+// CHECK:           tt.addptr %[[P1]], %[[ZERO_OFFSET_i64]] : !tt.ptr<f32, 1>, i64
+// CHECK-NEXT:      tt.load
+// CHECK-SAME:      {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+// CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+// CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 // CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 // CHECK:           tt.make_tensor_ptr
 // CHECK-SAME:      <tensor<16xf32>, 1>
 // CHECK:           tt.load
 // CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1> -> tensor<16xf32>
-// CHECK:           tt.addptr %[[P1]], %[[ZERO_OFFSET_i64]] : !tt.ptr<f32, 1>, i64
-// CHECK-NEXT:      tt.load
-// CHECK-SAME:      {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
 // CHECK:           tt.reduce
 // CHECK:           ^bb0(%[[ARG3:.*]]: f32, %[[ARG4:.*]]: f32):
 // CHECK:             %[[MAX:.*]] = arith.maximumf %[[ARG3]], %[[ARG4]] : f32
@@ -996,23 +1042,25 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                        "triton_softmax_computation", R"(
+// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
 // CHECK-LABEL:   tt.func @triton_fn(
 // CHECK-SAME:                       %[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
 // CHECK-SAME:                       %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
 // CHECK-SAME:                       %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 // CHECK-DAG:       %[[ZERO_OFFSET_i32:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[C0_i64:.*]] = arith.constant 0 : i64
 // CHECK-DAG:       %[[C1_i64:.*]] = arith.constant 1 : i64
 // CHECK-DAG:       %[[C16_i64:.*]] = arith.constant 16 : i64
-// CHECK-DAG:       %[[ZERO_OFFSET_i64:.*]] = arith.constant 0 : i64
 // CHECK-DAG:       %[[PID:.*]] = tt.get_program_id x : i32
-// CHECK:           %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
-// CHECK:           %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C16_i64]] : i64
-// CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+// CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
+// CHECK:           tt.addptr %[[P1]], %[[C0_i64]] : !tt.ptr<f32, 1>, i64
 // CHECK-NEXT:      tt.make_tensor_ptr
 // CHECK-SAME:      <tensor<16xf32>, 1>
 // CHECK:           tt.load
 // CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1> -> tensor<16xf32>
-// CHECK:           tt.addptr %[[P1]], %[[ZERO_OFFSET_i64]] : !tt.ptr<f32, 1>, i64
+// CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+// CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
+// CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 // CHECK-NEXT:      tt.make_tensor_ptr
 // CHECK-SAME:      <tensor<16xf32>, 1>
 // CHECK-NEXT:      tt.load
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 1c1b997cf7adc3..3450ab0c83c1e8 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -579,7 +579,6 @@ xla_cc_test(
     deps = [
         ":indexing_map",
         ":symbolic_tile_analysis",
-        "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index 07e54e4c24dbf4..7b1400f1ffc2da 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -660,7 +660,8 @@ std::vector<int64_t> ComputeStrides(absl::Span<const int64_t> dims) {
   return strides;
 }
 
-// Computes 1D index given a shape and N-d indexing expressions.
+}  // namespace
+
 AffineExpr LinearizeShape(absl::Span<const int64_t> dims,
                           absl::Span<const AffineExpr> dimension_exprs,
                           MLIRContext* mlir_context) {
@@ -673,7 +674,6 @@ AffineExpr LinearizeShape(absl::Span<const int64_t> dims,
   return linear_index;
 }
 
-// Computes N-d indexing expressions given a linear index and a shape.
 std::vector<AffineExpr> DelinearizeIndex(absl::Span<const int64_t> dims,
                                          AffineExpr linear_index,
                                          MLIRContext* mlir_context) {
@@ -688,6 +688,8 @@ std::vector<AffineExpr> DelinearizeIndex(absl::Span<const int64_t> dims,
   return multi_index;
 }
 
+namespace {
+
 // Computes indexing for "minimal" reshapes, i.e. reshapes that cannot be
 // represented by a series of composed reshapes, i.e. when there are no
 // subshapes in input and output that have the same number of elements.
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.h b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
index b6c346ffabfcc8..ed8c495c425899 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
@@ -9,6 +9,7 @@ You may obtain a copy of the License at
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
@@ -160,6 +161,17 @@ IndexingMap GetIndexingMapForTiling(mlir::AffineMap block_offsets,
 // Returns the shape of the output of the instruction.
 const Shape& GetOutputShape(const HloInstruction* instr, int64_t output_id);
 
+// Computes 1D index given a shape and N-d indexing expressions.
+mlir::AffineExpr LinearizeShape(
+    absl::Span<const int64_t> dims,
+    absl::Span<const mlir::AffineExpr> dimension_exprs,
+    mlir::MLIRContext* mlir_context);
+
+// Computes N-d indexing expressions given a linear index and a shape.
+std::vector<mlir::AffineExpr> DelinearizeIndex(absl::Span<const int64_t> dims,
+                                               mlir::AffineExpr linear_index,
+                                               mlir::MLIRContext* mlir_context);
+
 // Creates an identity indexing map corresponding to the parameter shape.
 IndexingMap CreateIdentityMap(const Shape& shape,
                               mlir::MLIRContext* mlir_context);

From 878006ba0a814f38358012caa78022a0ae8efd3b Mon Sep 17 00:00:00 2001
From: Sergey Kozub <sergeykozub@google.com>
Date: Tue, 26 Mar 2024 04:53:49 -0700
Subject: [PATCH 437/670] Do not produce negative strides when building
 symbolic maps

PiperOrigin-RevId: 619148171
---
 .../xla/service/gpu/model/tile_analysis.cc    | 19 ++++++++++++++++++-
 .../xla/xla/service/gpu/model/tile_analysis.h |  7 +++----
 .../service/gpu/model/tile_analysis_test.cc   | 11 ++++++-----
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.cc b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
index a229d2ec71c70c..dceb6b737e6b56 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
@@ -289,7 +289,7 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
   //   = stride_expr{i} * index_expr{i}
   //
   // offset_expressions = f(0, ..., 0)[0, ..., 0].
-  llvm::ArrayRef<AffineExpr> offset_expressions =
+  std::vector<AffineExpr> offset_expressions =
       SubstituteAllIndicesAndKnownSymbolsWithSameValue(
           input_affine_map, getAffineConstantExpr(0, mlir_context))
           .getResults();
@@ -313,6 +313,23 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
     stride_expressions.push_back(maybe_size_and_stride->stride);
   }
 
+  // Eliminate negative strides and recalculate offsets.
+  std::vector<AffineExpr> dim_replacements, sym_replacements;
+  for (auto [offset, size, stride] :
+       llvm::zip(offset_expressions, size_expressions, stride_expressions)) {
+    auto constant = llvm::dyn_cast<mlir::AffineConstantExpr>(stride);
+    if (!constant) {
+      AffineMapPrinter printer;
+      VLOG(1) << "Unexpected non-constant stride expression: "
+              << printer.ToString(stride);
+      return std::nullopt;
+    }
+    if (constant.getValue() < 0) {
+      offset = offset + size * stride - stride;
+      stride = -stride;
+    }
+  }
+
   int64_t num_symbols = input_affine_map.getNumDims();
   AffineMap offset_map =
       AffineMap::get(0, num_symbols, offset_expressions, mlir_context);
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.h b/third_party/xla/xla/service/gpu/model/tile_analysis.h
index b488f1658eda25..82c104759fe453 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.h
@@ -32,7 +32,7 @@ namespace gpu {
 // expressed as a strided expression
 //     offset + stride * iota(size)
 // with offset, stride, and size three integers, and iota the usual range
-// function. The size and offsets may never be negative.
+// function. These values may never be negative.
 //
 // A N-dimensional symbolic tile is a function from offsets, strides, and sizes
 // to a N-dimensional tile. It can be represented as three affine maps with
@@ -44,9 +44,8 @@ namespace gpu {
 //     (stride0, ..., stride{N-1})     (stride_map())
 // where maps respectively encode the offset, size, and stride component of
 // each strided expression in the tile. The parameters to the maps above are all
-// assumed to be strictly positive, but results of stride_map() may be negative.
-// The input offsets are assumed to be all 0s, and the input strides are assumed
-// to be all 1s.
+// assumed to be strictly positive. The input offsets are assumed to be all 0s,
+// and the input strides are assumed to be all 1s.
 //
 // A symbolic tile with M symbols and N results is constructed using an
 // `IndexingMap` with M input dimensions and N results. The construction of the
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
index 3c18b4a018ca81..18d80f6fd4627e 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
@@ -193,8 +193,8 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughReverse) {
 
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0] -> (178)", "()[s0] -> (s0)",
-                                 "()[s0] -> (-1)")));
+      Optional(MatchSymbolicTile("()[s0] -> (-s0 + 179)", "()[s0] -> (s0)",
+                                 "()[s0] -> (1)")));
 }
 
 TEST_F(SymbolicTileTest, CanPropagateTileFromSliceOutputToInput) {
@@ -293,14 +293,15 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughSplitReshapeOfReverse) {
     }
   )"));
 
-  // TODO(b/328190548): normalize strides to be positive.
+  // TODO(b/331257678): the expected expressions should be simplified.
   EXPECT_THAT(
       SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       Optional(MatchSymbolicTile(
-          "()[s0, s1] -> (0, 7, 5, 0)",
+          "()[s0, s1] -> (0, -((s0 + 5) floordiv 6) + 8, "
+          "-(s0 - ((s0 - 1) floordiv 6) * 6) + 6, 0)",
           "()[s0, s1] -> "
           "(1, (s0 + 5) floordiv 6, s0 - ((s0 - 1) floordiv 6) * 6, s1)",
-          "()[s0, s1] -> (0, -1, -1, 1)")));
+          "()[s0, s1] -> (0, 1, 1, 1)")));
 }
 
 TEST_F(SymbolicTileTest,

From b24d3ae5c37061ba2f687945d55602ce9dc456af Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Tue, 26 Mar 2024 05:07:52 -0700
Subject: [PATCH 438/670] [XLA:GPU] Remove FusionMergerTriton pass.

This was never enabled, and is being replaced by the more principled
solution using SymbolicTileAnalysis and priority fusion.

Update an imprecise test that was supposed to test this (but apparently
wasn't).

PiperOrigin-RevId: 619151128
---
 third_party/xla/xla/service/gpu/BUILD         |  48 --
 .../xla/service/gpu/fusion_merger_triton.cc   | 286 ---------
 .../xla/service/gpu/fusion_merger_triton.h    |  55 --
 .../service/gpu/fusion_merger_triton_test.cc  | 548 ------------------
 .../xla/xla/service/gpu/gpu_compiler.cc       |   7 -
 .../ir_emitter_triton_parametrized_test.cc    |   9 +-
 6 files changed, 4 insertions(+), 949 deletions(-)
 delete mode 100644 third_party/xla/xla/service/gpu/fusion_merger_triton.cc
 delete mode 100644 third_party/xla/xla/service/gpu/fusion_merger_triton.h
 delete mode 100644 third_party/xla/xla/service/gpu/fusion_merger_triton_test.cc

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 75f83ca53799eb..241d2d5bf7a3e6 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1714,53 +1714,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "fusion_merger_triton",
-    srcs = ["fusion_merger_triton.cc"],
-    hdrs = ["fusion_merger_triton.h"],
-    deps = [
-        ":backend_configs_cc",
-        ":gpu_fusible",
-        ":ir_emission_utils",
-        ":triton_fusion_analysis",
-        "//xla:status",
-        "//xla:statusor",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:hlo_pass",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-    ],
-)
-
-xla_test(
-    name = "fusion_merger_triton_test",
-    srcs = ["fusion_merger_triton_test.cc"],
-    backend_tags = {"gpu": [
-        "requires-gpu-sm70",
-    ]},
-    backends = [
-        "gpu",
-    ],
-    deps = [
-        ":fusion_merger_triton",
-        "//xla:autotune_results_proto_cc",
-        "//xla/service:pattern_matcher",
-        "//xla/service:pattern_matcher_gmock",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
-        "@com_google_absl//absl/log",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:status_matchers",
-    ],
-)
-
 cc_library(
     name = "softmax_rewriter_triton",
     srcs = ["softmax_rewriter_triton.cc"],
@@ -3782,7 +3735,6 @@ cc_library(
         "@local_tsl//tsl/platform:numbers",
     ]) + xla_export_hlo_deps() + [
         ":command_buffer_scheduling",
-        ":fusion_merger_triton",
         ":fusion_pipeline",
         ":ir_emitter_context",
         ":ir_emitter_unnested",
diff --git a/third_party/xla/xla/service/gpu/fusion_merger_triton.cc b/third_party/xla/xla/service/gpu/fusion_merger_triton.cc
deleted file mode 100644
index 9c8c4264ad6f57..00000000000000
--- a/third_party/xla/xla/service/gpu/fusion_merger_triton.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/fusion_merger_triton.h"
-
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/gpu_fusible.h"
-#include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/triton_fusion_analysis.h"
-#include "xla/status.h"
-#include "xla/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
-
-namespace xla::gpu {
-namespace {
-
-// Taking in a producer HloFusionInstruction, tries to merge into consumer
-// triton softmax fusion.
-// The following is assumed:
-//  * The producer is an HloFusionInstruction
-//  * The (sole) consumer of the producer is a triton softmax fusion
-//
-// Returns std::optional<HloFusionInstruction*>, pointing to the new (fused)
-// triton softmax instruction if the producer was successfully merged into the
-// consumer. If the merge was unsuccessful, the original computation remains
-// unchanged and a nullopt is returned.
-std::optional<HloFusionInstruction*>
-TryMergeFusionProducerIntoTritonSoftmaxConsumer(
-    HloFusionInstruction* producer) {
-  // TODO(b/313026024): Add support for multiple users
-  CHECK_EQ(producer->user_count(), 1);
-
-  HloComputation* computation = producer->parent();
-  HloModule* parent_module = computation->parent();
-  HloInstruction* original_softmax_instruction = producer->users().front();
-  CHECK_EQ(original_softmax_instruction->opcode(), HloOpcode::kFusion);
-
-  std::unique_ptr<HloInstruction> candidate =
-      original_softmax_instruction->Clone();
-  HloInstruction* candidate_fusion = candidate.get();
-
-  // Try to merge the producer into candidate fusion.
-  candidate_fusion->MergeFusionInstruction(producer);
-
-  HloComputation* fused_computation =
-      candidate_fusion->called_computations().front();
-
-  const auto analysis = TritonFusionAnalysis::Execute(*fused_computation);
-
-  if (!analysis.ok()) {
-    return std::nullopt;
-  }
-
-  computation->AddInstruction(std::move(candidate));
-
-  if (original_softmax_instruction->IsRoot()) {
-    computation->set_root_instruction(candidate_fusion);
-  }
-
-  TF_CHECK_OK(
-      original_softmax_instruction->ReplaceAllUsesWith(candidate_fusion));
-
-  HloComputation* original_softmax_computation =
-      original_softmax_instruction->fused_instructions_computation();
-  TF_CHECK_OK(computation->RemoveInstruction(original_softmax_instruction));
-  TF_CHECK_OK(
-      parent_module->RemoveEmbeddedComputation(original_softmax_computation));
-
-  CHECK_EQ(0, producer->user_count()) << producer->ToString();
-  HloComputation* original_producer_computation =
-      producer->fused_instructions_computation();
-  TF_CHECK_OK(computation->RemoveInstruction(producer));
-  TF_CHECK_OK(
-      parent_module->RemoveEmbeddedComputation(original_producer_computation));
-
-  return Cast<HloFusionInstruction>(candidate_fusion);
-}
-
-// Taking in a consumer HloFusionInstruction and a HloInstruction for a triton
-// softmax fusion, tries to merge the consumer fusion into the softmax fusion.
-// The following is assumed:
-//  * The consumer is an HloFusionInstruction
-//  * consumer->shape().IsArray(), i.e. not a multi-output consumer
-//  * The original_softmax_instr is a triton softmax fusion
-//  * The consumer is the sole user of original_softmax_instr
-//
-// Returns std::optional<HloFusionInstruction*>, pointing to the new (fused)
-// triton softmax instruction if the consumer was successfully merged into the
-// producer. If the merge was unsuccessful, the original computation remains
-// unchanged and a nullopt is returned.
-std::optional<HloFusionInstruction*>
-TryMergeFusionConsumerIntoTritonSoftmaxProducer(
-    HloFusionInstruction* consumer,
-    HloFusionInstruction* original_softmax_instr) {
-  CHECK_EQ(original_softmax_instr->opcode(), HloOpcode::kFusion);
-  CHECK_EQ(original_softmax_instr->user_count(), 1);
-  CHECK_EQ(original_softmax_instr->users().front(), consumer);
-  CHECK(consumer->shape().IsArray());
-  CHECK_OK(original_softmax_instr->backend_config<GpuBackendConfig>());
-  CHECK_EQ(original_softmax_instr->backend_config<GpuBackendConfig>()
-               ->fusion_backend_config()
-               .kind(),
-           kTritonSoftmaxFusionKind);
-  HloComputation* parent_computation = consumer->parent();
-  HloModule* parent_module = parent_computation->parent();
-
-  // We clone the consumer to generate a candidate that we fuse into.
-  std::unique_ptr<HloInstruction> candidate_instr_ptr = consumer->Clone();
-  HloInstruction* consumer_candidate_instr = candidate_instr_ptr.get();
-
-  // Try to merge the producer into candidate fusion.
-  consumer_candidate_instr->MergeFusionInstruction(original_softmax_instr);
-  HloComputation* fused_computation =
-      consumer_candidate_instr->fused_instructions_computation();
-
-  const auto analysis = TritonFusionAnalysis::Execute(*fused_computation);
-
-  if (!analysis.ok()) {
-    return std::nullopt;
-  }
-
-  // We want our joined fusion to have the correct fusion_kind, backend_config,
-  // etc for a triton fusion. So we assemble a new instruction rather than
-  // using consumer_candidate_instr, which would not get triton codegen'd.
-  std::unique_ptr<HloInstruction> new_softmax_instr_ptr =
-      HloInstruction::CreateFusion(
-          /*shape=*/consumer_candidate_instr->shape(),
-          /*fusion_kind=*/original_softmax_instr->fusion_kind(),
-          /*operands=*/consumer_candidate_instr->operands(),
-          /*fusion_computation=*/fused_computation,
-          /*prefix=*/"triton_softmax_");
-
-  HloInstruction* new_softmax_instr = new_softmax_instr_ptr.get();
-
-  new_softmax_instr->CopyBackendConfigFrom(original_softmax_instr);
-
-  // Now, we incorporate new_softmax_instr into our module.
-  parent_computation->AddInstruction(std::move(new_softmax_instr_ptr));
-
-  if (consumer->IsRoot()) {
-    parent_computation->set_root_instruction(new_softmax_instr);
-  }
-
-  TF_CHECK_OK(consumer->ReplaceAllUsesWith(new_softmax_instr));
-
-  // Remove the replaced instructions and computations from the module.
-  HloComputation* original_consumer_computation =
-      consumer->fused_instructions_computation();
-  TF_CHECK_OK(parent_computation->RemoveInstruction(consumer));
-  TF_CHECK_OK(
-      parent_module->RemoveEmbeddedComputation(original_consumer_computation));
-
-  CHECK_EQ(0, original_softmax_instr->user_count());
-
-  // Keep a ptr to the original computation so we can remove it from the module.
-  HloComputation* original_softmax_computation =
-      original_softmax_instr->fused_instructions_computation();
-
-  TF_CHECK_OK(parent_computation->RemoveInstruction(original_softmax_instr));
-  TF_CHECK_OK(
-      parent_module->RemoveEmbeddedComputation(original_softmax_computation));
-
-  return Cast<HloFusionInstruction>(new_softmax_instr);
-}
-
-bool TryMergeProducerAndConsumerFusionsIntoTritonSoftmax(
-    HloFusionInstruction* softmax_fusion) {
-  // The softmax_fusion should come directly from the matcher. They might have
-  // more than a single operand, in this case attempt to fuse into the first
-  // operand only.
-  if (softmax_fusion->operand_count() > 1) {
-    LOG(INFO) << "More than one parameter detected. Will attempt to merge "
-                 "fusions only for operand 0 (diamond producer).";
-  }
-
-  // TODO(b/313026024): Add support for multiple users
-  bool should_try_merging_producer =
-      softmax_fusion->operand(0)->user_count() == 1 &&
-      softmax_fusion->operand(0)->opcode() == HloOpcode::kFusion;
-  // TODO(b/315040476): generalize for multiple users and multi-output
-  bool should_try_merging_consumer =
-      softmax_fusion->user_count() == 1 &&
-      softmax_fusion->users().front()->opcode() == HloOpcode::kFusion &&
-      softmax_fusion->users().front()->shape().IsArray();
-
-  bool changed = false;
-  if (should_try_merging_producer) {
-    HloFusionInstruction* producer =
-        Cast<HloFusionInstruction>(softmax_fusion->mutable_operand(0));
-
-    VLOG(6) << "Fusing producer " << producer->ToShortString() << " into "
-            << softmax_fusion->ToShortString();
-
-    std::optional<HloFusionInstruction*> result =
-        TryMergeFusionProducerIntoTritonSoftmaxConsumer(producer);
-
-    if (!result.has_value()) {
-      VLOG(6) << "Did not fuse producer into "
-              << softmax_fusion->ToShortString();
-    } else {
-      softmax_fusion = result.value();
-      changed = true;
-    }
-  }
-
-  if (should_try_merging_consumer) {
-    HloFusionInstruction* consumer =
-        Cast<HloFusionInstruction>(softmax_fusion->users().front());
-
-    VLOG(6) << "Fusing consumer " << consumer->ToShortString() << " into "
-            << softmax_fusion->ToShortString();
-
-    std::optional<HloFusionInstruction*> result =
-        TryMergeFusionConsumerIntoTritonSoftmaxProducer(consumer,
-                                                        softmax_fusion);
-
-    if (!result.has_value()) {
-      VLOG(6) << "Did not fuse consumer into "
-              << softmax_fusion->ToShortString();
-    } else {
-      softmax_fusion = result.value();
-      changed = true;
-    }
-  }
-  return changed;
-}
-
-}  // anonymous namespace
-
-absl::StatusOr<bool> FusionMergerTriton::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  int fused_comps = 0;
-  for (HloComputation* comp :
-       module->MakeNonfusionComputations(execution_threads)) {
-    if (comp->IsCustomCallComputation()) {
-      continue;
-    }
-
-    for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
-      if (!IsTritonSoftmaxFusion(*instr)) continue;
-
-      VLOG(6) << "Matched triton_softmax fusion: " << instr->ToShortString();
-
-      HloFusionInstruction* softmax = Cast<HloFusionInstruction>(instr);
-
-      bool result =
-          TryMergeProducerAndConsumerFusionsIntoTritonSoftmax(softmax);
-
-      if (!result) {
-        VLOG(6) << "Did not fuse producer or consumer into "
-                << instr->ToShortString();
-      } else {
-        ++fused_comps;
-      }
-    }
-  }
-  return fused_comps > 0;
-}
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/fusion_merger_triton.h b/third_party/xla/xla/service/gpu/fusion_merger_triton.h
deleted file mode 100644
index 72ab4d8e70dfa5..00000000000000
--- a/third_party/xla/xla/service/gpu/fusion_merger_triton.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_FUSION_MERGER_TRITON_H_
-#define XLA_SERVICE_GPU_FUSION_MERGER_TRITON_H_
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/hlo_pass_interface.h"
-
-namespace xla {
-namespace gpu {
-
-// An HLO pass that attempts to merge producer fusions into triton softmax
-// fusions.
-//
-// Producer kernels are only merged if the resulting fusion can be correctly
-// tiled. If the result can be tiled, all operations from the auxiliary
-// producer fusion will be merged into the triton softmax computation, and this
-// computation will replace both the auxiliary and original triton softmax
-// fusion.
-//
-// Auxiliary fusions are not merged into consumer triton fusions if:
-// * The auxiliary fusion has multiple users
-// * The resulting merged fusion is not tilable
-class FusionMergerTriton : public HloModulePass {
- public:
-  explicit FusionMergerTriton() = default;
-  absl::string_view name() const override { return "fusion-merger-triton"; }
-
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_FUSION_MERGER_TRITON_H_
diff --git a/third_party/xla/xla/service/gpu/fusion_merger_triton_test.cc b/third_party/xla/xla/service/gpu/fusion_merger_triton_test.cc
deleted file mode 100644
index 20a21807e95997..00000000000000
--- a/third_party/xla/xla/service/gpu/fusion_merger_triton_test.cc
+++ /dev/null
@@ -1,548 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/fusion_merger_triton.h"
-
-#include <memory>
-#include <string>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "xla/autotune_results.pb.h"
-#include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/status_matchers.h"
-
-using ::tsl::testing::IsOk;
-using ::tsl::testing::IsOkAndHolds;
-
-namespace xla {
-namespace gpu {
-namespace {
-
-namespace m = ::xla::match;
-using FusionMergerTritonTest = HloTestBase;
-
-TEST_F(FusionMergerTritonTest,
-       CanMergeTritonFusionWithSingleParameterProducer) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=auxiliary_computation
-  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_THAT(fusion_merger.Run(module.get()), IsOkAndHolds(true));
-  EXPECT_THAT(verifier().Run(module.get()), IsOk());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest, CanMergeWithTwoParameterConsumer) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-consumer_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, broadcast)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125,127]{1,0} parameter(0)
-  param_1 = f32[125]{0} parameter(1)
-  triton_softmax = f32[125,127]{1,0} fusion(param_0), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_1, triton_softmax), kind=kLoop, calls=consumer_computation
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger{};
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
-}
-
-TEST_F(
-    FusionMergerTritonTest,
-    CanMergeProducerFusionIntoTritonSoftmaxConsumerWhenTheConsumerIsNotRoot) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=auxiliary_computation
-  triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-  ROOT broadcast = f32[10,125,127]{2,1,0} broadcast(triton_softmax), dimensions={1,2}
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_THAT(fusion_merger.Run(module.get()), IsOkAndHolds(true));
-  EXPECT_THAT(verifier().Run(module.get()), IsOk());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Broadcast(m::Fusion(m::Parameter()))));
-}
-
-TEST_F(FusionMergerTritonTest,
-       CanMergeTritonFusionWithMultipleParameterProducer) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, broadcast)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  param_1 = f32[125,127]{1,0} parameter(1)
-  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kLoop, calls=auxiliary_computation
-  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest, CanMergeTritonFusionWithTransposeProducer) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  parameter_1 = f32[127,125]{1,0} parameter(1)
-  transpose = f32[125,127]{1,0} transpose(parameter_1), dimensions={1,0}
-  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(transpose, broadcast)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  param_1 = f32[127,125]{1,0} parameter(1)
-  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kLoop, calls=auxiliary_computation
-  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest,
-       DoesNotMergeTritonFusionWithProducerContainingUntileableOp) {
-  // Right now, concatenate is not tileable.
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_computation {
-  parameter_0 = f32[125,63]{1,0} parameter(0)
-  parameter_1 = f32[125,64]{1,0} parameter(1)
-  ROOT concatenate = f32[125,127]{1,0} concatenate(parameter_0, parameter_1), dimensions={1}
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125,63]{1,0} parameter(0)
-  param_1 = f32[125,64]{1,0} parameter(1)
-  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kLoop, calls=auxiliary_computation
-  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_FALSE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Fusion(m::Parameter(), m::Parameter()))));
-}
-
-TEST_F(FusionMergerTritonTest, CanMergeTritonFusionWithElementwiseProducer) {
-  const std::string kHloText = R"(
-HloModule layernorm
-
-add_f32 {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add_6 = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_fusion {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  ROOT multiply_1 = f32[125,127]{1,0} multiply(parameter_0, parameter_1)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  constant_0 = f32[] constant(0)
-  reduce = f32[125]{0} reduce(parameter_0, constant_0), dimensions={1}, to_apply=add_f32
-  broadcast = f32[125,127]{1,0} broadcast(reduce), dimensions={0}
-  ROOT multiply_result = f32[125,127]{1,0} multiply(parameter_0, broadcast)
-}
-
-ENTRY main {
-  param_0 = f32[125,127]{1,0} parameter(0)
-  param_1 = f32[125,127]{1,0} parameter(1)
-  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kCustom, calls=auxiliary_fusion
-  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-}
-
-)";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest,
-       DoesNotMergeSoftmaxWithParamBroadcastedAlongBatchAndReduceDimensions) {
-  const std::string kHloText = R"(
-HloModule t
-
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_computation {
-  param_0 = f32[10,125,127]{2,1,0} parameter(0)
-  param_1 = f32[10]{0} parameter(1)
-  broadcast_0 = f32[10,125,127]{2,1,0} broadcast(param_1), dimensions={0}
-  ROOT multiply_0 = f32[10,125,127]{2,1,0} multiply(param_0, broadcast_0)
-}
-
-triton_softmax_computation {
-  param_0 = f32[10,125,127]{2,1,0} parameter(0)
-  multiply = f32[10,125,127]{2,1,0} multiply(param_0, param_0)
-  constant = f32[] constant(0)
-  reduce = f32[10,125]{1,0} reduce(multiply, constant), dimensions={2}, to_apply=add
-  broadcast = f32[10,125,127]{2,1,0} broadcast(reduce), dimensions={0,1}
-  ROOT multiply_out = f32[10,125,127]{2,1,0} multiply(param_0, broadcast)
-}
-
-ENTRY main {
-  param_0 = f32[10,125,127]{2,1,0} parameter(0)
-  param_1 = f32[10]{0} parameter(1)
-  auxiliary_fusion = f32[10,125,127]{2,1,0} fusion(param_0, param_1), kind=kCustom, calls=auxiliary_computation
-  ROOT triton_softmax = f32[10,125,127]{2,1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-}
-)";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_FALSE(fusion_merger.Run(module.get()).value());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Fusion())));
-}
-
-TEST_F(FusionMergerTritonTest, CanMergeWithBothProducerAndConsumerFusions) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-producer_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-}
-
-consumer_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, parameter_0)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  param_1 = f32[125,127]{1,0} parameter(1)
-  producer_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=producer_computation
-  triton_softmax = f32[125,127]{1,0} fusion(producer_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_1, triton_softmax), kind=kLoop, calls=consumer_computation
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger{};
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest,
-       CanMergeWithMultiInputProducerAndConsumerFusions) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-producer_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-  ROOT add = f32[125,127]{1,0} add(parameter_1, broadcast)
-}
-
-consumer_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, parameter_0)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  param_1 = f32[125,127]{1,0} parameter(1)
-  param_2 = f32[125,127]{1,0} parameter(2)
-  producer_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kLoop, calls=producer_computation
-  triton_softmax = f32[125,127]{1,0} fusion(producer_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_2, triton_softmax), kind=kLoop, calls=consumer_computation
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger{};
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest,
-       CanMergeWithBothProducerAndConsumerFusionsSharingParameter) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-producer_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-}
-
-consumer_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, broadcast)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  producer_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=producer_computation
-  triton_softmax = f32[125,127]{1,0} fusion(producer_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_0, triton_softmax), kind=kLoop, calls=consumer_computation
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger{};
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest, DoesNotMergeSoftmaxWithMultiOutputConsumer) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-producer_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-}
-
-consumer_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  add = f32[125,127]{1,0} add(parameter_1, parameter_0)
-  multiply = f32[125,127]{1,0} multiply(parameter_1, parameter_0)
-  ROOT tuple = (f32[125,127]{1,0}, f32[125,127]{1,0}) tuple(add, multiply)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125,127]{1,0} parameter(0)
-  triton_softmax = f32[125,127]{1,0} fusion(param_0), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-  ROOT consumer_fusion = (f32[125,127]{1,0}, f32[125,127]{1,0}) fusion(param_0, triton_softmax), kind=kLoop, calls=consumer_computation
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_FALSE(fusion_merger.Run(module.get()).value());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter(), m::Fusion())));
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 776511c18c5c4d..d98b718e5bc1f9 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -128,7 +128,6 @@ limitations under the License.
 #include "xla/service/gpu/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/dot_dimension_sorter.h"
 #include "xla/service/gpu/dot_operand_converter.h"
-#include "xla/service/gpu/fusion_merger_triton.h"
 #include "xla/service/gpu/fusion_pipeline.h"
 #include "xla/service/gpu/fusion_wrapper.h"
 #include "xla/service/gpu/gemm_broadcast_folding_rewriter.h"
@@ -1043,12 +1042,6 @@ absl::Status RunFusionPasses(HloModule* hlo_module,
                          .Run(hlo_module)
                          .status());
 
-  if (hlo_module->config()
-          .debug_options()
-          .xla_gpu_enable_triton_softmax_fusion()) {
-    TF_RETURN_IF_ERROR(FusionMergerTriton().Run(hlo_module).status());
-  }
-
   if (hlo_module->config().debug_options().xla_gpu_collect_cost_model_stats()) {
     GpuHloCostAnalysis::Options cost_analysis_options{
         shape_size_fn,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index a23f4b68c48c90..9d4e55be446f21 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -2393,17 +2393,16 @@ ENTRY main {
 }
 )";
 
-  // Param order is arbitrary. We test that only param_1 is in the fused root
-  // instruction below.
   const std::string hlo_ref = R"(
 ; CHECK:    ENTRY
 ; CHECK-DAG:    %[[param_0:.*]] = f32[125,127]{1,0} parameter(0)
 ; CHECK-DAG:    %[[param_1:.*]] = f32[127]{0} parameter(1)
 ; CHECK:      ROOT
 ; CHECK-SAME:   f32[125,127]{1,0} fusion
-; CHECK-SAME:   %[[param_1]]
-; CHECK-SAME:   kind=kCustom
-; CHECK-SAME:   triton_softmax
+; CHECK-SAME:    %[[param_0]]
+; CHECK-SAME:    %[[param_1]]
+; CHECK-SAME:          kind=kCustom
+; CHECK-SAME:          triton_softmax
 )";
   MatchOptimizedHlo(hlo_text, hlo_ref);
 

From 0e1f1b7d9fc4867eb2577be722a9986a4f5d2ce5 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <dmitrig@google.com>
Date: Tue, 26 Mar 2024 05:41:18 -0700
Subject: [PATCH 439/670] Integrate LLVM at llvm/llvm-project@fa3d789df15b

Updates LLVM usage to match
[fa3d789df15b](https://github.com/llvm/llvm-project/commit/fa3d789df15b)

PiperOrigin-RevId: 619157908
---
 third_party/llvm/generated.patch | 218 ++++++++++---------------------
 third_party/llvm/workspace.bzl   |   4 +-
 2 files changed, 71 insertions(+), 151 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 523466863396aa..29f93730c6cf18 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,151 +1,71 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
---- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
-+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
-@@ -126,7 +126,6 @@
- const int SIGSEGV = 11;
- const int SIGPIPE = 13;
- const int SIGTERM = 15;
--const int SIGPROF = 27;
- #if defined(__mips__) || SANITIZER_FREEBSD || SANITIZER_APPLE || SANITIZER_NETBSD
- const int SIGBUS = 10;
- const int SIGSYS = 12;
-@@ -2180,8 +2179,7 @@
-     return;
-   }
-   // Don't mess with synchronous signals.
--  const bool sync = is_sync_signal(sctx, sig, info) ||
--                    (sig == SIGPROF && thr->is_inited && !thr->is_dead);
-+  const bool sync = is_sync_signal(sctx, sig, info);
-   if (sync ||
-       // If we are in blocking function, we can safely process it now
-       // (but check if we are in a recursive interceptor,
-diff -ruN --strip-trailing-cr a/compiler-rt/test/tsan/signal_errno.cpp b/compiler-rt/test/tsan/signal_errno.cpp
---- a/compiler-rt/test/tsan/signal_errno.cpp
-+++ b/compiler-rt/test/tsan/signal_errno.cpp
-@@ -18,7 +18,7 @@
+diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
++++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+@@ -598,14 +598,14 @@
+     name = "llvm-min-tblgen",
+     srcs = [
+         "utils/TableGen/Attributes.cpp",
+-        "utils/TableGen/CodeGenIntrinsics.cpp",
+-        "utils/TableGen/CodeGenIntrinsics.h",
++        "utils/TableGen/Basic/CodeGenIntrinsics.cpp",
++        "utils/TableGen/Basic/CodeGenIntrinsics.h",
++        "utils/TableGen/Basic/SDNodeProperties.cpp",
++        "utils/TableGen/Basic/SDNodeProperties.h",
++        "utils/TableGen/Basic/SequenceToOffsetTable.h",
+         "utils/TableGen/DirectiveEmitter.cpp",
+         "utils/TableGen/IntrinsicEmitter.cpp",
+         "utils/TableGen/RISCVTargetDefEmitter.cpp",
+-        "utils/TableGen/SDNodeProperties.cpp",
+-        "utils/TableGen/SDNodeProperties.h",
+-        "utils/TableGen/SequenceToOffsetTable.h",
+         "utils/TableGen/TableGen.cpp",
+         "utils/TableGen/VTEmitter.cpp",
+     ],
+@@ -621,7 +621,7 @@
+ cc_library(
+     name = "TableGenGlobalISel",
+     srcs = [
+-        "utils/TableGen/GlobalISel/CodeExpander.cpp",
++        "utils/TableGen/Common/GlobalISel/CodeExpander.cpp",
+     ],
+     hdrs = glob([
+         # We have to include these headers here as well as in the `hdrs` below
+@@ -629,8 +629,8 @@
+         # them, even though consumers of this library use inclusion relative to
+         # `utils/TableGen` with the `strip_includes_prefix` of this library.
+         # This mixture appears to be incompatible with header modules.
+-        "utils/TableGen/GlobalISel/CodeExpander.h",
+-        "utils/TableGen/GlobalISel/CodeExpansions.h",
++        "utils/TableGen/Common/GlobalISel/CodeExpander.h",
++        "utils/TableGen/Common/GlobalISel/CodeExpansions.h",
+     ]),
+     copts = llvm_copts,
+     features = ["-header_modules"],
+@@ -654,8 +654,12 @@
+         [
+             "utils/TableGen/*.cpp",
+             "utils/TableGen/*.h",
+-            "utils/TableGen/GlobalISel/*.cpp",
+-            "utils/TableGen/GlobalISel/*.h",
++            "utils/TableGen/Basic/*.cpp",
++            "utils/TableGen/Basic/*.h",
++            "utils/TableGen/Common/*.cpp",
++            "utils/TableGen/Common/*.h",
++            "utils/TableGen/Common/GlobalISel/*.cpp",
++            "utils/TableGen/Common/GlobalISel/*.h",
  
- static void* sendsignal(void *p) {
-   barrier_wait(&barrier);
--  pthread_kill(mainth, SIGALRM);
-+  pthread_kill(mainth, SIGPROF);
-   return 0;
- }
- 
-@@ -37,7 +37,7 @@
-   mainth = pthread_self();
-   struct sigaction act = {};
-   act.sa_sigaction = &MyHandler;
--  sigaction(SIGALRM, &act, 0);
-+  sigaction(SIGPROF, &act, 0);
-   pthread_t th;
-   pthread_create(&th, 0, sendsignal, 0);
-   loop();
-@@ -46,7 +46,7 @@
- }
- 
- // CHECK: WARNING: ThreadSanitizer: signal handler spoils errno
--// CHECK:   Signal 14 handler invoked at:
-+// CHECK:   Signal 27 handler invoked at:
- // CHECK:     #0 MyHandler(int, {{(__)?}}siginfo{{(_t)?}}*, void*) {{.*}}signal_errno.cpp
- // CHECK:     main
- // CHECK: SUMMARY: ThreadSanitizer: signal handler spoils errno{{.*}}MyHandler
-diff -ruN --strip-trailing-cr a/compiler-rt/test/tsan/signal_reset.cpp b/compiler-rt/test/tsan/signal_reset.cpp
---- a/compiler-rt/test/tsan/signal_reset.cpp
-+++ b/compiler-rt/test/tsan/signal_reset.cpp
-@@ -28,12 +28,12 @@
-   struct sigaction act = {};
-   for (int i = 0; i < 1000000; i++) {
-     act.sa_handler = &handler;
--    if (sigaction(SIGALRM, &act, 0)) {
-+    if (sigaction(SIGPROF, &act, 0)) {
-       perror("sigaction");
-       exit(1);
-     }
-     act.sa_handler = SIG_IGN;
--    if (sigaction(SIGALRM, &act, 0)) {
-+    if (sigaction(SIGPROF, &act, 0)) {
-       perror("sigaction");
-       exit(1);
-     }
-@@ -44,7 +44,7 @@
- int main() {
-   struct sigaction act = {};
-   act.sa_handler = SIG_IGN;
--  if (sigaction(SIGALRM, &act, 0)) {
-+  if (sigaction(SIGPROF, &act, 0)) {
-     perror("sigaction");
-     exit(1);
-   }
-@@ -53,7 +53,7 @@
-   t.it_value.tv_sec = 0;
-   t.it_value.tv_usec = 10;
-   t.it_interval = t.it_value;
--  if (setitimer(ITIMER_REAL, &t, 0)) {
-+  if (setitimer(ITIMER_PROF, &t, 0)) {
-     perror("setitimer");
-     exit(1);
-   }
-diff -ruN --strip-trailing-cr a/compiler-rt/test/tsan/signal_sync.cpp b/compiler-rt/test/tsan/signal_sync.cpp
---- a/compiler-rt/test/tsan/signal_sync.cpp
-+++ b/compiler-rt/test/tsan/signal_sync.cpp
-@@ -30,7 +30,7 @@
- 
-   struct sigaction act = {};
-   act.sa_handler = &handler;
--  if (sigaction(SIGVTALRM, &act, 0)) {
-+  if (sigaction(SIGPROF, &act, 0)) {
-     perror("sigaction");
-     exit(1);
-   }
-@@ -39,7 +39,7 @@
-   t.it_value.tv_sec = 0;
-   t.it_value.tv_usec = 10;
-   t.it_interval = t.it_value;
--  if (setitimer(ITIMER_VIRTUAL, &t, 0)) {
-+  if (setitimer(ITIMER_PROF, &t, 0)) {
-     perror("setitimer");
-     exit(1);
-   }
-diff -ruN --strip-trailing-cr a/compiler-rt/test/tsan/signal_thread2.cpp b/compiler-rt/test/tsan/signal_thread2.cpp
---- a/compiler-rt/test/tsan/signal_thread2.cpp
-+++ b/compiler-rt/test/tsan/signal_thread2.cpp
-@@ -40,7 +40,7 @@
- int main() {
-   struct sigaction act = {};
-   act.sa_handler = &handler;
--  if (sigaction(SIGALRM, &act, 0)) {
-+  if (sigaction(SIGPROF, &act, 0)) {
-     perror("sigaction");
-     exit(1);
-   }
-@@ -49,7 +49,7 @@
-   t.it_value.tv_sec = 0;
-   t.it_value.tv_usec = 10;
-   t.it_interval = t.it_value;
--  if (setitimer(ITIMER_REAL, &t, 0)) {
-+  if (setitimer(ITIMER_PROF, &t, 0)) {
-     perror("setitimer");
-     exit(1);
-   }
-diff -ruN --strip-trailing-cr a/compiler-rt/test/tsan/signal_thread.cpp b/compiler-rt/test/tsan/signal_thread.cpp
---- a/compiler-rt/test/tsan/signal_thread.cpp
-+++ b/compiler-rt/test/tsan/signal_thread.cpp
-@@ -24,7 +24,7 @@
- int main() {
-   struct sigaction act = {};
-   act.sa_handler = &handler;
--  if (sigaction(SIGVTALRM, &act, 0)) {
-+  if (sigaction(SIGPROF, &act, 0)) {
-     perror("sigaction");
-     exit(1);
-   }
-@@ -33,7 +33,7 @@
-   t.it_value.tv_sec = 0;
-   t.it_value.tv_usec = 10;
-   t.it_interval = t.it_value;
--  if (setitimer(ITIMER_VIRTUAL, &t, 0)) {
-+  if (setitimer(ITIMER_PROF, &t, 0)) {
-     perror("setitimer");
-     exit(1);
-   }
+             # Some tablegen sources include headers from MC, so these have to be
+             # listed here. MC uses headers produced by tablegen, so it cannot be a
+@@ -663,9 +667,10 @@
+             "include/llvm/MC/*.h",
+             "include/llvm/TargetParser/SubtargetFeature.h",
+         ],
+-        exclude = ["utils/TableGen/GlobalISel/CodeExpander.cpp"],
++        exclude = ["utils/TableGen/Common/GlobalISel/CodeExpander.cpp"],
+     ),
+     copts = llvm_copts,
++    includes = ["utils/TableGen"],
+     stamp = 0,
+     deps = [
+         ":CodeGenTypes",
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 2b48087e0c951f..bba2ccd34d8330 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "7ac7d418ac2b16fd44789dcf48e2b5d73de3e715"
-    LLVM_SHA256 = "8b99a146881fbb2a2d8e812724550b2c88fed4403dfb4e133ee8b7107a6a9348"
+    LLVM_COMMIT = "fa3d789df15bd1f58fb8ba4ea3be909218cf7f03"
+    LLVM_SHA256 = "770f98fe3019d205db5ef33e77c231c2ac790705116d92cad6b150692255219e"
 
     tf_http_archive(
         name = name,

From c06f571a98a2e45b10cc4cef5b174781a0d303f5 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Tue, 26 Mar 2024 06:20:46 -0700
Subject: [PATCH 440/670] [XLA:GPU] Add threadId mappings to the legacy
 emitters and add tests to the new ones.

PiperOrigin-RevId: 619167450
---
 third_party/xla/xla/service/gpu/fusions/BUILD |  49 +++++-
 .../xla/service/gpu/fusions/concatenate.cc    |  12 +-
 .../xla/xla/service/gpu/fusions/concatenate.h |   7 +-
 .../gpu/fusions/concatenate_mlir_test.cc      |  55 +++++++
 .../service/gpu/fusions/concatenate_test.cc   | 117 +++++++++++++++
 .../fusions/in_place_dynamic_update_slice.cc  |  20 +++
 .../fusions/in_place_dynamic_update_slice.h   |   5 +-
 .../in_place_dynamic_update_slice_mlir.cc     |   3 +
 ...in_place_dynamic_update_slice_mlir_test.cc |  49 ++++++
 .../in_place_dynamic_update_slice_test.cc     | 105 +++++++++++++
 .../xla/xla/service/gpu/fusions/scatter.cc    |  51 ++++++-
 .../xla/xla/service/gpu/fusions/scatter.h     |  12 +-
 .../service/gpu/fusions/scatter_mlir_test.cc  |   2 +-
 .../xla/service/gpu/fusions/scatter_test.cc   | 140 +++++++++++++++++-
 14 files changed, 600 insertions(+), 27 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/fusions/concatenate_test.cc
 create mode 100644 third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_test.cc

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 7d5340bd131783..018283ba68848a 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -32,6 +32,26 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "in_place_dynamic_update_slice_test",
+    srcs = ["in_place_dynamic_update_slice_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":fusions",
+        ":in_place_dynamic_update_slice",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu/model:affine_map_printer",
+        "//xla/service/gpu/model:indexing_test_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "in_place_dynamic_update_slice_mlir",
     srcs = ["in_place_dynamic_update_slice_mlir.cc"],
@@ -67,6 +87,7 @@ xla_cc_test(
         ":mlir_emitter_test_base",
         "//xla:error_spec",
         "//xla/service:gpu_plugin",
+        "//xla/service/gpu/model:indexing_test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
@@ -487,9 +508,9 @@ cc_library(
     hdrs = ["scatter.h"],
     deps = [
         ":fusion_emitter",
+        ":loop",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
@@ -505,7 +526,6 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -517,6 +537,8 @@ xla_cc_test(
         ":scatter",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu/model:affine_map_printer",
+        "//xla/service/gpu/model:indexing_test_utils",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -852,6 +874,25 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "concatenate_test",
+    srcs = ["concatenate_test.cc"],
+    deps = [
+        ":concatenate",
+        ":fusions",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu/model:affine_map_printer",
+        "//xla/service/gpu/model:indexing_test_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "concatenate_mlir",
     srcs = ["concatenate_mlir.cc"],
@@ -885,14 +926,14 @@ xla_cc_test(
     srcs = ["concatenate_mlir_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
+        ":concatenate",
         ":concatenate_mlir",
         ":mlir_emitter_test_base",
         "//xla:error_spec",
-        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu/model:indexing_test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.cc b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
index 084aece24b1c92..3135e5d807df1f 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
@@ -58,8 +58,16 @@ ConcatenateFusion::ConcatenateFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis) {}
 
 std::optional<IndexingMap> ConcatenateFusion::ComputeThreadIdToOutputIndexing(
-    int64_t output_id, mlir::MLIRContext* ctx) const {
-  return std::nullopt;  // TODO(b/319081342): Implement this.
+    int64_t root_index, mlir::MLIRContext* ctx) const {
+  return std::nullopt;
+}
+
+std::optional<IndexingMap> ConcatenateFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* ctx) const {
+  return GetDefaultThreadIdToOutputIndexingMap(
+      launch_dimensions(), /*unroll_factor=*/1,
+      GetLargestConcatOperandShape(analysis_), ctx);
 }
 
 absl::Status ConcatenateFusion::EmitKernel(
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.h b/third_party/xla/xla/service/gpu/fusions/concatenate.h
index 997033293eff2b..ec9349c9589410 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.h
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.h
@@ -38,14 +38,11 @@ class ConcatenateFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
-    // TODO(b/319081342): Implement this.
-    return std::nullopt;
-  }
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc
index 3b8040caad0d2e..515d385408018f 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
+#include "xla/service/gpu/fusions/concatenate.h"
 #include "xla/service/gpu/fusions/mlir_emitter_test_base.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
 #include "tsl/lib/core/status_test_util.h"
 
 namespace xla {
@@ -26,6 +28,59 @@ namespace {
 
 using MlirConcatenateFusionTest = MlirEmitterTestBase<MlirConcatenateFusion>;
 
+TEST_F(MlirConcatenateFusionTest, ThreadIdIndexing) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    fused_computation {
+      param0 = f32[200] parameter(0)
+      param1 = f32[400] parameter(1)
+      param2 = f32[300] parameter(2)
+      ROOT concat = f32[900] concatenate(param0, param1, param2), dimensions={0}
+    }
+    ENTRY main {
+      param0 = f32[200] parameter(0)
+      param1 = f32[400] parameter(1)
+      param2 = f32[300] parameter(2)
+      ROOT fusion = f32[900] fusion(param0, param1, param2),
+        calls=fused_computation, kind=kLoop
+    }
+  )"));
+  thread_id_printer_.SetSymbolName(0, "chunk_id");
+  thread_id_printer_.SetSymbolName(1, "unroll_id");
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+  MlirConcatenateFusion fusion(analysis);
+
+  constexpr auto kIndexing = R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+    (th_x + bl_x * 128) mod 400)
+    domain:
+    th_x in [0, 127]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 3]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+    th_x + bl_x * 128 in [0, 399]
+  )";
+  auto thread_id_to_output_indexing_0 = fusion.ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
+  EXPECT_THAT(thread_id_to_output_indexing_0->ToString(thread_id_printer_),
+              MatchIndexingString(kIndexing));
+  auto thread_id_to_output_indexing_1 = fusion.ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/1, &mlir_context_);
+  EXPECT_THAT(thread_id_to_output_indexing_1->ToString(thread_id_printer_),
+              MatchIndexingString(kIndexing));
+  auto thread_id_to_output_indexing_2 = fusion.ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/2, &mlir_context_);
+  EXPECT_THAT(thread_id_to_output_indexing_2->ToString(thread_id_printer_),
+              MatchIndexingString(kIndexing));
+}
+
 TEST_F(MlirConcatenateFusionTest, StandAloneConcatenate) {
   auto kHloString = R"(
     HloModule module
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_test.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_test.cc
new file mode 100644
index 00000000000000..b617fd6513d107
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/fusions/concatenate.h"
+
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/service/gpu/fusions/fusions.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class ConcatenateTest : public HloTestBase {
+ public:
+  void SetUp() override {
+    HloTestBase::SetUp();
+    printer_ =
+        AffineMapPrinter({"th_x", "th_y", "th_z", "bl_x", "bl_y", "bl_z"},
+                         {"chunk_id", "unroll_id"});
+  }
+
+ protected:
+  AffineMapPrinter printer_;
+  mlir::MLIRContext mlir_context_;
+};
+
+TEST_F(ConcatenateTest, ThreadIndexing) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    fused_computation {
+      param0 = f32[200] parameter(0)
+      param1 = f32[400] parameter(1)
+      param2 = f32[300] parameter(2)
+      ROOT concat = f32[900] concatenate(param0, param1, param2), dimensions={0}
+    }
+    ENTRY main {
+      param0 = f32[200] parameter(0)
+      param1 = f32[400] parameter(1)
+      param2 = f32[300] parameter(2)
+      ROOT fusion = f32[900] fusion(param0, param1, param2),
+        calls=fused_computation, kind=kLoop
+    }
+  )")
+                    .value();
+
+  stream_executor::DeviceDescription device_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis_fused = AnalyzeFusion(*root, device_info);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto emitter,
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis_fused}));
+  auto fusion = dynamic_cast<ConcatenateFusion*>(emitter.get());
+  ASSERT_NE(fusion, nullptr);
+
+  constexpr auto kIndexing = R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+    (th_x + bl_x * 128) mod 400)
+    domain:
+    th_x in [0, 127]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 3]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+    th_x + bl_x * 128 in [0, 399]
+  )";
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kIndexing));
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/0, /*hero_operand_index=*/1, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kIndexing));
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/0, /*hero_operand_index=*/2, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kIndexing));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
index 1fedb880c88f89..1dd6bda4d13da5 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
@@ -34,12 +34,32 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+namespace {
+
+constexpr int kDUSUpdateIndex = 1;
+
+}  // namespace
 
 LaunchDimensions InPlaceDynamicUpdateSliceFusion::launch_dimensions() const {
   const auto& update_shape = dus_ops_.front()->operand(1)->shape();
   return CalculateLaunchDimensions(update_shape, analysis_.device_info());
 }
 
+std::optional<IndexingMap>
+InPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* mlir_context) const {
+  if (hero_operand_index != kDUSUpdateIndex) {
+    return std::nullopt;
+  }
+  auto launch_dims = launch_dimensions();
+  // It is guaranteed that all DUS ops have the same output shape at this point.
+  const auto& update_shape =
+      dus_ops_.front()->operand(kDUSUpdateIndex)->shape();
+  return GetDefaultThreadIdToOutputIndexingMap(launch_dims, /*unroll_factor=*/1,
+                                               update_shape, mlir_context);
+}
+
 absl::Status InPlaceDynamicUpdateSliceFusion::EmitKernel(
     IrEmitterContext& ir_emitter_context, const HloFusionInstruction& fusion,
     const LaunchDimensions& launch_dims, std::vector<llvm_ir::IrArray> inputs,
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
index 12be8043b05ec1..213f7e7ecbdeab 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
@@ -75,10 +75,7 @@ class InPlaceDynamicUpdateSliceFusion : public KernelFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
-    // TODO(b/319081342): Implement this.
-    return std::nullopt;
-  }
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
index acb612bce464ad..3e5ce21c714f34 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
@@ -81,6 +81,9 @@ std::optional<IndexingMap>
 MlirInPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
     mlir::MLIRContext* mlir_context) const {
+  if (hero_operand_index != kDUSUpdateIndex) {
+    return std::nullopt;
+  }
   auto launch_dims = launch_dimensions();
   // It is guaranteed that all DUS ops have the same output shape at this point.
   const auto& update_shape =
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
index 3374568b3c7a5d..3aabb901b498c9 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
 #include "xla/service/gpu/fusions/mlir_emitter_test_base.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
 #include "tsl/lib/core/status_test_util.h"
 
 namespace xla {
@@ -26,6 +27,54 @@ namespace {
 using MlirInPlaceDynamicUpdateSliceFusionTest =
     MlirEmitterTestBase<MlirInPlaceDynamicUpdateSliceFusion>;
 
+TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, ThreadIndexing) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    fused_computation {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] parameter(2)
+      i1 = s32[] parameter(3)
+      ROOT updated = f32[20,30] dynamic-update-slice(in, updates, i0, i1)
+    }
+    ENTRY entry {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] constant(2)
+      i1 = s32[] constant(3)
+      ROOT fusion = f32[20,30] fusion(in, updates, i0, i1), kind=kLoop, calls=fused_computation
+    }
+  )"));
+  thread_id_printer_.SetSymbolName(0, "chunk_id");
+  thread_id_printer_.SetSymbolName(1, "unroll_id");
+
+  auto* root = module->entry_computation()->root_instruction();
+
+  auto analysis = AnalyzeFusion(*root, device_info_);
+  MlirInPlaceDynamicUpdateSliceFusion fusion(analysis);
+
+  auto thread_id_update_indexing = fusion.ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/1, &mlir_context_);
+  EXPECT_THAT(thread_id_update_indexing->ToString(thread_id_printer_),
+              MatchIndexingString(R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+    th_x floordiv 6, th_x mod 6)
+    domain:
+    th_x in [0, 29]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 0]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+  )"));
+  auto thread_id_dst_indexing = fusion.ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
+  EXPECT_THAT(thread_id_dst_indexing, ::testing::Eq(std::nullopt));
+}
+
 TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, SimpleDUS) {
   auto kHloString = R"(
     HloModule module
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_test.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_test.cc
new file mode 100644
index 00000000000000..c0382560399c14
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_test.cc
@@ -0,0 +1,105 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/fusions/in_place_dynamic_update_slice.h"
+
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/service/gpu/fusions/fusions.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class InPlaceDynamicUpdateSliceFusionTest : public HloTestBase {
+ public:
+  void SetUp() override {
+    HloTestBase::SetUp();
+    printer_ =
+        AffineMapPrinter({"th_x", "th_y", "th_z", "bl_x", "bl_y", "bl_z"},
+                         {"chunk_id", "unroll_id"});
+  }
+
+ protected:
+  AffineMapPrinter printer_;
+  mlir::MLIRContext mlir_context_;
+};
+
+TEST_F(InPlaceDynamicUpdateSliceFusionTest, ThreadIndexing) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    fused_computation {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] parameter(2)
+      i1 = s32[] parameter(3)
+      ROOT updated = f32[20,30] dynamic-update-slice(in, updates, i0, i1)
+    }
+    ENTRY entry {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] constant(2)
+      i1 = s32[] constant(3)
+      ROOT fusion = f32[20,30] fusion(in, updates, i0, i1), kind=kLoop, calls=fused_computation
+    }
+  )")
+                    .value();
+
+  stream_executor::DeviceDescription device_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis_fused = AnalyzeFusion(*root, device_info);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto emitter,
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis_fused}));
+  auto fusion = dynamic_cast<InPlaceDynamicUpdateSliceFusion*>(emitter.get());
+  ASSERT_NE(fusion, nullptr);
+
+  auto thread_id_update_indexing = fusion->ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/1, &mlir_context_);
+  EXPECT_THAT(thread_id_update_indexing->ToString(printer_),
+              MatchIndexingString(R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+    th_x floordiv 6, th_x mod 6)
+    domain:
+    th_x in [0, 29]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 0]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+  )"));
+  auto thread_id_dst_indexing = fusion->ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
+  EXPECT_THAT(thread_id_dst_indexing, ::testing::Eq(std::nullopt));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.cc b/third_party/xla/xla/service/gpu/fusions/scatter.cc
index c784d08bf794e9..f7c5598d3ef887 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/elemental_ir_emitter.h"
+#include "xla/service/gpu/fusions/loop.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_nested.h"
@@ -47,9 +48,15 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+ScatterFusion::ScatterFusion(const HloFusionAnalysis& analysis)
+    : analysis_(analysis), config_(ComputeLoopFusionConfig(analysis)) {
+  CHECK_EQ(analysis.fusion_roots().size(), 1);
+  CHECK_EQ(analysis.fusion_roots()[0]->opcode(), HloOpcode::kScatter);
+}
+
 LaunchDimensions ScatterFusion::launch_dimensions() const {
   const auto& updates_shape =
-      analysis_.fusion_roots().front()->operand(2)->shape();
+      analysis_.fusion_roots().front()->operands().back()->shape();
   return CalculateLaunchDimensions(updates_shape, analysis_.device_info());
 }
 
@@ -232,5 +239,47 @@ absl::Status ScatterFusion::EmitKernel(IrEmitterContext& ir_emitter_context,
       .EmitLoop(name, index_type);
 }
 
+std::optional<IndexingMap> ScatterFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* ctx) const {
+  auto* scatter =
+      DynCast<HloScatterInstruction>(analysis_.fusion_heroes().front());
+  int64_t scatter_operand_count = scatter->scatter_operand_count();
+  // Scatter operands a packed in the following way:
+  // Operand IDs [0, scatter_operand_count - 1] for `scatter operands`.
+  // Operand ID  scatter_operand_count for `scatter indices`.
+  // Operand IDs [scatter_operand_count + 1, 2 * scatter_operand_count] for
+  // `scatter updates`.
+
+  // For scatter operands we do not know the thread ID indexing.
+  if (hero_operand_index < scatter_operand_count) {
+    return std::nullopt;
+  }
+  // Compute thread id mapping based on the first update operand.
+  Shape scatter_update_shape = scatter->scatter_updates().front()->shape();
+  IndexingMap scatter_update_map = GetDefaultThreadIdToOutputIndexingMap(
+      launch_dimensions(), config_.unroll_factor, scatter_update_shape, ctx);
+
+  // For scatter indices we project indexing for scatter updates and take the
+  // first result of the affine map only, because they coincide.
+  if (hero_operand_index == scatter_operand_count) {
+    Shape scatter_indices_shape = scatter->scatter_indices()->shape();
+    CHECK_EQ(scatter_indices_shape.rank(), 2) << scatter->ToString();
+    // Create a map from scatter update to scatter indices.
+    IndexingMap updates_to_indices_map{
+        mlir::AffineMap::get(
+            /*dimCount=*/scatter_update_shape.rank(), /*symbolCount=*/1,
+            {mlir::getAffineDimExpr(0, ctx), mlir::getAffineSymbolExpr(0, ctx)},
+            ctx),
+        DimVarsFromTensorSizes(scatter_update_shape.dimensions()),
+        RangeVarsFromTensorSizes({scatter_indices_shape.dimensions(1)}),
+        /*rt_vars=*/{}};
+    auto scatter_indices_map = scatter_update_map * updates_to_indices_map;
+    scatter_indices_map.Simplify();
+    return scatter_indices_map;
+  }
+  return scatter_update_map;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.h b/third_party/xla/xla/service/gpu/fusions/scatter.h
index 6982bbc8e6bd2c..289328fd7a7c14 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.h
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.h
@@ -35,11 +35,7 @@ namespace gpu {
 // A scatter, implemented as a loop over the updates. All scatters are in-place.
 class ScatterFusion : public KernelFusionEmitterBase {
  public:
-  explicit ScatterFusion(const HloFusionAnalysis& analysis)
-      : analysis_(analysis) {
-    CHECK_EQ(analysis.fusion_roots().size(), 1);
-    CHECK_EQ(analysis.fusion_roots()[0]->opcode(), HloOpcode::kScatter);
-  }
+  explicit ScatterFusion(const HloFusionAnalysis& analysis);
 
   LaunchDimensions launch_dimensions() const override;
 
@@ -52,10 +48,7 @@ class ScatterFusion : public KernelFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
-    // TODO(b/319081342): Implement this.
-    return std::nullopt;
-  }
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
@@ -67,6 +60,7 @@ class ScatterFusion : public KernelFusionEmitterBase {
 
  private:
   const HloFusionAnalysis& analysis_;
+  LaunchDimensionsConfig config_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
index f7fdba3b97db30..12fca854ae5fb7 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
@@ -28,7 +28,7 @@ namespace {
 
 using MlirScatterFusionTest = MlirEmitterTestBase<MlirScatterFusion>;
 
-TEST_F(MlirScatterFusionTest, ThreadId_IndexingUnrolled) {
+TEST_F(MlirScatterFusionTest, ThreadIdIndexing) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_test.cc b/third_party/xla/xla/service/gpu/fusions/scatter_test.cc
index e000c05caf9938..2be8dc86d75540 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "xla/service/gpu/fusions/fusions.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
@@ -29,7 +31,19 @@ namespace xla {
 namespace gpu {
 namespace {
 
-class ScatterFusionTest : public HloTestBase {};
+class ScatterFusionTest : public HloTestBase {
+ public:
+  void SetUp() override {
+    HloTestBase::SetUp();
+    printer_ =
+        AffineMapPrinter({"th_x", "th_y", "th_z", "bl_x", "bl_y", "bl_z"},
+                         {"chunk_id", "unroll_id", "index_id"});
+  }
+
+ protected:
+  AffineMapPrinter printer_;
+  mlir::MLIRContext mlir_context_;
+};
 
 TEST_F(ScatterFusionTest, ScatterFusion) {
   auto module = ParseAndReturnVerifiedModule(R"(
@@ -76,6 +90,130 @@ TEST_F(ScatterFusionTest, ScatterFusion) {
             3 * 9 /* updates size */);
 }
 
+TEST_F(ScatterFusionTest, ThreadIdIndexing) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    computation {
+      %p0 = f32[] parameter(0)
+      %p1 = f32[] parameter(1)
+      %p2 = f32[] parameter(2)
+      %p3 = f32[] parameter(3)
+      ROOT %tuple = (f32[], f32[]) tuple(f32[] %p2, f32[] %p3)
+    }
+    scatter {
+      %operand0 = f32[300,200] parameter(0)
+      %operand1 = f32[300,200] parameter(1)
+      %indices = s32[42,1] parameter(2)
+      %update.1 = f32[42,10,20] parameter(3)
+      %update.2 = f32[42,10,20]parameter(4)
+
+      ROOT %scatter = (f32[300,200], f32[300,200]) scatter(
+          f32[300,200] %operand0,
+          f32[300,200] %operand1,
+          s32[42,1] %indices,
+          f32[42,10,20] %update.1,
+          f32[42,10,20] %update.2
+        ),
+        update_window_dims={1,2},
+        inserted_window_dims={},
+        scatter_dims_to_operand_dims={0},
+        index_vector_dim=1,
+        to_apply=computation
+    }
+    ENTRY entry {
+      %operand0 = f32[300,200] parameter(0)
+      %operand1 = f32[300,200] parameter(1)
+      %indices = s32[42,1] parameter(2)
+      %update.1 = f32[42,10,20] parameter(3)
+      %update.2 = f32[42,10,20]parameter(4)
+      ROOT %fusion = (f32[300,200], f32[300,200]) fusion(
+        %operand0, %operand1, %indices, %update.1, %update.2),
+        kind=kLoop, calls=scatter
+    }
+  )"));
+  stream_executor::DeviceDescription device_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis_fused = AnalyzeFusion(*root, device_info);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto emitter,
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis_fused}));
+  auto fusion = dynamic_cast<ScatterFusion*>(emitter.get());
+  ASSERT_NE(fusion, nullptr);
+
+  constexpr auto kUpdatesIndexing = R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+    ((bl_x * 16 + th_x floordiv 8) floordiv 25) mod 42,
+    ((bl_x * 32 + th_x floordiv 4) floordiv 5) mod 10,
+    (th_x + bl_x * 128) mod 20)
+    domain:
+    th_x in [0, 127]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 65]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+    th_x + bl_x * 128 in [0, 8399]
+  )";
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/0, /*hero_operand_index=*/3, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kUpdatesIndexing));
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/0, /*hero_operand_index=*/4, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kUpdatesIndexing));
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/1, /*hero_operand_index=*/3, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kUpdatesIndexing));
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/1, /*hero_operand_index=*/4, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kUpdatesIndexing));
+
+  constexpr auto kIndicesIndexing = R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id, index_id] -> (
+    ((bl_x * 16 + th_x floordiv 8) floordiv 25) mod 42, 0)
+    domain:
+    th_x in [0, 127]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 65]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+    index_id in [0, 0]
+    th_x + bl_x * 128 in [0, 8399]
+  )";
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/0, /*hero_operand_index=*/2, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kIndicesIndexing));
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/1, /*hero_operand_index=*/2, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kIndicesIndexing));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From 5c52f8a927aa4b05b4df23e508bd46de963d0c65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tam=C3=A1s=20Danyluk?= <tdanyluk@google.com>
Date: Tue, 26 Mar 2024 06:23:03 -0700
Subject: [PATCH 441/670] [XLA:GPU] Implement EmitDot in the new MLIR emitters

PiperOrigin-RevId: 619167898
---
 .../xla/xla/service/gpu/fusions/mlir/BUILD    |   4 +
 .../gpu/fusions/mlir/elemental_hlo_to_mlir.cc | 107 ++++++-
 .../mlir/elemental_hlo_to_mlir_test.cc        | 283 ++++++++++++++++++
 3 files changed, 393 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
index 0c6c8982bc5111..4b9298fda6e6b5 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
@@ -70,20 +70,24 @@ cc_library(
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/mlir_hlo:type_conversion",
+        "//xla/service:algorithm_util",
         "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
+        "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:AffineUtils",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index 7f938e6c5a72ad..321b06c9c42a97 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -25,9 +25,11 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/STLExtras.h"
@@ -65,11 +67,13 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
 #include "xla/mlir_hlo/mhlo/utils/type_conversion.h"
 #include "xla/primitive_util.h"
+#include "xla/service/algorithm_util.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
@@ -85,6 +89,7 @@ namespace {
 using llvm::SmallVector;
 using llvm::SmallVectorImpl;
 using mlir::Block;
+using mlir::FloatType;
 using mlir::ImplicitLocOpBuilder;
 using mlir::IRMapping;
 using mlir::Location;
@@ -162,7 +167,7 @@ static auto& kUnsupportedOps =
                                         HloOpcode::kCall};
 
 static auto& kUnimplementedOps = *new absl::flat_hash_set<HloOpcode>{
-    HloOpcode::kConvolution, HloOpcode::kDot, HloOpcode::kMap};
+    HloOpcode::kConvolution, HloOpcode::kMap};
 
 bool IsUnsupportedConstant(const HloInstruction* instr) {
   return instr->opcode() == HloOpcode::kConstant &&
@@ -552,6 +557,103 @@ absl::StatusOr<SmallVector<Value>> EmitPad(
   return if_op.getResults();
 }
 
+absl::StatusOr<Value> EmitFloatCast(Value value, mlir::Type target_type,
+                                    ImplicitLocOpBuilder& b) {
+  if (value.getType().getIntOrFloatBitWidth() <
+      target_type.getIntOrFloatBitWidth()) {
+    return b.create<arith::ExtFOp>(target_type, value);
+  }
+  if (value.getType().getIntOrFloatBitWidth() >
+      target_type.getIntOrFloatBitWidth()) {
+    return b.create<arith::TruncFOp>(target_type, value);
+  }
+  return value;
+}
+
+absl::StatusOr<Value> EmitMulAdd(Value lhs, Value rhs, Value accumulator,
+                                 mlir::Type result_element_type,
+                                 mlir::Type accumulator_type,
+                                 ImplicitLocOpBuilder& b) {
+  if (result_element_type.isa<FloatType>()) {
+    if (result_element_type.isBF16()) {
+      lhs = b.create<arith::ExtFOp>(b.getF32Type(), lhs);
+      rhs = b.create<arith::ExtFOp>(b.getF32Type(), rhs);
+    }
+    TF_ASSIGN_OR_RETURN(
+        Value casted,
+        EmitFloatCast(b.create<arith::MulFOp>(lhs, rhs), accumulator_type, b));
+    return b.create<arith::AddFOp>(accumulator, casted);
+  }
+  if (result_element_type.isInteger(1)) {
+    return b.create<arith::OrIOp>(accumulator,
+                                  b.create<arith::AndIOp>(lhs, rhs));
+  }
+  return b.create<arith::AddIOp>(accumulator,
+                                 b.create<arith::MulIOp>(lhs, rhs));
+}
+
+absl::StatusOr<SmallVector<Value>> EmitDot(
+    const HloInstruction* instr, mlir::Type result_element_type,
+    ValueRange indices, const OperandProvider& operand_provider,
+    ImplicitLocOpBuilder& b) {
+  VLOG(1) << "EmitDot: " << instr->ToString() << " "
+          << llvm_ir::DumpToString(result_element_type);
+
+  if (!algorithm_util::IsSupportedByElementalIrEmitter(
+          instr->precision_config().algorithm())) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("Algorithm not supported by the ElementalIrEmitter: %s",
+                        PrecisionConfig::Algorithm_Name(
+                            instr->precision_config().algorithm())));
+  }
+  auto* dot = DynCast<HloDotInstruction>(instr);
+  TF_RET_CHECK(dot != nullptr);
+  if (dot->sparse_operands()) {
+    return absl::UnimplementedError(
+        "Sparse dot is supported by Triton emitter only.");
+  }
+
+  HloInstructionIndexing indexing =
+      ComputeOutputToInputIndexing(instr, /*output_id=*/0, b.getContext());
+  const IndexingMap& lhs_indexing_map = *indexing.indexing_maps.at(0).begin();
+  const IndexingMap& rhs_indexing_map = *indexing.indexing_maps.at(1).begin();
+
+  const mlir::Type accumulator_type =
+      result_element_type.isBF16() ? b.getF32Type() : result_element_type;
+  Value accum_init_value =
+      b.create<ConstantOp>(b.getZeroAttr(accumulator_type)).getResult();
+
+  auto body =
+      [&](ValueRange iter_args, ValueRange dim_values,
+          ValueRange symbol_values) -> absl::StatusOr<SmallVector<Value>> {
+    llvm::SmallVector<Value> lhs_indices = ApplyAffineMap(
+        lhs_indexing_map.GetAffineMap(), dim_values, symbol_values, b);
+    llvm::SmallVector<Value> rhs_indices = ApplyAffineMap(
+        rhs_indexing_map.GetAffineMap(), dim_values, symbol_values, b);
+
+    TF_ASSIGN_OR_RETURN(Value lhs_value, GetSingleOperandValue(
+                                             operand_provider, instr,
+                                             /*operand_index=*/0, lhs_indices));
+    TF_ASSIGN_OR_RETURN(Value rhs_value, GetSingleOperandValue(
+                                             operand_provider, instr,
+                                             /*operand_index=*/1, rhs_indices));
+    Value accum = iter_args[0];
+
+    TF_ASSIGN_OR_RETURN(
+        accum, EmitMulAdd(lhs_value, rhs_value, accum, result_element_type,
+                          accumulator_type, b));
+    return {{accum}};
+  };
+
+  TF_ASSIGN_OR_RETURN(SmallVector<Value> results,
+                      EmitLoopNestWithStatus(b, indices, {accum_init_value},
+                                             lhs_indexing_map, body));
+  if (result_element_type.isBF16()) {
+    results[0] = b.create<arith::TruncFOp>(b.getBF16Type(), results[0]);
+  }
+  return results;
+}
+
 absl::StatusOr<SmallVector<Value>> EmitParameter(const HloInstruction* instr,
                                                  mlir::func::FuncOp this_fn,
                                                  ValueRange indices,
@@ -703,6 +805,9 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
     case HloOpcode::kPad:
       return EmitPad(instr, result_element_type, indices, operand_provider,
                      builder);
+    case HloOpcode::kDot:
+      return EmitDot(instr, result_element_type, indices, operand_provider,
+                     builder);
     case HloOpcode::kParameter:
       return EmitParameter(instr, this_fn, indices, builder);
     case HloOpcode::kReduce:
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
index 699912005bb223..3bb4e9fbd1f01d 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
@@ -434,6 +434,289 @@ TEST_F(ElementalHloToMlirTest, PadUnsigned) {
   )"));
 }
 
+TEST_F(ElementalHloToMlirTest, DotWithF32Type) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = f32[3, 4] parameter(0)
+      p1 = f32[4, 5] parameter(1)
+      ROOT dot = f32[3, 5] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })",
+                   R"(
+    // CHECK:      @main_dot(
+    // CHECK-SAME: %[[A:.*]]: tensor<3x4xf32>, %[[B:.*]]: tensor<4x5xf32>,
+    // CHECK-SAME: %[[I:.*]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[J:.*]]: index {xla.range = [0 : index, 4 : index]})
+    // CHECK-SAME: -> f32
+    // CHECK-SAME: {
+    // CHECK-DAG:    %[[ACCUM_INIT:.*]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG:    %[[C4:.*]] = arith.constant 4 : index
+    // CHECK:        %[[FOR0:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-SAME:   iter_args(%[[ACCUM:.*]] = %[[ACCUM_INIT]]) -> (f32) {
+    // CHECK-DAG:      %[[CMPI0:.*]] = arith.cmpi sge, %[[I]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI1:.*]] = arith.cmpi sle, %[[I]], %[[C2]] : index
+    // CHECK-DAG:      %[[I_IN_RANGE:.*]] = arith.andi %[[CMPI0]], %[[CMPI1]] : i1
+    // CHECK-DAG:      %[[CMPI2:.*]] = arith.cmpi sge, %[[J]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI3:.*]] = arith.cmpi sle, %[[J]], %[[C4]] : index
+    // CHECK-DAG:      %[[J_IN_RANGE:.*]] = arith.andi %[[CMPI2]], %[[CMPI3]] : i1
+    // CHECK-DAG:      %[[I_J_IN_RANGE:.*]] = arith.andi %[[I_IN_RANGE]], %[[J_IN_RANGE]] : i1
+    // CHECK:          %[[IF0:.*]] = scf.if %[[I_J_IN_RANGE]] -> (f32) {
+    // CHECK-DAG:        %[[A_I_K:.*]] = tensor.extract %[[A]][%[[I]], %[[K]]] : tensor<3x4xf32>
+    // CHECK-DAG:        %[[B_K_J:.*]] = tensor.extract %[[B]][%[[K]], %[[J]]] : tensor<4x5xf32>
+    // CHECK-DAG:        %[[MULF0:.*]] = arith.mulf %[[A_I_K]], %[[B_K_J]] : f32
+    // CHECK-DAG:        %[[ADDF0:.*]] = arith.addf %[[ACCUM]], %[[MULF0]] : f32
+    // CHECK-DAG:        scf.yield %[[ADDF0]] : f32
+    // CHECK:          } else {
+    // CHECK:            scf.yield %[[ACCUM]] : f32
+    // CHECK:          }
+    // CHECK:          scf.yield %[[IF0]] : f32
+    // CHECK:        }
+    // CHECK:        return %[[FOR0]] : f32
+    // CHECK:      }
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, DotWithBF16Type) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = bf16[3, 4] parameter(0)
+      p1 = bf16[4, 5] parameter(1)
+      ROOT dot = bf16[3, 5] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })",
+                   R"(
+    // CHECK:      @main_dot(
+    // CHECK-SAME: %[[A:.*]]: tensor<3x4xbf16>, %[[B:.*]]: tensor<4x5xbf16>,
+    // CHECK-SAME: %[[I:.*]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[J:.*]]: index {xla.range = [0 : index, 4 : index]})
+    // CHECK-SAME: -> bf16
+    // CHECK-SAME: {
+    // CHECK-DAG:    %[[ACCUM_INIT:.*]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG:    %[[C4:.*]] = arith.constant 4 : index
+    // CHECK:        %[[FOR0:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-SAME:   iter_args(%[[ACCUM:.*]] = %[[ACCUM_INIT]]) -> (f32) {
+    // CHECK-DAG:      %[[CMPI0:.*]] = arith.cmpi sge, %[[I]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI1:.*]] = arith.cmpi sle, %[[I]], %[[C2]] : index
+    // CHECK-DAG:      %[[I_IN_RANGE:.*]] = arith.andi %[[CMPI0]], %[[CMPI1]] : i1
+    // CHECK-DAG:      %[[CMPI2:.*]] = arith.cmpi sge, %[[J]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI3:.*]] = arith.cmpi sle, %[[J]], %[[C4]] : index
+    // CHECK-DAG:      %[[J_IN_RANGE:.*]] = arith.andi %[[CMPI2]], %[[CMPI3]] : i1
+    // CHECK-DAG:      %[[I_J_IN_RANGE:.*]] = arith.andi %[[I_IN_RANGE]], %[[J_IN_RANGE]] : i1
+    // CHECK:          %[[IF0:.*]] = scf.if %[[I_J_IN_RANGE]] -> (f32) {
+    // CHECK-DAG:        %[[A_I_K:.*]] = tensor.extract %[[A]][%[[I]], %[[K]]] : tensor<3x4xbf16>
+    // CHECK-DAG:        %[[B_K_J:.*]] = tensor.extract %[[B]][%[[K]], %[[J]]] : tensor<4x5xbf16>
+    // CHECK-DAG:        %[[A_I_K_F32:.*]] = arith.extf %[[A_I_K]] :  bf16 to f32
+    // CHECK-DAG:        %[[B_K_J_F32:.*]] = arith.extf %[[B_K_J]] :  bf16 to f32
+    // CHECK-DAG:        %[[MULF0:.*]] = arith.mulf %[[A_I_K_F32]], %[[B_K_J_F32]] : f32
+    // CHECK-DAG:        %[[ADDF0:.*]] = arith.addf %[[ACCUM]], %[[MULF0]] : f32
+    // CHECK-DAG:        scf.yield %[[ADDF0]] : f32
+    // CHECK:          } else {
+    // CHECK:            scf.yield %[[ACCUM]] : f32
+    // CHECK:          }
+    // CHECK:          scf.yield %[[IF0]] : f32
+    // CHECK:        }
+    // CHECK:        %[[FOR0_BF16:.*]] = arith.truncf %[[FOR0]] : f32 to bf16
+    // CHECK:        return %[[FOR0_BF16]] : bf16
+    // CHECK:      }
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, DotWithS32Type) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = s32[3, 4] parameter(0)
+      p1 = s32[4, 5] parameter(1)
+      ROOT dot = s32[3, 5] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })",
+                   R"(
+    // CHECK:      @main_dot(
+    // CHECK-SAME: %[[A:.*]]: tensor<3x4xi32>, %[[B:.*]]: tensor<4x5xi32>,
+    // CHECK-SAME: %[[I:.*]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[J:.*]]: index {xla.range = [0 : index, 4 : index]})
+    // CHECK-SAME: -> i32
+    // CHECK-SAME: {
+    // CHECK-DAG:    %[[ACCUM_INIT:.*]] = arith.constant 0 : i32
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG:    %[[C4:.*]] = arith.constant 4 : index
+    // CHECK:        %[[FOR0:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-SAME:   iter_args(%[[ACCUM:.*]] = %[[ACCUM_INIT]]) -> (i32) {
+    // CHECK-DAG:      %[[CMPI0:.*]] = arith.cmpi sge, %[[I]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI1:.*]] = arith.cmpi sle, %[[I]], %[[C2]] : index
+    // CHECK-DAG:      %[[I_IN_RANGE:.*]] = arith.andi %[[CMPI0]], %[[CMPI1]] : i1
+    // CHECK-DAG:      %[[CMPI2:.*]] = arith.cmpi sge, %[[J]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI3:.*]] = arith.cmpi sle, %[[J]], %[[C4]] : index
+    // CHECK-DAG:      %[[J_IN_RANGE:.*]] = arith.andi %[[CMPI2]], %[[CMPI3]] : i1
+    // CHECK-DAG:      %[[I_J_IN_RANGE:.*]] = arith.andi %[[I_IN_RANGE]], %[[J_IN_RANGE]] : i1
+    // CHECK:          %[[IF0:.*]] = scf.if %[[I_J_IN_RANGE]] -> (i32) {
+    // CHECK-DAG:        %[[A_I_K:.*]] = tensor.extract %[[A]][%[[I]], %[[K]]] : tensor<3x4xi32>
+    // CHECK-DAG:        %[[B_K_J:.*]] = tensor.extract %[[B]][%[[K]], %[[J]]] : tensor<4x5xi32>
+    // CHECK-DAG:        %[[MUL0:.*]] = arith.muli %[[A_I_K]], %[[B_K_J]] : i32
+    // CHECK-DAG:        %[[ADD0:.*]] = arith.addi %[[ACCUM]], %[[MUL0]] : i32
+    // CHECK-DAG:        scf.yield %[[ADD0]] : i32
+    // CHECK:          } else {
+    // CHECK:            scf.yield %[[ACCUM]] : i32
+    // CHECK:          }
+    // CHECK:          scf.yield %[[IF0]] : i32
+    // CHECK:        }
+    // CHECK:        return %[[FOR0]] : i32
+    // CHECK:      }
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, DotWithU32Type) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = u32[3, 4] parameter(0)
+      p1 = u32[4, 5] parameter(1)
+      ROOT dot = u32[3, 5] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })",
+                   R"(
+    // CHECK:      @main_dot(
+    // CHECK-SAME: %[[A:.*]]: tensor<3x4xui32>, %[[B:.*]]: tensor<4x5xui32>,
+    // CHECK-SAME: %[[I:.*]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[J:.*]]: index {xla.range = [0 : index, 4 : index]})
+    // CHECK-SAME: -> ui32
+    // CHECK-SAME: {
+    // CHECK-DAG:    %[[ACCUM_INIT:.*]] = arith.constant 0 : i32
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG:    %[[C4:.*]] = arith.constant 4 : index
+    // CHECK:        %[[FOR0:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-SAME:   iter_args(%[[ACCUM:.*]] = %[[ACCUM_INIT]]) -> (i32) {
+    // CHECK-DAG:      %[[CMPI0:.*]] = arith.cmpi sge, %[[I]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI1:.*]] = arith.cmpi sle, %[[I]], %[[C2]] : index
+    // CHECK-DAG:      %[[I_IN_RANGE:.*]] = arith.andi %[[CMPI0]], %[[CMPI1]] : i1
+    // CHECK-DAG:      %[[CMPI2:.*]] = arith.cmpi sge, %[[J]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI3:.*]] = arith.cmpi sle, %[[J]], %[[C4]] : index
+    // CHECK-DAG:      %[[J_IN_RANGE:.*]] = arith.andi %[[CMPI2]], %[[CMPI3]] : i1
+    // CHECK-DAG:      %[[I_J_IN_RANGE:.*]] = arith.andi %[[I_IN_RANGE]], %[[J_IN_RANGE]] : i1
+    // CHECK:          %[[IF0:.*]] = scf.if %[[I_J_IN_RANGE]] -> (i32) {
+    // CHECK-DAG:        %[[A_I_K:.*]] = tensor.extract %[[A]][%[[I]], %[[K]]] : tensor<3x4xui32>
+    // CHECK-DAG:        %[[A_I_K_I32:.*]] = builtin.unrealized_conversion_cast %[[A_I_K]] : ui32 to i32
+    // CHECK-DAG:        %[[B_K_J:.*]] = tensor.extract %[[B]][%[[K]], %[[J]]] : tensor<4x5xui32>
+    // CHECK-DAG:        %[[B_K_J_I32:.*]] = builtin.unrealized_conversion_cast %[[B_K_J]] : ui32 to i32
+    // CHECK-DAG:        %[[MUL0:.*]] = arith.muli %[[A_I_K_I32]], %[[B_K_J_I32]] : i32
+    // CHECK-DAG:        %[[ADD0:.*]] = arith.addi %[[ACCUM]], %[[MUL0]] : i32
+    // CHECK-DAG:        scf.yield %[[ADD0]] : i32
+    // CHECK:          } else {
+    // CHECK:            scf.yield %[[ACCUM]] : i32
+    // CHECK:          }
+    // CHECK:          scf.yield %[[IF0]] : i32
+    // CHECK:        }
+    // CHECK:        %[[FOR0_UI32:.*]] = builtin.unrealized_conversion_cast %[[FOR0]] : i32 to ui32
+    // CHECK:        return %[[FOR0_UI32]] : ui32
+    // CHECK:      }
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, DotWithPredType) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = pred[3, 4] parameter(0)
+      p1 = pred[4, 5] parameter(1)
+      ROOT dot = pred[3, 5] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })",
+                   R"(
+    // CHECK:      @main_dot(
+    // CHECK-SAME: %[[A:.*]]: tensor<3x4xi1>, %[[B:.*]]: tensor<4x5xi1>,
+    // CHECK-SAME: %[[I:.*]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[J:.*]]: index {xla.range = [0 : index, 4 : index]})
+    // CHECK-SAME: -> i1
+    // CHECK-SAME: {
+    // CHECK-DAG:    %[[ACCUM_INIT:.*]] = arith.constant false
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG:    %[[C4:.*]] = arith.constant 4 : index
+    // CHECK:        %[[FOR0:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-SAME:   iter_args(%[[ACCUM:.*]] = %[[ACCUM_INIT]]) -> (i1) {
+    // CHECK-DAG:      %[[CMPI0:.*]] = arith.cmpi sge, %[[I]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI1:.*]] = arith.cmpi sle, %[[I]], %[[C2]] : index
+    // CHECK-DAG:      %[[I_IN_RANGE:.*]] = arith.andi %[[CMPI0]], %[[CMPI1]] : i1
+    // CHECK-DAG:      %[[CMPI2:.*]] = arith.cmpi sge, %[[J]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI3:.*]] = arith.cmpi sle, %[[J]], %[[C4]] : index
+    // CHECK-DAG:      %[[J_IN_RANGE:.*]] = arith.andi %[[CMPI2]], %[[CMPI3]] : i1
+    // CHECK-DAG:      %[[I_J_IN_RANGE:.*]] = arith.andi %[[I_IN_RANGE]], %[[J_IN_RANGE]] : i1
+    // CHECK:          %[[IF0:.*]] = scf.if %[[I_J_IN_RANGE]] -> (i1) {
+    // CHECK-DAG:        %[[A_I_K:.*]] = tensor.extract %[[A]][%[[I]], %[[K]]] : tensor<3x4xi1>
+    // CHECK-DAG:        %[[B_K_J:.*]] = tensor.extract %[[B]][%[[K]], %[[J]]] : tensor<4x5xi1>
+    // CHECK-DAG:        %[[AND0:.*]] = arith.andi %[[A_I_K]], %[[B_K_J]] : i1
+    // CHECK-DAG:        %[[OR0:.*]] = arith.ori %[[ACCUM]], %[[AND0]] : i1
+    // CHECK-DAG:        scf.yield %[[OR0]] : i1
+    // CHECK:          } else {
+    // CHECK:            scf.yield %[[ACCUM]] : i1
+    // CHECK:          }
+    // CHECK:          scf.yield %[[IF0]] : i1
+    // CHECK:        }
+    // CHECK:        return %[[FOR0]] : i1
+    // CHECK:      }
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, DotWithBatchAnd2ContractingDims) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = f32[7, 3, 4, 5] parameter(0)
+      p1 = f32[5, 6, 4, 7] parameter(1)
+      ROOT dot = f32[7, 3, 6] dot(p0, p1),
+                 lhs_contracting_dims={2, 3}, rhs_contracting_dims={2, 0},
+                 lhs_batch_dims={0}, rhs_batch_dims={3}
+    })",
+                   R"(
+    // CHECK:      @main_dot(
+    // CHECK-SAME: %[[A:.*]]: tensor<7x3x4x5xf32>, %[[B:.*]]: tensor<5x6x4x7xf32>,
+    // CHECK-SAME: %[[N:.*]]: index {xla.range = [0 : index, 6 : index]},
+    // CHECK-SAME: %[[I:.*]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[J:.*]]: index {xla.range = [0 : index, 5 : index]})
+    // CHECK-SAME: -> f32
+    // CHECK-SAME: {
+    // CHECK-DAG:    %[[C0F:.*]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG:    %[[C4:.*]] = arith.constant 4 : index
+    // CHECK-DAG:    %[[C5:.*]] = arith.constant 5 : index
+    // CHECK-DAG:    %[[C6:.*]] = arith.constant 6 : index
+    // CHECK:        %[[FOR0:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-SAME:   iter_args(%[[ACCUM0:.*]] = %[[C0F]]) -> (f32) {
+    // CHECK:          %[[FOR1:.*]] = scf.for %[[L:.*]] = %[[C0]] to %[[C5]] step %[[C1]]
+    // CHECK-SAME:     iter_args(%[[ACCUM1:.*]] = %[[ACCUM0]]) -> (f32) {
+    // CHECK-DAG:        %[[CMPI0:.*]] = arith.cmpi sge, %[[N]], %[[C0]] : index
+    // CHECK-DAG:        %[[CMPI1:.*]] = arith.cmpi sle, %[[N]], %[[C6]] : index
+    // CHECK-DAG:        %[[N_IN_RANGE:.*]] = arith.andi %[[CMPI0]], %[[CMPI1]] : i1
+    // CHECK-DAG:        %[[CMPI2:.*]] = arith.cmpi sge, %[[I]], %[[C0]] : index
+    // CHECK-DAG:        %[[CMPI3:.*]] = arith.cmpi sle, %[[I]], %[[C2]] : index
+    // CHECK-DAG:        %[[I_IN_RANGE:.*]] = arith.andi %[[CMPI2]], %[[CMPI3]] : i1
+    // CHECK-DAG:        %[[N_I_IN_RANGE:.*]] = arith.andi %[[N_IN_RANGE]], %[[I_IN_RANGE]] : i1
+    // CHECK-DAG:        %[[CMPI4:.*]] = arith.cmpi sge, %[[J]], %[[C0]] : index
+    // CHECK-DAG:        %[[CMPI5:.*]] = arith.cmpi sle, %[[J]], %[[C5]] : index
+    // CHECK-DAG:        %[[J_IN_RANGE:.*]] = arith.andi %[[CMPI4]], %[[CMPI5]] : i1
+    // CHECK-DAG:        %[[N_I_J_IN_RANGE:.*]] = arith.andi %[[N_I_IN_RANGE]], %[[J_IN_RANGE]] : i1
+    // CHECK:            %[[IF0:.*]] = scf.if %[[N_I_J_IN_RANGE]] -> (f32) {
+    // CHECK-DAG:          %[[A_N_I_K_L:.*]] = tensor.extract %[[A]][%[[N]], %[[I]], %[[K]], %[[L]]] : tensor<7x3x4x5xf32>
+    // CHECK-DAG:          %[[B_L_J_K_N:.*]] = tensor.extract %[[B]][%[[L]], %[[J]], %[[K]], %[[N]]] : tensor<5x6x4x7xf32>
+    // CHECK-DAG:          %[[MULF0:.*]] = arith.mulf %[[A_N_I_K_L]], %[[B_L_J_K_N]] : f32
+    // CHECK-DAG:          %[[ADDF0:.*]] = arith.addf %[[ACCUM1]], %[[MULF0]] : f32
+    // CHECK-DAG:          scf.yield %[[ADDF0]] : f32
+    // CHECK:            } else {
+    // CHECK:              scf.yield %[[ACCUM1]] : f32
+    // CHECK:            }
+    // CHECK:            scf.yield %[[IF0]] : f32
+    // CHECK:          }
+    // CHECK:          scf.yield %[[FOR1]] : f32
+    // CHECK:        }
+    // CHECK:        return %[[FOR0]] : f32
+    // CHECK:      }
+  )"));
+}
+
 TEST_F(ElementalHloToMlirTest, Transpose) {
   TF_EXPECT_OK(Run(R"(
     ENTRY main {

From 409ff7b54c89ca7e243d8e144d360ba058b942f9 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Tue, 26 Mar 2024 06:37:44 -0700
Subject: [PATCH 442/670] [XLA:GPU][NFC] Move tile_analysis* to symbolic_tile*.

Related code pertaining to tile propagation/analysis belongs in
symbolic_tile_analysis*.

PiperOrigin-RevId: 619171048
---
 third_party/xla/xla/service/gpu/model/BUILD        | 14 +++++++-------
 .../model/{tile_analysis.cc => symbolic_tile.cc}   |  2 +-
 .../gpu/model/{tile_analysis.h => symbolic_tile.h} |  6 +++---
 .../service/gpu/model/symbolic_tile_analysis.cc    |  2 +-
 .../xla/service/gpu/model/symbolic_tile_analysis.h |  2 +-
 ...tile_analysis_test.cc => symbolic_tile_test.cc} |  2 +-
 6 files changed, 14 insertions(+), 14 deletions(-)
 rename third_party/xla/xla/service/gpu/model/{tile_analysis.cc => symbolic_tile.cc} (99%)
 rename third_party/xla/xla/service/gpu/model/{tile_analysis.h => symbolic_tile.h} (95%)
 rename third_party/xla/xla/service/gpu/model/{tile_analysis_test.cc => symbolic_tile_test.cc} (99%)

diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 3450ab0c83c1e8..782121674db3d1 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -517,9 +517,9 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "tile_analysis",
-    srcs = ["tile_analysis.cc"],
-    hdrs = ["tile_analysis.h"],
+    name = "symbolic_tile",
+    srcs = ["symbolic_tile.cc"],
+    hdrs = ["symbolic_tile.h"],
     deps = [
         ":affine_map_printer",
         ":indexing_map",
@@ -534,13 +534,13 @@ cc_library(
 )
 
 xla_cc_test(
-    name = "tile_analysis_test",
-    srcs = ["tile_analysis_test.cc"],
+    name = "symbolic_tile_test",
+    srcs = ["symbolic_tile_test.cc"],
     deps = [
         ":affine_map_printer",
         ":indexing_analysis",
         ":indexing_test_utils",
-        ":tile_analysis",
+        ":symbolic_tile",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -559,7 +559,7 @@ cc_library(
     deps = [
         ":indexing_analysis",
         ":indexing_map",
-        ":tile_analysis",
+        ":symbolic_tile",
         "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:instruction_fusion",
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/model/tile_analysis.cc
rename to third_party/xla/xla/service/gpu/model/symbolic_tile.cc
index dceb6b737e6b56..98ed3ecc4dff4f 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/tile_analysis.h"
+#include "xla/service/gpu/model/symbolic_tile.h"
 
 #include <cstdint>
 #include <optional>
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile.h
similarity index 95%
rename from third_party/xla/xla/service/gpu/model/tile_analysis.h
rename to third_party/xla/xla/service/gpu/model/symbolic_tile.h
index 82c104759fe453..93dc5ae25b0318 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_MODEL_TILE_ANALYSIS_H_
-#define XLA_SERVICE_GPU_MODEL_TILE_ANALYSIS_H_
+#ifndef XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_H_
+#define XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_H_
 
 #include <optional>
 #include <ostream>
@@ -78,4 +78,4 @@ class SymbolicTile {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_MODEL_TILE_ANALYSIS_H_
+#endif  // XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_H_
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index aa38b747126039..41852118a2c4c2 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "xla/service/gpu/model/tile_analysis.h"
+#include "xla/service/gpu/model/symbolic_tile.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/status.h"
 
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
index 96c95fd9340b0d..be9953fa0ff574 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "xla/service/gpu/model/tile_analysis.h"
+#include "xla/service/gpu/model/symbolic_tile.h"
 #include "xla/service/instruction_fusion.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
rename to third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
index 18d80f6fd4627e..74673bd8f4db19 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/tile_analysis.h"
+#include "xla/service/gpu/model/symbolic_tile.h"
 
 #include <optional>
 

From dcd9c6dffb379b02daa54430aa314a917bc6aa31 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Tue, 26 Mar 2024 08:17:07 -0700
Subject: [PATCH 443/670] [XLA:GPU] Add GpuHloCostAnalysis option to estimate
 Triton Softmax fusions.

PiperOrigin-RevId: 619195443
---
 third_party/xla/xla/service/gpu/BUILD         |  2 +
 third_party/xla/xla/service/gpu/model/BUILD   |  1 +
 .../gpu/model/gpu_hlo_cost_analysis.cc        | 10 +++
 .../service/gpu/model/gpu_hlo_cost_analysis.h | 11 +++-
 .../gpu/model/gpu_performance_model_test.cc   | 66 +++++++++++++++++++
 5 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 241d2d5bf7a3e6..1f63895f621e05 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1378,6 +1378,7 @@ cc_library(
     name = "reduction_utils",
     srcs = ["reduction_utils.cc"],
     hdrs = ["reduction_utils.h"],
+    compatible_with = get_compatible_with_portable(),
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":ir_emission_utils",
@@ -4606,6 +4607,7 @@ cc_library(
     name = "gpu_fusible",
     srcs = ["gpu_fusible.cc"],
     hdrs = ["gpu_fusible.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":backend_configs_cc",
         ":hlo_traversal",
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 782121674db3d1..54387ad4f8c568 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -159,6 +159,7 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
+        "//xla/service/gpu:gpu_fusible",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
index c1ec02bc5267cf..1a4e7665a8a50f 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/model/hlo_op_profile.pb.h"
 #include "xla/service/gpu/model/hlo_op_profiles.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -131,6 +132,9 @@ absl::Status GpuHloCostAnalysis::FusionCalculateUtilizations(
   current_properties_[kBasicBlockSplitCountKey] = 0;
   current_properties_[kIRSizeKey] = 0;
 
+  bool is_softmax =
+      IsTritonSoftmaxFusion(*fusion) && enable_triton_softmax_fusion_analysis_;
+
   for (const HloInstruction* instr : instructions) {
     VLOG(8) << instr->name() << ":";
     VLOG(9) << "Elementwise use roots:";
@@ -141,6 +145,12 @@ absl::Status GpuHloCostAnalysis::FusionCalculateUtilizations(
       instr_props[kIRSizeKey] += root_ir_sizes[r];
     }
 
+    if (is_softmax) {
+      // Special case for Triton Softmax fusions. The fusion will be codegen in
+      // a way that each unique element is computed only once per tile.
+      instr_props[kUtilizationKey] = 1;
+    }
+
     float cur_instr_utilization = instr_props[kUtilizationKey];
     VLOG(8) << "Total utilization: " << cur_instr_utilization;
     float cur_instr_times_emitted = instr_props[kIRSizeKey];
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h
index 01842d8f113187..ccc521854686ce 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h
@@ -41,8 +41,12 @@ class GpuHloCostAnalysis : public HloCostAnalysis {
  public:
   explicit GpuHloCostAnalysis(
       const Options& options,
-      const se::DeviceDescription* device_info = nullptr)
-      : HloCostAnalysis(options), device_info_(device_info) {}
+      const se::DeviceDescription* device_info = nullptr,
+      bool enable_triton_softmax_fusion_analysis = false)
+      : HloCostAnalysis(options),
+        device_info_(device_info),
+        enable_triton_softmax_fusion_analysis_(
+            enable_triton_softmax_fusion_analysis) {}
 
   absl::Status Preprocess(const HloInstruction* hlo) override;
 
@@ -109,6 +113,9 @@ class GpuHloCostAnalysis : public HloCostAnalysis {
   // This is different from hlo_properties_[instr][kUtilizationKey] which
   // is the utilization of the instruction by other roots.
   absl::flat_hash_map<const HloInstruction*, float> root_utilizations_;
+
+  // If true, enable special handling of Triton Softmax fusions.
+  bool enable_triton_softmax_fusion_analysis_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
index 82984b1193bef8..01f8039bc1c1e1 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -618,6 +618,72 @@ ENTRY fusion {
   EXPECT_LT(exp_producer_priority, exp_consumer_priority);
 }
 
+TEST_F(GpuPerformanceModelTest, FuseBroadcastIntoTritonSoftmax) {
+  constexpr absl::string_view kHlo = R"(
+HloModule m
+
+fused_select {
+  param_0 = pred[4,2048,2048] parameter(0)
+  broadcast = pred[4,12,2048,2048] broadcast(param_0), dimensions={0,2,3}
+  param_1 = bf16[4,12,2048,2048] parameter(1)
+  convert = f32[4,12,2048,2048] convert(param_1)
+  constant = f32[] constant(-2.38197633e+38)
+  broadcast.1 = f32[4,12,2048,2048] broadcast(constant), dimensions={}
+  ROOT select = f32[4,12,2048,2048] select(broadcast, convert, broadcast.1)
+}
+
+region {
+  param_0.1 = f32[] parameter(0)
+  param_1.1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(param_0.1, param_1.1)
+}
+
+region.1 {
+  param_0.2 = f32[] parameter(0)
+  param_1.2 = f32[] parameter(1)
+  ROOT add = f32[] add(param_0.2, param_1.2)
+}
+
+triton_softmax_computation {
+  param_0.3 = f32[4,12,2048,2048] parameter(0)
+  constant.1 = f32[] constant(-inf)
+  reduce = f32[4,12,2048] reduce(param_0.3, constant.1), dimensions={3}, to_apply=region
+  broadcast.2 = f32[4,12,2048,2048] broadcast(reduce), dimensions={0,1,2}
+  subtract = f32[4,12,2048,2048] subtract(param_0.3, broadcast.2)
+  exponential = f32[4,12,2048,2048] exponential(subtract)
+  constant.2 = f32[] constant(0)
+  reduce.1 = f32[4,12,2048] reduce(exponential, constant.2), dimensions={3}, to_apply=region.1
+  broadcast.3 = f32[4,12,2048,2048] broadcast(reduce.1), dimensions={0,1,2}
+  divide = f32[4,12,2048,2048] divide(exponential, broadcast.3)
+  convert.1 = bf16[4,12,2048,2048] convert(divide)
+  ROOT bitcast = bf16[48,2048,2048] bitcast(convert.1)
+}
+
+ENTRY entry_computation {
+  param_0.4 = pred[4,2048,2048] parameter(0)
+  param_1.3 = bf16[4,12,2048,2048] parameter(1)
+  loop_select_fusion.1 = f32[4,12,2048,2048] fusion(param_0.4, param_1.3), kind=kLoop, calls=fused_select
+  ROOT triton_softmax.1 = bf16[48,2048,2048]{2,1,0} fusion(loop_select_fusion.1), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config":{"kind":"__triton_softmax"}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+
+  GpuHloCostAnalysis analysis{options_, &device_info_,
+                              /*enable_triton_softmax_fusion_analysis=*/true};
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis));
+
+  auto* producer = module->entry_computation()->GetInstructionWithName(
+      "loop_select_fusion.1");
+  auto* consumer =
+      module->entry_computation()->GetInstructionWithName("triton_softmax.1");
+
+  auto t = GpuPerformanceModel::EstimateRunTimesForPriorityFusion(
+      producer, &analysis, GpuPerformanceModelOptions::PriorityFusion(),
+      {consumer});
+
+  EXPECT_LT(t.time_fused, t.time_unfused);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From f1faf3b0ab405343cf1da4df08b4df6fe12e8981 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 08:50:08 -0700
Subject: [PATCH 444/670] Migrate Protobuf DebugString calls

PiperOrigin-RevId: 619204340
---
 .../compiler/mlir/tensorflow/translate/BUILD  |  2 ++
 .../tf_mlir_translate_registration.cc         |  5 ++--
 .../translate/translate_tf_dialect_op.cc      |  3 +-
 tensorflow/core/common_runtime/BUILD          |  1 +
 .../common_runtime/direct_session_test.cc     |  4 ++-
 .../process_function_library_runtime_test.cc  |  4 ++-
 tensorflow/core/framework/model.cc            |  3 +-
 tensorflow/core/framework/op_kernel_test.cc   | 27 +++++++++++------
 tensorflow/core/graph/optimizer_cse_test.cc   |  3 +-
 tensorflow/python/framework/BUILD             |  1 +
 tensorflow/python/framework/python_op_gen.cc  |  3 +-
 .../third_party/tsl/tsl/platform/protobuf.h   |  6 ++++
 .../tsl/tsl/platform/protobuf_util.cc         | 30 +++++++++++++++++++
 13 files changed, 75 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/translate/BUILD b/tensorflow/compiler/mlir/tensorflow/translate/BUILD
index 46af8590c8108e..59d7cfd7081106 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/translate/BUILD
@@ -122,6 +122,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TranslateLib",
+        "@local_tsl//tsl/platform:protobuf",
     ],
     alwayslink = 1,
 )
@@ -287,6 +288,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TranslateLib",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_xla//xla/client:client_library",
         "@local_xla//xla/client:compile_only_client",
         "@local_xla//xla/service/cpu:cpu_compiler",
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
index 97bf4efedf1587..eb9bf3db34106d 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tsl/platform/protobuf.h"
 
 namespace mlir {
 using tsl::Status;
@@ -152,7 +153,7 @@ static LogicalResult MlirToGraphTranslateFunction(ModuleOp module,
   // Print the graph to the output after going through GraphDef conversion.
   // The DumpGraphToFile would do this anyway so just skip straight to it.
   graph->ToGraphDef(graphdef.get());
-  output << graphdef->DebugString();
+  output << tsl::LegacyUnredactedDebugString(*graphdef);
 
   return success();
 }
@@ -178,7 +179,7 @@ static LogicalResult MlirToGraphdefTranslateFunction(
     return mlir::failure();
   }
 
-  output << graphdef_or.value()->DebugString();
+  output << tsl::LegacyUnredactedDebugString(*graphdef_or.value());
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
index 08cbb51a576760..856db032e501ae 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
+#include "tsl/platform/protobuf.h"
 
 namespace mlir {
 static mlir::Operation* ExtractOnlyOp(mlir::ModuleOp module) {
@@ -61,7 +62,7 @@ static LogicalResult MlirToTfNodeDef(ModuleOp module,
     return failure();
   }
 
-  output << node_def_or.value()->DebugString();
+  output << tsl::LegacyUnredactedDebugString(*node_def_or.value());
   return success();
 }
 
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 6ce381ab739b17..d9f05c303cc0fd 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -2894,6 +2894,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:queue_ops",
         "//tensorflow/core/kernels:session_ops",
         "//tensorflow/core/kernels:variable_ops",
+        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 11dbb0fb8247c0..f278dfc233bbf9 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tsl/platform/protobuf.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -1165,7 +1166,8 @@ class SessionMetadataReaderOp : public OpKernel {
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output("y", TensorShape({}), &out_tensor));
     if (ctx->session_metadata() != nullptr) {
-      out_tensor->scalar<tstring>()() = ctx->session_metadata()->DebugString();
+      out_tensor->scalar<tstring>()() =
+          tsl::LegacyUnredactedDebugString(*ctx->session_metadata());
     } else {
       out_tensor->scalar<tstring>()() = "";
     }
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index c95f118a1a1589..8e3c2c225dc0ed 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
+#include "tsl/platform/protobuf.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -1243,7 +1244,8 @@ class SessionMetadataReaderOp : public OpKernel {
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output("y", TensorShape({}), &out_tensor));
     if (ctx->session_metadata() != nullptr) {
-      out_tensor->scalar<tstring>()() = ctx->session_metadata()->DebugString();
+      out_tensor->scalar<tstring>()() =
+          tsl::LegacyUnredactedDebugString(*ctx->session_metadata());
     } else {
       out_tensor->scalar<tstring>()() = "";
     }
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index f25624b41675ab..cc970218ba76cf 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/statusor.h"
+#include "tsl/platform/protobuf.h"
 
 namespace tensorflow {
 namespace data {
@@ -2265,7 +2266,7 @@ Model::Model(std::optional<std::string> dataset_name)
               if (dataset_name_.has_value()) {
                 model_proto.set_dataset_name(dataset_name_.value());
               }
-              return model_proto.DebugString();
+              return tsl::LegacyUnredactedDebugString(model_proto);
             }
             LOG(WARNING) << s.message();
           }
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index fb477a50899601..ed4bb1bf7cdcf2 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tsl/platform/protobuf.h"
 
 class DummyKernel : public tensorflow::OpKernel {
  public:
@@ -350,15 +351,19 @@ TEST_F(OpKernelTest, NotFound) {
   const auto not_found = error::NOT_FOUND;
   // Something with that op type name exists, but only with a
   // different DeviceType.
-  ExpectFailure(CreateNodeDef("Test1", {DT_FLOAT, DT_INT32}).DebugString(),
+  ExpectFailure(tsl::LegacyUnredactedDebugString(
+                    CreateNodeDef("Test1", {DT_FLOAT, DT_INT32})),
                 DEVICE_GPU, not_found);
-  ExpectFailure(CreateNodeDef("Test3", {DT_INT8, DT_INT8}).DebugString(),
+  ExpectFailure(tsl::LegacyUnredactedDebugString(
+                    CreateNodeDef("Test3", {DT_INT8, DT_INT8})),
                 DEVICE_GPU, not_found);
-  ExpectFailure(CreateNodeDef("Test3", {DT_FLOAT, DT_FLOAT}).DebugString(),
+  ExpectFailure(tsl::LegacyUnredactedDebugString(
+                    CreateNodeDef("Test3", {DT_FLOAT, DT_FLOAT})),
                 DEVICE_CPU, not_found);
 
   // No kernel with that signature registered.
-  ExpectFailure(CreateNodeDef("Test3", {DT_INT32, DT_INT32}).DebugString(),
+  ExpectFailure(tsl::LegacyUnredactedDebugString(
+                    CreateNodeDef("Test3", {DT_INT32, DT_INT32})),
                 DEVICE_GPU, not_found);
 
   // Nothing with that op type name exists.
@@ -370,23 +375,27 @@ TEST_F(OpKernelTest, TooFewInputs) {
   const auto invalid = error::INVALID_ARGUMENT;
   NodeDef node_def = CreateNodeDef("Test1", {DT_FLOAT, DT_INT32});
   node_def.clear_input();
-  ExpectFailure(node_def.DebugString(), DEVICE_CPU, invalid);
+  ExpectFailure(tsl::LegacyUnredactedDebugString(node_def), DEVICE_CPU,
+                invalid);
   node_def.add_input("a");
-  ExpectFailure(node_def.DebugString(), DEVICE_CPU, invalid);
+  ExpectFailure(tsl::LegacyUnredactedDebugString(node_def), DEVICE_CPU,
+                invalid);
 }
 
 TEST_F(OpKernelTest, TooManyInputs) {
   const auto invalid = error::INVALID_ARGUMENT;
   NodeDef node_def = CreateNodeDef("Test1", {DT_FLOAT, DT_INT32});
   node_def.add_input("c");
-  ExpectFailure(node_def.DebugString(), DEVICE_CPU, invalid);
+  ExpectFailure(tsl::LegacyUnredactedDebugString(node_def), DEVICE_CPU,
+                invalid);
 }
 
 TEST_F(OpKernelTest, MatchSignatureFails) {
   const auto invalid = error::INVALID_ARGUMENT;
   foo::match_signature_ = true;
-  ExpectFailure(CreateNodeDef("Test2", {DT_FLOAT}).DebugString(), DEVICE_GPU,
-                invalid);
+  ExpectFailure(
+      tsl::LegacyUnredactedDebugString(CreateNodeDef("Test2", {DT_FLOAT})),
+      DEVICE_GPU, invalid);
   EXPECT_FALSE(foo::match_signature_);
 }
 
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 55429b50eb31fc..f5fa5de61a218e 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tsl/platform/protobuf.h"
 
 namespace tensorflow {
 namespace {
@@ -333,7 +334,7 @@ TEST_F(OptimizerCSETest, Constant_Dedup) {
   }
   GraphDef gdef;
   test::graph::ToGraphDef(&g, &gdef);
-  InitGraph(gdef.DebugString());
+  InitGraph(tsl::LegacyUnredactedDebugString(gdef));
 
   EXPECT_EQ(OriginalGraph(),
             "n/_0(Const);n/_1(Const);n/_2(Const);n/_3(Const);"
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index 6c9ccdeb1d41ee..a1dc47c492de7b 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -93,6 +93,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:protobuf",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 2478db37b7f977..e064900002be16 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -52,6 +52,7 @@ limitations under the License.
 #include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/framework/python_op_gen_annotator.h"
+#include "tsl/platform/protobuf.h"
 
 namespace tensorflow {
 namespace {
@@ -674,7 +675,7 @@ string ShapeToPython(const TensorShapeProto& shape) {
 }
 
 string TensorToPython(const TensorProto& proto) {
-  return proto.ShortDebugString();
+  return tsl::LegacyUnredactedShortDebugString(proto);
 }
 
 string AttrListToPython(const AttrValue& value,
diff --git a/third_party/xla/third_party/tsl/tsl/platform/protobuf.h b/third_party/xla/third_party/tsl/tsl/platform/protobuf.h
index 791524f6b00a72..e6180168662250 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/protobuf.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/protobuf.h
@@ -125,6 +125,12 @@ class TStringOutputStream : public protobuf::io::ZeroCopyOutputStream {
 
   tstring* target_;
 };
+
+std::string LegacyUnredactedDebugString(const tsl::protobuf::Message& message);
+std::string LegacyUnredactedDebugString(
+    const tsl::protobuf::MessageLite& message);
+std::string LegacyUnredactedShortDebugString(
+    const tsl::protobuf::Message& message);
 }  // namespace tsl
 
 #endif  // TENSORFLOW_TSL_PLATFORM_PROTOBUF_H_
diff --git a/third_party/xla/third_party/tsl/tsl/platform/protobuf_util.cc b/third_party/xla/third_party/tsl/tsl/platform/protobuf_util.cc
index 837bf112ed0ec6..0bab6131327970 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/protobuf_util.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/protobuf_util.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
 #include "tsl/platform/protobuf.h"
 
 namespace tsl {
@@ -27,4 +29,32 @@ bool ParseProtoUnlimited(protobuf::MessageLite* proto, const void* serialized,
   return proto->ParseFromArray(serialized, size);
 }
 
+std::string LegacyUnredactedDebugString(const tsl::protobuf::Message& message) {
+  std::string debug_string;
+  tsl::protobuf::TextFormat::Printer printer;
+  printer.SetExpandAny(true);
+
+  printer.PrintToString(message, &debug_string);
+  return debug_string;
+}
+
+std::string LegacyUnredactedDebugString(
+    const tsl::protobuf::MessageLite& message) {
+  return message.DebugString();
+}
+
+std::string LegacyUnredactedShortDebugString(
+    const tsl::protobuf::Message& message) {
+  std::string debug_string;
+  tsl::protobuf::TextFormat::Printer printer;
+  printer.SetSingleLineMode(true);
+  printer.SetExpandAny(true);
+
+  printer.PrintToString(message, &debug_string);
+  if (!debug_string.empty() && debug_string.back() == ' ') {
+    debug_string.pop_back();
+  }
+  return debug_string;
+}
+
 }  // namespace tsl

From 50f19709f31fc4a4e806e6a425204dee0ebaee12 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <dmitrig@google.com>
Date: Tue, 26 Mar 2024 09:04:47 -0700
Subject: [PATCH 445/670] Integrate LLVM at llvm/llvm-project@3cf169ca160e

Updates LLVM usage to match
[3cf169ca160e](https://github.com/llvm/llvm-project/commit/3cf169ca160e)

PiperOrigin-RevId: 619208575
---
 third_party/llvm/generated.patch | 70 --------------------------------
 third_party/llvm/workspace.bzl   |  4 +-
 2 files changed, 2 insertions(+), 72 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 29f93730c6cf18..509398da979e83 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,71 +1 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
---- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
-+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
-@@ -598,14 +598,14 @@
-     name = "llvm-min-tblgen",
-     srcs = [
-         "utils/TableGen/Attributes.cpp",
--        "utils/TableGen/CodeGenIntrinsics.cpp",
--        "utils/TableGen/CodeGenIntrinsics.h",
-+        "utils/TableGen/Basic/CodeGenIntrinsics.cpp",
-+        "utils/TableGen/Basic/CodeGenIntrinsics.h",
-+        "utils/TableGen/Basic/SDNodeProperties.cpp",
-+        "utils/TableGen/Basic/SDNodeProperties.h",
-+        "utils/TableGen/Basic/SequenceToOffsetTable.h",
-         "utils/TableGen/DirectiveEmitter.cpp",
-         "utils/TableGen/IntrinsicEmitter.cpp",
-         "utils/TableGen/RISCVTargetDefEmitter.cpp",
--        "utils/TableGen/SDNodeProperties.cpp",
--        "utils/TableGen/SDNodeProperties.h",
--        "utils/TableGen/SequenceToOffsetTable.h",
-         "utils/TableGen/TableGen.cpp",
-         "utils/TableGen/VTEmitter.cpp",
-     ],
-@@ -621,7 +621,7 @@
- cc_library(
-     name = "TableGenGlobalISel",
-     srcs = [
--        "utils/TableGen/GlobalISel/CodeExpander.cpp",
-+        "utils/TableGen/Common/GlobalISel/CodeExpander.cpp",
-     ],
-     hdrs = glob([
-         # We have to include these headers here as well as in the `hdrs` below
-@@ -629,8 +629,8 @@
-         # them, even though consumers of this library use inclusion relative to
-         # `utils/TableGen` with the `strip_includes_prefix` of this library.
-         # This mixture appears to be incompatible with header modules.
--        "utils/TableGen/GlobalISel/CodeExpander.h",
--        "utils/TableGen/GlobalISel/CodeExpansions.h",
-+        "utils/TableGen/Common/GlobalISel/CodeExpander.h",
-+        "utils/TableGen/Common/GlobalISel/CodeExpansions.h",
-     ]),
-     copts = llvm_copts,
-     features = ["-header_modules"],
-@@ -654,8 +654,12 @@
-         [
-             "utils/TableGen/*.cpp",
-             "utils/TableGen/*.h",
--            "utils/TableGen/GlobalISel/*.cpp",
--            "utils/TableGen/GlobalISel/*.h",
-+            "utils/TableGen/Basic/*.cpp",
-+            "utils/TableGen/Basic/*.h",
-+            "utils/TableGen/Common/*.cpp",
-+            "utils/TableGen/Common/*.h",
-+            "utils/TableGen/Common/GlobalISel/*.cpp",
-+            "utils/TableGen/Common/GlobalISel/*.h",
- 
-             # Some tablegen sources include headers from MC, so these have to be
-             # listed here. MC uses headers produced by tablegen, so it cannot be a
-@@ -663,9 +667,10 @@
-             "include/llvm/MC/*.h",
-             "include/llvm/TargetParser/SubtargetFeature.h",
-         ],
--        exclude = ["utils/TableGen/GlobalISel/CodeExpander.cpp"],
-+        exclude = ["utils/TableGen/Common/GlobalISel/CodeExpander.cpp"],
-     ),
-     copts = llvm_copts,
-+    includes = ["utils/TableGen"],
-     stamp = 0,
-     deps = [
-         ":CodeGenTypes",
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index bba2ccd34d8330..83a5ccb8ec502b 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "fa3d789df15bd1f58fb8ba4ea3be909218cf7f03"
-    LLVM_SHA256 = "770f98fe3019d205db5ef33e77c231c2ac790705116d92cad6b150692255219e"
+    LLVM_COMMIT = "3cf169ca160eaf5464503fbd93d73ee1d8597936"
+    LLVM_SHA256 = "b63cac687df1bc98e3eb0289f3be6824fcb1b106d0720b5c083417918d1029fd"
 
     tf_http_archive(
         name = name,

From 5289b173d37be752066438fe10faea8f603ad3dd Mon Sep 17 00:00:00 2001
From: Mohammed Anany <manany@google.com>
Date: Tue, 26 Mar 2024 10:29:32 -0700
Subject: [PATCH 446/670] Disable Tensor Cores for 8-bit x F32 dot.

PiperOrigin-RevId: 619235998
---
 .../xla/xla/service/gpu/ir_emitter_triton.cc   | 18 ++++++++++++++----
 .../gpu/ir_emitter_triton_parametrized_test.cc |  2 +-
 .../xla/service/gpu/ir_emitter_triton_test.cc  |  4 ++--
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 89a13a9d765023..a3062fcc48ac87 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -2002,6 +2002,8 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
     iter_args_next.reserve(iter_args.size());
     absl::flat_hash_map<const HloInstruction*, Value> values_lhs;
     absl::flat_hash_map<const HloInstruction*, Value> values_rhs;
+    bool has_8_bit_input = false;
+
     // Load tiles of all parameters of LHS and RHS scopes and advance pointers.
     for (int i = 0; i < iter_args.size() - 1; ++i) {
       const bool is_lhs =
@@ -2019,6 +2021,10 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
         param_value = Cast(b, param_value, param_ty);
       }
 
+      if (param_ty.getIntOrFloatBitWidth() <= 8) {
+        has_8_bit_input = true;
+      }
+
       CHECK(values.insert({param_hlo, param_value}).second);
       SmallVector<Value> increments;
       for (const DimProperties& dim : side.tiled_dims) {
@@ -2080,6 +2086,10 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
       dot_input_rhs = apply_mask(1, dot_input_rhs);
     }
 
+    // TODO(b/320659359) Allow TF32 for 8-bit types with F32.
+    bool has_convert_8_bit_to_f32 =
+        has_8_bit_input && getElementTypeOrSelf(dot_input_lhs).isF32();
+
     const HloModule* hlo_module = dot_instr->GetModule();
     if (hlo_module->config().debug_options().xla_gpu_enable_bf16_3way_gemm() &&
         hlo_module->config().debug_options().xla_gpu_enable_bf16_6way_gemm()) {
@@ -2107,10 +2117,10 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
       // maxNumImpreciseAcc flag was introduced for Hopper to accumulate in a
       // lower precision than the output type. The change was introduced here:
       // https://github.com/openai/triton/commit/31b0c521427109a8eda609b58d756c380b21599a
-      accumulator_next =
-          b.create<mt::DotOp>(dot_input_lhs, dot_input_rhs, iter_args.back(),
-                              /*allowTF32=*/IsTf32Allowed(dot_instr),
-                              /*maxNumImpreciseAcc=*/0);
+      accumulator_next = b.create<mt::DotOp>(
+          dot_input_lhs, dot_input_rhs, iter_args.back(),
+          /*allowTF32=*/IsTf32Allowed(dot_instr) && !has_convert_8_bit_to_f32,
+          /*maxNumImpreciseAcc=*/0);
     }
     iter_args_next.push_back(accumulator_next);
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index 9d4e55be446f21..8e01b864b893ee 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -136,7 +136,7 @@ INSTANTIATE_TEST_SUITE_P(RewriteTestSuite, MixedTypeTest,
                              //  TritonRewriteTest2Params{F32, F16},
                              //  TritonRewriteTest2Params{F32, BF16},
                              MixTypeParams{S8, BF16, 24, 40, 8},
-                             MixTypeParams{S8, F16, 80, 16, 32},
+                             MixTypeParams{S8, F16, 80, 16, 32, 1e-3, 1e-6},
                              MixTypeParams{F16, F32, 127, 3, 300, 1e-2, 1e-2},
                              MixTypeParams{F16, BF16, 544, 96, 16, 1e-3, 1e-3},
                              MixTypeParams{BF16, F32, 77, 500, 333, 3e-3, 3e-3},
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 7597c556406465..b40e04614b000f 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -1227,7 +1227,7 @@ ENTRY e {
 TEST_F(TritonGemmTest, UseTensorCoresForF32OnAmpere) {
   const std::string kHloText = R"(
 triton_gemm_r {
-  parameter_0 = s8[80,15]{1,0} parameter(0)
+  parameter_0 = f16[80,15]{1,0} parameter(0)
   convert.3 = f32[80,15]{1,0} convert(parameter_0)
   parameter_1 = f32[16,15]{1,0} parameter(1)
   ROOT r.1 = f32[80,16]{1,0} dot(convert.3, parameter_1),
@@ -1236,7 +1236,7 @@ triton_gemm_r {
 
 ENTRY e {
   p1 = f32[16,15]{1,0} parameter(1)
-  p0 = s8[80,15]{1,0} parameter(0)
+  p0 = f16[80,15]{1,0} parameter(0)
   ROOT triton_gemm_r = f32[80,16]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm_r,
     backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:

From f5a03f1c8b6105b92dc98233f0b494632ba9224f Mon Sep 17 00:00:00 2001
From: Mohammed Anany <manany@google.com>
Date: Tue, 26 Mar 2024 10:31:12 -0700
Subject: [PATCH 447/670] [Triton] Modify disabled mixed-precision mmav2
 swizzling to be enabled with different configuration.

PiperOrigin-RevId: 619236546
---
 third_party/triton/cl619146327.patch          | 52 +++++++++++++++++++
 third_party/triton/workspace.bzl              |  1 +
 .../xla/third_party/triton/cl619146327.patch  | 52 +++++++++++++++++++
 .../xla/third_party/triton/workspace.bzl      |  1 +
 4 files changed, 106 insertions(+)
 create mode 100644 third_party/triton/cl619146327.patch
 create mode 100644 third_party/xla/third_party/triton/cl619146327.patch

diff --git a/third_party/triton/cl619146327.patch b/third_party/triton/cl619146327.patch
new file mode 100644
index 00000000000000..4f9c1e4b971c0a
--- /dev/null
+++ b/third_party/triton/cl619146327.patch
@@ -0,0 +1,52 @@
+This patch can be removed once this commit is included:
+https://github.com/openai/triton/commit/6ea5b56015db9e0bcff45ec7116cfcbfa729a516
+
+diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
++++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+@@ -305,9 +305,10 @@ compared to 1*64 when the hasLeadingOffs
+           int perPhase = 128 / (shapePerCTA[order[0]] * 4 / dotOpEnc.getKWidth());
+           perPhase = std::max<int>(perPhase, 1);
+           std::vector<size_t> matShape = {8, 8, 4 * dotOpEnc.getKWidth()};
+-          // for now, disable swizzle when using transposed int8 tensor cores
+-          if ((32 / typeWidthInBit != dotOpEnc.getKWidth()) && order[0] == inner)
+-            return get(context, 1, 1, 1, order, CTALayout);
++          int vecWidth = 32 / typeWidthInBit;
++          if (vecWidth != dotOpEnc.getKWidth() && order[0] == inner) {
++              perPhase = std::max<int>(perPhase, 2 * vecWidth);
++          }
+           int rank = order.size();
+           // --- handle A operand ---
+           if (opIdx == 0) { // compute swizzling for A operand
+diff --git a/test/TritonGPU/reduce-data-duplication.mlir b/test/TritonGPU/reduce-data-duplication.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/TritonGPU/reduce-data-duplication.mlir
+@@ -0,0 +1,14 @@
++// RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s
++
++//       CHECK:   #[[SHARED:.*]] = #triton_gpu.shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}
++//       CHECK:   apply_swizzle
++//       CHECK:   %{{.*}} = triton_gpu.local_alloc %{{.*}} : (tensor<16x256xf16, #{{.*}}>) -> !tt.memdesc<16x256xf16, #[[SHARED]]>
++
++#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}>
++#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
++module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
++  tt.func @apply_swizzle(%arg0: tensor<16x256xf16, #blocked>) {
++    %0 = triton_gpu.convert_layout %arg0 : tensor<16x256xf16, #blocked> -> tensor<16x256xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> 
++    tt.return 
++  } 
++}
+\ No newline at end of file
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
+@@ -541,8 +541,6 @@ getLoadMatrixFn(MemDescType descTy, cons
+   const int elemBytes = descTy.getElementTypeBitWidth() / 8;
+   auto order = sharedLayout.getOrder();
+ 
+-  if (kWidth != (4 / elemBytes))
+-    assert(vecPhase == 1 || vecPhase == 4 * kWidth);
+   int nPerWarp =
+       std::max<int>(shapePerCTA[2] / mmaLayout.getWarpsPerCTA()[2], 8);
+ 
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index b2e16e65435a7c..ce8d828c2fc64b 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -16,5 +16,6 @@ def repo():
         patch_file = [
             "//third_party/triton:cl607293980.patch",  # long standing :(
             "//third_party/triton:cl617812302.patch",
+            "//third_party/triton:cl619146327.patch",
         ],
     )
diff --git a/third_party/xla/third_party/triton/cl619146327.patch b/third_party/xla/third_party/triton/cl619146327.patch
new file mode 100644
index 00000000000000..4f9c1e4b971c0a
--- /dev/null
+++ b/third_party/xla/third_party/triton/cl619146327.patch
@@ -0,0 +1,52 @@
+This patch can be removed once this commit is included:
+https://github.com/openai/triton/commit/6ea5b56015db9e0bcff45ec7116cfcbfa729a516
+
+diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
++++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+@@ -305,9 +305,10 @@ compared to 1*64 when the hasLeadingOffs
+           int perPhase = 128 / (shapePerCTA[order[0]] * 4 / dotOpEnc.getKWidth());
+           perPhase = std::max<int>(perPhase, 1);
+           std::vector<size_t> matShape = {8, 8, 4 * dotOpEnc.getKWidth()};
+-          // for now, disable swizzle when using transposed int8 tensor cores
+-          if ((32 / typeWidthInBit != dotOpEnc.getKWidth()) && order[0] == inner)
+-            return get(context, 1, 1, 1, order, CTALayout);
++          int vecWidth = 32 / typeWidthInBit;
++          if (vecWidth != dotOpEnc.getKWidth() && order[0] == inner) {
++              perPhase = std::max<int>(perPhase, 2 * vecWidth);
++          }
+           int rank = order.size();
+           // --- handle A operand ---
+           if (opIdx == 0) { // compute swizzling for A operand
+diff --git a/test/TritonGPU/reduce-data-duplication.mlir b/test/TritonGPU/reduce-data-duplication.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/TritonGPU/reduce-data-duplication.mlir
+@@ -0,0 +1,14 @@
++// RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s
++
++//       CHECK:   #[[SHARED:.*]] = #triton_gpu.shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}
++//       CHECK:   apply_swizzle
++//       CHECK:   %{{.*}} = triton_gpu.local_alloc %{{.*}} : (tensor<16x256xf16, #{{.*}}>) -> !tt.memdesc<16x256xf16, #[[SHARED]]>
++
++#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}>
++#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
++module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
++  tt.func @apply_swizzle(%arg0: tensor<16x256xf16, #blocked>) {
++    %0 = triton_gpu.convert_layout %arg0 : tensor<16x256xf16, #blocked> -> tensor<16x256xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> 
++    tt.return 
++  } 
++}
+\ No newline at end of file
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
+@@ -541,8 +541,6 @@ getLoadMatrixFn(MemDescType descTy, cons
+   const int elemBytes = descTy.getElementTypeBitWidth() / 8;
+   auto order = sharedLayout.getOrder();
+ 
+-  if (kWidth != (4 / elemBytes))
+-    assert(vecPhase == 1 || vecPhase == 4 * kWidth);
+   int nPerWarp =
+       std::max<int>(shapePerCTA[2] / mmaLayout.getWarpsPerCTA()[2], 8);
+ 
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index b2e16e65435a7c..ce8d828c2fc64b 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -16,5 +16,6 @@ def repo():
         patch_file = [
             "//third_party/triton:cl607293980.patch",  # long standing :(
             "//third_party/triton:cl617812302.patch",
+            "//third_party/triton:cl619146327.patch",
         ],
     )

From fed1369a7b70f546ad4bcff65d9d3a59490f7b11 Mon Sep 17 00:00:00 2001
From: Elliot English <elliotenglish@google.com>
Date: Tue, 26 Mar 2024 10:39:13 -0700
Subject: [PATCH 448/670] Update github xla contacts

PiperOrigin-RevId: 619239299
---
 third_party/xla/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/README.md b/third_party/xla/README.md
index 65e8b8f2211c46..be0325eefc03ba 100644
--- a/third_party/xla/README.md
+++ b/third_party/xla/README.md
@@ -28,7 +28,7 @@ and then see the [developer guide](docs/developer_guide.md).
 
 ## Contacts
 
-*   For questions, contact Thea Lamkin - thealamkin at google.com.
+*   For questions, contact the maintainers - maintainers at openxla.org
 
 ## Resources
 

From a6fefad79486645335a18410beeb4f58d82c0ed3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 11:32:58 -0700
Subject: [PATCH 449/670] [XLA:GPU] Revert "Add GpuHloCostAnalysis option to
 estimate Triton Softmax fusions."

PiperOrigin-RevId: 619258430
---
 third_party/xla/xla/service/gpu/BUILD         |  2 -
 third_party/xla/xla/service/gpu/model/BUILD   |  1 -
 .../gpu/model/gpu_hlo_cost_analysis.cc        | 10 ---
 .../service/gpu/model/gpu_hlo_cost_analysis.h | 11 +---
 .../gpu/model/gpu_performance_model_test.cc   | 66 -------------------
 5 files changed, 2 insertions(+), 88 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 1f63895f621e05..241d2d5bf7a3e6 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1378,7 +1378,6 @@ cc_library(
     name = "reduction_utils",
     srcs = ["reduction_utils.cc"],
     hdrs = ["reduction_utils.h"],
-    compatible_with = get_compatible_with_portable(),
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":ir_emission_utils",
@@ -4607,7 +4606,6 @@ cc_library(
     name = "gpu_fusible",
     srcs = ["gpu_fusible.cc"],
     hdrs = ["gpu_fusible.h"],
-    compatible_with = get_compatible_with_portable(),
     deps = [
         ":backend_configs_cc",
         ":hlo_traversal",
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 54387ad4f8c568..782121674db3d1 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -159,7 +159,6 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
-        "//xla/service/gpu:gpu_fusible",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
index 1a4e7665a8a50f..c1ec02bc5267cf 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
-#include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/model/hlo_op_profile.pb.h"
 #include "xla/service/gpu/model/hlo_op_profiles.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -132,9 +131,6 @@ absl::Status GpuHloCostAnalysis::FusionCalculateUtilizations(
   current_properties_[kBasicBlockSplitCountKey] = 0;
   current_properties_[kIRSizeKey] = 0;
 
-  bool is_softmax =
-      IsTritonSoftmaxFusion(*fusion) && enable_triton_softmax_fusion_analysis_;
-
   for (const HloInstruction* instr : instructions) {
     VLOG(8) << instr->name() << ":";
     VLOG(9) << "Elementwise use roots:";
@@ -145,12 +141,6 @@ absl::Status GpuHloCostAnalysis::FusionCalculateUtilizations(
       instr_props[kIRSizeKey] += root_ir_sizes[r];
     }
 
-    if (is_softmax) {
-      // Special case for Triton Softmax fusions. The fusion will be codegen in
-      // a way that each unique element is computed only once per tile.
-      instr_props[kUtilizationKey] = 1;
-    }
-
     float cur_instr_utilization = instr_props[kUtilizationKey];
     VLOG(8) << "Total utilization: " << cur_instr_utilization;
     float cur_instr_times_emitted = instr_props[kIRSizeKey];
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h
index ccc521854686ce..01842d8f113187 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.h
@@ -41,12 +41,8 @@ class GpuHloCostAnalysis : public HloCostAnalysis {
  public:
   explicit GpuHloCostAnalysis(
       const Options& options,
-      const se::DeviceDescription* device_info = nullptr,
-      bool enable_triton_softmax_fusion_analysis = false)
-      : HloCostAnalysis(options),
-        device_info_(device_info),
-        enable_triton_softmax_fusion_analysis_(
-            enable_triton_softmax_fusion_analysis) {}
+      const se::DeviceDescription* device_info = nullptr)
+      : HloCostAnalysis(options), device_info_(device_info) {}
 
   absl::Status Preprocess(const HloInstruction* hlo) override;
 
@@ -113,9 +109,6 @@ class GpuHloCostAnalysis : public HloCostAnalysis {
   // This is different from hlo_properties_[instr][kUtilizationKey] which
   // is the utilization of the instruction by other roots.
   absl::flat_hash_map<const HloInstruction*, float> root_utilizations_;
-
-  // If true, enable special handling of Triton Softmax fusions.
-  bool enable_triton_softmax_fusion_analysis_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
index 01f8039bc1c1e1..82984b1193bef8 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -618,72 +618,6 @@ ENTRY fusion {
   EXPECT_LT(exp_producer_priority, exp_consumer_priority);
 }
 
-TEST_F(GpuPerformanceModelTest, FuseBroadcastIntoTritonSoftmax) {
-  constexpr absl::string_view kHlo = R"(
-HloModule m
-
-fused_select {
-  param_0 = pred[4,2048,2048] parameter(0)
-  broadcast = pred[4,12,2048,2048] broadcast(param_0), dimensions={0,2,3}
-  param_1 = bf16[4,12,2048,2048] parameter(1)
-  convert = f32[4,12,2048,2048] convert(param_1)
-  constant = f32[] constant(-2.38197633e+38)
-  broadcast.1 = f32[4,12,2048,2048] broadcast(constant), dimensions={}
-  ROOT select = f32[4,12,2048,2048] select(broadcast, convert, broadcast.1)
-}
-
-region {
-  param_0.1 = f32[] parameter(0)
-  param_1.1 = f32[] parameter(1)
-  ROOT maximum = f32[] maximum(param_0.1, param_1.1)
-}
-
-region.1 {
-  param_0.2 = f32[] parameter(0)
-  param_1.2 = f32[] parameter(1)
-  ROOT add = f32[] add(param_0.2, param_1.2)
-}
-
-triton_softmax_computation {
-  param_0.3 = f32[4,12,2048,2048] parameter(0)
-  constant.1 = f32[] constant(-inf)
-  reduce = f32[4,12,2048] reduce(param_0.3, constant.1), dimensions={3}, to_apply=region
-  broadcast.2 = f32[4,12,2048,2048] broadcast(reduce), dimensions={0,1,2}
-  subtract = f32[4,12,2048,2048] subtract(param_0.3, broadcast.2)
-  exponential = f32[4,12,2048,2048] exponential(subtract)
-  constant.2 = f32[] constant(0)
-  reduce.1 = f32[4,12,2048] reduce(exponential, constant.2), dimensions={3}, to_apply=region.1
-  broadcast.3 = f32[4,12,2048,2048] broadcast(reduce.1), dimensions={0,1,2}
-  divide = f32[4,12,2048,2048] divide(exponential, broadcast.3)
-  convert.1 = bf16[4,12,2048,2048] convert(divide)
-  ROOT bitcast = bf16[48,2048,2048] bitcast(convert.1)
-}
-
-ENTRY entry_computation {
-  param_0.4 = pred[4,2048,2048] parameter(0)
-  param_1.3 = bf16[4,12,2048,2048] parameter(1)
-  loop_select_fusion.1 = f32[4,12,2048,2048] fusion(param_0.4, param_1.3), kind=kLoop, calls=fused_select
-  ROOT triton_softmax.1 = bf16[48,2048,2048]{2,1,0} fusion(loop_select_fusion.1), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config":{"kind":"__triton_softmax"}}
-}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
-
-  GpuHloCostAnalysis analysis{options_, &device_info_,
-                              /*enable_triton_softmax_fusion_analysis=*/true};
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis));
-
-  auto* producer = module->entry_computation()->GetInstructionWithName(
-      "loop_select_fusion.1");
-  auto* consumer =
-      module->entry_computation()->GetInstructionWithName("triton_softmax.1");
-
-  auto t = GpuPerformanceModel::EstimateRunTimesForPriorityFusion(
-      producer, &analysis, GpuPerformanceModelOptions::PriorityFusion(),
-      {consumer});
-
-  EXPECT_LT(t.time_fused, t.time_unfused);
-}
-
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From d58be30612134ae62b457bc12f91c7e1d3c7ba23 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rjascani@google.com>
Date: Tue, 26 Mar 2024 11:50:16 -0700
Subject: [PATCH 450/670] shlo_ref: Add doc links for contributing & iOS

This PR adds links to the main tensorflow contributing guideline to the
SHLO doc and highlights the expectation that clang-format is to be used.
It also adds some rudimentary instructions for building targets for iOS,
but testing is still left as a todo.
---
 tensorflow/lite/experimental/shlo/README.md | 22 ++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/experimental/shlo/README.md b/tensorflow/lite/experimental/shlo/README.md
index 2864114b08147f..2ca6fec92f81b7 100644
--- a/tensorflow/lite/experimental/shlo/README.md
+++ b/tensorflow/lite/experimental/shlo/README.md
@@ -5,9 +5,13 @@ StableHLO kernels.
 
 ## Contributing
 
+Please review the [Tensorflow Contributing Guide] for the repository's
+contributing guidelines.
+
 The code makes use of C++17 and is built using Bazel.
 
-Unless specified, the [Google style guide] should be specified.
+Unless specified, the [Google style guide] should be followed. Clang-format
+with `google` style should be used for automatic code formatting.
 
 To keep familiarity for people who are used to working with StableHLO, the data
 structures try to follow the naming and hierarchy that are found in the
@@ -200,6 +204,20 @@ ash shell /data/local/tmp/op_name_test
 
 #### iOS
 
+##### Prerequisites
+
+Follow the instructions for setting up the iOS development environment in the
+TensorFlow Lite [Build for iOS] guide. The `configure` script must be run and
+you must opt-in to iOS development.
+
+##### Building
+
+```
+bazel build -c opt --config=ios_arm64 ops:op_name_test
+```
+
+##### Testing
+
 TODO:
 
 [stablehlo]: https://github.com/openxla/stablehlo/blob/main/docs/spec.md
@@ -208,3 +226,5 @@ TODO:
 [GoogleTest]: https://github.com/google/googletest
 [Google Benchmark]: https://github.com/google/benchmark
 [Google style guide]: https://google.github.io/styleguide/cppguide.html
+[Tensorflow Contributing Guide]: https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md
+[Build for iOS]: https://www.tensorflow.org/lite/guide/build_ios

From 2f8eef9323990be32cac7f749306b9596659c6c7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 11:51:39 -0700
Subject: [PATCH 451/670] Models only a subset of memory segments that are
 sufficiently different from one another, skipping those that are "similar
 enough."

PiperOrigin-RevId: 619264352
---
 .../auto_sharding/auto_sharding_solver.cc     | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
index f7a6d3c67be523..618073f9900f8a 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
@@ -73,6 +73,9 @@ constexpr double kMemoryMultiplier = 1e-6;
 // Any memory terms below this threshold will be dropped (to reduce MIP size).
 constexpr double kTinyTermThreshold = 1e-6;
 
+// Any memory segments differing by this amount are skipped (reduces MIP size).
+constexpr double kSimilarityThreshold = 1e-2;
+
 bool AutoShardingSolverResult::operator==(
     const AutoShardingSolverResult& other) const {
   return status == other.status &&
@@ -213,6 +216,24 @@ AutoShardingSolverRequest ScaleRequest(
   return scaled_request;
 }
 
+double MemoryDifference(
+    const AutoShardingSolverRequest& request,
+    const tsl::protobuf::RepeatedPtrField<AutoShardingSolverRequest_Costs>& c,
+    const absl::flat_hash_set<int64_t>& live_prev,
+    const absl::flat_hash_set<int64_t>& live_curr) {
+  double memory_diff = 0.0;  // How much this segment differs from the last.
+  absl::flat_hash_set<int64_t> live_union;
+  live_union.insert(live_prev.begin(), live_prev.end());
+  live_union.insert(live_curr.begin(), live_curr.end());
+  for (int64_t idx : live_union) {
+    if (!live_prev.contains(idx) || !live_curr.contains(idx)) {
+      memory_diff +=
+          *std::max_element(c.at(idx).costs().begin(), c.at(idx).costs().end());
+    }
+  }
+  return memory_diff;
+}
+
 // Taking an auto-sharding problem (`request`) as an input, calls the OR tools
 // CP-SAT solver and outputs a solution to the input problem.
 //
@@ -466,7 +487,27 @@ AutoShardingSolverResult CallORToolsSolver(
   // c.
   if (request.memory_budget() > 0) {
     int tiny_term_count = 0;
+    int segment_similarity_skips = 0;
+    absl::flat_hash_set<int64_t> live_nodes_prev, live_edges_prev;
     for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
+      // Decide whether this segment is similar enough to be skipped.
+      absl::flat_hash_set<int64_t> live_nodes_curr, live_edges_curr;
+      const auto& live_nodes = request.live(time_idx).nodes();
+      live_nodes_curr.insert(live_nodes.begin(), live_nodes.end());
+      double memory_diff = MemoryDifference(request, request.memory_costs(),
+                                            live_nodes_prev, live_nodes_curr);
+      if (!request.live_edges().empty() && request.enable_memory_edge_costs()) {
+        const auto& live_edges = request.live_edges(time_idx).edges();
+        live_edges_curr.insert(live_edges.begin(), live_edges.end());
+        memory_diff += MemoryDifference(request, request.memory_edge_costs(),
+                                        live_edges_prev, live_edges_curr);
+      }
+      if (memory_diff < kSimilarityThreshold * request.memory_budget()) {
+        ++segment_similarity_skips;
+        continue;
+      }
+      live_nodes_prev = live_nodes_curr;
+      live_edges_prev = live_edges_curr;
       MPConstraint* constraint =
           solver->MakeRowConstraint(-MPSolver::infinity(), MPSolver::infinity(),
                                     absl::StrCat("mem[", time_idx, "]"));
@@ -512,6 +553,8 @@ AutoShardingSolverResult CallORToolsSolver(
                         (request.memory_budget() - tiny_term_total));
     }
     LOG(INFO) << "Number of tiny terms: " << tiny_term_count;
+    LOG(INFO) << "Skipped " << segment_similarity_skips << " segments out of "
+              << request.live().size() << " due to similarity";
     if (overbudget_var) {
       solver->MutableObjective()->SetCoefficient(
           overbudget_var,

From 3f604d5a293d672c5d7a6d4f73834061f9a9bc78 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Tue, 26 Mar 2024 12:07:15 -0700
Subject: [PATCH 452/670] [XLA:GPU] Fix Triton emitter for nested reducer
 fusions.

This should not have any reason to happen in principle, but the fusion passes
currently sometimes produce fusions inside reduction computations---causing
us to have to codegen a fusion inside our Triton fusion.

PiperOrigin-RevId: 619269385
---
 .../xla/xla/service/gpu/ir_emitter_triton.cc  | 46 ++++++++++++++++++-
 .../xla/service/gpu/ir_emitter_triton_test.cc | 44 +++++++++++++++++-
 2 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index a3062fcc48ac87..3e64addaf797b8 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -124,6 +124,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
+#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/translate/hlo_to_mhlo/hlo_function_importer.h"
@@ -692,6 +693,44 @@ absl::StatusOr<Value> EmitReduce(ImplicitLocOpBuilder& b,
   return Cast(b, result, TritonType(b, hlo_reduce.shape().element_type()));
 }
 
+// Emit code corresponding to a fusion instruction somehow nested within the
+// initial Triton fusion. This can happen when we carry around auxiliary
+// computations, e.g. with reduces. Since we are emitting a single Triton
+// fusion, we simply flatten the fusion inside the computation.
+//
+// TODO(b/331413981): get rid of this special handling once this is solved.
+absl::StatusOr<Value> EmitNestedFusion(
+    ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
+    const se::DeviceDescription& device_info,
+    const HloFusionInstruction& fusion_instruction,
+    absl::flat_hash_map<const HloInstruction*, Value>& values) {
+  // TODO(b/331402498): revisit the order of scope once we completely deprecate
+  // Triton fusion analysis.
+  const HloComputation* fusion_computation =
+      fusion_instruction.fused_instructions_computation();
+
+  absl::flat_hash_map<const HloInstruction*, Value> region_values;
+
+  std::vector<const HloInstruction*> to_emit;
+  for (const HloInstruction* instr :
+       fusion_computation->MakeInstructionPostOrder()) {
+    if (instr->opcode() == HloOpcode::kParameter) {
+      int64_t parameter_number = instr->parameter_number();
+      auto it = values.find(fusion_instruction.operand(parameter_number));
+      TF_RET_CHECK(it != values.end());
+      TF_RET_CHECK(region_values.insert({instr, it->second}).second);
+    } else {
+      to_emit.push_back(instr);
+    }
+  }
+
+  TF_RET_CHECK(to_emit.back() == fusion_computation->root_instruction());
+
+  return EmitScope(b, libdevice_path, device_info, /*analysis=*/nullptr,
+                   TritonFusionAnalysis::Scope::OUTPUT, {}, to_emit,
+                   region_values);
+}
+
 // TODO(b/331332678): Add unit tests to target this function specifically.
 Value EmitTiledBroadcast(
     ImplicitLocOpBuilder& b, const SymbolicTileAnalysis& analysis,
@@ -855,7 +894,7 @@ absl::StatusOr<Value> EmitScope(
     } else if (hlo->opcode() == HloOpcode::kReduce) {
       TF_ASSIGN_OR_RETURN(result, EmitReduce(b, libdevice_path, device_info,
                                              *hlo, values[hlo->operand(0)]));
-    } else if (hlo->IsElementwise()) {
+    } else if (HloInstruction::IsOpElementwise(hlo->opcode())) {
       std::vector<Value> operands;
       operands.reserve(hlo->operands().size());
       for (const HloInstruction* operand : hlo->operands()) {
@@ -873,6 +912,11 @@ absl::StatusOr<Value> EmitScope(
       // which are pushed to loads and stores. No operations on tiles are
       // performed here.
       result = values[hlo->operand(0)];
+    } else if (hlo->opcode() == HloOpcode::kFusion) {
+      const auto* fusion_instruction = ::xla::Cast<HloFusionInstruction>(hlo);
+      TF_ASSIGN_OR_RETURN(result,
+                          EmitNestedFusion(b, libdevice_path, device_info,
+                                           *fusion_instruction, values));
     } else {
       LOG(FATAL) << hlo->ToString();
     }
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index b40e04614b000f..113ebcc237a6f8 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "xla/autotuning.pb.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -862,6 +861,49 @@ CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1>, tensor<16xf32>
                                                 /*arel=*/0}));
 }
 
+TEST_F(TritonFilecheckTest, NestedReducerFusionGetsCodegenedCorrectly) {
+  // TODO(b/327336797): remove filter once V100 codegen in Triton is removed.
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Doesn't pass on pre-Ampere GPUs.";
+  }
+
+  const std::string kHloText = R"(
+HloModule softmax
+
+fused_convert {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  convert0 = bf16[] convert(p0)
+  convert1 = bf16[] convert(p1)
+  add = bf16[] add(convert0, convert1)
+  ROOT output = f32[] convert(add)
+}
+
+add_computation {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT fusion = f32[] fusion(p0, p1), kind=kLoop, calls=fused_convert
+}
+
+triton_softmax_computation {
+  p0 = pred[10,128]{1,0} parameter(0)
+  p0_f32 = f32[10,128]{1,0} convert(p0)
+  zero = f32[] constant(0)
+  reduce = f32[10]{0} reduce(p0_f32, zero), dimensions={1}, to_apply=add_computation
+  broadcast = f32[10,128]{1,0} broadcast(reduce), dimensions={0}
+  ROOT add = f32[10,128]{1,0} add(p0_f32, broadcast)
+}
+
+ENTRY main {
+  p0 = pred[10,128]{1,0} parameter(0)
+  ROOT softmax = f32[10,128] fusion(p0), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config":{"kind":"__triton_softmax"}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/0,
+                                                           /*arel=*/0}));
+}
+
 TEST_F(
     TritonFilecheckTest,
     DiamondWithAdditionalDiamondParameterBroadcastedAlongBatchDimProducesAccurateResults) {  // NOLINT(whitespace/line_length)

From 15acdd7bae5eabe1fc92ebbb507aadd7bdbd42a9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 13:20:44 -0700
Subject: [PATCH 453/670] Don't reset changed_ to false every time
 PerformSpaceToBatchOnConvolution is called.

For example after PropagateOnUsers is called, new broadcast instructions are added so changed_ should be true. This makes it possible for the hlo module to be saved after this pass has rewrote or added new hlo instructions (module pertaining to the pass is only saved if there are changes).

PiperOrigin-RevId: 619290345
---
 third_party/xla/xla/service/space_to_batch_converter.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/space_to_batch_converter.cc b/third_party/xla/xla/service/space_to_batch_converter.cc
index 890d734f25e260..f5015217115b5e 100644
--- a/third_party/xla/xla/service/space_to_batch_converter.cc
+++ b/third_party/xla/xla/service/space_to_batch_converter.cc
@@ -895,6 +895,7 @@ absl::StatusOr<bool> ConvolutionVisitor::Run() {
     }
     if (convs_to_visit_.count(conv) > 0) {
       TF_CHECK_OK(PerformSpaceToBatchOnConvolution(conv));
+      changed_ = true;
     }
   }
   conv_visitor_list_.clear();
@@ -3709,8 +3710,6 @@ Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
   }
   VLOG(1) << "Handling conv " << convolution->ToString();
 
-  changed_ = false;
-
   ConvolutionDimensionNumbers dim_numbers =
       convolution->convolution_dimension_numbers();
 
@@ -3915,7 +3914,6 @@ Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
   }
   TF_CHECK_OK(PropagateOnUsers(original_conv));
 
-  changed_ = true;
 
   return OkStatus();
 }

From d56e01ed167757aa68eaa3e1ad9ce76d9936de74 Mon Sep 17 00:00:00 2001
From: Kevin Gleason <gleasonk@google.com>
Date: Tue, 26 Mar 2024 13:25:02 -0700
Subject: [PATCH 454/670] Add CompositeOp to CustomOp conversion

PiperOrigin-RevId: 619291788
---
 tensorflow/compiler/mlir/lite/BUILD           |   2 +
 tensorflow/compiler/mlir/lite/stablehlo/BUILD |  27 ++++
 .../legalize-stablehlo-tfl-composite.mlir     |  37 +++++
 ...alize_stablehlo_composite_to_tfl_custom.cc | 137 ++++++++++++++++++
 .../mlir/lite/stablehlo/transforms/passes.td  |   4 +
 .../compiler/mlir/lite/tf_tfl_passes.cc       |   5 +-
 6 files changed, 211 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
 create mode 100644 tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 2b2a23a118630a..6a21902e8932c0 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -1153,6 +1153,7 @@ cc_library(
         ":size_utils",
         ":tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_composite_to_tfl_custom",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_to_vhlo_pass",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -1419,6 +1420,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/metrics:error_collector",
         "//tensorflow/compiler/mlir/lite/metrics:error_collector_inst",
         "//tensorflow/compiler/mlir/lite/quantization/stablehlo:quantization",
+        "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_composite_to_tfl_custom",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_to_vhlo_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:op_stat_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:stablehlo_util",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 34401ced6e280f..17e9a1862aaeac 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -427,6 +427,32 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "legalize_stablehlo_composite_to_tfl_custom",
+    srcs = [
+        "transforms/legalize_stablehlo_composite_to_tfl_custom.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+        "transforms/passes.h.inc",
+    ],
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "@flatbuffers",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:stablehlo_ops",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "legalize_stablehlo_to_vhlo_pass",
     srcs = [
@@ -709,6 +735,7 @@ tf_cc_binary(
         ":compose_uniform_quantized_type_pass",
         ":fold_broadcast_pass",
         ":fuse_convolution_pass",
+        ":legalize_stablehlo_composite_to_tfl_custom",
         ":legalize_stablehlo_custom_call_to_composite",
         ":legalize_stablehlo_to_vhlo_pass",
         ":legalize_tf_xla_call_module_to_stablehlo_pass",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
new file mode 100644
index 00000000000000..41a94b929c0f47
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
@@ -0,0 +1,37 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-composite-legalize-tfl-custom | FileCheck %s
+// RUN: tf_tfl_translate --enable-hlo-to-tf-conversion --input-mlir %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s --check-prefix=CHECK-ROUNDTRIP
+
+module {
+  func.func public @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>, %arg2: tensor<1x100x32x4xf32>,
+      %arg3: tensor<1x500x4x4xf32>, %arg4: tensor<1x500x4x4xf32>, %arg5: tensor<1x1x100x500xf32>, %arg6: tensor<f32>)
+      -> (tensor<3x3xf32>, tensor<1x100x32x4xf32>) {
+    // CHECK-ROUNDTRIP: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D617800010E00020001000100F40105032501">} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    // CHECK-ROUNDTRIP: %1 = "tfl.custom"(%arg2, %arg3, %arg4, %arg5, %arg6) {custom_code = "odml.scaled_dot_product_attention", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
+    %0 = func.call @test_kv_cache(%arg0, %arg1) : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    %1 = func.call @test_sdpa(%arg2, %arg3, %arg4, %arg5, %arg6) : (tensor<1x100x32x4xf32>,  tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
+    return %0, %1 : tensor<3x3xf32>, tensor<1x100x32x4xf32>
+  }
+
+  // CHECK-LABEL: func.func private @test_kv_cache
+  func.func private @test_kv_cache(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    // CHECK: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D617800010E00020001000100F40105032501">} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    %0 = stablehlo.composite "odml.update_kv_cache" %arg0, %arg1 {composite_attributes = {kv_cache_max = 500 : i64}, decomposition = @odml.update_kv_cache.impl} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    return %0 : tensor<3x3xf32>
+  }
+  func.func private @odml.update_kv_cache.impl(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    // No decomposition provided for test case.
+    return %arg0 : tensor<3x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @test_sdpa
+  func.func private @test_sdpa(%arg0: tensor<1x100x32x4xf32>, %arg1: tensor<1x500x4x4xf32>, %arg2: tensor<1x500x4x4xf32>, %arg3: tensor<1x1x100x500xf32>, %arg4: tensor<f32>) -> tensor<1x100x32x4xf32> {
+    // CHECK:  %0 = "tfl.custom"(%arg0, %arg1, %arg2, %arg3, %arg4) {custom_code = "odml.scaled_dot_product_attention", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
+    %0 = stablehlo.composite "odml.scaled_dot_product_attention" %arg0, %arg1, %arg2, %arg3, %arg4 {decomposition = @odml.scaled_dot_product_attention.impl} : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
+    return %0 : tensor<1x100x32x4xf32>
+  }
+  func.func private @odml.scaled_dot_product_attention.impl(%arg0: tensor<1x100x32x4xf32>, %arg1: tensor<1x500x4x4xf32>, %arg2: tensor<1x500x4x4xf32>, %arg3: tensor<1x1x100x500xf32>, %arg4: tensor<f32>) -> tensor<1x100x32x4xf32> {
+    // No decomposition provided for test case.
+    return %arg0 : tensor<1x100x32x4xf32>
+  }
+
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
new file mode 100644
index 00000000000000..a35f5ba324e3f4
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
@@ -0,0 +1,137 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstddef>
+#include <memory>
+#include <string>
+
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
+
+#define DEBUG_TYPE "composite-to-custom"
+
+namespace mlir {
+namespace odml {
+
+#define GEN_PASS_DEF_LEGALIZECOMPOSITETOCUSTOMOPPASS
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h.inc"
+
+namespace {
+bool IsSupportedComposite(::mlir::stablehlo::CompositeOp op) {
+  // List of supported composites to represent using CustomOp.
+  return llvm::is_contained(
+      {"odml.update_kv_cache", "odml.scaled_dot_product_attention"},
+      op.getName());
+}
+
+TFL::ConstBytesAttr CustomOption(OpBuilder* builder,
+                                 const std::string& content) {
+  return TFL::ConstBytesAttr::get(builder->getContext(),
+                                  StringRef(content.data(), content.size()));
+}
+
+LogicalResult BuildOption(flexbuffers::Builder* fbb, Operation* op,
+                          NamedAttribute pair) {
+  const char* key = pair.getName().data();
+  const auto attr = pair.getValue();
+
+  if (attr.isa<::mlir::IntegerAttr>()) {
+    fbb->Int(key, attr.dyn_cast<mlir::IntegerAttr>().getInt());
+    return success();
+  }
+
+  if (attr.isa<::mlir::FloatAttr>()) {
+    fbb->Double(key, attr.dyn_cast<mlir::FloatAttr>().getValueAsDouble());
+    return success();
+  }
+
+  return op->emitWarning("serialization not supported for : ") << key;
+}
+
+TFL::CustomOp BuildCustomOp(stablehlo::CompositeOp composite,
+                            const std::string& custom_option_buffer) {
+  OpBuilder builder(composite->getContext());
+  builder.setInsertionPoint(composite);
+  return builder.create<TFL::CustomOp>(
+      composite->getLoc(), composite->getResultTypes(),
+      composite->getOperands(), composite.getName(),
+      CustomOption(&builder, custom_option_buffer));
+}
+
+}  // namespace
+
+// Legalize stablehlo::CompositeOp to TFL::CustomOp for runtime-supported
+// composites. See `IsSupportedComposite` for list of supported ops.
+//
+// Example:
+//   %0 = stablehlo.composite "odml.some_op" <args> {
+//      composite_attrs = {<attrs>},
+//      version = 0 : i32
+//   }
+//   ==>
+//   %0 = tfl.custom(<args>) {
+//     custom_code = "odml.some_op",
+//     custom_option = #tfl<const_bytes : "flexbuffer_serialized_attrs">
+//   }
+struct LegalizeCompositeToCustomOpPass
+    : public impl::LegalizeCompositeToCustomOpPassBase<
+          LegalizeCompositeToCustomOpPass> {
+  using LegalizeCompositeToCustomOpPassBase::
+      LegalizeCompositeToCustomOpPassBase;
+
+  void runOnOperation() override {
+    func::FuncOp fn = getOperation();
+    fn.walk([&](Operation* op) {
+      // Process only StableHLO composite ops.
+      auto composite = llvm::dyn_cast<stablehlo::CompositeOp>(op);
+      if (!composite || !IsSupportedComposite(composite)) return;
+
+      // Build flexbuffer options.
+      std::string custom_option_buffer;
+      auto fbb = std::make_unique<flexbuffers::Builder>();
+      size_t map_start = fbb->StartMap();
+      for (const NamedAttribute& pair : composite.getCompositeAttributes()) {
+        // Allows skipping unsupported attributes, will warn.
+        (void)BuildOption(fbb.get(), op, pair);
+      }
+      fbb->EndMap(map_start);
+      fbb->Finish();
+      custom_option_buffer.assign(fbb->GetBuffer().begin(),
+                                  fbb->GetBuffer().end());
+
+      // Build TFL custom op, replace composite with custom op.
+      TFL::CustomOp tfl_custom_op =
+          BuildCustomOp(composite, custom_option_buffer);
+      composite->replaceAllUsesWith(tfl_custom_op);
+      composite->erase();
+    });
+  }
+};
+
+static PassRegistration<LegalizeCompositeToCustomOpPass> pass;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
index f736fa734d3ffc..3e7dfcf58a4438 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
@@ -95,6 +95,10 @@ def LegalizeVhloToStablehloPass : Pass<"vhlo-legalize-stablehlo", "ModuleOp"> {
   let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
 }
 
+def LegalizeCompositeToCustomOpPass : Pass<"stablehlo-composite-legalize-tfl-custom", "func::FuncOp"> {
+  let summary = "Legalize supported StableHLO CompositeOps to TFL CustomOp";
+  let dependentDialects = ["TFL::TensorFlowLiteDialect"];
+}
 def LegalizeStablehloCustomCallToCompositePass : Pass<"stablehlo-custom-call-legalize-composite", "ModuleOp"> {
   let summary = "Legalize StableHLO custom call ops where the call target is 'stablehlo.composite' to composite ops.";
   let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 9f0d1c99c64d60..ffa807a6c65663 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -256,9 +256,12 @@ void AddPostQuantizationStableHloToTfPasses(
   // Translate "stablehlo.custom_call @stablehlo.composite" to
   // "stablehlo.composite"
   // TODO: b/330741524 - clean this up when "stablehlo.composite" is emitted
-  // directly.
+  // directly. Additionally remove the composite to custom once ODML long term
+  // solution lands.
   pass_manager.addPass(
       mlir::odml::createLegalizeStablehloCustomCallToCompositePass());
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
+      mlir::odml::createLegalizeCompositeToCustomOpPass());
 }
 
 // This is the early part of the conversion in isolation. This enables a caller

From 85005536a02d8fcf1cd533929dbc9ea80d3281ae Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Tue, 26 Mar 2024 13:30:19 -0700
Subject: [PATCH 455/670] Add new overload of `GraphToFunctionDef()` that can
 consume the graph.

Previously, we copied all of the node data from the input subgraph to the `FunctionDef`. This change reduces the memory footprint of multi-device function instantiation by (where possible) `std::move`-ing the bulk of the node data from the input graph to newly created `NodeDef` members of the resulting `FunctionDef` protocol buffer.

This change should reduce (but not completely eliminate) the need for setting `TF_PFLR_PARALLEL_INSTANTIATE_THRESHOLD` in jobs with large multi-device function graphs.

PiperOrigin-RevId: 619293682
---
 .../process_function_library_runtime.cc       |   5 +-
 .../core/framework/graph_to_functiondef.cc    | 264 +++++++++++-------
 .../core/framework/graph_to_functiondef.h     |   6 +
 3 files changed, 171 insertions(+), 104 deletions(-)

diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 71784b67180628..a404cb014aef35 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -615,7 +615,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   // We must preserve control returns in each of the function components,
   // otherwise after function inlining we might prune side-effectful nodes.
   const auto control_ret =
-      [&node_name_to_control_ret](const Node* n) -> absl::optional<string> {
+      [&node_name_to_control_ret](const Node* n) -> std::optional<string> {
     const auto it = node_name_to_control_ret.find(n->name());
     return it != node_name_to_control_ret.end()
                // NOLINTNEXTLINE
@@ -684,7 +684,8 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     }
 
     FunctionDef shard;
-    s = GraphToFunctionDef(*subgraph, comp_data->name, control_ret, &shard);
+    s = GraphToFunctionDef(std::move(subgraph), comp_data->name, control_ret,
+                           &shard);
     if (!s.ok()) {
       done(s);
       return;
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index 9b2d2d483bcda7..384d9cba6865a2 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 
+#include <memory>
+#include <utility>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -156,7 +159,7 @@ Status FillFunctionBody(
     const std::vector<const Node*>& body_nodes,
     const absl::flat_hash_map<string, string>& tensor_renaming,
     bool set_stateful_from_nodes, bool copy_placeholder_attrs_from_nodes,
-    FunctionDef* fdef) {
+    bool allow_destructive_reads, FunctionDef* fdef) {
   absl::flat_hash_set<string> func_attr_names;
   for (const auto& func_attr : fdef->signature().attr()) {
     func_attr_names.insert(func_attr.name());
@@ -166,18 +169,45 @@ Status FillFunctionBody(
   std::vector<const Edge*> control_edges;
   for (const Node* node : body_nodes) {
     NodeDef* node_def = fdef->add_node_def();
+    NodeDebugInfo debug_info(node->def());
     // First, copy the node_def as is. We will patch it next.
-    *node_def = node->def();
+    if (allow_destructive_reads) {
+      // TODO(b/327983931): Add static_assert to catch the case where fields are
+      // added to `NodeDef`.
+      Node* mutable_node = const_cast<Node*>(node);
+      // NOTE: `node_def->name()`, `node_def->device()` and `node_def->input()`
+      // are set below.
+      *node_def->mutable_op() =
+          node->def()
+              .op();  // Must be retained for access via `Node::type_string()`.
+      // Do not copy `input` or `device` because these are overwritten below.
+      // After this point, the other fields of `dst->def()` should no longer be
+      // accessed.
+      *node_def->mutable_attr() =
+          std::move(*mutable_node->mutable_def()->mutable_attr());
+
+      if (node->def().has_experimental_debug_info()) {
+        *node_def->mutable_experimental_debug_info() = std::move(
+            *mutable_node->mutable_def()->mutable_experimental_debug_info());
+      }
+      if (node->def().has_experimental_type()) {
+        *node_def->mutable_experimental_type() = std::move(
+            *mutable_node->mutable_def()->mutable_experimental_type());
+      }
+    } else {
+      *node_def = node->def();
+      MergeDebugInfo(NodeDebugInfo(node->def()), node_def);
+
+      // Input names must be set based on nested names in tensor_renaming.
+      // Clear the flat input names we got from the original node_def
+      // from the graph.
+      node_def->clear_input();
+    }
+
     if (!node->assigned_device_name().empty()) {
       node_def->set_device(node->assigned_device_name());
     }
     node_def->set_name(node_names.Lookup(node->name()));
-    MergeDebugInfo(NodeDebugInfo(node->def()), node_def);
-
-    // Input names must be set based on nested names in tensor_renaming.
-    // Clear the flat input names we got from the original node_def
-    // from the graph.
-    node_def->clear_input();
 
     // Collect regular and control inputs. Regular inputs are indexed
     // by the index at which they come into the `node`. Control inputs
@@ -202,7 +232,7 @@ Status FillFunctionBody(
     // Add regular inputs.
     for (size_t i = 0; i < in_edges.size(); ++i) {
       const Edge* edge = in_edges[i];
-      string original_input_name;
+      std::string original_input_name;
       if (edge == nullptr) {
         // A backedge might not appear as a regular Edge, but be only present
         // in the node_def. Such edges are referred to as requested_inputs().
@@ -255,20 +285,20 @@ Status FillFunctionBody(
     if (!copy_placeholder_attrs_from_nodes) {
       continue;
     }
-    for (const auto& iter : node->attrs()) {
+    for (const auto& iter : node_def->attr()) {
       if (iter.second.placeholder().empty()) {
         continue;
       }
 
       // If we already added the attribute, skip it.
-      string func_attr_name = iter.second.placeholder();
+      const std::string& func_attr_name = iter.second.placeholder();
       if (func_attr_names.find(func_attr_name) != func_attr_names.end()) {
         continue;
       }
 
       // This node's attribute is a placeholder value, so it does not have type
       // information. We check node's OpDef for attribute type.
-      string node_attr_name = iter.first;
+      const std::string& node_attr_name = iter.first;
       const OpDef::AttrDef* node_attr_def = nullptr;
       for (const auto& node_attr : node->op_def().attr()) {
         if (node_attr.name() == node_attr_name) {
@@ -292,91 +322,15 @@ Status FillFunctionBody(
 }
 
 Status GraphToFunctionDefHelper(
-    const Graph& graph, const string& name,
-    const std::function<absl::optional<string>(const Node*)>& control_ret,
-    const std::vector<string>& output_names, FunctionDef* fdef) {
-  auto add_arg_or_retval = [](Node* node,
-                              std::vector<OutputTensor>* args_or_retvals) {
-    int index;
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
-    if (index >= args_or_retvals->size()) {
-      args_or_retvals->resize(index + 1);
-    }
-    if ((*args_or_retvals)[index].node == nullptr) {
-      (*args_or_retvals)[index].node = node;
-    } else {
-      return errors::InvalidArgument(
-          "Multiple '", node->type_string(), "' nodes found with index ", index,
-          "; originally we already have:\n",
-          (*args_or_retvals)[index].node->DebugString(), "\nNow we have:\n",
-          node->DebugString());
-    }
-    return OkStatus();
-  };
-
-  std::vector<const Node*> body_nodes;
-  std::vector<OutputTensor> inputs;
-  std::vector<OutputTensor> outputs;
-  std::vector<const Node*> control_outputs;
-  std::vector<string> control_output_names;
-  for (Node* node : graph.op_nodes()) {
-    if (node->IsArg()) {
-      TF_RETURN_IF_ERROR(add_arg_or_retval(node, &inputs));
-      continue;
-    }
-
-    if (node->IsRetval()) {
-      TF_RETURN_IF_ERROR(add_arg_or_retval(node, &outputs));
-      continue;
-    }
-
-    if (control_ret) {
-      auto control_ret_name = control_ret(node);
-      if (control_ret_name.has_value()) {
-        control_outputs.push_back(node);
-        control_output_names.push_back(control_ret_name.value());
-      }
-    }
-
-    body_nodes.push_back(node);
-  }
-
-  auto validate_args_retvals =
-      [](const std::vector<OutputTensor>& args_or_retvals,
-         const string& op_type) {
-        for (int i = 0, e = args_or_retvals.size(); i < e; ++i) {
-          if (args_or_retvals[i].node == nullptr) {
-            return errors::InvalidArgument("Missing '", op_type,
-                                           "' node at index ", i);
-          }
-        }
-        return OkStatus();
-      };
-
-  TF_RETURN_IF_ERROR(validate_args_retvals(inputs, "_Arg"));
-  TF_RETURN_IF_ERROR(validate_args_retvals(outputs, "_Retval"));
-
-  return GraphToFunctionDef(graph, name, /*append_hash_to_fn_name=*/false,
-                            /*set_stateful_from_nodes=*/false,
-                            /*copy_placeholder_attrs_from_nodes=*/false,
-                            body_nodes, inputs, outputs, output_names,
-                            control_outputs, control_output_names,
-                            /*description=*/nullptr, fdef);
-}
-
-}  // anonymous namespace
-
-Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
-                          bool append_hash_to_fn_name,
-                          bool set_stateful_from_nodes,
-                          bool copy_placeholder_attrs_from_nodes,
-                          const std::vector<const Node*>& body_nodes,
-                          const std::vector<OutputTensor>& inputs,
-                          const std::vector<OutputTensor>& outputs,
-                          const std::vector<string>& output_names,
-                          const std::vector<const Node*>& control_outputs,
-                          const std::vector<string>& control_output_names,
-                          const char* description, FunctionDef* fdef) {
+    const Graph& fn_body, const string& fn_name, bool append_hash_to_fn_name,
+    bool set_stateful_from_nodes, bool copy_placeholder_attrs_from_nodes,
+    const std::vector<const Node*>& body_nodes,
+    const std::vector<OutputTensor>& inputs,
+    const std::vector<OutputTensor>& outputs,
+    const std::vector<string>& output_names,
+    const std::vector<const Node*>& control_outputs,
+    const std::vector<string>& control_output_names, const char* description,
+    bool allow_destructive_reads, FunctionDef* fdef) {
   if (!output_names.empty()) {
     DCHECK_EQ(output_names.size(), outputs.size());
   }
@@ -501,9 +455,9 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
     }
   }
 
-  TF_RETURN_IF_ERROR(FillFunctionBody(fn_name, node_names, body_nodes,
-                                      tensor_renaming, set_stateful_from_nodes,
-                                      copy_placeholder_attrs_from_nodes, fdef));
+  TF_RETURN_IF_ERROR(FillFunctionBody(
+      fn_name, node_names, body_nodes, tensor_renaming, set_stateful_from_nodes,
+      copy_placeholder_attrs_from_nodes, allow_destructive_reads, fdef));
 
   // Remap return values.
   for (int r = 0; r < fdef->signature().output_arg_size(); ++r) {
@@ -585,12 +539,108 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
   return OkStatus();
 }
 
+Status GraphToFunctionDefHelper(
+    const Graph& graph, const string& name,
+    const std::function<absl::optional<string>(const Node*)>& control_ret,
+    const std::vector<string>& output_names, bool allow_destructive_reads,
+    FunctionDef* fdef) {
+  auto add_arg_or_retval = [](Node* node,
+                              std::vector<OutputTensor>* args_or_retvals) {
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
+    if (index >= args_or_retvals->size()) {
+      args_or_retvals->resize(index + 1);
+    }
+    if ((*args_or_retvals)[index].node == nullptr) {
+      (*args_or_retvals)[index].node = node;
+    } else {
+      return errors::InvalidArgument(
+          "Multiple '", node->type_string(), "' nodes found with index ", index,
+          "; originally we already have:\n",
+          (*args_or_retvals)[index].node->DebugString(), "\nNow we have:\n",
+          node->DebugString());
+    }
+    return OkStatus();
+  };
+
+  std::vector<const Node*> body_nodes;
+  std::vector<OutputTensor> inputs;
+  std::vector<OutputTensor> outputs;
+  std::vector<const Node*> control_outputs;
+  std::vector<string> control_output_names;
+  for (Node* node : graph.op_nodes()) {
+    if (node->IsArg()) {
+      TF_RETURN_IF_ERROR(add_arg_or_retval(node, &inputs));
+      continue;
+    }
+
+    if (node->IsRetval()) {
+      TF_RETURN_IF_ERROR(add_arg_or_retval(node, &outputs));
+      continue;
+    }
+
+    if (control_ret) {
+      auto control_ret_name = control_ret(node);
+      if (control_ret_name.has_value()) {
+        control_outputs.push_back(node);
+        control_output_names.push_back(control_ret_name.value());
+      }
+    }
+
+    body_nodes.push_back(node);
+  }
+
+  auto validate_args_retvals =
+      [](const std::vector<OutputTensor>& args_or_retvals,
+         const string& op_type) {
+        for (int i = 0, e = args_or_retvals.size(); i < e; ++i) {
+          if (args_or_retvals[i].node == nullptr) {
+            return errors::InvalidArgument("Missing '", op_type,
+                                           "' node at index ", i);
+          }
+        }
+        return OkStatus();
+      };
+
+  TF_RETURN_IF_ERROR(validate_args_retvals(inputs, "_Arg"));
+  TF_RETURN_IF_ERROR(validate_args_retvals(outputs, "_Retval"));
+
+  return GraphToFunctionDefHelper(
+      graph, name, /*append_hash_to_fn_name=*/false,
+      /*set_stateful_from_nodes=*/false,
+      /*copy_placeholder_attrs_from_nodes=*/false, body_nodes, inputs, outputs,
+      output_names, control_outputs, control_output_names,
+      /*description=*/nullptr, allow_destructive_reads, fdef);
+}
+
+}  // anonymous namespace
+
+Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
+                          bool append_hash_to_fn_name,
+                          bool set_stateful_from_nodes,
+                          bool copy_placeholder_attrs_from_nodes,
+                          const std::vector<const Node*>& body_nodes,
+                          const std::vector<OutputTensor>& inputs,
+                          const std::vector<OutputTensor>& outputs,
+                          const std::vector<string>& output_names,
+                          const std::vector<const Node*>& control_outputs,
+                          const std::vector<string>& control_output_names,
+                          const char* description, FunctionDef* fdef) {
+  return GraphToFunctionDefHelper(
+      fn_body, fn_name, append_hash_to_fn_name, set_stateful_from_nodes,
+      copy_placeholder_attrs_from_nodes, body_nodes, inputs, outputs,
+      output_names, control_outputs, control_output_names, description,
+      /*allow_destructive_reads=*/false, fdef);
+  return OkStatus();
+}
+
 Status GraphToFunctionDef(
     const Graph& graph, const string& name,
     const std::function<absl::optional<string>(const Node*)>& control_ret,
     FunctionDef* fdef) {
   return GraphToFunctionDefHelper(graph, name, control_ret,
-                                  /*output_names=*/{}, fdef);
+                                  /*output_names=*/{},
+                                  /*allow_destructive_reads=*/false, fdef);
 }
 
 Status GraphToFunctionDef(const Graph& graph, const string& name,
@@ -602,7 +652,17 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
                           const std::vector<std::string>& output_names,
                           FunctionDef* fdef) {
   return GraphToFunctionDefHelper(graph, name, /*control_ret=*/nullptr,
-                                  output_names, fdef);
+                                  output_names,
+                                  /*allow_destructive_reads=*/false, fdef);
+}
+
+Status GraphToFunctionDef(
+    std::unique_ptr<Graph> graph, const string& name,
+    const std::function<std::optional<string>(const Node*)>& control_ret,
+    FunctionDef* fdef) {
+  return GraphToFunctionDefHelper(*graph, name, control_ret,
+                                  /*output_names=*/{},
+                                  /*allow_destructive_reads=*/true, fdef);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/graph_to_functiondef.h b/tensorflow/core/framework/graph_to_functiondef.h
index 834bf50acccdde..5af678645527bd 100644
--- a/tensorflow/core/framework/graph_to_functiondef.h
+++ b/tensorflow/core/framework/graph_to_functiondef.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
 #define TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
 
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -60,6 +61,11 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
                           const std::vector<std::string>& output_names,
                           FunctionDef* fdef);
 
+Status GraphToFunctionDef(
+    std::unique_ptr<Graph> graph, const string& name,
+    const std::function<std::optional<string>(const Node*)>& control_ret,
+    FunctionDef* fdef);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_

From df541429a5d610fcc926fc5719f7c284200dad74 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 13:39:17 -0700
Subject: [PATCH 456/670] [XLA:Runtime] Moved the thunk target to the runtime
 folder.

Updated the necessary directories pointing to the thunk target.

PiperOrigin-RevId: 619296551
---
 third_party/xla/xla/service/gpu/BUILD         |  49 ++------
 .../service/gpu/compile_module_to_llvm_ir.cc  |   2 +-
 .../service/gpu/compile_module_to_llvm_ir.h   |   2 +-
 third_party/xla/xla/service/gpu/fusions/BUILD |  10 +-
 .../xla/xla/service/gpu/fusions/copy.cc       |   2 +-
 .../xla/xla/service/gpu/fusions/custom.cc     |   2 +-
 .../xla/service/gpu/fusions/fusion_emitter.h  |   2 +-
 .../xla/xla/service/gpu/fusions/reduction.cc  |   2 +-
 .../xla/xla/service/gpu/fusions/thunk_util.cc |   2 +-
 .../xla/xla/service/gpu/fusions/thunk_util.h  |   2 +-
 .../xla/xla/service/gpu/gpu_compiler.cc       |   2 +-
 .../xla/xla/service/gpu/gpu_executable.cc     |   2 +-
 .../xla/xla/service/gpu/gpu_executable.h      |   2 +-
 .../xla/service/gpu/ir_emitter_unnested.cc    |   2 +-
 .../xla/xla/service/gpu/ir_emitter_unnested.h |   2 +-
 .../xla/xla/service/gpu/mock_nccl_utils.cc    |   2 +-
 .../xla/xla/service/gpu/mock_nccl_utils.h     |   2 +-
 .../service/gpu/mock_nccl_utils_default.cc    |   2 +-
 third_party/xla/xla/service/gpu/runtime/BUILD | 107 ++++++++++++------
 .../gpu/runtime/address_computation_thunk.cc  |   2 +-
 .../gpu/runtime/address_computation_thunk.h   |   2 +-
 .../runtime/address_computation_thunk_test.cc |   2 +-
 .../xla/service/gpu/runtime/cholesky_thunk.h  |   2 +-
 .../service/gpu/runtime/command_buffer_cmd.cc |   2 +-
 .../service/gpu/runtime/command_buffer_cmd.h  |   2 +-
 .../gpu/runtime/command_buffer_cmd_emitter.cc |   2 +-
 .../gpu/runtime/command_buffer_cmd_emitter.h  |   2 +-
 .../gpu/runtime/command_buffer_cmd_test.cc    |   2 +-
 .../gpu/runtime/command_buffer_thunk.cc       |   2 +-
 .../gpu/runtime/command_buffer_thunk.h        |   2 +-
 .../gpu/runtime/command_buffer_thunk_test.cc  |   2 +-
 .../service/gpu/runtime/conditional_thunk.cc  |   2 +-
 .../service/gpu/runtime/conditional_thunk.h   |   2 +-
 .../service/gpu/runtime/convolution_thunk.h   |   2 +-
 .../xla/xla/service/gpu/runtime/copy_thunk.cc |   2 +-
 .../xla/xla/service/gpu/runtime/copy_thunk.h  |   2 +-
 .../xla/service/gpu/runtime/cub_sort_thunk.cc |   2 +-
 .../xla/service/gpu/runtime/cub_sort_thunk.h  |   2 +-
 .../xla/xla/service/gpu/runtime/cudnn_thunk.h |   2 +-
 .../service/gpu/runtime/custom_call_thunk.cc  |   2 +-
 .../service/gpu/runtime/custom_call_thunk.h   |   2 +-
 .../xla/xla/service/gpu/runtime/fft_thunk.h   |   2 +-
 .../xla/service/gpu/runtime/fused_mha_thunk.h |   2 +-
 .../xla/xla/service/gpu/runtime/gemm_thunk.cc |   2 +-
 .../xla/xla/service/gpu/runtime/gemm_thunk.h  |   2 +-
 .../gpu/runtime/gpublas_lt_matmul_thunk.cc    |   2 +-
 .../gpu/runtime/gpublas_lt_matmul_thunk.h     |   2 +-
 .../xla/service/gpu/runtime/infeed_thunk.h    |   2 +-
 .../xla/service/gpu/runtime/kernel_thunk.cc   |   2 +-
 .../xla/service/gpu/runtime/kernel_thunk.h    |   2 +-
 .../xla/service/gpu/runtime/memset_thunk.h    |   2 +-
 .../gpu/runtime/nccl_all_gather_thunk.cc      |   2 +-
 .../gpu/runtime/nccl_all_reduce_thunk.cc      |   2 +-
 .../nccl_collective_broadcast_thunk.cc        |   2 +-
 .../runtime/nccl_collective_permute_thunk.cc  |   2 +-
 .../gpu/runtime/nccl_collective_thunk.cc      |   2 +-
 .../gpu/runtime/nccl_collective_thunk.h       |   2 +-
 .../service/gpu/runtime/nccl_recv_thunk.cc    |   2 +-
 .../service/gpu/runtime/nccl_send_thunk.cc    |   2 +-
 .../xla/xla/service/gpu/runtime/norm_thunk.h  |   2 +-
 .../xla/service/gpu/runtime/outfeed_thunk.h   |   2 +-
 .../service/gpu/runtime/replica_id_thunk.h    |   2 +-
 .../service/gpu/runtime/send_recv_thunk.cc    |   2 +-
 .../xla/service/gpu/runtime/send_recv_thunk.h |   2 +-
 .../service/gpu/runtime/sequential_thunk.cc   |   2 +-
 .../service/gpu/runtime/sequential_thunk.h    |   2 +-
 .../xla/service/gpu/{ => runtime}/thunk.cc    |   2 +-
 .../xla/xla/service/gpu/{ => runtime}/thunk.h |   6 +-
 .../gpu/runtime/triangular_solve_thunk.h      |   2 +-
 .../gpu/runtime/wait_for_streams_thunk.cc     |   2 +-
 .../gpu/runtime/wait_for_streams_thunk.h      |   2 +-
 .../xla/service/gpu/runtime/while_thunk.cc    |   2 +-
 .../xla/xla/service/gpu/runtime/while_thunk.h |   2 +-
 .../service/gpu/stream_attribute_annotator.cc |   2 +-
 .../gpu/stream_attribute_async_wrapper.cc     |   3 +-
 75 files changed, 157 insertions(+), 158 deletions(-)
 rename third_party/xla/xla/service/gpu/{ => runtime}/thunk.cc (99%)
 rename third_party/xla/xla/service/gpu/{ => runtime}/thunk.h (99%)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 241d2d5bf7a3e6..ad4002db4b6688 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -340,7 +340,6 @@ cc_library(
         ":matmul_utils",
         ":nccl_api",
         ":parallel_loop_emitter",
-        ":thunk",
         ":triton_call",
         "//xla:autotuning_proto_cc",
         "//xla:literal",
@@ -390,6 +389,7 @@ cc_library(
         "//xla/service/gpu/runtime:replica_id_thunk",
         "//xla/service/gpu/runtime:send_recv_thunk",
         "//xla/service/gpu/runtime:sequential_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/service/gpu/runtime:wait_for_streams_thunk",
         "//xla/service/gpu/runtime:while_thunk",
         "//xla/service/llvm_ir:buffer_assignment_util",
@@ -867,39 +867,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "thunk",
-    srcs = ["thunk.cc"],
-    hdrs = ["thunk.h"],
-    deps = [
-        ":backend_configs_cc",
-        ":buffer_allocations",
-        ":gpu_executable_run_options",
-        ":nccl_api",
-        ":nccl_clique",
-        ":nccl_clique_key",
-        "//xla:executable_run_options",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/service:executable",
-        "//xla/service:global_device_id",
-        "//xla/stream_executor",
-        "//xla/translate/mhlo_to_hlo:location_exporter",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/lib/gtl:int_type",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 #===-------------------------------------------------------------------------------------------===//
 # NCCL integration
 #===-------------------------------------------------------------------------------------------===//
@@ -1125,7 +1092,6 @@ cc_library(
         ":nccl_clique_key",
         ":nccl_clique",
         ":sleep_kernel",
-        ":thunk",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1154,6 +1120,7 @@ cc_library(
         "//xla/service:lockable",
         "//xla/service/gpu/runtime:nccl_collective_thunk",
         "//xla/service/gpu/runtime:nccl_p2p_thunk_common",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/gpu:gpu_stream",
@@ -1178,7 +1145,6 @@ cc_library(
         ":nccl_api",
         ":nccl_clique_key",
         ":nccl_clique",
-        ":thunk",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
@@ -1189,6 +1155,7 @@ cc_library(
         "//xla:util",
         "//xla/service/gpu/runtime:nccl_collective_thunk",
         "//xla/service/gpu/runtime:nccl_p2p_thunk_common",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/service:collective_ops_utils",
         "//xla/service:global_device_id",
         "//xla/service:lockable",
@@ -1227,7 +1194,6 @@ cc_library(
         ":nccl_clique",
         ":nccl_clique_key",
         ":stream_executor_util",
-        ":thunk",
         "//xla:executable_run_options",
         "//xla:shape_tree",
         "//xla:shape_util",
@@ -1247,6 +1213,7 @@ cc_library(
         "//xla/service:stream_pool",
         "//xla/service:xla_debug_info_manager",
         "//xla/service/gpu/runtime:annotation",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
@@ -3269,7 +3236,6 @@ cc_library(
         ":ir_emitter_unnested",
         ":metrics",
         ":runtime_intrinsics",
-        ":thunk",
         "//xla:shape_util",
         "//xla:status",
         "//xla:util",
@@ -3284,6 +3250,7 @@ cc_library(
         "//xla/service:logical_buffer",
         "//xla/service/gpu/runtime:conditional_thunk",
         "//xla/service/gpu/runtime:sequential_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/service/gpu/runtime:while_thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
@@ -3740,7 +3707,7 @@ cc_library(
         ":ir_emitter_unnested",
         ":prepare_hlo_for_ir_emitting_pipeline",
         ":rename_fusions",
-        ":thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor:platform_manager",
         "@llvm-project//mlir:FuncDialect",
         "@local_tsl//tsl/lib/monitoring:counter",
@@ -5990,7 +5957,6 @@ cc_library(
     hdrs = ["stream_attribute_annotator.h"],
     deps = [
         ":backend_configs_cc",
-        ":thunk",
         "//xla:comparison_util",
         "//xla:status",
         "//xla:statusor",
@@ -5999,6 +5965,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_pass",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
@@ -6041,7 +6008,6 @@ cc_library(
     hdrs = ["stream_attribute_async_wrapper.h"],
     deps = [
         ":backend_configs_cc",
-        ":thunk",
         "//xla:comparison_util",
         "//xla:status",
         "//xla:statusor",
@@ -6049,6 +6015,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index 8734cde6f969fd..0eed8983af4060 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -53,8 +53,8 @@ limitations under the License.
 #include "xla/service/gpu/metrics.h"
 #include "xla/service/gpu/runtime/conditional_thunk.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/runtime/while_thunk.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_ordering.h"
 #include "xla/service/logical_buffer.h"
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
index 431b68cdcb29d5..4dee30414094a9 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/service/buffer_value.h"
 #include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/gpu_executable.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/shape.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 018283ba68848a..a213584af671af 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -104,8 +104,8 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:ir_emitter_context",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/runtime:copy_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -135,13 +135,13 @@ cc_library(
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:kernel_arguments",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/kernels:custom_kernel",
         "//xla/service/gpu/kernels:custom_kernel_fusion",
         "//xla/service/gpu/runtime:address_computation_thunk",
         "//xla/service/gpu/runtime:custom_call_thunk",
         "//xla/service/gpu/runtime:gemm_thunk",
         "//xla/service/gpu/runtime:kernel_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -204,10 +204,10 @@ cc_library(
         "//xla/service/gpu:kernel_reuse_cache",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:target_util",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/runtime:kernel_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
@@ -665,8 +665,8 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:ir_emitter_context",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/runtime:memset_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
@@ -702,10 +702,10 @@ cc_library(
         "//xla/service/gpu:parallel_loop_emitter",
         "//xla/service/gpu:reduction_utils",
         "//xla/service/gpu:target_util",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/runtime:kernel_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:kernel_support_library",
diff --git a/third_party/xla/xla/service/gpu/fusions/copy.cc b/third_party/xla/xla/service/gpu/fusions/copy.cc
index eaa7a515cf8e8f..37cb5ab0dba0e0 100644
--- a/third_party/xla/xla/service/gpu/fusions/copy.cc
+++ b/third_party/xla/xla/service/gpu/fusions/copy.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/runtime/copy_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index afef3ae930dcd6..147fb0e89e401b 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -58,7 +58,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 #include "xla/service/gpu/runtime/gemm_thunk.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
index dbc8e8718debe0..506dffcd92ac61 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/shape.h"
 #include "xla/status.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction.cc b/third_party/xla/xla/service/gpu/fusions/reduction.cc
index f861fa7bfb932e..193fc36d20266c 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction.cc
@@ -62,8 +62,8 @@ limitations under the License.
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/target_util.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/kernel_support_library.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/thunk_util.cc b/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
index 36b5cb300453bd..b356f76d083f8a 100644
--- a/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
+++ b/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/runtime/memset_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 
diff --git a/third_party/xla/xla/service/gpu/fusions/thunk_util.h b/third_party/xla/xla/service/gpu/fusions/thunk_util.h
index 32ba8e6a267a8c..a78bb76f3cdd2e 100644
--- a/third_party/xla/xla/service/gpu/fusions/thunk_util.h
+++ b/third_party/xla/xla/service/gpu/fusions/thunk_util.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/ir_emitter_context.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index d98b718e5bc1f9..b11812816324b9 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -165,12 +165,12 @@ limitations under the License.
 #include "xla/service/gpu/reduction_splitter.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/gpu/rename_fusions.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/runtime_intrinsics.h"
 #include "xla/service/gpu/scatter_slice_simplifier.h"
 #include "xla/service/gpu/softmax_rewriter_triton.h"
 #include "xla/service/gpu/stream_attribute_annotator.h"
 #include "xla/service/gpu/stream_attribute_async_wrapper.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/gpu/topk_specializer.h"
 #include "xla/service/gpu/topk_splitter.h"
 #include "xla/service/gpu/tree_reduction_rewriter.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 176fea2fd89115..25c03adb2e9c53 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -54,8 +54,8 @@ limitations under the License.
 #include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/annotation.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_parser.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index 76c325b797145b..e2e0daafe7554a 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -40,7 +40,7 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/runtime/annotation.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/service_executable_run_options.h"
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 686de8112eb2e9..79298088d9fc5a 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -136,9 +136,9 @@ limitations under the License.
 #include "xla/service/gpu/runtime/replica_id_thunk.h"
 #include "xla/service/gpu/runtime/send_recv_thunk.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/runtime/wait_for_streams_thunk.h"
 #include "xla/service/gpu/runtime/while_thunk.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/gpu/triton_call.h"
 #include "xla/service/llvm_ir/buffer_assignment_util.h"
 #include "xla/service/llvm_ir/ir_array.h"
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
index f378c07872c00f..f91db2db5ecfeb 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/runtime/send_recv_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/llvm_ir/loop_emitter.h"
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils.cc b/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
index 375cdf5954a930..911346cc5435fb 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
+++ b/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
@@ -66,8 +66,8 @@ limitations under the License.
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/sleep_kernel.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/lockable.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils.h b/third_party/xla/xla/service/gpu/mock_nccl_utils.h
index 71412ed7d36e92..1f1db47bfce802 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils.h
+++ b/third_party/xla/xla/service/gpu/mock_nccl_utils.h
@@ -33,7 +33,7 @@ limitations under the License.
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/lockable.h"
 #include "xla/stream_executor/stream.h"
 #include "tsl/lib/gtl/int_type.h"
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc b/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc
index dc1b8bde221609..856b3a4b871a77 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc
+++ b/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index eae262d65a0930..0fa9d245dcf111 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -96,8 +96,8 @@ cc_library(
         "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
         "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_types_header",
@@ -143,7 +143,7 @@ cc_library(
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -166,7 +166,7 @@ xla_test(
         "//xla/service:platform_util",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
@@ -199,7 +199,7 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:memory_allocation",
         "@com_google_absl//absl/base:core_headers",
@@ -240,7 +240,7 @@ xla_test(
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
@@ -266,7 +266,7 @@ cc_library(
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:cusolver_context",
         "//xla/service/gpu:make_batch_pointers",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -296,7 +296,7 @@ cc_library(
         "//xla:statusor",
         "//xla/service:buffer_assignment",  # build_cleaner: keep
         "//xla/service/gpu:buffer_allocations",  # build_cleaner: keep
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -339,7 +339,7 @@ xla_test(
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
@@ -368,8 +368,8 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu:variant_visitor",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:memory_allocation",
         "@com_google_absl//absl/base:core_headers",
@@ -395,7 +395,7 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_conv_runner",
         "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -413,7 +413,7 @@ cc_library(
     deps = [
         "//xla:status",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
         "@llvm-project//mlir:IR",
@@ -435,7 +435,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor:device_memory",
         "//xla:shape_util",
         "//xla:util",
@@ -464,7 +464,7 @@ cc_library(
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_status_internal",
         "//xla/service:executable",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_types_header",
@@ -491,7 +491,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -515,7 +515,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_fused_mha_runner",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -532,7 +532,7 @@ cc_library(
         "//xla:status",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:logging",
@@ -552,7 +552,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla:status",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor",
@@ -571,7 +571,7 @@ cc_library(
         "//xla:util",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:io_feed_manager",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
     ],
@@ -589,8 +589,8 @@ cc_library(
         "//xla/service/gpu:kernel_arguments",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -613,7 +613,7 @@ cc_library(
     deps = [
         "//xla:status",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
     ],
@@ -630,7 +630,7 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
@@ -653,7 +653,7 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -677,7 +677,7 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -699,7 +699,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -722,7 +722,7 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/translate/mhlo_to_hlo:attribute_exporter",
         "@com_google_absl//absl/algorithm:container",
@@ -759,7 +759,7 @@ cc_library(
         "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique",
         "//xla/service/gpu:nccl_clique_key",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_headers",
@@ -829,7 +829,7 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -854,7 +854,7 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -874,7 +874,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_norm_runner",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -890,7 +890,7 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla/service/gpu:io_feed_manager",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
     ],
@@ -903,7 +903,7 @@ cc_library(
     deps = [
         "//xla/service:buffer_assignment",
         "//xla/service:global_device_id",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -921,7 +921,7 @@ cc_library(
         "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
@@ -940,7 +940,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/service:buffer_assignment",
         "//xla/service:global_device_id",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -955,6 +955,39 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "thunk",
+    srcs = ["thunk.cc"],
+    hdrs = ["thunk.h"],
+    deps = [
+        "//xla:executable_run_options",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:executable",
+        "//xla/service:global_device_id",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/service/gpu:nccl_api",
+        "//xla/service/gpu:nccl_clique",
+        "//xla/service/gpu:nccl_clique_key",
+        "//xla/stream_executor",
+        "//xla/translate/mhlo_to_hlo:location_exporter",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/lib/gtl:int_type",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "triangular_solve_thunk",
     srcs = if_gpu_is_configured(["triangular_solve_thunk.cc"]),
@@ -969,7 +1002,7 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:make_batch_pointers",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor/gpu:gpu_asm_opts",
@@ -989,7 +1022,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1008,7 +1041,7 @@ cc_library(
     hdrs = ["wait_for_streams_thunk.h"],
     deps = [
         "//xla/service:global_device_id",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
@@ -1023,7 +1056,7 @@ cc_library(
     deps = [
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:kernel_arguments",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 97791d32b76284..705aede8672f5f 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index f7cefe1b1843fb..05765f215a9e14 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index bc5b0b1d998deb..030e45a154ed7c 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 #include "xla/service/gpu/runtime/gemm_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h b/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h
index d2448223a82a09..3fdbf3ebc89f9d 100644
--- a/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_asm_opts.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
index 0ca9addd067aa1..f8507cd4b205a0 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
@@ -56,8 +56,8 @@ limitations under the License.
 #include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
index 5ecf3c922fa096..ddbbff9d361694 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
@@ -45,7 +45,7 @@ limitations under the License.
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
index 5fcd8e299177c8..e1974c17cfefa7 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
@@ -35,8 +35,8 @@ limitations under the License.
 #include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
 #include "xla/service/gpu/runtime/replica_id_thunk.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/runtime/while_thunk.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h
index 5bb327a4e60271..a5608355a79752 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
index 7d7bdcd9aee4f3..d174c4733d8867 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/status.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc
index 3878153d18b1d4..b1de1a0f86f87b 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/runtime/annotation.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
index d81003c899661f..c1c5f1feba64c9 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/gpu/runtime/command_buffer_allocations.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
index 4472f5ef6b73f4..4a78b621b3dd4b 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/runtime/command_buffer_allocations.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc
index fb5a588fc21be0..8680126792fe07 100644
--- a/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/variant_visitor.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
index a7a7d3be0e0e37..0d8109b70c09b6 100644
--- a/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h b/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h
index cc22a07c439dd6..3f9db4ea26660b 100644
--- a/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc b/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
index 4bdc2acf5545cd..9a8698b23f8b5c 100644
--- a/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/stream_executor.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/copy_thunk.h b/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
index 9ad1a2943de683..521030ccee233d 100644
--- a/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc
index 1e4fe32cd97db2..5b477dc5fb2a46 100644
--- a/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/cub_sort_kernel.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
index 69cf643789b67f..12ee7a6dd1f3eb 100644
--- a/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
index 54b3ea0359a03d..b1b99889522179 100644
--- a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/kernel_arguments.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/dnn.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
index 2eaa44b8e0e166..0eaf0aaf4c39a4 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
@@ -38,7 +38,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
index 12d62c67c9af09..02679d2e0d21ff 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/status.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/fft_thunk.h b/third_party/xla/xla/service/gpu/runtime/fft_thunk.h
index 278e946f8df0cc..ffd45ed804fda9 100644
--- a/third_party/xla/xla/service/gpu/runtime/fft_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/fft_thunk.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
index 0415f3cc53e14d..6e93541f09c123 100644
--- a/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_fused_mha_runner.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/gemm_thunk.cc b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.cc
index cc74707c84e01f..4d46a78c4af583 100644
--- a/third_party/xla/xla/service/gpu/runtime/gemm_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/device_memory.h"
 #include "tsl/platform/logging.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
index a134ed2623b1ae..58f13d33172bb4 100644
--- a/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
index 74316dcb511b21..a6aaabe5b31896 100644
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <utility>
 
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/scratch_allocator.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
index 3b7649102e25d2..90ff6a1d6c24ad 100644
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h b/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h
index 6bff3471d8ac43..7a3db689cd6a35 100644
--- a/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
index b482c1cf97a228..063940d8ab7cd4 100644
--- a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
index 1a4ad3b8f7d595..1b50ce6be28694 100644
--- a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
@@ -32,7 +32,7 @@ limitations under the License.
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/memset_thunk.h b/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
index e8adf0cbaca6b7..e1eef4c39c5d3a 100644
--- a/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 
 // This file contains thunks that set a buffer's elements to a particular value.
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
index dd564620115c69..3a9f2c1d75401b 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
index 262795ab1350bc..47742afa87648f 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
index 03790724ef8975..79155dc95c687d 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
index 77cafde8120c1e..8eddb792f91cf9 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
index 4be40d4ee2ccb2..70d7b4351cc1fc 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
@@ -47,7 +47,7 @@ limitations under the License.
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
 #include "xla/status.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
index 6853ef0025a519..7eadf2ec270029 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
@@ -41,7 +41,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
index d0a09adbfe08fd..a20b33813d966e 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
index e56e62bba62c81..757f98745dd09e 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/norm_thunk.h b/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
index 2346811a36ed43..602d504175fb3d 100644
--- a/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_norm_runner.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h b/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h
index 5fbb3fe6a3e36d..a216431eb0a425 100644
--- a/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h b/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h
index acf09dca0a7493..7b9aa403de1bc9 100644
--- a/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
index 7d6a3183069e6e..ba23806bbbaf18 100644
--- a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
index 07c92f72547f7e..2c235664e8188d 100644
--- a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/stream_executor/event.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc
index e872792fc5c467..143ad94a29071d 100644
--- a/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "xla/service/gpu/runtime/annotation.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "tsl/platform/errors.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h
index fe1abafe5df5d1..4642f08becadb5 100644
--- a/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/thunk.cc b/third_party/xla/xla/service/gpu/runtime/thunk.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/thunk.cc
index 1019f027a36ec6..17af8b837b00bb 100644
--- a/third_party/xla/xla/service/gpu/thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 #include <algorithm>
 #include <cstddef>
diff --git a/third_party/xla/xla/service/gpu/thunk.h b/third_party/xla/xla/service/gpu/runtime/thunk.h
similarity index 99%
rename from third_party/xla/xla/service/gpu/thunk.h
rename to third_party/xla/xla/service/gpu/runtime/thunk.h
index 3adbea09dc7319..c9d037209062c9 100644
--- a/third_party/xla/xla/service/gpu/thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_THUNK_H_
-#define XLA_SERVICE_GPU_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_THUNK_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -458,4 +458,4 @@ bool IsReductionCollective(Thunk::Kind kind);
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h b/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h
index eb7111888190b6..83b97ae2438199 100644
--- a/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_asm_opts.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.cc b/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.cc
index a63cf493c1a461..2bd961264ee12a 100644
--- a/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "tsl/platform/errors.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h b/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h
index c30216ba80c5b9..2a545e01738657 100644
--- a/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/service/gpu/runtime/while_thunk.cc b/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
index 0f933aad078f50..cbd9ce3134133e 100644
--- a/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/while_thunk.h b/third_party/xla/xla/service/gpu/runtime/while_thunk.h
index ec48b0c5714b04..e1a06c96308590 100644
--- a/third_party/xla/xla/service/gpu/runtime/while_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/while_thunk.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
 
diff --git a/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc b/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc
index f0884d6638cc09..0bfa2cef837e2d 100644
--- a/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc
+++ b/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper.cc b/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper.cc
index 78d59e8d2fa597..97baaa76fb3818 100644
--- a/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper.cc
@@ -15,14 +15,13 @@ limitations under the License.
 
 #include "xla/service/gpu/stream_attribute_async_wrapper.h"
 
-
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"

From 2199188b0452a9cf290e2218aefa18f00c1ee13c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 14:26:24 -0700
Subject: [PATCH 457/670] Automated Code Change

PiperOrigin-RevId: 619311613
---
 tensorflow/lite/kernels/internal/BUILD | 12 +++---------
 tensorflow/lite/python/testdata/BUILD  | 12 ++++--------
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 3cd65dceb6cd20..8b103baeda2423 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -1507,9 +1507,7 @@ filegroup(
     srcs = glob([
         "optimized/*.h",
     ]),
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 filegroup(
@@ -1517,9 +1515,7 @@ filegroup(
     srcs = glob([
         "reference/*.h",
     ]),
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 filegroup(
@@ -1527,9 +1523,7 @@ filegroup(
     srcs = glob([
         "*.h",
     ]),
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 transitive_hdrs(
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
index 40417f5d6e9421..b339e0c107b32d 100644
--- a/tensorflow/lite/python/testdata/BUILD
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -1,6 +1,4 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/lite:build_def.bzl", "tf_to_tflite", "tflite_copts")
-load("//tensorflow:tensorflow.default.bzl", "pybind_extension", "tf_custom_op_py_strict_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
@@ -8,6 +6,8 @@ load(
     "tf_gen_op_wrapper_py",
     "tf_opts_nortti_if_android",
 )
+load("//tensorflow:tensorflow.default.bzl", "pybind_extension", "tf_custom_op_py_strict_library")
+load("//tensorflow/lite:build_def.bzl", "tf_to_tflite", "tflite_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -81,9 +81,7 @@ cc_library(
     name = "test_delegate",
     testonly = 1,
     srcs = ["test_delegate.cc"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/lite/core/c:common",
     ],
@@ -160,9 +158,7 @@ cc_library(
     name = "test_registerer",
     srcs = ["test_registerer.cc"],
     hdrs = ["test_registerer.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:builtin_ops",

From d45190043dca055c48eb4bd1ca541378f42c95a3 Mon Sep 17 00:00:00 2001
From: Majid Dadashi <majiddadashi@google.com>
Date: Tue, 26 Mar 2024 14:38:49 -0700
Subject: [PATCH 458/670] [tflite] Add the skeleton for tflite composite
 lowering pass.

PiperOrigin-RevId: 619315414
---
 tensorflow/compiler/mlir/lite/BUILD           |  2 +-
 tensorflow/compiler/mlir/lite/stablehlo/BUILD | 44 +++++++++++
 .../transforms/composite_lowering_pass.cc     | 79 +++++++++++++++++++
 .../transforms/composite_lowering_pass.h      | 27 +++++++
 .../transforms/composite_lowering_patterns.td | 22 ++++++
 .../mlir/lite/stablehlo/transforms/passes.h   |  3 +
 .../mlir/lite/stablehlo/transforms/passes.td  |  6 ++
 .../compiler/mlir/lite/tf_tfl_passes.cc       |  4 +-
 8 files changed, 183 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc
 create mode 100644 tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.h
 create mode 100644 tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 6a21902e8932c0..d684d1f3a5c27c 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -1381,6 +1381,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
         "//tensorflow/compiler/mlir/lite/quantization/tensorflow:tf_quantization_passes",
         "//tensorflow/compiler/mlir/lite/stablehlo:compose_uniform_quantized_type_pass",
+        "//tensorflow/compiler/mlir/lite/stablehlo:composite_lowering",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_tf_xla_call_module_to_stablehlo_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:rename_entrypoint_to_main",
         "//tensorflow/compiler/mlir/lite/stablehlo:tf_legalize_hlo",  # buildcleaner: keep
@@ -1392,7 +1393,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
-        "@com_google_absl//absl/log",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 17e9a1862aaeac..b6abb996f47837 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -683,6 +683,50 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "composite_lowering",
+    srcs = [
+        "transforms/composite_lowering_pass.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+    ],
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        ":composite_lowering_inc_gen",
+        ":passes_inc_gen",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@local_xla//xla/mlir_hlo",
+    ],
+    alwayslink = True,
+)
+
+gentbl_cc_library(
+    name = "composite_lowering_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            ["-gen-rewriters"],
+            "transforms/generated_composite_lowering.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "transforms/composite_lowering_patterns.td",
+    deps = [
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite_ops_td_files",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncTdFiles",
+        "@local_xla//xla/mlir_hlo:hlo_ops_td_files",
+    ],
+)
+
 tf_cc_binary(
     name = "odml_to_stablehlo",
     srcs = [
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc
new file mode 100644
index 00000000000000..0dc354f998d246
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc
@@ -0,0 +1,79 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"  // IWYU pragma: keep
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+
+namespace mlir {
+namespace odml {
+
+namespace {
+
+// This file is generated from `passes.td` and provides the implementation base
+// class.
+#define GEN_PASS_DEF_COMPOSITELOWERINGPASS
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h.inc"
+
+class CompositeLoweringPass
+    : public impl::CompositeLoweringPassBase<CompositeLoweringPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CompositeLoweringPass);
+
+  void runOnOperation() override;
+};
+
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/generated_composite_lowering.inc"
+
+void CompositeLoweringPass::runOnOperation() {
+  MLIRContext& context = getContext();
+  RewritePatternSet patterns(&getContext());
+
+  populateWithGenerated(patterns);
+
+  ConversionTarget target(context);
+  target.addLegalDialect<TFL::TensorFlowLiteDialect>();
+
+  if (failed(applyPartialConversion(getOperation(), target,
+                                    std::move(patterns)))) {
+    getOperation().emitError("Composite lowering pass failed.");
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateCompositeLoweringPass() {
+  return std::make_unique<CompositeLoweringPass>();
+}
+
+// Registers the pass implementation
+static PassRegistration<CompositeLoweringPass> pass;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.h
new file mode 100644
index 00000000000000..0bb758ad9f154b
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.h
@@ -0,0 +1,27 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_LOWERING_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_LOWERING_PASS_H_
+
+namespace mlir {
+namespace odml {
+
+std::unique_ptr<Pass> CreateCompositeLoweringPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_LOWERING_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
new file mode 100644
index 00000000000000..74d8bb372c7d37
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
@@ -0,0 +1,22 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Pattern definition file for direct lowering of mhlo composites to tflite ops.
+
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mhlo/IR/hlo_ops.td"
+include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
+
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
index 1735b99aafb0e0..49e8b673f63374 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
@@ -59,6 +59,9 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateUnfoldSplatConstantPass();
 // Create a pass that legalizes MHLO to TFLite dialect.
 std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeHloToTfLitePass();
 
+// Creates a pass that lowers stablehlo composite ops to tflite ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateCompositeLoweringPass();
+
 // Adds the HLO to TF rewrite patterns to the specified pattern list.
 void PopulateLegalizeHloToTfPatterns(RewritePatternSet* patterns,
                                      MLIRContext* context);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
index 3e7dfcf58a4438..a535d3aa867c80 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
@@ -108,3 +108,9 @@ def UnfoldSplatConstantPass : Pass<"unfold-splat-constant-pass", "ModuleOp"> {
   let summary = "Replaces a splat constant tensor with a BroadcastInDim op.";
   let constructor = "mlir::odml::CreateUnfoldSplatConstantPass()";
 }
+
+def CompositeLoweringPass : Pass<"composite-lowering", "ModuleOp"> {
+  let summary = "Lowers mhlo composites directly to tflite ops (when possible).";
+  let dependentDialects = ["mlir::mhlo::MhloDialect", "TFL::TensorFlowLiteDialect"];
+  let constructor = "mlir::odml::CreateCompositeLoweringPass()";
+}
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index ffa807a6c65663..72abf68f852fb5 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/log/log.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -227,8 +226,7 @@ void AddPostQuantizationStableHloToTfPasses(
   }
 
   if (pass_config.enable_composite_direct_lowering) {
-    LOG(WARNING) << "Direct lowerting of composites to TFLite ops is not "
-                    "implemented yet.";
+    pass_manager.addPass(mlir::odml::CreateCompositeLoweringPass());
   }
 
   // TFLite dialect passes.

From 6eb471abb2cad90c60c94d2ef92d0f889c45eee7 Mon Sep 17 00:00:00 2001
From: pemeliya <141146080+pemeliya@users.noreply.github.com>
Date: Tue, 26 Mar 2024 15:06:26 -0700
Subject: [PATCH 459/670] PR #10788: [ROCM] fixing build brake: disabling
 cudnn_fusion_compiler for ROCM

Imported from GitHub PR https://github.com/openxla/xla/pull/10788

In this PR I disable yet cudnn_fusion_compiler to fix the build brake
These feature is to be enabled later.

@xla-rotation: woud you have a look please ?

Copybara import of the project:

--
7ff1b0094f371188a8cc142eec6e6c77585ae666 by Pavel Emeliyanenko <pavel.emeliyanenko@amd.com>:

rebased and added virtual function for cudnn compiler pass

Merging this change closes #10788

PiperOrigin-RevId: 619323685
---
 third_party/xla/xla/service/gpu/BUILD             | 2 +-
 third_party/xla/xla/service/gpu/gpu_compiler.cc   | 5 ++---
 third_party/xla/xla/service/gpu/gpu_compiler.h    | 7 +++++++
 third_party/xla/xla/service/gpu/nvptx_compiler.cc | 9 +++++++++
 third_party/xla/xla/service/gpu/nvptx_compiler.h  | 4 ++++
 5 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index ad4002db4b6688..e3a106e7bef46d 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3496,7 +3496,6 @@ cc_library(
         ":compile_module_to_llvm_ir",
         ":conv_layout_normalization",
         ":copy_fusion",
-        ":cudnn_fusion_compiler",
         ":custom_kernel_fusion_rewriter",
         ":dot_dimension_sorter",
         ":dot_operand_converter",
@@ -3836,6 +3835,7 @@ cc_library(
         ":cudnn_fused_conv_rewriter",
         ":cudnn_fused_mha_rewriter",
         ":cudnn_fused_mha_transpose_fusion",
+        ":cudnn_fusion_compiler",
         ":cudnn_norm_rewriter",
         ":cudnn_pad_for_convolutions",
         ":cudnn_simplify_padding",
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index b11812816324b9..8ec5319dd8b10b 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -124,7 +124,6 @@ limitations under the License.
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "xla/service/gpu/conv_layout_normalization.h"
 #include "xla/service/gpu/copy_fusion.h"
-#include "xla/service/gpu/cudnn_fusion_compiler.h"
 #include "xla/service/gpu/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/dot_dimension_sorter.h"
 #include "xla/service/gpu/dot_operand_converter.h"
@@ -1980,8 +1979,8 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     const CompileOptions& options) {
   Thunk::BinaryMap dnn_compiled_graphs;
   if (stream_exec) {
-    CuDnnFusionCompiler cudnn_compiler(*stream_exec, dnn_compiled_graphs);
-    TF_RETURN_IF_ERROR(cudnn_compiler.Run(&*module).status());
+    TF_RETURN_IF_ERROR(RunCudnnFusionCompilerPass(module.get(), stream_exec,
+                                                  &dnn_compiled_graphs));
   }
 
   const DebugOptions& debug_opts = module->config().debug_options();
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index dccb93acc01e96..87824610328860 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -166,6 +166,13 @@ class GpuCompiler : public LLVMCompiler {
     return absl::OkStatus();
   }
 
+  // Runs CUDNN fusion compiler pass.
+  virtual absl::Status RunCudnnFusionCompilerPass(
+      HloModule* module, se::StreamExecutor* stream_exec,
+      Thunk::BinaryMap* dnn_compiled_graphs) {
+    return absl::OkStatus();
+  }
+
   AlgebraicSimplifierOptions GetAlgebraicSimplifierOptions(
       const HloModuleConfig& config);
 
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 3422cd810290fd..4a29483ce03091 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "xla/service/gpu/cudnn_fused_conv_rewriter.h"
 #include "xla/service/gpu/cudnn_fused_mha_rewriter.h"
 #include "xla/service/gpu/cudnn_fused_mha_transpose_fusion.h"
+#include "xla/service/gpu/cudnn_fusion_compiler.h"
 #include "xla/service/gpu/cudnn_norm_rewriter.h"
 #include "xla/service/gpu/cudnn_pad_for_convolutions.h"
 #include "xla/service/gpu/cudnn_simplify_padding.h"
@@ -341,6 +342,14 @@ absl::Status NVPTXCompiler::AddCustomKernelReplacementPasses(
   }
   return absl::OkStatus();
 }
+
+absl::Status NVPTXCompiler::RunCudnnFusionCompilerPass(
+    HloModule* module, se::StreamExecutor* stream_exec,
+    Thunk::BinaryMap* dnn_compiled_graphs) {
+  CuDnnFusionCompiler cudnn_compiler(*stream_exec, *dnn_compiled_graphs);
+  return cudnn_compiler.Run(module).status();
+}
+
 namespace {
 // Try to load ptx from files defined in the FLAGS. If successful, return true.
 bool MaybeLoadPtxFromFile(const HloModuleConfig module_config,
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.h b/third_party/xla/xla/service/gpu/nvptx_compiler.h
index a0ca2cf7c99b2c..3d7a770282b134 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.h
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.h
@@ -79,6 +79,10 @@ class NVPTXCompiler : public GpuCompiler {
   absl::Status AddCustomKernelReplacementPasses(
       HloPassPipeline* pipeline, const DebugOptions& debug_options) override;
 
+  absl::Status RunCudnnFusionCompilerPass(
+      HloModule* module, se::StreamExecutor* stream_exec,
+      Thunk::BinaryMap* dnn_compiled_graphs) override;
+
   HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() const override;
 
   absl::StatusOr<BackendCompileResult> CompileTargetBinary(

From 6d4cd94866f4f9e93c4742c22317e6aeefb32322 Mon Sep 17 00:00:00 2001
From: Pearu Peterson <pearu.peterson@gmail.com>
Date: Tue, 26 Mar 2024 15:07:56 -0700
Subject: [PATCH 460/670] PR #10376: Fix expm1 inaccuracies on complex inputs
 with small absolute values. Add Cosm1.

Imported from GitHub PR https://github.com/openxla/xla/pull/10376

As in the title.

Tests and improvement reports are in https://github.com/google/jax/pull/20144.

Accuracy tests are enabled in https://github.com/google/jax/pull/20436
Copybara import of the project:

--
42e222a436787c52adf453c8c0c39125b010e2b2 by Pearu Peterson <pearu.peterson@gmail.com>:

Fix expm1 inaccuracies on complex inputs with small absolute values. Add Cosm1.
Fix expm1(x+yi) when x is large and y is zero.

Merging this change closes #10376

PiperOrigin-RevId: 619324160
---
 third_party/xla/xla/python/xla_client.py      |  2 +-
 .../xla/xla/service/elemental_ir_emitter.cc   | 98 ++++++++++++++-----
 .../xla/xla/service/elemental_ir_emitter.h    |  3 +
 3 files changed, 75 insertions(+), 28 deletions(-)

diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index ccfbf7d00874f1..ef9e4e5291ab97 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -48,7 +48,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 249
+_version = 250
 
 # Version number for MLIR:Python components.
 mlir_api_version = 55
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.cc b/third_party/xla/xla/service/elemental_ir_emitter.cc
index a6b99654bce902..af3cc23abd5725 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/elemental_ir_emitter.cc
@@ -1019,16 +1019,21 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     }
     case HloOpcode::kExpm1: {
       // e^(a+bi)-1 = (e^a*cos(b)-1)+e^a*sin(b)i
-      TF_ASSIGN_OR_RETURN(
-          auto exp_a,
-          EmitExp(component_type, EmitExtractReal(operand_value), ""));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_b, EmitCos(component_type, EmitExtractImag(operand_value)));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value)));
-      auto one = llvm::ConstantFP::get(exp_a->getType(), 1.0);
-      auto real_result = FSub(FMul(exp_a, cos_b), one);
-      auto imag_result = FMul(exp_a, sin_b);
+      //            [handle inaccuracies when a and/or b are small]
+      //            = ((e^a - 1) * cos(b) + cos(b) - 1) + e^a*sin(b)i
+      //            = (expm1(a) * cos(b) + cosm1(b)) + e^a*sin(b)i
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      auto zero = llvm::ConstantFP::get(b->getType(), 0.0);
+      auto one = llvm::ConstantFP::get(b->getType(), 1.0);
+      auto b_is_zero = FCmpOEQ(b, zero);
+      TF_ASSIGN_OR_RETURN(auto expm1_a, EmitExpm1(component_type, a));
+      auto exp_a = FAdd(expm1_a, one);
+      TF_ASSIGN_OR_RETURN(auto sin_b, EmitSin(component_type, b));
+      TF_ASSIGN_OR_RETURN(auto cos_b_minus_one, EmitCosm1(component_type, b));
+      auto cos_b = FAdd(cos_b_minus_one, one);
+      auto real_result = FAdd(FMul(expm1_a, cos_b), cos_b_minus_one);
+      auto imag_result = Select(b_is_zero, zero, FMul(exp_a, sin_b));
       return EmitComposeComplex(op, real_result, imag_result);
     }
     case HloOpcode::kCos:
@@ -1953,6 +1958,45 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitCos(
                                       {value->getType()}, b_);
 }
 
+absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitCosm1(
+    PrimitiveType prim_type, llvm::Value* value) {
+  auto x = value;
+  auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_);
+  auto negative_half = llvm::ConstantFP::get(type, -0.5);
+  auto negative_one = llvm::ConstantFP::get(type, -1.0);
+
+  // Algorithm copied from cephes cosm1:
+  //   cosm1(x) = -0.5 * x^2 + x^4 * P(x^2);
+  // that is suitable when abs(x) < pi/4, otherwise we'll use cos(x)-1.
+  //
+  // This is an alternative algorithm
+  //   cosm1(x) = -2 * sin(x/2)^2
+  // that is only slightly less accurate around abs(x) == 0.1 but
+  // otherwise equivalent accuracy-wise compared to cephes cosm1.
+  // However, we are not using it because it is notably less
+  // performant than cephes cosm1.
+
+  // TODO: define cosm1(x) as cosm1(x mod (2*pi)) to increase accuracy
+  // for large x values that are close to 2*pi*n where n is some integer.
+  static const std::array<double, 7> kCoeffs{
+      4.7377507964246204691685E-14, -1.1470284843425359765671E-11,
+      2.0876754287081521758361E-9,  -2.7557319214999787979814E-7,
+      2.4801587301570552304991E-5,  -1.3888888888888872993737E-3,
+      4.1666666666666666609054E-2,
+  };
+  TF_ASSIGN_OR_RETURN(auto cos_x, EmitCos(prim_type, x));
+  auto for_large_x = FAdd(cos_x, negative_one);
+
+  auto xx = FMul(x, x);
+  auto xxxx = FMul(xx, xx);
+  TF_ASSIGN_OR_RETURN(auto poly, EvaluatePolynomial(type, xx, kCoeffs));
+  auto for_small_x = FAdd(FMul(xxxx, poly), FMul(negative_half, xx));
+
+  // (pi/4)^2 is approximately 0.61685
+  return Select(FCmpOGT(xx, llvm::ConstantFP::get(type, 0.61685)), for_large_x,
+                for_small_x);
+}
+
 absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitExp(
     PrimitiveType prim_type, llvm::Value* value, absl::string_view name) {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {value},
@@ -2214,7 +2258,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitPredBinaryOp(
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
       return Internal("Invalid binary op '%s' for pred",
-                           HloOpcodeString(op->opcode()));
+                      HloOpcodeString(op->opcode()));
 
     default:
       return Unimplemented("binary pred op '%s'",
@@ -3033,12 +3077,12 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         return EmitReducePrecision(hlo, operand_value);
       };
     case HloOpcode::kConcatenate:
-      return [this, hlo,
-              &operand_to_generator](const IrArray::Index target_index)
-                 -> absl::StatusOr<llvm::Value*> {
-        return EmitElementalConcatenate(hlo, operand_to_generator,
-                                        target_index);
-      };
+      return
+          [this, hlo, &operand_to_generator](const IrArray::Index target_index)
+              -> absl::StatusOr<llvm::Value*> {
+            return EmitElementalConcatenate(hlo, operand_to_generator,
+                                            target_index);
+          };
     case HloOpcode::kReverse:
       return [this, hlo,
               &operand_to_generator](const IrArray::Index& target_index)
@@ -3055,16 +3099,16 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         return operand_to_generator.at(operand)(source_index);
       };
     case HloOpcode::kBroadcast:
-      return [this, hlo,
-              &operand_to_generator](const IrArray::Index& target_index)
-                 -> absl::StatusOr<llvm::Value*> {
-        const HloInstruction* operand = hlo->operand(0);
-        // The `dimensions` member of the broadcast instruction maps from
-        // input dimensions to output dimensions.
-        return operand_to_generator.at(operand)(
-            target_index.SourceIndexOfBroadcast(hlo->shape(), operand->shape(),
-                                                hlo->dimensions(), b_));
-      };
+      return
+          [this, hlo, &operand_to_generator](const IrArray::Index& target_index)
+              -> absl::StatusOr<llvm::Value*> {
+            const HloInstruction* operand = hlo->operand(0);
+            // The `dimensions` member of the broadcast instruction maps from
+            // input dimensions to output dimensions.
+            return operand_to_generator.at(operand)(
+                target_index.SourceIndexOfBroadcast(
+                    hlo->shape(), operand->shape(), hlo->dimensions(), b_));
+          };
     case HloOpcode::kIota:
       return [this, hlo](const IrArray::Index& target_index)
                  -> absl::StatusOr<llvm::Value*> {
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.h b/third_party/xla/xla/service/elemental_ir_emitter.h
index ef015b11eb53e4..b636cb82df5eac 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.h
+++ b/third_party/xla/xla/service/elemental_ir_emitter.h
@@ -157,6 +157,9 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   virtual absl::StatusOr<llvm::Value*> EmitCos(PrimitiveType prim_type,
                                                llvm::Value* value);
 
+  virtual absl::StatusOr<llvm::Value*> EmitCosm1(PrimitiveType prim_type,
+                                                 llvm::Value* value);
+
   virtual absl::StatusOr<llvm::Value*> EmitTan(PrimitiveType prim_type,
                                                llvm::Value* value);
 

From b2907e161156e026126e332115f99a1960209ac6 Mon Sep 17 00:00:00 2001
From: Ruturaj Vaidya <ruturaj.vaidya@amd.com>
Date: Tue, 26 Mar 2024 15:10:07 -0700
Subject: [PATCH 461/670] PR #10954: [ROCM] fix 'Invalid plugin kind specified:
 DNN' error

Imported from GitHub PR https://github.com/openxla/xla/pull/10954

This pr removes "Invalid plugin kind specified: DNN" error/warning often seen on ROCm platform.

@xla-rotation : would you take a look please?
Copybara import of the project:

--
b2c8d601a9aa1bcd7f6a6d9d7bb8c1b4eda93f8a by Ruturaj4 <ruturaj.vaidya@amd.com>:

fix 'Invalid plugin kind specified: DNN' error

Merging this change closes #10954

PiperOrigin-RevId: 619324921
---
 .../xla/stream_executor/plugin_registry.cc    | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/plugin_registry.cc b/third_party/xla/xla/stream_executor/plugin_registry.cc
index 7eb0aae78ad280..8e5f772dbf603f 100644
--- a/third_party/xla/xla/stream_executor/plugin_registry.cc
+++ b/third_party/xla/xla/stream_executor/plugin_registry.cc
@@ -80,17 +80,19 @@ absl::Status PluginRegistry::RegisterFactoryInternal(
 bool PluginRegistry::HasFactory(Platform::Id platform_id,
                                 PluginKind plugin_kind) const {
   auto iter = factories_.find(platform_id);
-  if (iter != factories_.end()) {
-    switch (plugin_kind) {
-      case PluginKind::kBlas:
-        return iter->second.blas.has_value();
-      case PluginKind::kDnn:
-        return iter->second.dnn.has_value();
-      case PluginKind::kFft:
-        return iter->second.fft.has_value();
-      default:
-        break;
-    }
+  if (iter == factories_.end()) {
+    return false;
+  }
+
+  switch (plugin_kind) {
+    case PluginKind::kBlas:
+      return iter->second.blas.has_value();
+    case PluginKind::kDnn:
+      return iter->second.dnn.has_value();
+    case PluginKind::kFft:
+      return iter->second.fft.has_value();
+    default:
+      break;
   }
 
   LOG(ERROR) << "Invalid plugin kind specified: "

From aed79b8b9af12a68b56b3fd61b8c38bfd535eedf Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Tue, 26 Mar 2024 15:14:45 -0700
Subject: [PATCH 462/670] Generate XLA's warnings.bazelrc automatically by
 inspecting the toolchain

PiperOrigin-RevId: 619326423
---
 .bazelrc                                 | 88 +--------------------
 third_party/xla/.bazelrc                 | 88 +--------------------
 third_party/xla/third_party/tsl/.bazelrc | 88 +--------------------
 third_party/xla/warnings.bazelrc         | 98 ++++++++++++++++++++++++
 4 files changed, 104 insertions(+), 258 deletions(-)
 create mode 100644 third_party/xla/warnings.bazelrc

diff --git a/.bazelrc b/.bazelrc
index 5f36e395844fc1..c630c1350bbd77 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -923,89 +923,5 @@ test:rbe_cross_compile_macos_x86 --jobs=100
 # END MACOS CROSS-COMPILE CONFIGS
 # END CROSS-COMPILE CONFIGS
 
-# XLA Warnings Config
-# # Treat warnings as errors...
-build:warnings --copt=-Werror --host_copt=-Werror
-# ...and silence them outside of the workspace.
-build:warnings --per_file_copt=external/.*@-w
-# ...and silence them on host builds. There is no host_per_file_copt and
-# everything we build in the host configuration we either also build in the
-# target configuration or is external, so we can't control it.
-# If/when Bazel supports --host_per_file_copt, we could use that instead:
-# https://github.com/bazelbuild/bazel/issues/12406.
-# Would need to then make all the --copt below duplicated with --host_copt.
-build:warnings --host_copt=-w
-
-# Set clang warnings and promotion to errors. These largely match the set of
-# warnings used within Google. If you feel that some of these should be
-# different, please raise an issue!
-
-build:warnings --copt=-Wall
-
-# The list below comes from cs/warnings.bzl internally.
-# Disable warnings that generally have a low signal/noise ratio.
-build:warnings --copt=-Wno-ambiguous-member-template
-build:warnings --copt=-Wno-char-subscripts
-build:warnings --copt=-Wno-deprecated-declarations
-build:warnings --copt=-Wno-deprecated-pragma
-build:warnings --copt=-Wno-extern-c-compat
-build:warnings --copt=-Wno-gnu-alignof-expression
-build:warnings --copt=-Wno-gnu-variable-sized-type-not-at-end
-build:warnings --copt=-Wno-implicit-int-float-conversion
-build:warnings --copt=-Wno-invalid-source-encoding
-build:warnings --copt=-Wno-mismatched-tags
-build:warnings --copt=-Wno-pointer-sign
-build:warnings --copt=-Wno-private-header
-build:warnings --copt=-Wno-sign-compare
-build:warnings --copt=-Wno-strict-overflow
-build:warnings --copt=-Wno-unknown-pragmas
-build:warnings --copt=-Wno-unused-command-line-argument
-build:warnings --copt=-Wno-unused-const-variable
-build:warnings --copt=-Wno-unused-function
-build:warnings --copt=-Wno-unused-private-field
-build:warnings --copt=-Wno-user-defined-warnings
-build:warnings --copt=-Wno-return-type-c-linkage
-build:warnings --copt=-Wno-self-assign-overloaded
-
-# Warnings that are disabled internally, but may be reenabled
-build:warnings --copt=-Wno-address-of-packed-member
-build:warnings --copt=-Wno-defaulted-function-deleted
-build:warnings --copt=-Wno-enum-compare-switch
-build:warnings --copt=-Wno-expansion-to-defined
-build:warnings --copt=-Wno-ignored-attributes
-build:warnings --copt=-Wno-ignored-qualifiers
-build:warnings --copt=-Wno-inconsistent-missing-override
-build:warnings --copt=-Wno-potentially-evaluated-expression
-build:warnings --copt=-Wno-range-loop-analysis
-build:warnings --copt=-Wno-strict-prototypes
-build:warnings --copt=-Wno-tautological-type-limit-compare
-build:warnings --copt=-Wno-tautological-undefined-compare
-build:warnings --copt=-Wno-tautological-unsigned-zero-compare
-build:warnings --copt=-Wno-tautological-unsigned-enum-zero-compare
-build:warnings --copt=-Wno-undefined-func-template
-build:warnings --copt=-Wno-unused-but-set-variable
-build:warnings --copt=-Wno-unused-lambda-capture
-build:warnings --copt=-Wno-unused-local-typedef
-build:warnings --copt=-Wno-deprecated-builtins
-
-# Disable warnings that we want, but require tons of work to enable at present
-# for XLA specifically
-build:warnings --copt=-Wno-macro-redefined # because of tsl vs absl logging
-
-# Explicitly enable some additional warnings.
-# Some of these aren't on by default, or aren't on under -Wall, or are subsets
-# of warnings turned off above.
-build:warnings --copt=-Wfloat-overflow-conversion
-build:warnings --copt=-Wfloat-zero-conversion
-build:warnings --copt=-Wfor-loop-analysis
-build:warnings --copt=-Wgnu-redeclared-enum
-build:warnings --copt=-Winfinite-recursion
-build:warnings --copt=-Wself-assign
-build:warnings --copt=-Wstring-conversion
-build:warnings --copt=-Wtautological-overlap-compare
-build:warnings --copt=-Wunused-but-set-parameter
-build:warnings --copt=-Wunused-comparison
-build:warnings --copt=-Wvla
-build:warnings --copt=-Wctad-maybe-unsupported
-build:warnings --copt=-Wthread-safety-beta
-
+# Try to load the XLA warnings config if available
+try-import %workspace%/warnings.bazelrc
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 5f36e395844fc1..c630c1350bbd77 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -923,89 +923,5 @@ test:rbe_cross_compile_macos_x86 --jobs=100
 # END MACOS CROSS-COMPILE CONFIGS
 # END CROSS-COMPILE CONFIGS
 
-# XLA Warnings Config
-# # Treat warnings as errors...
-build:warnings --copt=-Werror --host_copt=-Werror
-# ...and silence them outside of the workspace.
-build:warnings --per_file_copt=external/.*@-w
-# ...and silence them on host builds. There is no host_per_file_copt and
-# everything we build in the host configuration we either also build in the
-# target configuration or is external, so we can't control it.
-# If/when Bazel supports --host_per_file_copt, we could use that instead:
-# https://github.com/bazelbuild/bazel/issues/12406.
-# Would need to then make all the --copt below duplicated with --host_copt.
-build:warnings --host_copt=-w
-
-# Set clang warnings and promotion to errors. These largely match the set of
-# warnings used within Google. If you feel that some of these should be
-# different, please raise an issue!
-
-build:warnings --copt=-Wall
-
-# The list below comes from cs/warnings.bzl internally.
-# Disable warnings that generally have a low signal/noise ratio.
-build:warnings --copt=-Wno-ambiguous-member-template
-build:warnings --copt=-Wno-char-subscripts
-build:warnings --copt=-Wno-deprecated-declarations
-build:warnings --copt=-Wno-deprecated-pragma
-build:warnings --copt=-Wno-extern-c-compat
-build:warnings --copt=-Wno-gnu-alignof-expression
-build:warnings --copt=-Wno-gnu-variable-sized-type-not-at-end
-build:warnings --copt=-Wno-implicit-int-float-conversion
-build:warnings --copt=-Wno-invalid-source-encoding
-build:warnings --copt=-Wno-mismatched-tags
-build:warnings --copt=-Wno-pointer-sign
-build:warnings --copt=-Wno-private-header
-build:warnings --copt=-Wno-sign-compare
-build:warnings --copt=-Wno-strict-overflow
-build:warnings --copt=-Wno-unknown-pragmas
-build:warnings --copt=-Wno-unused-command-line-argument
-build:warnings --copt=-Wno-unused-const-variable
-build:warnings --copt=-Wno-unused-function
-build:warnings --copt=-Wno-unused-private-field
-build:warnings --copt=-Wno-user-defined-warnings
-build:warnings --copt=-Wno-return-type-c-linkage
-build:warnings --copt=-Wno-self-assign-overloaded
-
-# Warnings that are disabled internally, but may be reenabled
-build:warnings --copt=-Wno-address-of-packed-member
-build:warnings --copt=-Wno-defaulted-function-deleted
-build:warnings --copt=-Wno-enum-compare-switch
-build:warnings --copt=-Wno-expansion-to-defined
-build:warnings --copt=-Wno-ignored-attributes
-build:warnings --copt=-Wno-ignored-qualifiers
-build:warnings --copt=-Wno-inconsistent-missing-override
-build:warnings --copt=-Wno-potentially-evaluated-expression
-build:warnings --copt=-Wno-range-loop-analysis
-build:warnings --copt=-Wno-strict-prototypes
-build:warnings --copt=-Wno-tautological-type-limit-compare
-build:warnings --copt=-Wno-tautological-undefined-compare
-build:warnings --copt=-Wno-tautological-unsigned-zero-compare
-build:warnings --copt=-Wno-tautological-unsigned-enum-zero-compare
-build:warnings --copt=-Wno-undefined-func-template
-build:warnings --copt=-Wno-unused-but-set-variable
-build:warnings --copt=-Wno-unused-lambda-capture
-build:warnings --copt=-Wno-unused-local-typedef
-build:warnings --copt=-Wno-deprecated-builtins
-
-# Disable warnings that we want, but require tons of work to enable at present
-# for XLA specifically
-build:warnings --copt=-Wno-macro-redefined # because of tsl vs absl logging
-
-# Explicitly enable some additional warnings.
-# Some of these aren't on by default, or aren't on under -Wall, or are subsets
-# of warnings turned off above.
-build:warnings --copt=-Wfloat-overflow-conversion
-build:warnings --copt=-Wfloat-zero-conversion
-build:warnings --copt=-Wfor-loop-analysis
-build:warnings --copt=-Wgnu-redeclared-enum
-build:warnings --copt=-Winfinite-recursion
-build:warnings --copt=-Wself-assign
-build:warnings --copt=-Wstring-conversion
-build:warnings --copt=-Wtautological-overlap-compare
-build:warnings --copt=-Wunused-but-set-parameter
-build:warnings --copt=-Wunused-comparison
-build:warnings --copt=-Wvla
-build:warnings --copt=-Wctad-maybe-unsupported
-build:warnings --copt=-Wthread-safety-beta
-
+# Try to load the XLA warnings config if available
+try-import %workspace%/warnings.bazelrc
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 5f36e395844fc1..c630c1350bbd77 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -923,89 +923,5 @@ test:rbe_cross_compile_macos_x86 --jobs=100
 # END MACOS CROSS-COMPILE CONFIGS
 # END CROSS-COMPILE CONFIGS
 
-# XLA Warnings Config
-# # Treat warnings as errors...
-build:warnings --copt=-Werror --host_copt=-Werror
-# ...and silence them outside of the workspace.
-build:warnings --per_file_copt=external/.*@-w
-# ...and silence them on host builds. There is no host_per_file_copt and
-# everything we build in the host configuration we either also build in the
-# target configuration or is external, so we can't control it.
-# If/when Bazel supports --host_per_file_copt, we could use that instead:
-# https://github.com/bazelbuild/bazel/issues/12406.
-# Would need to then make all the --copt below duplicated with --host_copt.
-build:warnings --host_copt=-w
-
-# Set clang warnings and promotion to errors. These largely match the set of
-# warnings used within Google. If you feel that some of these should be
-# different, please raise an issue!
-
-build:warnings --copt=-Wall
-
-# The list below comes from cs/warnings.bzl internally.
-# Disable warnings that generally have a low signal/noise ratio.
-build:warnings --copt=-Wno-ambiguous-member-template
-build:warnings --copt=-Wno-char-subscripts
-build:warnings --copt=-Wno-deprecated-declarations
-build:warnings --copt=-Wno-deprecated-pragma
-build:warnings --copt=-Wno-extern-c-compat
-build:warnings --copt=-Wno-gnu-alignof-expression
-build:warnings --copt=-Wno-gnu-variable-sized-type-not-at-end
-build:warnings --copt=-Wno-implicit-int-float-conversion
-build:warnings --copt=-Wno-invalid-source-encoding
-build:warnings --copt=-Wno-mismatched-tags
-build:warnings --copt=-Wno-pointer-sign
-build:warnings --copt=-Wno-private-header
-build:warnings --copt=-Wno-sign-compare
-build:warnings --copt=-Wno-strict-overflow
-build:warnings --copt=-Wno-unknown-pragmas
-build:warnings --copt=-Wno-unused-command-line-argument
-build:warnings --copt=-Wno-unused-const-variable
-build:warnings --copt=-Wno-unused-function
-build:warnings --copt=-Wno-unused-private-field
-build:warnings --copt=-Wno-user-defined-warnings
-build:warnings --copt=-Wno-return-type-c-linkage
-build:warnings --copt=-Wno-self-assign-overloaded
-
-# Warnings that are disabled internally, but may be reenabled
-build:warnings --copt=-Wno-address-of-packed-member
-build:warnings --copt=-Wno-defaulted-function-deleted
-build:warnings --copt=-Wno-enum-compare-switch
-build:warnings --copt=-Wno-expansion-to-defined
-build:warnings --copt=-Wno-ignored-attributes
-build:warnings --copt=-Wno-ignored-qualifiers
-build:warnings --copt=-Wno-inconsistent-missing-override
-build:warnings --copt=-Wno-potentially-evaluated-expression
-build:warnings --copt=-Wno-range-loop-analysis
-build:warnings --copt=-Wno-strict-prototypes
-build:warnings --copt=-Wno-tautological-type-limit-compare
-build:warnings --copt=-Wno-tautological-undefined-compare
-build:warnings --copt=-Wno-tautological-unsigned-zero-compare
-build:warnings --copt=-Wno-tautological-unsigned-enum-zero-compare
-build:warnings --copt=-Wno-undefined-func-template
-build:warnings --copt=-Wno-unused-but-set-variable
-build:warnings --copt=-Wno-unused-lambda-capture
-build:warnings --copt=-Wno-unused-local-typedef
-build:warnings --copt=-Wno-deprecated-builtins
-
-# Disable warnings that we want, but require tons of work to enable at present
-# for XLA specifically
-build:warnings --copt=-Wno-macro-redefined # because of tsl vs absl logging
-
-# Explicitly enable some additional warnings.
-# Some of these aren't on by default, or aren't on under -Wall, or are subsets
-# of warnings turned off above.
-build:warnings --copt=-Wfloat-overflow-conversion
-build:warnings --copt=-Wfloat-zero-conversion
-build:warnings --copt=-Wfor-loop-analysis
-build:warnings --copt=-Wgnu-redeclared-enum
-build:warnings --copt=-Winfinite-recursion
-build:warnings --copt=-Wself-assign
-build:warnings --copt=-Wstring-conversion
-build:warnings --copt=-Wtautological-overlap-compare
-build:warnings --copt=-Wunused-but-set-parameter
-build:warnings --copt=-Wunused-comparison
-build:warnings --copt=-Wvla
-build:warnings --copt=-Wctad-maybe-unsupported
-build:warnings --copt=-Wthread-safety-beta
-
+# Try to load the XLA warnings config if available
+try-import %workspace%/warnings.bazelrc
diff --git a/third_party/xla/warnings.bazelrc b/third_party/xla/warnings.bazelrc
new file mode 100644
index 00000000000000..00e9d3f58028d2
--- /dev/null
+++ b/third_party/xla/warnings.bazelrc
@@ -0,0 +1,98 @@
+# This file is autogenerated! Do not edit!
+
+# Treat warnings as errors...
+build:warnings --copt=-Werror --host_copt=-Werror
+# ...and silence them outside of the workspace.
+build:warnings --per_file_copt=external/.*@-w
+# ...and silence them on host builds. There is no host_per_file_copt and
+# everything we build in the host configuration we either also build in the
+# target configuration or is external, so we can't control it.
+# If/when Bazel supports --host_per_file_copt, we could use that instead:
+# https://github.com/bazelbuild/bazel/issues/12406.
+# Would need to then make all the --copt below duplicated with --host_copt.
+build:warnings --host_copt=-w
+
+build:warnings --copt=-Wall
+build:warnings --copt=-Werror
+build:warnings --copt=-Wno-address-of-packed-member
+build:warnings --copt=-Wno-defaulted-function-deleted
+build:warnings --copt=-Wno-enum-compare-switch
+build:warnings --copt=-Wno-expansion-to-defined
+build:warnings --copt=-Wno-ignored-attributes
+build:warnings --copt=-Wno-ignored-qualifiers
+build:warnings --copt=-Wno-inconsistent-missing-override
+build:warnings --copt=-Wno-potentially-evaluated-expression
+build:warnings --copt=-Wno-range-loop-analysis
+build:warnings --copt=-Wno-strict-prototypes
+build:warnings --copt=-Wno-tautological-type-limit-compare
+build:warnings --copt=-Wno-tautological-undefined-compare
+build:warnings --copt=-Wno-tautological-unsigned-zero-compare
+build:warnings --copt=-Wno-tautological-unsigned-enum-zero-compare
+build:warnings --copt=-Wno-undefined-func-template
+build:warnings --copt=-Wno-unused-but-set-variable
+build:warnings --copt=-Wno-unused-lambda-capture
+build:warnings --copt=-Wno-unused-local-typedef
+build:warnings --copt=-Wno-deprecated-builtins
+build:warnings --copt=-Wno-deprecated-volatile
+build:warnings --copt=-Wno-deprecated-anon-enum-enum-conversion
+build:warnings --copt=-Wno-deprecated-enum-compare
+build:warnings --copt=-Wno-deprecated-enum-enum-conversion
+build:warnings --copt=-Wno-deprecated-enum-compare-conditional
+build:warnings --copt=-Wno-deprecated-enum-float-conversion
+build:warnings --copt=-Wno-deprecated-this-capture
+build:warnings --copt=-Wno-deprecated-array-compare
+build:warnings --copt=-Wno-deprecated-comma-subscript
+build:warnings --copt=-Wno-bitfield-constant-conversion
+build:warnings --copt=-Wno-bitwise-instead-of-logical
+build:warnings --copt=-Wno-comment
+build:warnings --copt=-Wno-compound-token-split
+build:warnings --copt=-Wno-deprecated-non-prototype
+build:warnings --copt=-Wno-enum-constexpr-conversion
+build:warnings --copt=-Wno-misleading-indentation
+build:warnings --copt=-Wno-psabi
+build:warnings --copt=-Wno-unqualified-std-cast-call
+build:warnings --copt=-Wno-ambiguous-member-template
+build:warnings --copt=-Wno-char-subscripts
+build:warnings --copt=-Wno-deprecated-declarations
+build:warnings --copt=-Wno-deprecated-pragma
+build:warnings --copt=-Wno-extern-c-compat
+build:warnings --copt=-Wno-gnu-alignof-expression
+build:warnings --copt=-Wno-gnu-variable-sized-type-not-at-end
+build:warnings --copt=-Wno-implicit-int-float-conversion
+build:warnings --copt=-Wno-invalid-source-encoding
+build:warnings --copt=-Wno-mismatched-tags
+build:warnings --copt=-Wno-pointer-sign
+build:warnings --copt=-Wno-private-header
+build:warnings --copt=-Wno-sign-compare
+build:warnings --copt=-Wno-strict-overflow
+build:warnings --copt=-Wno-unknown-pragmas
+build:warnings --copt=-Wno-unused-command-line-argument
+build:warnings --copt=-Wno-unused-const-variable
+build:warnings --copt=-Wno-unused-function
+build:warnings --copt=-Wno-unused-private-field
+build:warnings --copt=-Wno-user-defined-warnings
+build:warnings --copt=-Wfloat-overflow-conversion
+build:warnings --copt=-Wfloat-zero-conversion
+build:warnings --copt=-Wfor-loop-analysis
+build:warnings --copt=-Wgnu-redeclared-enum
+build:warnings --copt=-Winfinite-recursion
+build:warnings --copt=-Wself-assign
+build:warnings --copt=-Wstring-conversion
+build:warnings --copt=-Wtautological-overlap-compare
+build:warnings --copt=-Wunused-but-set-parameter
+build:warnings --copt=-Wunused-comparison
+build:warnings --copt=-Wvla
+build:warnings --copt=-Wno-return-type-c-linkage
+build:warnings --copt=-Wno-self-assign-overloaded
+build:warnings --copt=-Wctad-maybe-unsupported
+build:warnings --copt=-Wthread-safety-beta
+build:warnings --copt=-Wno-trigraphs
+build:warnings --copt=-Woverloaded-virtual
+build:warnings --copt=-Wno-invalid-offsetof
+build:warnings --copt=-Wno-final-dtor-non-final-class
+build:warnings --copt=-Wnon-virtual-dtor
+build:warnings --copt=-Wimplicit-fallthrough
+build:warnings --copt=-Wthread-safety-analysis
+build:warnings --copt=-Wno-tautological-type-limit-compare
+build:warnings --copt=-Wno-builtin-macro-redefined
+build:warnings --copt=-Wno-macro-redefined

From 578c94cd84707d3fd5d7b0105577ccb87b4b7ba0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 15:31:51 -0700
Subject: [PATCH 463/670] Replace TODO bug, to track investigation and fix for
 histogram percentile calibration method for StableHLO Quantizer.

PiperOrigin-RevId: 619331511
---
 .../python/integration_test/quantize_model_test.py   | 12 ++++++++++--
 .../integration_test/quantize_model_test_base.py     |  4 ++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
index 78494fcbe98132..eda6cc15c5be5a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -810,7 +810,7 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
   (default in TF2) to ensure support for when TF2 is disabled.
   """
 
-  # TODO(b/307621353): add CALIBRATION_METHOD_HISTOGRAM_PERCENTILE.
+  # TODO(b/331467239): Fix CALIBRATION_METHOD_HISTOGRAM_PERCENTILE.
   @parameterized.parameters(
       {
           'calibration_options': qc.CalibrationOptions(
@@ -822,6 +822,14 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
               calibration_method=_CalibrationMethod.CALIBRATION_METHOD_AVERAGE_MIN_MAX
           ),
       },
+      # {
+      #     'calibration_options': qc.CalibrationOptions(
+      #         calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
+      #         calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
+      #             initial_num_bins=10,
+      #         ),
+      #     ),
+      # },
       {
           'calibration_options': qc.CalibrationOptions(
               calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
@@ -855,7 +863,7 @@ def test_conv_ptq_model_by_calibration_options(
     bias_fn = nn_ops.bias_add
     activation_fn = nn_ops.relu6
     enable_per_channel_quantized_weight = False
-    has_batch_norm = False
+    has_batch_norm = True
     dilations = None
     input_shape = (1, 3, 4, 3)
     filter_shape = (2, 3, 3, 2)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
index 8a5f2529c56e22..d71c89e15d313f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
@@ -284,13 +284,13 @@ def conv2d(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
         )
         if bias_fn is not None:
           out = nn_ops.bias_add(out, self.bias)
-        if activation_fn is not None:
-          out = activation_fn(out)
         if has_batch_norm:
           # Fusing is supported for non-training case.
           out, _, _, _, _, _ = nn_ops.fused_batch_norm_v3(
               out, scale, offset, mean, variance, is_training=False
           )
+        if activation_fn is not None:
+          out = activation_fn(out)
         return {'output': out}
 
     model = ConvModel()

From 0e6c3baaa54f2cd0fee563f5262a6f6e92b4628a Mon Sep 17 00:00:00 2001
From: Clive Verghese <cliveverghese@google.com>
Date: Tue, 26 Mar 2024 15:39:13 -0700
Subject: [PATCH 464/670] Mark alias in tensorflow::profiler to counter parts
 in tsl::profiler as deprecated.

PiperOrigin-RevId: 619333639
---
 .../trace_viewer/trace_events_to_json.h       |  4 +-
 tensorflow/core/profiler/lib/BUILD            | 24 +++++--
 .../core/profiler/lib/connected_traceme.h     | 12 +++-
 tensorflow/core/profiler/lib/context_types.h  | 23 ++++++-
 .../core/profiler/lib/profiler_controller.h   |  9 ++-
 .../core/profiler/lib/profiler_factory.h      | 25 ++++++--
 .../core/profiler/lib/profiler_interface.h    |  9 ++-
 tensorflow/core/profiler/lib/profiler_lock.h  |  9 ++-
 .../core/profiler/lib/profiler_session.h      |  9 ++-
 .../core/profiler/lib/scoped_annotation.h     |  9 ++-
 .../lib/scoped_memory_debug_annotation.h      | 12 +++-
 tensorflow/core/profiler/lib/traceme.h        | 25 ++++++--
 tensorflow/core/profiler/lib/traceme_encode.h | 64 +++++++++++++++++--
 13 files changed, 200 insertions(+), 34 deletions(-)

diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
index 2e52a6e590cd11..fcf0011409d681 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
@@ -181,7 +181,7 @@ class JsonEventWriter {
         if (event.has_flow_category()) {
           ContextType type = GetSafeContextType(event.flow_category());
           if (type != ContextType::kGeneric && type != ContextType::kLegacy) {
-            const char* category = GetContextTypeString(type);
+            const char* category = tsl::profiler::GetContextTypeString(type);
             output_->Append(R"(,"cat":")", category, R"(")");
           }
         }
@@ -210,7 +210,7 @@ class JsonEventWriter {
         output_->Append(R"(,"id":)", event.flow_id());
         if (event.has_flow_category()) {
           ContextType type = GetSafeContextType(event.flow_category());
-          const char* category = GetContextTypeString(type);
+          const char* category = tsl::profiler::GetContextTypeString(type);
           output_->Append(R"(,"cat":")", category, R"(")");
         }
         switch (event.flow_entry_type()) {
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 968397cfb12b2f..d8bb6c98e1fda1 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -43,6 +43,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_session",
     ] + if_not_android([
         ":profiler_interface",
@@ -66,6 +67,7 @@ cc_library(
     ],
     deps = [
         ":profiler_interface",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
     ] + if_static([
@@ -83,9 +85,8 @@ cc_library(
         "//tensorflow/core/profiler:__pkg__",
     ],
     deps = [
-        ":profiler_controller",
         ":profiler_interface",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
         "@local_tsl//tsl/profiler/lib:profiler_factory_impl",
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
@@ -104,6 +105,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_interface",
     ],
 )
@@ -115,6 +117,7 @@ cc_library(
         ":profiler_interface",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_controller",
     ],
 )
@@ -138,6 +141,7 @@ cc_library(
         ":traceme_encode",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
@@ -153,6 +157,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
@@ -177,7 +182,7 @@ cc_library(
         ":context_types_hdrs",
         ":traceme",
         ":traceme_encode",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@local_tsl//tsl/profiler/lib:connected_traceme",
@@ -188,14 +193,20 @@ cc_library(
     name = "context_types_hdrs",
     hdrs = ["context_types.h"],
     visibility = ["//visibility:public"],
-    deps = ["@local_tsl//tsl/profiler/lib:context_types_hdrs"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "@local_tsl//tsl/profiler/lib:context_types_hdrs",
+    ],
 )
 
 cc_library(
     name = "context_types",
     hdrs = ["context_types.h"],
     visibility = ["//visibility:public"],
-    deps = ["@local_tsl//tsl/profiler/lib:context_types"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "@local_tsl//tsl/profiler/lib:context_types",
+    ],
 )
 
 cc_library(
@@ -203,6 +214,7 @@ cc_library(
     hdrs = ["scoped_memory_debug_annotation.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:scoped_memory_debug_annotation",
     ],
 )
@@ -215,6 +227,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ] + if_not_android([
@@ -230,6 +243,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_lock",
     ],
 )
diff --git a/tensorflow/core/profiler/lib/connected_traceme.h b/tensorflow/core/profiler/lib/connected_traceme.h
index aa6d354420038b..e696cdaf4682d0 100644
--- a/tensorflow/core/profiler/lib/connected_traceme.h
+++ b/tensorflow/core/profiler/lib/connected_traceme.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/base/macros.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/profiler/lib/context_types.h"
@@ -25,11 +26,18 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tsl/profiler/lib/connected_traceme.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::TraceMeConsumer;  // NOLINT
-using tsl::profiler::TraceMeProducer;  // NOLINT
+using TraceMeConsumer ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::TraceMeConsumer;  // NOLINT
+using TraceMeProducer ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::TraceMeProducer;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/context_types.h b/tensorflow/core/profiler/lib/context_types.h
index 99b2029fd09f82..434a02c22dd912 100644
--- a/tensorflow/core/profiler/lib/context_types.h
+++ b/tensorflow/core/profiler/lib/context_types.h
@@ -15,14 +15,31 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_CONTEXT_TYPES_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_CONTEXT_TYPES_H_
 
+#include <cstdint>
+
+#include "absl/base/macros.h"
 #include "tsl/profiler/lib/context_types.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::ContextType;           // NOLINT
-using tsl::profiler::GetContextTypeString;  // NOLINT
-using tsl::profiler::GetSafeContextType;    // NOLINT
+using ContextType ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ContextType;  // NOLINT
+
+ABSL_DEPRECATE_AND_INLINE()
+inline const char* GetContextTypeString(ContextType context_type) {
+  return tsl::profiler::GetContextTypeString(context_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline ContextType GetSafeContextType(uint32_t context_type) {
+  return tsl::profiler::GetSafeContextType(context_type);
+}
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_controller.h b/tensorflow/core/profiler/lib/profiler_controller.h
index 5993c6b42bec05..21936dcd46dc9c 100644
--- a/tensorflow/core/profiler/lib/profiler_controller.h
+++ b/tensorflow/core/profiler/lib/profiler_controller.h
@@ -17,15 +17,22 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/base/macros.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tsl/profiler/lib/profiler_controller.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::ProfilerController;  // NOLINT
+using ProfilerController ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ProfilerController;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_factory.h b/tensorflow/core/profiler/lib/profiler_factory.h
index 14d32919190d2d..54b6fb5577cf6a 100644
--- a/tensorflow/core/profiler/lib/profiler_factory.h
+++ b/tensorflow/core/profiler/lib/profiler_factory.h
@@ -19,26 +19,43 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/base/macros.h"
 #include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/lib/profiler_factory.h"
 #include "tsl/profiler/protobuf/profiler_options.pb.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
 // A ProfilerFactory returns an instance of ProfilerInterface if ProfileOptions
 // require it. Otherwise, it might return nullptr.
-using tsl::profiler::ProfilerFactory;  // NOLINT
+using ProfilerFactor ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ProfilerFactory;  // NOLINT
 
 // Registers a profiler factory. Should be invoked at most once per factory.
-using tsl::profiler::RegisterProfilerFactory;  // NOLINT
+ABSL_DEPRECATE_AND_INLINE()
+inline void RegisterProfilerFactory(ProfilerFactor factory) {
+  tsl::profiler::RegisterProfilerFactory(factory);
+}
 
 // Invokes all registered profiler factories with the given options, and
 // returns the instantiated (non-null) profiler interfaces.
-using tsl::profiler::CreateProfilers;  // NOLINT
+ABSL_DEPRECATE_AND_INLINE()
+inline std::vector<std::unique_ptr<profiler::ProfilerInterface>>
+CreateProfilers(const tensorflow::ProfileOptions& options) {
+  return tsl::profiler::CreateProfilers(options);
+}
 
 // For testing only.
-using tsl::profiler::ClearRegisteredProfilersForTest;  // NOLINT
+ABSL_DEPRECATE_AND_INLINE()
+inline void ClearRegisteredProfilersForTest() {
+  tsl::profiler::ClearRegisteredProfilersForTest();
+}
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_interface.h b/tensorflow/core/profiler/lib/profiler_interface.h
index ee3f9de6e5a02b..11423c1ab212ff 100644
--- a/tensorflow/core/profiler/lib/profiler_interface.h
+++ b/tensorflow/core/profiler/lib/profiler_interface.h
@@ -15,14 +15,21 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_INTERFACE_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_INTERFACE_H_
 
+#include "absl/base/macros.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tsl/profiler/lib/profiler_interface.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::ProfilerInterface;  // NOLINT
+using ProfilerInterface ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ProfilerInterface;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_lock.h b/tensorflow/core/profiler/lib/profiler_lock.h
index 1bf378d6226446..7480df5887ec1f 100644
--- a/tensorflow/core/profiler/lib/profiler_lock.h
+++ b/tensorflow/core/profiler/lib/profiler_lock.h
@@ -15,13 +15,20 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
 
+#include "absl/base/macros.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tsl/profiler/lib/profiler_lock.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::ProfilerLock;  // NOLINT
+using ProfilerLock ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ProfilerLock;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 1260c19b4c4c12..76099cc1b07ce3 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -15,11 +15,18 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
 
+#include "absl/base/macros.h"
 #include "tsl/profiler/lib/profiler_session.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 
-using tsl::ProfilerSession;  // NOLINT
+using ProfilerSession ABSL_DEPRECATE_AND_INLINE() =
+    tsl::ProfilerSession;  // NOLINT
 
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
diff --git a/tensorflow/core/profiler/lib/scoped_annotation.h b/tensorflow/core/profiler/lib/scoped_annotation.h
index b3336ba6397b14..fc92c39e676e52 100644
--- a/tensorflow/core/profiler/lib/scoped_annotation.h
+++ b/tensorflow/core/profiler/lib/scoped_annotation.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string_view>
 #include <utility>
 
+#include "absl/base/macros.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
@@ -30,10 +31,16 @@ limitations under the License.
 #include "tsl/profiler/backends/cpu/annotation_stack.h"
 #endif
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::ScopedAnnotation;  // NOLINT
+using ScopedAnnotation ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ScopedAnnotation;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h b/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h
index 1156aecc359511..e44cdb3c487ec4 100644
--- a/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h
+++ b/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h
@@ -20,13 +20,21 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/base/macros.h"
 #include "tsl/profiler/lib/scoped_memory_debug_annotation.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::MemoryDebugAnnotation;        // NOLINT
-using tsl::profiler::ScopedMemoryDebugAnnotation;  // NOLINT
+using MemoryDebugAnnotation ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::MemoryDebugAnnotation;  // NOLINT
+using ScopedMemoryDebugAnnotation ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ScopedMemoryDebugAnnotation;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index c5bf36e2b82be3..a5209317d62317 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/base/macros.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -31,16 +32,26 @@ limitations under the License.
 #include "tsl/profiler/utils/time_utils.h"
 #endif
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::GetTFTraceMeLevel;   // NOLINT
-using tsl::profiler::kCritical;           // NOLINT
-using tsl::profiler::kInfo;               // NOLINT
-using tsl::profiler::kVerbose;            // NOLINT
-using tsl::profiler::TfOpDetailsEnabled;  // NOLINT
-using tsl::profiler::TraceMe;             // NOLINT
-using tsl::profiler::TraceMeLevel;        // NOLINT
+using tsl::profiler::kInfo;                                          // NOLINT
+using TraceMe ABSL_DEPRECATE_AND_INLINE() = tsl::profiler::TraceMe;  // NOLINT
+using TraceMeLevel ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::TraceMeLevel;  // NOLINT
+
+ABSL_DEPRECATE_AND_INLINE()
+inline int GetTFTraceMeLevel(bool is_expensive) {
+  return tsl::profiler::GetTFTraceMeLevel(is_expensive);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline bool TfOpDetailsEnabled() { return tsl::profiler::TfOpDetailsEnabled(); }
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h
index dd0e87f7f09f43..5f1bb253ccb611 100644
--- a/tensorflow/core/profiler/lib/traceme_encode.h
+++ b/tensorflow/core/profiler/lib/traceme_encode.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <initializer_list>
 #include <string>
 
+#include "absl/base/macros.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -27,13 +28,68 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tsl/profiler/lib/traceme_encode.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::TraceMeArg;         // NOLINT
-using tsl::profiler::TraceMeEncode;      // NOLINT
-using tsl::profiler::TraceMeOp;          // NOLINT
-using tsl::profiler::TraceMeOpOverride;  // NOLINT
+using TraceMeArg ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::TraceMeArg;  // NOLINT
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeEncode(std::string name,
+                                 std::initializer_list<TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(name, args);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeEncode(absl::string_view name,
+                                 std::initializer_list<TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(name, args);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeEncode(const char* name,
+                                 std::initializer_list<TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(name, args);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeEncode(std::initializer_list<TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(args);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+// Concatenates op_name and op_type.
+inline std::string TraceMeOp(absl::string_view op_name,
+                             absl::string_view op_type) {
+  return tsl::profiler::TraceMeOp(op_name, op_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeOp(const char* op_name, const char* op_type) {
+  return tsl::profiler::TraceMeOp(op_name, op_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeOp(std::string&& op_name, absl::string_view op_type) {
+  return tsl::profiler::TraceMeOp(op_name, op_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+// Concatenates op_name and op_type.
+inline std::string TraceMeOpOverride(absl::string_view op_name,
+                                     absl::string_view op_type) {
+  return tsl::profiler::TraceMeOpOverride(op_name, op_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeOpOverride(const char* op_name, const char* op_type) {
+  return tsl::profiler::TraceMeOpOverride(op_name, op_type);
+}
 
 }  // namespace profiler
 }  // namespace tensorflow

From 9fd1dc32fe076886ee87785be7e21e8b59b37f0c Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Tue, 26 Mar 2024 16:06:21 -0700
Subject: [PATCH 465/670] hlo_dfs_reachability_test: use batch mode in Build
 benchmark

- Before:
----------------------------------------------------------------------------
Benchmark                                  Time             CPU   Iterations
----------------------------------------------------------------------------
BM_HloDfsReachabilityBuild/1             143 ns          143 ns     10314343
BM_HloDfsReachabilityBuild/64           1776 ns         1776 ns       783254
BM_HloDfsReachabilityBuild/128          3565 ns         3564 ns       391936
BM_HloDfsReachabilityBuild/256          7446 ns         7445 ns       189334
BM_HloDfsReachabilityBuild/512         15044 ns        15041 ns        92625
BM_HloDfsReachabilityBuild/4096       158882 ns       158865 ns         8625
BM_HloDfsReachabilityBuild/32768     1709590 ns      1709401 ns          825
BM_HloDfsReachabilityBuild/262144   37142558 ns     37136529 ns           38

- After:
----------------------------------------------------------------------------
Benchmark                                  Time             CPU   Iterations
----------------------------------------------------------------------------
BM_HloDfsReachabilityBuild/1             135 ns          135 ns     10330052
BM_HloDfsReachabilityBuild/64           27.8 ns         27.8 ns     50131200
BM_HloDfsReachabilityBuild/128          28.7 ns         28.7 ns     47522432
BM_HloDfsReachabilityBuild/256          29.2 ns         29.2 ns     47567104
BM_HloDfsReachabilityBuild/512          29.9 ns         29.9 ns     46650880
BM_HloDfsReachabilityBuild/4096         41.1 ns         41.1 ns     36945920
BM_HloDfsReachabilityBuild/32768        59.5 ns         59.5 ns     23035904
BM_HloDfsReachabilityBuild/262144        154 ns          154 ns      8912896

PiperOrigin-RevId: 619340769
---
 third_party/xla/xla/service/hlo_dfs_reachability_test.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/hlo_dfs_reachability_test.cc b/third_party/xla/xla/service/hlo_dfs_reachability_test.cc
index 27a3c402c606c7..9bc77c75f42ea1 100644
--- a/third_party/xla/xla/service/hlo_dfs_reachability_test.cc
+++ b/third_party/xla/xla/service/hlo_dfs_reachability_test.cc
@@ -168,8 +168,9 @@ class HloDfsReachabilityBenchmark {
 };
 
 void BM_HloDfsReachabilityBuild(benchmark::State& state) {
-  HloDfsReachabilityBenchmark bm(state.range(0), state.name());
-  for (auto s : state) {
+  int num_nodes = state.range(0);
+  HloDfsReachabilityBenchmark bm(num_nodes, state.name());
+  while (state.KeepRunningBatch(num_nodes)) {
     benchmark::DoNotOptimize(bm.Build());
   }
 }

From aaa5de3fb7ac296a6ac8a1a4c185a58f7e1e5d4c Mon Sep 17 00:00:00 2001
From: Fergus Henderson <fergus@google.com>
Date: Tue, 26 Mar 2024 16:43:03 -0700
Subject: [PATCH 466/670] Reverts changelist 506335783

PiperOrigin-RevId: 619350353
---
 configure.py                                | 2 +-
 tensorflow/lite/g3doc/android/lite_build.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configure.py b/configure.py
index c1cb20162012f6..50e2c40e188eff 100644
--- a/configure.py
+++ b/configure.py
@@ -759,7 +759,7 @@ def get_ndk_api_level(environ_cp, android_ndk_home_path):
   android_ndk_api_level = prompt_loop_or_load_from_env(
       environ_cp,
       var_name='ANDROID_NDK_API_LEVEL',
-      var_default='26',  # 26 is required to support AHardwareBuffer.
+      var_default='21',  # 21 is required for ARM64 support.
       ask_for_var=(
           'Please specify the (min) Android NDK API level to use. '
           '[Available levels: %s]'
diff --git a/tensorflow/lite/g3doc/android/lite_build.md b/tensorflow/lite/g3doc/android/lite_build.md
index b9e0ab86649537..aea79a35d14356 100644
--- a/tensorflow/lite/g3doc/android/lite_build.md
+++ b/tensorflow/lite/g3doc/android/lite_build.md
@@ -137,7 +137,7 @@ in the `.tf_configure.bazelrc` file in the root folder:
 
 ```shell
 build --action_env ANDROID_NDK_HOME="/usr/local/android/android-ndk-r25b"
-build --action_env ANDROID_NDK_API_LEVEL="26"
+build --action_env ANDROID_NDK_API_LEVEL="21"
 build --action_env ANDROID_BUILD_TOOLS_VERSION="30.0.3"
 build --action_env ANDROID_SDK_API_LEVEL="30"
 build --action_env ANDROID_SDK_HOME="/usr/local/android/android-sdk-linux"

From 40178abaaabf3684eb703d72041f3dbe47a0bc4b Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Tue, 26 Mar 2024 16:57:22 -0700
Subject: [PATCH 467/670] #tf-data Add type annotations for `distributed_save`.

PiperOrigin-RevId: 619353640
---
 tensorflow/python/data/experimental/ops/BUILD |  2 ++
 .../experimental/ops/distributed_save_op.py   | 34 ++++++++++++-------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 696d943e50dcc9..ac1ad1666b9a5a 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -99,6 +99,8 @@ py_strict_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/saved_model:nested_structure_coder",
     ],
diff --git a/tensorflow/python/data/experimental/ops/distributed_save_op.py b/tensorflow/python/data/experimental/ops/distributed_save_op.py
index 3e986a72681309..bd2b547a355bbc 100644
--- a/tensorflow/python/data/experimental/ops/distributed_save_op.py
+++ b/tensorflow/python/data/experimental/ops/distributed_save_op.py
@@ -14,32 +14,42 @@
 # ==============================================================================
 """Distributed saving of a dataset to disk."""
 
+from typing import Optional
+
 from tensorflow.core.protobuf import snapshot_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 # TODO(b/238903802): Use TypeSpec serialization methods directly.
 from tensorflow.python.saved_model import nested_structure_coder
 
 
 # TODO(b/250921378): Add example to docstring and export to TF API.
-def distributed_save(dataset, path, dispatcher_address, compression="AUTO"):
+def distributed_save(
+    dataset: dataset_ops.Dataset,
+    path: str,
+    dispatcher_address: str,
+    compression: str = "AUTO",
+) -> Optional[ops.OperationType]:
   """Initiates the process of distributedly saving a dataset to disk.
 
   Args:
     dataset: The `tf.data.Dataset` to save.
-    path: A string indicating the filepath of the directory to which to save
-      `dataset`.
-    dispatcher_address: A string indicating the address of the dispatcher for
-      the tf.data service instance used to save `dataset`.
-    compression: (Optional.) A string indicating whether and how to compress the
-      `dataset` materialization.  If `"AUTO"`, the tf.data runtime decides which
-      algorithm to use.  If `"GZIP"` or `"SNAPPY"`, that specific algorithm is
-      used.  If `None`, the `dataset` materialization is not compressed.
+    path: The directory path to save the dataset. Requires that the directory
+      do not exist and will create the directory.
+    dispatcher_address: The address of the tf.data service dispatcher used to
+      save `dataset`.
+    compression: (Optional.) Whether and how to compress the `dataset` snapshot.
+      If `"AUTO"`, the tf.data runtime decides which algorithm to use. If
+      `"GZIP"` or `"SNAPPY"`, that specific algorithm is used.  If `None`, the
+      `dataset` snapshot is not compressed.
 
   Returns:
     An operation which when executed performs the distributed save.
 
   Raises:
     ValueError: If `dispatcher_address` is invalid.
+    tf.errors.AlreadyExistsError: If the snapshot already exists.
   """
   if not isinstance(dispatcher_address, str):
     raise ValueError("`dispatcher_address` must be a string, but is a "
@@ -50,12 +60,10 @@ def distributed_save(dataset, path, dispatcher_address, compression="AUTO"):
   metadata = snapshot_pb2.DistributedSnapshotMetadata(
       element_spec=nested_structure_coder.encode_structure(
           dataset.element_spec).SerializeToString(),
-      compression=compression,
-  )
+      compression=compression)
 
   return gen_experimental_dataset_ops.distributed_save(
       dataset._variant_tensor,  # pylint: disable=protected-access
       directory=path,
       address=dispatcher_address,
-      metadata=metadata.SerializeToString(),
-  )
+      metadata=metadata.SerializeToString())

From 4d5fc22a3896dc619129b40ac4c3e022f5392500 Mon Sep 17 00:00:00 2001
From: Anshuman Goswami <anshumang@google.com>
Date: Tue, 26 Mar 2024 17:43:46 -0700
Subject: [PATCH 468/670] Replace `RemoteTensorHandle` with `TensorProto` for
 scalars in an `EnqueueRequest` except for `DT_RESOURCE`

PiperOrigin-RevId: 619364557
---
 tensorflow/core/common_runtime/eager/BUILD    |  4 ++
 .../core/common_runtime/eager/execute.cc      | 61 ++++++++++++++-----
 2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 5c53fb9fc4fcee..cf569caa543a9a 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -638,10 +638,12 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/util:env_var",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -806,10 +808,12 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/util:env_var",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 5faa7f1d2e9a86..7aff350fa65ac4 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -81,6 +81,7 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/util/env_var.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_copy_node.h"
@@ -121,6 +122,16 @@ auto* top_level_jit_compilation_counter = monitoring::Counter<1>::New(
     "The number of times a top-level JIT-compiled function is called.",
     "device");
 
+bool SendAsProtosWhenPossible() {
+  static bool send_as_protos_when_possible = []() {
+    bool result;
+    TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_SEND_AS_PROTOS_WHEN_POSSIBLE",
+                                        false, &result));
+    return result;
+  }();
+  return send_as_protos_when_possible;
+}
+
 const string& DeviceNameOrUnspecified(Device* device) {
   static string* unspecified_string = new string("<unspecified>");
   return (device == nullptr) ? *unspecified_string : device->name();
@@ -1915,21 +1926,41 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
                                               ctx.GetContextViewId()));
         }
       }
-      auto* input_handle = remote_op->add_op_inputs()->mutable_remote_handle();
-      // For a remote component function, a function execution request and an
-      // input generation request may come from different workers. We need to
-      // guarantee that the input generation request is processed before the
-      // function execution request, so wait until the remote input is ready
-      // before sending it to the multi-device function device.
-      bool wait_until_ready =
-          SkipRemoteHandleWaitReady() ? false : op->is_function();
-      TF_RETURN_IF_ERROR(ctx.RemoteMgr()->SerializeRemoteTensorHandle(
-          input, wait_until_ready, input_handle, input_device,
-          *input_device_name, serialize_resource_dtype_and_shape));
-      if (!input_handle->resource_dtypes_and_shapes().empty()) {
-        TF_RETURN_IF_ERROR(
-            input->AddResourceShapeMirror(op_device, input_handle->op_id(),
-                                          input_handle->output_num(), &ctx));
+      int64_t num_elements;
+      TF_RETURN_IF_ERROR(input->NumElements(&num_elements));
+      if ((input->Type() == TensorHandle::HandleType::LOCAL) &&
+          (num_elements == 1) && (input->DataType() != DT_VARIANT) &&
+          SendAsProtosWhenPossible()) {
+        auto* input_tensor_proto = remote_op->add_op_inputs()->mutable_tensor();
+        const tensorflow::Tensor* input_tensor = nullptr;
+        TensorHandle* local_cpu_input_handle = nullptr;
+        TF_RETURN_IF_ERROR(EagerCopyToDevice(input, &ctx, &ctx.Executor(),
+                                             ctx.HostCPU(), false,
+                                             &local_cpu_input_handle));
+        TF_RETURN_IF_ERROR(local_cpu_input_handle->Tensor(&input_tensor));
+        input_tensor->AsProtoTensorContent(input_tensor_proto);
+        // `TensorHandle::AddResourceShapeMirror` can change `input` but only if
+        // `TensorHandle::handle_dtypes_and_shapes_` is not empty. And that
+        // requires `TensorHandle::dtype` to be equal to `DT_RESOURCE` which
+        // cannot be the case when we are here. So nothing else to do.
+      } else {
+        auto* input_handle =
+            remote_op->add_op_inputs()->mutable_remote_handle();
+        // For a remote component function, a function execution request and an
+        // input generation request may come from different workers. We need to
+        // guarantee that the input generation request is processed before the
+        // function execution request, so wait until the remote input is ready
+        // before sending it to the multi-device function device.
+        bool wait_until_ready =
+            SkipRemoteHandleWaitReady() ? false : op->is_function();
+        TF_RETURN_IF_ERROR(ctx.RemoteMgr()->SerializeRemoteTensorHandle(
+            input, wait_until_ready, input_handle, input_device,
+            *input_device_name, serialize_resource_dtype_and_shape));
+        if (!input_handle->resource_dtypes_and_shapes().empty()) {
+          TF_RETURN_IF_ERROR(
+              input->AddResourceShapeMirror(op_device, input_handle->op_id(),
+                                            input_handle->output_num(), &ctx));
+        }
       }
     }
   }

From 92c4aae35b6f4fd077c46c65a97cfe24d070a1e9 Mon Sep 17 00:00:00 2001
From: Parker Schuh <parkers@google.com>
Date: Tue, 26 Mar 2024 18:35:49 -0700
Subject: [PATCH 469/670] Guard host transfers inside pure_callbacks from
 deadlocking the TPU.

Also fix python/callback.cc to not swallow errors in numpy conversions.

PiperOrigin-RevId: 619375128
---
 third_party/xla/xla/pjrt/host_callback.cc | 10 ++++++++++
 third_party/xla/xla/pjrt/host_callback.h  |  6 ++++++
 third_party/xla/xla/python/BUILD          |  1 +
 third_party/xla/xla/python/callback.cc    | 18 +++++++++++++-----
 third_party/xla/xla/python/nb_numpy.cc    | 13 +++++++++++++
 third_party/xla/xla/python/nb_numpy.h     |  4 ++++
 6 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/pjrt/host_callback.cc b/third_party/xla/xla/pjrt/host_callback.cc
index 231830991509dc..e59eeb6f1f8153 100644
--- a/third_party/xla/xla/pjrt/host_callback.cc
+++ b/third_party/xla/xla/pjrt/host_callback.cc
@@ -21,6 +21,13 @@ limitations under the License.
 
 namespace xla {
 
+static thread_local int on_send_guard = 0;
+
+void EnterHostCallback() { ++on_send_guard; }
+void LeaveHostCallback() { --on_send_guard; }
+
+bool ThisThreadIsInsideHostCallback() { return on_send_guard > 0; }
+
 Status HostCallbackContext::OnSend(int arg_num,
                                    const PjRtTransferMetadata& metadata,
                                    PjRtChunk data) {
@@ -72,7 +79,10 @@ Status HostCallbackContext::OnSend(int arg_num,
     result_ptrs.push_back(results.back().data());
   }
 
+  EnterHostCallback();
   auto status = host_callback_.callback(result_ptrs.data(), arg_ptrs.data());
+  LeaveHostCallback();
+
   // TODO(chky): Consider populating garbage data in results upon errors.
 
   // Clear the arguments for this invocation. This won't race with next
diff --git a/third_party/xla/xla/pjrt/host_callback.h b/third_party/xla/xla/pjrt/host_callback.h
index a4323a92b9a911..58788c0ee87954 100644
--- a/third_party/xla/xla/pjrt/host_callback.h
+++ b/third_party/xla/xla/pjrt/host_callback.h
@@ -34,6 +34,12 @@ limitations under the License.
 
 namespace xla {
 
+bool ThisThreadIsInsideHostCallback();
+
+void EnterHostCallback();
+
+void LeaveHostCallback();
+
 // A thread-safe queue for passing PjRtChunk objects for e.g. from Send ops to
 // Recv ops.
 class ThreadSafePjRtChunkQueue {
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 31ea8792e45ed4..7bbd87f5cf9ac5 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -418,6 +418,7 @@ cc_library(
         "//third_party/nanobind",
         "//xla:comparison_util",
         "//xla:xla_data_proto_cc",
+        "//xla/pjrt:host_callback",
         "//xla/pjrt:transpose",
         "//xla/service:custom_call_status",
         "@com_google_absl//absl/base",
diff --git a/third_party/xla/xla/python/callback.cc b/third_party/xla/xla/python/callback.cc
index e37ca0cf3c7b57..1696af9058bd39 100644
--- a/third_party/xla/xla/python/callback.cc
+++ b/third_party/xla/xla/python/callback.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/transpose.h"
 #include "xla/primitive_util.h"
 #include "xla/python/nb_numpy.h"
@@ -127,14 +128,16 @@ absl::Status CpuCallback::PrepareAndCall(void* result, void** arg_ptrs) {
 }
 
 absl::StatusOr<nb::tuple> CpuCallback::CallInternal(nb::tuple args) {
+  auto py_error_to_status = [](nb::python_error& e) {
+    std::string error_message = e.what();
+    return absl::InternalError(
+        absl::StrFormat("CpuCallback error: %s", error_message));
+  };
   nb::object result_object;
   try {
     result_object = callable_(*nb::borrow<nb::args>(args));
   } catch (nb::python_error& e) {
-    PyErr_Clear();
-    std::string error_message = e.what();
-    return absl::InternalError(
-        absl::StrFormat("CpuCallback error: %s", error_message));
+    return py_error_to_status(e);
   }
   if (!PyTuple_Check(result_object.ptr())) {
     return absl::InternalError(
@@ -158,7 +161,12 @@ absl::StatusOr<nb::tuple> CpuCallback::CallInternal(nb::tuple args) {
       }
       continue;
     }
-    nb_numpy_ndarray array = nb_numpy_ndarray::ensure(output);
+    nb_numpy_ndarray array;
+    try {
+      array = nb_numpy_ndarray::from_any(output, NPY_ARRAY_ENSUREARRAY);
+    } catch (nb::python_error& e) {
+      return py_error_to_status(e);
+    }
     static_assert(sizeof(ssize_t) == sizeof(int64_t),
                   "Expected ssize_t to be of equal size to int64_t");
     absl::Span<int64_t const> dims(
diff --git a/third_party/xla/xla/python/nb_numpy.cc b/third_party/xla/xla/python/nb_numpy.cc
index 5dde00d75a35e5..f6006cd94786e7 100644
--- a/third_party/xla/xla/python/nb_numpy.cc
+++ b/third_party/xla/xla/python/nb_numpy.cc
@@ -81,6 +81,19 @@ nb_numpy_ndarray::nb_numpy_ndarray(
   m_ptr = array.release().ptr();
 }
 
+/*static*/ nb_numpy_ndarray nb_numpy_ndarray::from_any(nanobind::handle h,
+                                                       int extra_requirements) {
+  nb::handle out = PyArray_FromAny(
+      h.ptr(), /*dtype=*/nullptr, /*min_depth=*/0,
+      /*max_depth=*/0,
+      /*requirements=*/NPY_ARRAY_ENSUREARRAY | extra_requirements,
+      /*context=*/nullptr);
+  if (PyErr_Occurred()) {
+    throw nb::python_error();
+  }
+  return nb::steal<nb_numpy_ndarray>(out);
+}
+
 /*static*/ nb_numpy_ndarray nb_numpy_ndarray::ensure(nanobind::handle h,
                                                      int extra_requirements) {
   nb::handle out = PyArray_FromAny(
diff --git a/third_party/xla/xla/python/nb_numpy.h b/third_party/xla/xla/python/nb_numpy.h
index 9b1cd6de603b4c..64c6c55cc93d2e 100644
--- a/third_party/xla/xla/python/nb_numpy.h
+++ b/third_party/xla/xla/python/nb_numpy.h
@@ -85,6 +85,10 @@ class nb_numpy_ndarray : public nanobind::object {
   static nb_numpy_ndarray ensure(nanobind::handle h,
                                  int extra_requirements = 0);
 
+  // Constructs a numpy ndarray via the PyArray_From Any API. This throws an
+  // error if an exception occurs.
+  static nb_numpy_ndarray from_any(nanobind::handle h, int extra_requirements);
+
   nb_dtype dtype() const;
   npy_intp ndim() const;
   const npy_intp* shape() const;

From 0018db366526b9a28a7e3bae6ed53b4af9d23a89 Mon Sep 17 00:00:00 2001
From: redwrasse <mail@redwrasse.io>
Date: Tue, 26 Mar 2024 20:42:28 -0700
Subject: [PATCH 470/670] add num_results + skip overflow check test

---
 tensorflow/python/ops/sobol_ops_test.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/sobol_ops_test.py b/tensorflow/python/ops/sobol_ops_test.py
index a795cbdcf4f06e..c7517186b0e13f 100644
--- a/tensorflow/python/ops/sobol_ops_test.py
+++ b/tensorflow/python/ops/sobol_ops_test.py
@@ -140,7 +140,7 @@ def test_non_scalar_input(self):
           skip=constant_op.constant([1])))
 
   @test_util.run_in_graph_and_eager_modes
-  def testDimNumResultsOverflow(self):
+  def test_dim_num_results_overflow(self):
     with self.assertRaisesRegex(
         (ValueError, errors.InvalidArgumentError),
         r'num_results\*dim must be less than 2147483647'):
@@ -148,6 +148,15 @@ def testDimNumResultsOverflow(self):
           gen_math_ops.sobol_sample(
               dim=2560, num_results=16384000, skip=0, dtype=dtypes.float32))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_num_results_skip_overflow(self):
+    with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError),
+          r'num_results\+skip must be less than 2147483647'):
+      self.evaluate(
+          gen_math_ops.sobol_sample(
+              dim=1, num_results=1, skip=2147483647, dtype=dtypes.float32))
+
 
 if __name__ == '__main__':
   googletest.main()

From 1ea9d19233ab677d6fe01d589522e5456f4643d1 Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Tue, 26 Mar 2024 21:37:24 -0700
Subject: [PATCH 471/670] Allow optional `stablehlo.broadcast_in_dim` op.

When deferring `stablehlo.transpose` of the lhs of `stablehlo.add`, the rhs may have a `constant->broadcast_in_dim` pattern.
Allow the pattern `DeferActivationTransposeForAddOp` to handle such a case.

PiperOrigin-RevId: 619408551
---
 .../passes/defer_activation_transpose.cc      | 22 ++++++++++--
 .../passes/defer_activation_transpose.mlir    | 26 ++++++++++++++
 .../tests/pipelines/process_nchw_tensor.mlir  | 36 +++++++++++++++++++
 3 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
index ec07590ca3651f..5be09ce2ad47ef 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
@@ -41,6 +41,7 @@ namespace mlir::quant::stablehlo {
 namespace {
 
 using ::mlir::stablehlo::AddOp;
+using ::mlir::stablehlo::BroadcastInDimOp;
 using ::mlir::stablehlo::MaxOp;
 using ::mlir::stablehlo::TransposeOp;
 
@@ -85,7 +86,19 @@ void DeferRhsTransposeForBinaryOp(OpT op, PatternRewriter& rewriter) {
   rewriter.replaceAllUsesWith(op.getResult(), output_transpose_op);
 }
 
-class RewriteAddWithActivationTranspose : public OpRewritePattern<AddOp> {
+// "Climbs up" the `op` if `op` is a `BraodcastInDimOp` and returns the defining
+// op of its operand. Returns `op` otherwise. May return `nullptr` when the
+// `BroadcastInDimOp`'s operand is a block argument.
+absl::Nullable<Operation*> SkipUpwardsOptionalBroadcastInDimOp(
+    absl::Nonnull<Operation*> op) {
+  if (auto broadcast_in_dim_op = dyn_cast_or_null<BroadcastInDimOp>(op);
+      broadcast_in_dim_op != nullptr) {
+    return broadcast_in_dim_op.getOperand().getDefiningOp();
+  }
+  return op;
+}
+
+class DeferActivationTransposeForAddOp : public OpRewritePattern<AddOp> {
  public:
   using OpRewritePattern<AddOp>::OpRewritePattern;
 
@@ -96,6 +109,11 @@ class RewriteAddWithActivationTranspose : public OpRewritePattern<AddOp> {
 
     const Value rhs = op.getOperand(1);
     Operation* rhs_op = rhs.getDefiningOp();
+    if (rhs_op == nullptr) return failure();
+
+    // Ignore the optional `BroadcastInDimOp` in between the constant and RHS.
+    rhs_op = SkipUpwardsOptionalBroadcastInDimOp(rhs_op);
+
     if (rhs_op == nullptr || !rhs_op->hasTrait<OpTrait::ConstantLike>()) {
       return failure();
     }
@@ -259,7 +277,7 @@ void DeferActivationTransposePass::runOnOperation() {
   MLIRContext& ctx = getContext();
 
   RewritePatternSet patterns(&ctx);
-  patterns.add<RewriteAddWithActivationTranspose,
+  patterns.add<DeferActivationTransposeForAddOp,
                DeferActivationTransposeForMaxPoolReduceWindowOp,
                DeferActivationTransposeForMaxOp>(&ctx);
   if (failed(applyPatternsAndFoldGreedily(func_op, std::move(patterns)))) {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
index 429f3f9f2452b0..96b270f8b888f9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
@@ -24,6 +24,32 @@ func.func @add_with_activation_transpose(%arg0: tensor<1x3x3x4xf32>) -> tensor<1
 
 // -----
 
+// Tests that an `add(transpose(arg0), broadcast_in_dim(arg1))` pattern is
+// converted to `transpose(add(arg0, transpose(broadcast_in_dim(arg1))))`.
+// The transpose in the activation is deferred to the output of `stablehlo.add`
+// and an extra transpose op is inserted to the RHS to match the shape of the
+// operand.
+
+// CHECK-LABEL: add_with_activation_transpose_broadcasted_rhs
+func.func @add_with_activation_transpose_broadcasted_rhs(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4xf32>
+  %1 = stablehlo.broadcast_in_dim %0, dims = [1] : (tensor<4xf32>) -> tensor<1x4x3x3xf32>
+  %2 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x4x3x3xf32>
+  return %3 : tensor<1x4x3x3xf32>
+}
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant
+// CHECK-DAG: %[[BROADCAST:.+]] = stablehlo.broadcast_in_dim %[[CONST_0]], dims = [1] : (tensor<4xf32>) -> tensor<1x4x3x3xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[BROADCAST]], dims = [0, 2, 3, 1] : (tensor<1x4x3x3xf32>) -> tensor<1x3x3x4xf32>
+
+// Check that the shape of the add is changed to reflect the deferred transpose.
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[ARG_0]], %[[TRANSPOSE_0]] : tensor<1x3x3x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
 // [No change] Tests that the activation transpose whose permutation is not
 // `[0, 3, 1, 2]` is not deferred.
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
index 636146a12b490e..831131a4c64555 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
@@ -133,3 +133,39 @@ func.func @nchw_conv_with_bias_add_relu(%arg0: tensor<1x2x5x5xf32>) -> tensor<1x
 // CHECK: %[[MAX:.+]] = stablehlo.maximum %[[ADD]], %[[ZERO_CONST]] : tensor<1x5x5x4xf32>
 // CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[MAX]], dims = [0, 3, 1, 2] : (tensor<1x5x5x4xf32>) -> tensor<1x4x5x5xf32>
 // CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// Tests that a `maximum(add(convolution(%activation, %weight), broadcast(%bias)
+// ), %zero)` with the activation tensor of NCHW format is converted to NHWC
+// convolution + add + maximum operation. Transpose ops are inserted to the
+// first activation, final output, and the bias constant (after the broadcast),
+// to match the function signature. Constants are also transpose-folded
+// accordingly.
+//
+// Note that the `transpose` after the `broadcast_in_dim` is not folded by the
+// `FoldConstantTransposePass`.
+
+// CHECK-LABEL: nchw_conv_with_broadcasted_bias_add_relu
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x5x5xf32>
+func.func @nchw_conv_with_broadcasted_bias_add_relu(%arg0: tensor<1x2x5x5xf32>) -> tensor<1x4x5x5xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4x2x3x3xf32>  // weight
+  %1 = stablehlo.constant dense<3.000000e+00> : tensor<4xf32>  // bias
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x4x5x5xf32>  // relu
+  %3 = stablehlo.broadcast_in_dim %1, dims = [1] : (tensor<4xf32>) -> tensor<1x4x5x5xf32>
+  %4 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2x5x5xf32>, tensor<4x2x3x3xf32>) -> tensor<1x4x5x5xf32>
+  %5 = stablehlo.add %4, %3 : tensor<1x4x5x5xf32>
+  %6 = stablehlo.maximum %5, %2 : tensor<1x4x5x5xf32>
+  return %6 : tensor<1x4x5x5xf32>
+}
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} : tensor<3x3x2x4xf32>
+// CHECK-DAG: %[[ZERO_CONST:.+]] = stablehlo.constant {{.*}} : tensor<1x5x5x4xf32>
+// CHECK-DAG: %[[BIAS_CONST:.+]] = stablehlo.constant {{.*}} : tensor<4xf32>
+// CHECK-DAG: %[[BROADCAST_IN_DIM:.+]] = stablehlo.broadcast_in_dim %[[BIAS_CONST]], dims = [1] : (tensor<4xf32>) -> tensor<1x4x5x5xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG]], dims = [0, 2, 3, 1] : (tensor<1x2x5x5xf32>) -> tensor<1x5x5x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[WEIGHT_CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2xf32>, tensor<3x3x2x4xf32>) -> tensor<1x5x5x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[BROADCAST_IN_DIM]], dims = [0, 2, 3, 1] : (tensor<1x4x5x5xf32>) -> tensor<1x5x5x4xf32>
+// CHECK: %[[ADD:.+]] = stablehlo.add %[[CONV]], %[[TRANSPOSE_1]] : tensor<1x5x5x4xf32>
+// CHECK: %[[MAX:.+]] = stablehlo.maximum %[[ADD]], %[[ZERO_CONST]] : tensor<1x5x5x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[MAX]], dims = [0, 3, 1, 2] : (tensor<1x5x5x4xf32>) -> tensor<1x4x5x5xf32>
+// CHECK: return %[[TRANSPOSE_1]]

From 1dfe621ddd9ade16c711fe0fdefc2f073d499aa1 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Tue, 26 Mar 2024 21:54:25 -0700
Subject: [PATCH 472/670] Add a few missing enums on XLA custom call version
 `API_VERSION_TYPED_FFI`.

PiperOrigin-RevId: 619411280
---
 third_party/xla/xla/ffi/call_frame.cc            | 1 +
 third_party/xla/xla/python/ops.cc                | 4 +++-
 third_party/xla/xla/python/xla_extension/ops.pyi | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index 2220664d4b1969..06dd10fb0833c0 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -216,6 +216,7 @@ CallFrame::~CallFrame() = default;
   auto to_data_type = [](PrimitiveType primitive_type) {
     switch (primitive_type) {
       case PrimitiveType::PRIMITIVE_TYPE_INVALID:
+      case PrimitiveType::PRED:
       case PrimitiveType::S8:
       case PrimitiveType::S16:
       case PrimitiveType::S32:
diff --git a/third_party/xla/xla/python/ops.cc b/third_party/xla/xla/python/ops.cc
index 9f4298fa47bda4..87f7242a5940d1 100644
--- a/third_party/xla/xla/python/ops.cc
+++ b/third_party/xla/xla/python/ops.cc
@@ -312,7 +312,9 @@ void BuildOpsSubmodule(nb::module_& m) {
       .value("API_VERSION_STATUS_RETURNING",
              CustomCallApiVersion::API_VERSION_STATUS_RETURNING)
       .value("API_VERSION_STATUS_RETURNING_UNIFIED",
-             CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED);
+             CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED)
+      .value("API_VERSION_TYPED_FFI",
+             CustomCallApiVersion::API_VERSION_TYPED_FFI);
 
   ops.def("AfterAll", &AfterAll, nb::arg("builder"), nb::arg("tokens"));
   ops.def("AllGather", &AllGather, nb::arg("operand"),
diff --git a/third_party/xla/xla/python/xla_extension/ops.pyi b/third_party/xla/xla/python/xla_extension/ops.pyi
index 7fcac04eddbf5f..55624f47446a60 100644
--- a/third_party/xla/xla/python/xla_extension/ops.pyi
+++ b/third_party/xla/xla/python/xla_extension/ops.pyi
@@ -59,6 +59,7 @@ class CustomCallApiVersion(enum.IntEnum):
   API_VERSION_ORIGINAL: int
   API_VERSION_STATUS_RETURNING: int
   API_VERSION_STATUS_RETURNING_UNIFIED: int
+  API_VERSION_TYPED_FFI: int
 
 def AfterAll(builder: XlaBuilder, tokens: Sequence[XlaOp]) -> XlaOp: ...
 def AllGather(

From eccbf325285c3f6110d07f1ef051c838ebf44c82 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 22:39:52 -0700
Subject: [PATCH 473/670] Automated Code Change

PiperOrigin-RevId: 619419396
---
 tensorflow/core/distributed_runtime/coordination/BUILD   | 9 ++++++---
 .../coordination/coordination_service_barrier_proxy.cc   | 5 ++++-
 .../coordination/coordination_service_barrier_proxy.h    | 1 +
 .../coordination_service_barrier_proxy_test.cc           | 4 +++-
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/coordination/BUILD b/tensorflow/core/distributed_runtime/coordination/BUILD
index b4706d29ddbffa..253393afd778d5 100644
--- a/tensorflow/core/distributed_runtime/coordination/BUILD
+++ b/tensorflow/core/distributed_runtime/coordination/BUILD
@@ -49,17 +49,18 @@ cc_library(
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework_lite",
-        "//tensorflow/core/platform:env",
-        "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:macros",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:thread_annotations",
         "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
     ],
 )
 
@@ -74,9 +75,11 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/distributed_runtime:call_options",
         "@local_tsl//tsl/distributed_runtime/coordination:coordination_client",
         "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
index 952d38d852560f..61083bf86bcfe9 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
@@ -20,14 +20,17 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tsl/protobuf/coordination_service.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
index 3254a5d3369f1e..b0612fc2ee9770 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tsl/protobuf/coordination_service.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
index 464c82bfca3fd0..213c187cd75214 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
@@ -24,12 +24,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/threadpool.h"
 #include "tsl/distributed_runtime/call_options.h"

From 4937722964803bb56a6767257807251861349acc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 22:55:53 -0700
Subject: [PATCH 474/670] Automated Code Change

PiperOrigin-RevId: 619422013
---
 tensorflow/core/profiler/utils/op_metrics_db_utils.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.h b/tensorflow/core/profiler/utils/op_metrics_db_utils.h
index 585676b690908c..c75bd71fcefc38 100644
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils.h
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_UTILS_OP_METRICS_DB_UTILS_H_
 
 #include <algorithm>
+#include <optional>
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
@@ -121,7 +122,7 @@ inline uint64_t ChildrenTimePs(const OpMetrics& metrics) {
 
 // Returns the ratio of time spent sending data from the host to the device
 // relative to the total time the host was active.
-absl::optional<double> HostInfeedEnqueueRatio(const OpMetricsDb& db);
+std::optional<double> HostInfeedEnqueueRatio(const OpMetricsDb& db);
 
 // Converts from the device op metrics to Tf-op metrics.
 OpMetricsDb CreateTfMetricsDbFromDeviceOpMetricsDb(

From 5680eca40afcfa49a3e707506e0dc330185e0b9d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 Mar 2024 23:12:52 -0700
Subject: [PATCH 475/670] Automated Code Change

PiperOrigin-RevId: 619425275
---
 third_party/xla/third_party/tsl/tsl/lib/io/random_inputstream.h | 2 +-
 third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h   | 2 +-
 third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/random_inputstream.h b/third_party/xla/third_party/tsl/tsl/lib/io/random_inputstream.h
index 9313ca591757ea..e1608ce3ec2b9b 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/random_inputstream.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/random_inputstream.h
@@ -31,7 +31,7 @@ class RandomAccessInputStream : public InputStreamInterface {
   // must outlive *this.
   RandomAccessInputStream(RandomAccessFile* file, bool owns_file = false);
 
-  ~RandomAccessInputStream();
+  ~RandomAccessInputStream() override;
 
   Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
 
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h
index e19843c812d350..d009cbfd0baf54 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h
@@ -55,7 +55,7 @@ class ZlibInputStream : public InputStreamInterface {
                   size_t output_buffer_bytes,
                   const ZlibCompressionOptions& zlib_options);
 
-  ~ZlibInputStream();
+  ~ZlibInputStream() override;
 
   // Reads bytes_to_read bytes into *result, overwriting *result.
   //
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h
index 3e4236ac1e44fd..8f0793c985bae5 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h
@@ -49,7 +49,7 @@ class ZlibOutputBuffer : public WritableFile {
       int32_t output_buffer_bytes,  // size of z_stream.next_out buffer
       const ZlibCompressionOptions& zlib_options);
 
-  ~ZlibOutputBuffer();
+  ~ZlibOutputBuffer() override;
 
   // Initializes some state necessary for the output buffer. This call is
   // required before any other operation on the buffer.

From 579c256fc577505db493bc5c7f298cba0c829d97 Mon Sep 17 00:00:00 2001
From: Doyeon Kim <doyeonkim@google.com>
Date: Wed, 27 Mar 2024 00:15:12 -0700
Subject: [PATCH 476/670] Rename 'hybrid' to 'weight-only'

PiperOrigin-RevId: 619436893
---
 .../mlir/quantization/stablehlo/BUILD         |  2 +-
 ...ntize_hybrid.cc => insert_weight_param.cc} | 19 ++++++------
 .../quantization/stablehlo/passes/passes.td   |  4 +--
 .../stablehlo/passes/quantization_patterns.cc | 18 +++++++-----
 .../stablehlo/passes/quantization_patterns.h  | 29 ++++++++++---------
 .../quantization/stablehlo/passes/quantize.cc | 16 +++++-----
 .../passes/quantize_composite_functions.cc    |  2 +-
 ...e_hybrid.mlir => insert_weight_param.mlir} |  2 +-
 ..._hybrid.mlir => quantize_weight_only.mlir} |  0
 ...tize_composite_functions_weight_only.mlir} |  4 +--
 10 files changed, 50 insertions(+), 46 deletions(-)
 rename tensorflow/compiler/mlir/quantization/stablehlo/passes/{prepare_quantize_hybrid.cc => insert_weight_param.cc} (91%)
 rename tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/{prepare_quantize_hybrid.mlir => insert_weight_param.mlir} (98%)
 rename tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/{quantize_hybrid.mlir => quantize_weight_only.mlir} (100%)
 rename tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/{quantize_composite_functions_hybrid.mlir => quantize_composite_functions_weight_only.mlir} (95%)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 5251ecebf1c9b3..bf1d249bf05cb5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -50,6 +50,7 @@ cc_library(
         "passes/convert_xla_call_module_op_to_bfloat16.cc",
         "passes/defer_activation_transpose.cc",
         "passes/fold_constant_transpose.cc",
+        "passes/insert_weight_param.cc",
         "passes/lift_quantizable_spots_as_functions.cc",
         "passes/lift_quantizable_spots_as_functions_fusion.inc",
         "passes/lift_quantizable_spots_as_functions_simple.inc",
@@ -57,7 +58,6 @@ cc_library(
         "passes/optimize_graph.cc",
         "passes/post_quantize.cc",
         "passes/prepare_quantize.cc",
-        "passes/prepare_quantize_hybrid.cc",
         "passes/quantize.cc",
         "passes/quantize_composite_functions.cc",
         "passes/quantize_weight.cc",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize_hybrid.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc
similarity index 91%
rename from tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize_hybrid.cc
rename to tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc
index 33a208f07ad2ec..9fb1e9e985d15e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize_hybrid.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc
@@ -47,20 +47,21 @@ limitations under the License.
 
 namespace mlir::quant::stablehlo {
 
-#define GEN_PASS_DEF_PREPAREQUANTIZEHYBRIDPASS
+#define GEN_PASS_DEF_INSERTWEIGHTPARAMPASS
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
 
 namespace {
 
-// Prepare hybrid quantization for weight-only quantization and dynamic range
-// quantization of `stablehlo.convolution` and `stablehlo.dot_general`.
-class PrepareQuantizeHybridPass
-    : public impl::PrepareQuantizeHybridPassBase<PrepareQuantizeHybridPass> {
+// Inserts quantization parameters of weights for weight-only quantization and
+// dynamic range quantization of `stablehlo.convolution` and
+// `stablehlo.dot_general`.
+class InsertWeightParamPass
+    : public impl::InsertWeightParamPassBase<InsertWeightParamPass> {
  public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PrepareQuantizeHybridPass)
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InsertWeightParamPass)
 
-  using impl::PrepareQuantizeHybridPassBase<
-      PrepareQuantizeHybridPass>::PrepareQuantizeHybridPassBase;
+  using impl::InsertWeightParamPassBase<
+      InsertWeightParamPass>::InsertWeightParamPassBase;
 
  private:
   void runOnOperation() override;
@@ -135,7 +136,7 @@ class InsertWeightParamPattern
   }
 };
 
-void PrepareQuantizeHybridPass::runOnOperation() {
+void InsertWeightParamPass::runOnOperation() {
   func::FuncOp func = getOperation();
   MLIRContext* context = func.getContext();
   RewritePatternSet patterns(context);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
index cf184376803d61..e69f4d02b4ba84 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -176,8 +176,8 @@ def DeferActivationTransposePass : Pass<"stablehlo-defer-activation-transpose",
   let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
 }
 
-def PrepareQuantizeHybridPass : Pass<"stablehlo-prepare-quantize-hybrid", "mlir::func::FuncOp"> {
-  let summary = "Prepare hybrid quantization for weight-only quantization and dynamic range quantization.";
+def InsertWeightParamPass : Pass<"stablehlo-insert-weight-param", "mlir::func::FuncOp"> {
+  let summary = "Insert quantization parameters of weights for weight-only quantization and dynamic range quantization.";
   let dependentDialects = [
       "mlir::stablehlo::StablehloDialect",
       "TF::TensorFlowDialect",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
index a43f4bab2b5d04..3b53bef99ba179 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
@@ -883,10 +883,10 @@ bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op) {
   return false;
 }
 
-class QuantizeHybridDotGeneralPattern
+class QuantizeWeightOnlyDotGeneralPattern
     : public EntryFuncBodyQuantizationPattern {
  public:
-  explicit QuantizeHybridDotGeneralPattern() = default;
+  explicit QuantizeWeightOnlyDotGeneralPattern() = default;
 
   LogicalResult match(func::FuncOp entry_func_op) const override {
     return MatchGemmStyleOp<DotGeneralOp>(entry_func_op);
@@ -899,10 +899,11 @@ class QuantizeHybridDotGeneralPattern
 template <typename FuncBodyRewritePatternT,
           typename = std::enable_if_t<std::is_base_of_v<
               EntryFuncBodyQuantizationPattern, FuncBodyRewritePatternT>>>
-class HybridXlaCallModuleOpToCallOp
+class WeightOnlyXlaCallModuleOpToCallOp
     : public OpRewritePattern<TF::XlaCallModuleOp> {
  public:
-  explicit HybridXlaCallModuleOpToCallOp(MLIRContext& ctx)
+  explicit WeightOnlyXlaCallModuleOpToCallOp(
+      MLIRContext& ctx, const bool enable_per_channel_quantized_weight)
       : OpRewritePattern<TF::XlaCallModuleOp>(&ctx) {};
 
   LogicalResult match(TF::XlaCallModuleOp op) const override {
@@ -953,10 +954,11 @@ void PopulateComputeHeavyPatterns(
   patterns.add<QuantizeOpWithRegionPattern>(ctx);
 }
 
-void PopulateQuantizeHybridPatterns(MLIRContext& ctx,
-                                    RewritePatternSet& patterns) {
-  patterns.add<HybridXlaCallModuleOpToCallOp<QuantizeHybridDotGeneralPattern>>(
-      ctx);
+void PopulateQuantizeWeightOnlyPatterns(MLIRContext& ctx,
+                                        RewritePatternSet& patterns) {
+  patterns.add<
+      WeightOnlyXlaCallModuleOpToCallOp<QuantizeWeightOnlyDotGeneralPattern>>(
+      ctx, /*enable_per_channel_quantized_weight=*/false);
 }
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
index 5c26cd0a3c6837..7e30fb54966077 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
@@ -60,14 +60,15 @@ bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op);
 // Each matched pattern are rewritten by its quantized alternatives.
 //
 // The concrete pattern, extends from this base pattern, can specify whether it
-// allows hybrid quantization. If it is allowed, for operand/result that is not
-// adjacent to dequantize/quantize op, it remains as float. For operand/result
-// that is adjacent to dequantize/quantize, it is quantized. Hybrid quantization
-// can be used to generate both weight-only quantization and dynamic range
-// quantization. The condition for allowing hybrid quantization or not for an op
-// can be specified in the below function:
+// allows weight-only quantization. If it is allowed, for operand/result that is
+// not adjacent to dequantize/quantize op, it remains as float. For
+// operand/result that is adjacent to dequantize/quantize, it is quantized.
+// Weight-only quantization can be used to generate both weight-only
+// quantization and dynamic range quantization. The condition for allowing
+// weight-only quantization or not for an op can be specified in the below
+// function:
 //
-//    static bool AllowHybridQuantization(Operation& op)
+//    static bool AllowWeightOnlyQuantization(Operation& op)
 //
 // This is a templatized `OpRewritePattern<RootOpT>`.
 //
@@ -177,8 +178,8 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
           // If the operand is an integer tensor, then it doesn't require the
           // DequantizeOp in the pattern.
           inputs.push_back(operand);
-        } else if (static_cast<const ConcreteT*>(this)->AllowHybridQuantization(
-                       *candidate_op)) {
+        } else if (static_cast<const ConcreteT*>(this)
+                       ->AllowWeightOnlyQuantization(*candidate_op)) {
           inputs.push_back(operand);
         } else {
           return failure();
@@ -214,8 +215,8 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
           // D op in the pattern.
           outputs_replaced.insert({result, enumerated_result.index()});
           output_types.push_back(result.getType());
-        } else if (static_cast<const ConcreteT*>(this)->AllowHybridQuantization(
-                       *candidate_op)) {
+        } else if (static_cast<const ConcreteT*>(this)
+                       ->AllowWeightOnlyQuantization(*candidate_op)) {
           outputs_replaced.insert({result, enumerated_result.index()});
           output_types.push_back(result.getType());
         } else {
@@ -253,9 +254,9 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
 void PopulateComputeHeavyPatterns(MLIRContext& ctx, RewritePatternSet& patterns,
                                   bool enable_per_channel_quantized_weight);
 
-// Populates pattern for hybrid quantization.
-void PopulateQuantizeHybridPatterns(MLIRContext& ctx,
-                                    RewritePatternSet& patterns);
+// Populates pattern weight-only quantization.
+void PopulateQuantizeWeightOnlyPatterns(MLIRContext& ctx,
+                                        RewritePatternSet& patterns);
 
 }  // namespace mlir::quant::stablehlo
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
index 5aaf98fd7cc810..a0749a4f3d3caa 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
@@ -57,7 +57,7 @@ struct StableHloQuantizationBase
                                      quantfork::DequantizeCastOp,
                                      /*VerifierT=*/void, RootOpT>(ctx) {}
 
-  static bool AllowHybridQuantization(Operation& op) { return false; }
+  static bool AllowWeightOnlyQuantization(Operation& op) { return false; }
 };
 
 // Quantization rewrite pattern using DQ as the root op.
@@ -78,12 +78,12 @@ struct StableHloQuantizationReverse
 };
 
 // Quantization rewrite pattern using DQ as the root op.
-struct StableHloQuantizationHybrid
-    : public StableHloQuantizationBase<StableHloQuantizationHybrid> {
-  explicit StableHloQuantizationHybrid(MLIRContext* ctx)
-      : StableHloQuantizationBase<StableHloQuantizationHybrid>(ctx) {}
+struct StableHloQuantizationWeightOnly
+    : public StableHloQuantizationBase<StableHloQuantizationWeightOnly> {
+  explicit StableHloQuantizationWeightOnly(MLIRContext* ctx)
+      : StableHloQuantizationBase<StableHloQuantizationWeightOnly>(ctx) {}
 
-  static bool AllowHybridQuantization(Operation& op) {
+  static bool AllowWeightOnlyQuantization(Operation& op) {
     auto call_op = cast<TF::XlaCallModuleOp>(op);
     return call_op && GetEntryFunctionName(call_op).contains("dot_general");
   }
@@ -113,8 +113,8 @@ void QuantizePass::runOnOperation() {
   RewritePatternSet patterns(&ctx);
   patterns.add<StableHloQuantization, StableHloQuantizationReverse>(&ctx);
   if (enable_weight_only_) {
-    patterns.add<StableHloQuantizationHybrid>(&ctx);
-    PopulateQuantizeHybridPatterns(ctx, patterns);
+    patterns.add<StableHloQuantizationWeightOnly>(&ctx);
+    PopulateQuantizeWeightOnlyPatterns(ctx, patterns);
   }
 
   PopulateComputeHeavyPatterns(ctx, patterns,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
index 9478cea46c8795..3583ff4cb4c08d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
@@ -80,7 +80,7 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   options.bit_width_ = 8;
 
   if (enable_weight_only_) {
-    pm.addNestedPass<func::FuncOp>(createPrepareQuantizeHybridPass());
+    pm.addNestedPass<func::FuncOp>(createInsertWeightParamPass());
   }
   // PrepareQuantizePass uses SymbolTable to fetch relevant GEMM ops for
   // determining quantization attributes. This requires module-level context.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize_hybrid.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_weight_param.mlir
similarity index 98%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize_hybrid.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_weight_param.mlir
index 9f68899873f0b0..89ff96efecf471 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize_hybrid.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_weight_param.mlir
@@ -1,4 +1,4 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-prepare-quantize-hybrid | FileCheck %s
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-insert-weight-param | FileCheck %s
 
 // Test that q/dq pair is inserted between constant and XlaCallModule op
 // with quantizable trait and function name containing conv.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_hybrid.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_hybrid.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_hybrid.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
similarity index 95%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_hybrid.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
index aa42045251778c..c14ff0e36340b3 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_hybrid.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
@@ -1,8 +1,8 @@
 // RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
 // RUN:     -stablehlo-quantize-composite-functions=enable-weight-only=true | FileCheck --check-prefix=CHECK %s
 
-// Test that hybrid quantized dot_general op is produced when hybrid-quantize
-// is set to true.
+// Test that weight-only quantized dot_general op is produced when
+// enable-weight-only is set to true.
 
 module attributes {tf_saved_model.semantics} {
   func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {

From fab86418bc0a3b6ae3dedc5c5cfd243d498d4cd5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Mar 2024 00:38:59 -0700
Subject: [PATCH 477/670] Change default calibration options. Populate both
 calibration method and calibration parameters.

PiperOrigin-RevId: 619441620
---
 .../mlir/quantization/stablehlo/cc/config.cc  | 88 +++++++++++++++----
 .../quantization/stablehlo/cc/config_test.cc  | 67 ++++++++++++++
 .../tensorflow/python/quantize_model.cc       |  2 +
 3 files changed, 141 insertions(+), 16 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
index 433cb3cdee3c1d..ccf2ddf768b88b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
@@ -21,14 +21,72 @@ limitations under the License.
 namespace stablehlo::quantization {
 namespace {
 
-// Creates `CalibrationOptions` with default fields. Uses simple min-max
-// calibration by default.
-CalibrationOptions GetDefaultCalibrationOptions() {
-  CalibrationOptions options{};
-  options.set_calibration_method(
-      CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
-
-  return options;
+// Populate `CalibrationOptions` with default fields.
+void PopulateDefaultCalibrationOptions(QuantizationConfig& quant_config) {
+  if (!quant_config.has_calibration_options() ||
+      quant_config.calibration_options().calibration_method() ==
+          CalibrationOptions::CALIBRATION_METHOD_UNSPECIFIED) {
+    quant_config.mutable_calibration_options()->set_calibration_method(
+        CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
+  }
+  switch (quant_config.calibration_options().calibration_method()) {
+    case CalibrationOptions::CALIBRATION_METHOD_MIN_MAX:
+      break;
+    case CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX:
+      break;
+    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE:
+      if (quant_config.calibration_options()
+              .calibration_parameters()
+              .initial_num_bins() == 0) {
+        quant_config.mutable_calibration_options()
+            ->mutable_calibration_parameters()
+            ->set_initial_num_bins(256);
+      }
+      if (quant_config.calibration_options()
+              .calibration_parameters()
+              .min_percentile() == 0) {
+        quant_config.mutable_calibration_options()
+            ->mutable_calibration_parameters()
+            ->set_min_percentile(0.001);
+      }
+      if (quant_config.calibration_options()
+              .calibration_parameters()
+              .max_percentile() == 0) {
+        quant_config.mutable_calibration_options()
+            ->mutable_calibration_parameters()
+            ->set_max_percentile(99.999);
+      }
+      break;
+    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE:
+      if (quant_config.calibration_options()
+              .calibration_parameters()
+              .initial_num_bins() == 0) {
+        quant_config.mutable_calibration_options()
+            ->mutable_calibration_parameters()
+            ->set_initial_num_bins(256);
+      }
+      break;
+    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY:
+      if (quant_config.calibration_options()
+              .calibration_parameters()
+              .initial_num_bins() == 0) {
+        quant_config.mutable_calibration_options()
+            ->mutable_calibration_parameters()
+            ->set_initial_num_bins(256);
+      }
+      break;
+    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC:
+      if (quant_config.calibration_options()
+              .calibration_parameters()
+              .initial_num_bins() == 0) {
+        quant_config.mutable_calibration_options()
+            ->mutable_calibration_parameters()
+            ->set_initial_num_bins(256);
+      }
+      break;
+    default:
+      break;
+  }
 }
 
 // Returns a default `QuantizationSpec` for performing static-range PTQ on all
@@ -75,8 +133,8 @@ QuantizationSpec GetStaticRangePtqSpecForConvolution() {
   // Enable per-channel quantization for convolution weights.
   QuantizedType conv_weight_quantized_type{};
 
-  // Assumes NHWC format, specifying the channel dimension (3) as the quantized
-  // axis.
+  // Assumes NHWC format, specifying the channel dimension (3) as the
+  // quantized axis.
   conv_weight_quantized_type.mutable_dimension_specs()->set_dimension(3);
 
   // The index of weight operands passed to lifted functions for convolution
@@ -99,9 +157,9 @@ void ExpandStaticRangePtqPreset(const StaticRangePtqPreset& preset,
         ->Add(preset_datasets.begin(), preset_datasets.end());
   }
 
-  // Create a new `QuantizationSpecs` to replace the existing one. The expansion
-  // from `StaticRangePtqPreset` gets populated first and then user-provided
-  // explicit `QuantizationSpec`s will be appended.
+  // Create a new `QuantizationSpecs` to replace the existing one. The
+  // expansion from `StaticRangePtqPreset` gets populated first and then
+  // user-provided explicit `QuantizationSpec`s will be appended.
   QuantizationSpecs new_specs{};
   *new_specs.add_specs() = GetDefaultStaticRangePtqSpec();
   *new_specs.add_specs() = GetStaticRangePtqSpecForConvolution();
@@ -136,9 +194,7 @@ QuantizationConfig PopulateDefaults(
     const QuantizationConfig& user_provided_config) {
   QuantizationConfig config = user_provided_config;
 
-  if (!config.has_calibration_options()) {
-    *config.mutable_calibration_options() = GetDefaultCalibrationOptions();
-  }
+  PopulateDefaultCalibrationOptions(config);
 
   PipelineConfig& pipeline_config = *config.mutable_pipeline_config();
   if (!pipeline_config.has_unpack_quantized_types()) {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
index 021f4baccda50b..70d23808f6df97 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
@@ -50,6 +50,19 @@ TEST(PopulateDefaultsTest, DefaultCalibrationOptionsPopulated) {
               Eq(CalibrationOptions::CALIBRATION_METHOD_MIN_MAX));
 }
 
+TEST(PopulateDefaultsTest,
+     DefaultCalibrationOptionsPopulatedForUnspecifiedMethod) {
+  QuantizationConfig config{};
+  CalibrationOptions& calibration_options =
+      *config.mutable_calibration_options();
+  calibration_options.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_UNSPECIFIED);
+
+  const QuantizationConfig new_config = PopulateDefaults(config);
+  EXPECT_THAT(new_config.calibration_options().calibration_method(),
+              Eq(CalibrationOptions::CALIBRATION_METHOD_MIN_MAX));
+}
+
 TEST(PopulateDefaultsTest, ExplicitCalibrationOptionsNotOverridden) {
   QuantizationConfig config{};
   CalibrationOptions& calibration_options =
@@ -70,6 +83,60 @@ TEST(PopulateDefaultsTest, ExplicitCalibrationOptionsNotOverridden) {
               Eq(512));
 }
 
+TEST(PopulateDefaultsTest, DefaultNumbersPopulatedForPartOfCalibrationOptions) {
+  QuantizationConfig config{};
+  CalibrationOptions& calibration_options =
+      *config.mutable_calibration_options();
+  calibration_options.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE);
+  calibration_options.mutable_calibration_parameters()->set_initial_num_bins(
+      512);
+
+  // Test that if the user explicitly provided part of the
+  // `calibration_options`, it is not overridden, rest of the data are default.
+  const QuantizationConfig new_config = PopulateDefaults(config);
+  EXPECT_THAT(new_config.calibration_options().calibration_method(),
+              Eq(CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .initial_num_bins(),
+              Eq(512));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .min_percentile(),
+              Eq(0.001f));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .max_percentile(),
+              Eq(99.999f));
+}
+
+TEST(PopulateDefaultsTest,
+     DefaultNumbersPopulatedForCalibrationOptionsOfHistogramMseBruteforce) {
+  QuantizationConfig config{};
+  CalibrationOptions& calibration_options =
+      *config.mutable_calibration_options();
+  calibration_options.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE);
+
+  const QuantizationConfig new_config = PopulateDefaults(config);
+  EXPECT_THAT(
+      new_config.calibration_options().calibration_method(),
+      Eq(CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .initial_num_bins(),
+              Eq(256));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .min_percentile(),
+              Eq(0.0f));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .max_percentile(),
+              Eq(0.0f));
+}
+
 TEST(ExpandPresetsTest, ExpandUnspecifiedPreset) {
   QuantizationConfig config{};
   const QuantizationConfig new_config = ExpandPresets(config);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index dfa7f702179137..8d230b97ca772b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -191,6 +191,8 @@ QuantizationConfig GetQuantizationConfigForStaticRangePtq(
   *quantization_config.mutable_debugger_config() =
       quantization_options.debugger_config();
   quantization_config.mutable_static_range_ptq_preset();
+  *quantization_config.mutable_calibration_options() =
+      quantization_options.calibration_options();
 
   return ExpandPresets(PopulateDefaults(quantization_config));
 }

From 0336671d53441efc9b03678b9ec30f807cb399a0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Mar 2024 00:57:47 -0700
Subject: [PATCH 478/670] Automated Code Change

PiperOrigin-RevId: 619444679
---
 tensorflow/lite/kernels/perception/BUILD                  | 8 +++++---
 tensorflow/lite/kernels/perception/dense_image_warp.cc    | 7 +++----
 .../lite/kernels/perception/dense_image_warp_test.cc      | 4 +++-
 .../lite/kernels/perception/max_pool_with_argmax.cc       | 1 +
 .../lite/kernels/perception/max_pool_with_argmax_test.cc  | 5 ++++-
 tensorflow/lite/kernels/perception/max_unpooling_2d.cc    | 6 +++---
 .../lite/kernels/perception/max_unpooling_2d_test.cc      | 5 ++++-
 tensorflow/lite/kernels/perception/perception_ops.cc      | 2 ++
 tensorflow/lite/kernels/perception/perception_ops.h       | 1 +
 .../lite/kernels/perception/perception_ops_wrapper.cc     | 1 +
 10 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/kernels/perception/BUILD b/tensorflow/lite/kernels/perception/BUILD
index f08f103df7eb02..52e5e9421babbd 100644
--- a/tensorflow/lite/kernels/perception/BUILD
+++ b/tensorflow/lite/kernels/perception/BUILD
@@ -22,13 +22,13 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:common",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/kernels/internal:types",
         "@flatbuffers",
     ],
@@ -46,9 +46,11 @@ cc_test(
         ":perception_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/core:framework_stable",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:test_main",
         "//tensorflow/lite/kernels:test_util",
-        "//tensorflow/lite/testing:util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
 )
@@ -68,7 +70,7 @@ pybind_extension(
     deps = [
         ":perception_ops",
         "//tensorflow/lite:mutable_op_resolver",
-        "//third_party/python_runtime:headers",
+        "//tensorflow/lite/c:common",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/lite/kernels/perception/dense_image_warp.cc b/tensorflow/lite/kernels/perception/dense_image_warp.cc
index 1dd6170bfb608a..b469897b34d34f 100644
--- a/tensorflow/lite/kernels/perception/dense_image_warp.cc
+++ b/tensorflow/lite/kernels/perception/dense_image_warp.cc
@@ -15,13 +15,12 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 
-#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/perception/dense_image_warp_test.cc b/tensorflow/lite/kernels/perception/dense_image_warp_test.cc
index 4f65a1bf2cd516..7d89d79b3b1635 100644
--- a/tensorflow/lite/kernels/perception/dense_image_warp_test.cc
+++ b/tensorflow/lite/kernels/perception/dense_image_warp_test.cc
@@ -16,10 +16,12 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/perception/perception_ops.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/perception/max_pool_with_argmax.cc b/tensorflow/lite/kernels/perception/max_pool_with_argmax.cc
index 521deb3799408d..d1b924066b23e3 100644
--- a/tensorflow/lite/kernels/perception/max_pool_with_argmax.cc
+++ b/tensorflow/lite/kernels/perception/max_pool_with_argmax.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/perception/max_pool_with_argmax_test.cc b/tensorflow/lite/kernels/perception/max_pool_with_argmax_test.cc
index a35917803ef700..082851d59c4488 100644
--- a/tensorflow/lite/kernels/perception/max_pool_with_argmax_test.cc
+++ b/tensorflow/lite/kernels/perception/max_pool_with_argmax_test.cc
@@ -17,11 +17,14 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/perception/perception_ops.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/perception/max_unpooling_2d.cc b/tensorflow/lite/kernels/perception/max_unpooling_2d.cc
index e87f7f6e0c248d..869a9457a9f49d 100644
--- a/tensorflow/lite/kernels/perception/max_unpooling_2d.cc
+++ b/tensorflow/lite/kernels/perception/max_unpooling_2d.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
 namespace tflite {
 namespace ops {
 namespace custom {
diff --git a/tensorflow/lite/kernels/perception/max_unpooling_2d_test.cc b/tensorflow/lite/kernels/perception/max_unpooling_2d_test.cc
index 8ac9d15283f114..784a3f4127a814 100644
--- a/tensorflow/lite/kernels/perception/max_unpooling_2d_test.cc
+++ b/tensorflow/lite/kernels/perception/max_unpooling_2d_test.cc
@@ -16,10 +16,13 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/perception/perception_ops.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/perception/perception_ops.cc b/tensorflow/lite/kernels/perception/perception_ops.cc
index faf4557aa5df4a..a4e62c90b76e57 100644
--- a/tensorflow/lite/kernels/perception/perception_ops.cc
+++ b/tensorflow/lite/kernels/perception/perception_ops.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/perception/perception_ops.h"
 
+#include "tensorflow/lite/mutable_op_resolver.h"
+
 namespace tflite {
 namespace ops {
 namespace custom {
diff --git a/tensorflow/lite/kernels/perception/perception_ops.h b/tensorflow/lite/kernels/perception/perception_ops.h
index e70d64b016055d..121c844b6e2a5a 100644
--- a/tensorflow/lite/kernels/perception/perception_ops.h
+++ b/tensorflow/lite/kernels/perception/perception_ops.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_PERCEPTION_PERCEPTION_OPS_H_
 #define TENSORFLOW_LITE_KERNELS_PERCEPTION_PERCEPTION_OPS_H_
 
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/perception/perception_ops_wrapper.cc b/tensorflow/lite/kernels/perception/perception_ops_wrapper.cc
index b1c1c813962f95..cd5e96eceacfdd 100644
--- a/tensorflow/lite/kernels/perception/perception_ops_wrapper.cc
+++ b/tensorflow/lite/kernels/perception/perception_ops_wrapper.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "tensorflow/lite/kernels/perception/perception_ops.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
 
 PYBIND11_MODULE(pywrap_perception_ops, m) {
   m.doc() = R"pbdoc(

From 78b57b521708ed6419f9ca22b8c25d73bdda3bd2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Mar 2024 02:02:14 -0700
Subject: [PATCH 479/670] Update GraphDef version to 1814.

PiperOrigin-RevId: 619457675
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 515e839a2839ed..8d59a74287919d 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1813  // Updated: 2024/3/26
+#define TF_GRAPH_DEF_VERSION 1814  // Updated: 2024/3/27
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 35dbb95ce2b0c332a09b809a3ecdc224f4b69974 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Mar 2024 02:06:11 -0700
Subject: [PATCH 480/670] compat: Update forward compatibility horizon to
 2024-03-27

PiperOrigin-RevId: 619458776
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 9b490822242bf7..a96d4514f68cda 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 26)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 27)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 7b5fdc36dd7f65d204f0c4e1261508fb9497f8c1 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Wed, 27 Mar 2024 02:48:43 -0700
Subject: [PATCH 481/670] [XLA:GPU] Move Tiling Complexity heuristic to
 EmitMatMul.

Currently the heuristic lives in the wrong place. It requires to pass fusion_kind to TritonWrapper.

The heuristic is used only for GEMMs, so it should move either up (to TritonFusion::Emit), or down (to EmitMatMul). I chose EmitMatMul because it requires less changes to the current infrastructure.

PiperOrigin-RevId: 619466784
---
 .../xla/xla/service/gpu/fusions/triton.cc     |  2 -
 .../xla/xla/service/gpu/ir_emitter_triton.cc  | 79 ++++++++++---------
 .../xla/xla/service/gpu/ir_emitter_triton.h   |  3 +-
 .../xla/service/gpu/ir_emitter_triton_test.cc | 10 +--
 4 files changed, 46 insertions(+), 48 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/triton.cc b/third_party/xla/xla/service/gpu/fusions/triton.cc
index 7f8d4c16808582..2b9565d87bbee9 100644
--- a/third_party/xla/xla/service/gpu/fusions/triton.cc
+++ b/third_party/xla/xla/service/gpu/fusions/triton.cc
@@ -137,7 +137,6 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
       TF_ASSIGN_OR_RETURN(
           triton_wrapper_result,
           TritonWrapper(analysis, impl_fn_name, hlo_computation,
-                        kTritonSoftmaxFusionKind,
                         ir_emitter_context.cuda_compute_capability(),
                         ir_emitter_context.gpu_device_info(), config,
                         ir_emitter_context.llvm_module(), &EmitSoftMax,
@@ -164,7 +163,6 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
       TF_ASSIGN_OR_RETURN(
           triton_wrapper_result,
           TritonWrapper(analysis, impl_fn_name, hlo_computation,
-                        kTritonGemmFusionKind,
                         ir_emitter_context.cuda_compute_capability(),
                         ir_emitter_context.gpu_device_info(), config,
                         ir_emitter_context.llvm_module(), &EmitMatMul,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 3e64addaf797b8..d24392d1aeffa8 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -1938,6 +1938,43 @@ bool Is3xBfloat16MatMul(const HloDotInstruction* dot_instr,
   return algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3;
 }
 
+// This is a heuristic that serves as a proxy for register usage and code size.
+//
+// We have noticed that tilings with very long LLVM IR code are both slow to
+// compile and slow to run. This can be for example due to register spills. So
+// we should skip these tilings to save time. But it's better to skip them
+// before the LLVM IR is generated. To do that, we came up with a formula that
+// strongly correlates with the LLVM IR size. The formula is the size of the two
+// input and the output thread block tiles divided by the number of warps. We
+// read https://developer.nvidia.com/blog/cutlass-linear-algebra-cuda/ as a
+// reference, and found the formula by trial and error.
+//
+// To regenerate the limit, we have to run an exhaustive search on all tilings
+// for a few different HLOs, printing the runtimes and the heuristic values.
+//
+// From that, we can find a limit, such that all tilings within alpha *
+// optimal_runtime have a heuristic value less than or equal to the limit.
+//
+// In our measurements, all tilings which were within 1.13 * optimal_runtime had
+// a complexity_heuristic_value <= kComplexityHeuristicLimit.
+//
+// See go/tiling-heuristic for more details.
+absl::Status CheckGemmTilingComplexityHeuristic(
+    const TritonGemmConfig& config) {
+  constexpr int64_t kComplexityHeuristicLimit = 9000;
+  int64_t complexity_heuristic_value =
+      (config.block_m * config.block_n +
+       (config.block_m + config.block_n) * config.block_k) /
+      config.num_warps;
+  VLOG(2) << "Complexity heuristic: " << complexity_heuristic_value;
+  if (complexity_heuristic_value > kComplexityHeuristicLimit) {
+    return ResourceExhausted("Tiling complexity heuristic exceeded: %d > %d",
+                             complexity_heuristic_value,
+                             kComplexityHeuristicLimit);
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 // Variable naming: lhs [m, k] x rhs [k, n] -> out [m, n].
@@ -1948,6 +1985,8 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
                         const HloComputation* computation,
                         mlir::triton::FuncOp fn,
                         const TritonGemmConfig& config) {
+  TF_RETURN_IF_ERROR(CheckGemmTilingComplexityHeuristic(config));
+
   const HloDotInstruction* dot_instr = DynCast<HloDotInstruction>(
       hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot));
   TF_RET_CHECK(!dot_instr->sparse_operands());
@@ -2707,8 +2746,7 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
 
 absl::StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
-    const HloComputation* hlo_computation, absl::string_view fusion_kind,
-    const se::CudaComputeCapability& cc,
+    const HloComputation* hlo_computation, const se::CudaComputeCapability& cc,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     llvm::Module* llvm_module, TritonIrEmitter ir_emitter,
     mlir::MLIRContext& mlir_context) {
@@ -2719,43 +2757,6 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
     tsl::setenv("ENABLE_PIPELINING", "true", true /*overwrite*/);
   }
 
-  if (fusion_kind == kTritonGemmFusionKind) {
-    // This is a heuristic that serves as a proxy for register usage and code
-    // size.
-    //
-    // We have noticed that tilings with very long LLVM IR code are both slow to
-    // compile and slow to run. This can be for example due to register spills.
-    // So we should skip these tilings to save time. But it's better to skip
-    // them before the LLVM IR is generated. To do that, we came up with a
-    // formula that strongly correlates with the LLVM IR size. The formula is
-    // the size of the two input and the output thread block tiles divided by
-    // the number of warps. We read
-    // https://developer.nvidia.com/blog/cutlass-linear-algebra-cuda/ as a
-    // reference, and found the formula by trial and error.
-    //
-    // To regenerate the limit, we have to run an exhaustive search on all
-    // tilings for a few different HLOs, printing the runtimes and the heuristic
-    // values.
-    // From that, we can find a limit, such that all tilings within alpha *
-    // optimal_runtime have a heuristic value less than or equal to the limit.
-    //
-    // In our measurements, all tilings which were within 1.13 * optimal_runtime
-    // had a complexity_heuristic_value <= kComplexityHeuristicLimit.
-    //
-    // See go/tiling-heuristic for more details.
-    constexpr int64_t kComplexityHeuristicLimit = 9000;
-    int64_t complexity_heuristic_value =
-        (config.block_m * config.block_n +
-         (config.block_m + config.block_n) * config.block_k) /
-        config.num_warps;
-    VLOG(2) << "Complexity heuristic: " << complexity_heuristic_value;
-    if (complexity_heuristic_value > kComplexityHeuristicLimit) {
-      return ResourceExhausted("Tiling complexity heuristic exceeded: %d > %d",
-                               complexity_heuristic_value,
-                               kComplexityHeuristicLimit);
-    }
-  }
-
   TF_ASSIGN_OR_RETURN(
       auto triton_module,
       CreateTritonModule(analysis, fn_name, hlo_computation, device_info,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index 0d575605325dcb..96ca55139bb196 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -89,8 +89,7 @@ using TritonIrEmitter = std::function<Status(
 // MatMul and SoftMax above are some such IR generators.
 absl::StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
-    const HloComputation* hlo_computation, absl::string_view fusion_kind,
-    const se::CudaComputeCapability& cc,
+    const HloComputation* hlo_computation, const se::CudaComputeCapability& cc,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     llvm::Module* llvm_module, TritonIrEmitter ir_emitter,
     mlir::MLIRContext& mlir_context);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 113ebcc237a6f8..78e277f224b316 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -1335,7 +1335,7 @@ ENTRY entry {
   TritonGemmConfig config(16, 32, 512, 1, 4, 8);
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation, kTritonGemmFusionKind,
+                    "test_fn", triton_dot_computation,
                     se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
                                               /*minor=*/0},
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context),
@@ -1350,7 +1350,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(
       const auto result,
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation, kTritonGemmFusionKind,
+                    "test_fn", triton_dot_computation,
                     se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
                                               /*minor=*/0},
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context));
@@ -1838,7 +1838,7 @@ ENTRY entry {
   TritonGemmConfig config(512, 512, 32, 1, 1, 2);
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation, kTritonGemmFusionKind,
+                    "test_fn", triton_dot_computation,
                     se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
                                               /*minor=*/0},
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context),
@@ -1852,7 +1852,7 @@ ENTRY entry {
   config.block_k = 32;
   TF_CHECK_OK(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation, kTritonGemmFusionKind,
+                    "test_fn", triton_dot_computation,
                     se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
                                               /*minor=*/0},
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context)
@@ -3159,7 +3159,7 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(
       const auto result,
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation, kTritonGemmFusionKind,
+                    "test_fn", triton_dot_computation,
                     GetCudaComputeCapability(), dev_info, triton_gemm_config,
                     &llvm_module, &EmitMatMul, mlir_context));
   // The config is chosen so that the used memory size is slightly above the

From 9f283afa506aee7406e1bb5ae02f53784eef8a07 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Mar 2024 02:52:13 -0700
Subject: [PATCH 482/670] Add the result sharding to reshapes added in
 `ReshapeWithCorrectRepresentationAndSharding`.

If `ConvertMlirHloToHlo` is applied after sharding propagation, then it's important that the added reshapes will have the same sharding as that of the corresponding function outputs.

PiperOrigin-RevId: 619467471
---
 third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc      | 1 +
 third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
index 0955046f1fe38d..5d8a128f7b231f 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
@@ -105,6 +105,7 @@ absl::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
       to_shape.set_dynamic_dimension(i, original_shape.is_dynamic_dimension(i));
     }
   }
+  xla::XlaScopedShardingAssignment scoped_sharding(builder, sharding);
   return xla::Reshape(to_shape, original);
 }
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
index 0342ce2b4c2eee..a4a2650d61b69b 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
@@ -31,8 +31,9 @@ func.func @main(%arg0: tensor<5x8x128xf32> {mhlo.sharding = "\08\03\1A\03\01\02\
 // CHECK-LABEL: ENTRY %main.{{.*}} ({{[^,]*}}: f32[4,4]) -> (f32[4,4], f32[4,4])
 func.func @main(%arg0: tensor<4x4xf32>) -> (tensor<4x4xf32> {mhlo.sharding = "\08\03\1A\03\02\01\02\22\04\00\01\02\03B\01\00"}, tensor<4x4xf32>) {
   // CHECK-NEXT: %Arg_0.1 = f32[4,4] parameter(0)
-  // CHECK-NEXT: [[RESHAPE_0:%.*]] = f32[4,4] reshape(f32[4,4] %Arg_0.1)
+  // CHECK-NEXT: [[RESHAPE_0:%.*]] = f32[4,4] reshape(f32[4,4] %Arg_0.1), sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
   // CHECK-NEXT: [[RESHAPE_1:%.*]] = f32[4,4] reshape(f32[4,4] %Arg_0.1)
+  // CHECK-NOT:  sharding
   // CHECK-NEXT: ROOT {{%.*}} = (f32[4,4], f32[4,4]) tuple(f32[4,4] [[RESHAPE_0]], f32[4,4] [[RESHAPE_1]])
   // CHECK-SAME: sharding={{\{}}{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}, {replicated}}
   return %arg0, %arg0 : tensor<4x4xf32>, tensor<4x4xf32>

From a0a8171d25946a54fecb5cede8a8373447351358 Mon Sep 17 00:00:00 2001
From: Sergey Kozub <sergeykozub@google.com>
Date: Wed, 27 Mar 2024 03:00:16 -0700
Subject: [PATCH 483/670] Add output-to-input convolution indexing map.

Reuse the existing ReduceWindowOp code to generate the indexing map, then remap it to convolution dimensions (taken from ConvolutionDimensionNumbers).

PiperOrigin-RevId: 619469114
---
 .../service/gpu/model/indexing_analysis.cc    | 167 +++++++++++--
 .../gpu/model/indexing_analysis_test.cc       | 221 ++++++++++++++++++
 2 files changed, 368 insertions(+), 20 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index 7b1400f1ffc2da..79c6a04db159dd 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -574,15 +574,11 @@ HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
   return instr_indexing;
 }
 
-// Indexing for reduce-window with dilations and non-trivial padding can be
-// represented as a composition of pad op and reduce-window that never goes out
-// of bounds.
-HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
-    const HloReduceWindowInstruction* reduce_window, int output_id,
+IndexingMap ComposeIndexingMapsForWindow(
+    absl::Span<const int64_t> input_dimensions,
+    absl::Span<const int64_t> output_dimensions, const Window& window,
     MLIRContext* mlir_context) {
-  const Shape& input_shape = reduce_window->operand(0)->shape();
-  const Shape& output_shape = GetOutputShape(reduce_window, 0);
-  int64_t rank = input_shape.rank();
+  size_t rank = input_dimensions.size();
 
   // Compute shape of the padded input and the indexing map of pad op required
   // to pad the input.
@@ -599,23 +595,22 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
   dim_vars.reserve(rank);
   range_vars.reserve(rank);
   for (const auto& [dim_id, window_config] :
-       llvm::enumerate(reduce_window->window().dimensions())) {
+       llvm::enumerate(window.dimensions())) {
     padding_low.push_back(window_config.padding_low());
     padding_high.push_back(window_config.padding_high());
     // For some reason interior_padding in HLO pad is offset from base_dilations
     // in HLO reduce-window by 1.
     padding_interior.push_back(window_config.base_dilation() - 1);
-    padded_input_dimensions.push_back(input_shape.dimensions(dim_id) +
-                                      window_config.padding_low() +
-                                      window_config.padding_high() +
-                                      (input_shape.dimensions(dim_id) - 1) *
-                                          (window_config.base_dilation() - 1));
+    padded_input_dimensions.push_back(
+        input_dimensions[dim_id] + window_config.padding_low() +
+        window_config.padding_high() +
+        (input_dimensions[dim_id] - 1) * (window_config.base_dilation() - 1));
     AffineExpr dim_expr = getAffineDimExpr(dim_id, mlir_context);
     AffineExpr symbol_expr = getAffineSymbolExpr(dim_id, mlir_context);
 
     exprs.push_back(symbol_expr * window_config.window_dilation() +
                     window_config.stride() * dim_expr);
-    dim_vars.push_back({Interval{0, output_shape.dimensions(dim_id) - 1}});
+    dim_vars.push_back({Interval{0, output_dimensions[dim_id] - 1}});
     range_vars.push_back({Interval{0, window_config.size() - 1}});
   }
   // Indexing map for pad op that pads the input.
@@ -623,15 +618,31 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
       padded_input_dimensions, padding_low, padding_high, padding_interior,
       mlir_context);
   // Indexing map for reduce-window, that does not do any padding.
-  IndexingMap reduce_window_indexing_no_padding(
+  IndexingMap input_indexing_no_padding(
       AffineMap::get(rank, rank, exprs, mlir_context), dim_vars, range_vars,
       /*rt_vars=*/{});
 
   // Composed indexing.
-  IndexingMap inputs_indexing = ComposeIndexingMaps(
-      reduce_window_indexing_no_padding, padded_input_indexing);
-  inputs_indexing.Simplify();
-  inputs_indexing.RemoveUnusedSymbols();
+  IndexingMap result =
+      ComposeIndexingMaps(input_indexing_no_padding, padded_input_indexing);
+  result.Simplify();
+  result.RemoveUnusedSymbols();
+  return result;
+}
+
+// Indexing for reduce-window with dilations and non-trivial padding can be
+// represented as a composition of pad op and reduce-window that never goes out
+// of bounds.
+HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
+    const HloReduceWindowInstruction* reduce_window, int output_id,
+    MLIRContext* mlir_context) {
+  const Shape& input_shape = reduce_window->operand(0)->shape();
+  const Shape& output_shape = GetOutputShape(reduce_window, 0);
+
+  // Indexing map for the input value.
+  IndexingMap inputs_indexing = ComposeIndexingMapsForWindow(
+      input_shape.dimensions(), output_shape.dimensions(),
+      reduce_window->window(), mlir_context);
 
   // Indexing map for the init value.
   IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
@@ -650,6 +661,119 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
   return instr_indexing;
 }
 
+HloInstructionIndexing ComputeOutputToInputConvolutionOpIndexing(
+    const HloConvolutionInstruction* convolution, MLIRContext* mlir_context) {
+  const Shape& input_shape = convolution->operand(0)->shape();
+  const Shape& kernel_shape = convolution->operand(1)->shape();
+  const Shape& output_shape = convolution->shape();
+  const ConvolutionDimensionNumbers& dnums =
+      convolution->convolution_dimension_numbers();
+  size_t rank = output_shape.rank();
+
+  // Collect sizes for input/output spatial dimensions.
+  size_t spatial_rank = rank - 2;
+  std::vector<int64_t> input_spatial_sizes(spatial_rank);
+  std::vector<int64_t> kernel_spatial_sizes(spatial_rank);
+  std::vector<int64_t> output_spatial_sizes(spatial_rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    input_spatial_sizes[i] =
+        input_shape.dimensions(dnums.input_spatial_dimensions(i));
+    kernel_spatial_sizes[i] =
+        kernel_shape.dimensions(dnums.kernel_spatial_dimensions(i));
+    output_spatial_sizes[i] =
+        output_shape.dimensions(dnums.output_spatial_dimensions(i));
+  }
+
+  // Indexing map for the input value (spatial dimensions only).
+  // The dimension numbers in the resulting affine expressions have to be
+  // remapped to correspond to the correct output dimensions.
+  IndexingMap input_spatial_indexing =
+      ComposeIndexingMapsForWindow(input_spatial_sizes, output_spatial_sizes,
+                                   convolution->window(), mlir_context);
+  std::vector<AffineExpr> replacement_dims(spatial_rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    replacement_dims[i] =
+        getAffineDimExpr(dnums.output_spatial_dimensions(i), mlir_context);
+  }
+
+  // Build affine expressions and constraints for input spatial dimensions.
+  std::vector<AffineExpr> input_exprs(rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    input_exprs[dnums.input_spatial_dimensions(i)] =
+        input_spatial_indexing.GetAffineMap().getResult(i).replaceDims(
+            replacement_dims);
+  }
+  llvm::DenseMap<AffineExpr, Interval> input_constraints;
+  for (const auto& [key, val] : input_spatial_indexing.GetConstraints()) {
+    input_constraints[key.replaceDims(replacement_dims)] = val;
+  }
+
+  // Build affine expressions for kernel spatial and output dimensions.
+  std::vector<AffineExpr> kernel_exprs(rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    kernel_exprs[dnums.kernel_spatial_dimensions(i)] =
+        getAffineSymbolExpr(i, mlir_context);
+  }
+  kernel_exprs[dnums.kernel_output_feature_dimension()] =
+      getAffineDimExpr(dnums.output_feature_dimension(), mlir_context);
+
+  // Build initial symbol ranges.
+  std::vector<RangeVar> input_symbols = input_spatial_indexing.GetRangeVars();
+  std::vector<RangeVar> kernel_symbols =
+      RangeVarsFromTensorSizes(kernel_spatial_sizes);
+
+  // Add symbol for input feature dimension.
+  input_exprs[dnums.input_feature_dimension()] =
+      getAffineSymbolExpr(input_symbols.size(), mlir_context);
+  kernel_exprs[dnums.kernel_input_feature_dimension()] =
+      getAffineSymbolExpr(kernel_symbols.size(), mlir_context);
+
+  int64_t input_group_size =
+      kernel_shape.dimensions(dnums.kernel_input_feature_dimension());
+  Interval input_feature_range{0, input_group_size - 1};
+  input_symbols.push_back({input_feature_range});
+  kernel_symbols.push_back({input_feature_range});
+
+  // With multiple feature groups, the input feature dimension is equally split.
+  if (convolution->feature_group_count() > 1) {
+    AffineExpr& input_feature = input_exprs[dnums.input_feature_dimension()];
+    AffineExpr dim_expr =
+        getAffineDimExpr(dnums.output_feature_dimension(), mlir_context);
+    input_feature =
+        dim_expr.floorDiv(input_group_size) * input_group_size + input_feature;
+  }
+
+  // With multiple batch groups, the input batch dimension is equally split.
+  AffineExpr batch_dim_expr =
+      getAffineDimExpr(dnums.output_batch_dimension(), mlir_context);
+  if (convolution->batch_group_count() > 1) {
+    int64_t batch_group_size =
+        output_shape.dimensions(dnums.output_batch_dimension());
+    AffineExpr batch_group_expr =
+        getAffineSymbolExpr(input_symbols.size(), mlir_context);
+    input_symbols.push_back({{0, convolution->batch_group_count() - 1}});
+    input_exprs[dnums.input_batch_dimension()] =
+        batch_group_expr * batch_group_size + batch_dim_expr;
+  } else {
+    input_exprs[dnums.input_batch_dimension()] = batch_dim_expr;
+  }
+
+  // Indexing map for the input value.
+  IndexingMap inputs_indexing(
+      AffineMap::get(rank, input_symbols.size(), input_exprs, mlir_context),
+      DimVarsFromTensorSizes(output_shape.dimensions()), input_symbols,
+      /*rt_vars=*/{}, input_constraints);
+
+  // Indexing map for the kernel value.
+  IndexingMap kernel_indexing(
+      AffineMap::get(rank, kernel_symbols.size(), kernel_exprs, mlir_context),
+      DimVarsFromTensorSizes(output_shape.dimensions()), kernel_symbols,
+      /*rt_vars=*/{});
+
+  return HloInstructionIndexing::FromIndexingMaps(
+      {inputs_indexing, kernel_indexing});
+}
+
 // Computes strides for a shape.
 std::vector<int64_t> ComputeStrides(absl::Span<const int64_t> dims) {
   int rank = static_cast<int>(dims.size());
@@ -1304,6 +1428,9 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
     return ComputeOutputToInputReduceWindowOpIndexing(reduce_window, output_id,
                                                       ctx);
   }
+  if (auto convolution = DynCast<HloConvolutionInstruction>(instr)) {
+    return ComputeOutputToInputConvolutionOpIndexing(convolution, ctx);
+  }
   if (auto reshape = DynCast<HloReshapeInstruction>(instr)) {
     return ComputeOutputToInputReshapeOpIndexing(reshape, ctx);
   }
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 505d5e959bef9d..d6424b4aaaf9ce 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -2094,6 +2094,227 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_Variadic) {
                           )"))));
 }
 
+TEST_F(IndexingAnalysisTest, ConvolutionOp_NoPadding) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,12,10,4] parameter(0)
+      p1 = f32[4,3,5,8] parameter(1)
+      ROOT conv = f32[1,10,6,8] convolution(p0, p1),
+        window={size=3x5 pad=0_0x0_0}, dim_labels=b01f_i01o->b01f
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 + s0, d2 + s1, s2)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 9]
+                            d2 in [0, 5]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (s2, s0, s1, d3)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 9]
+                            d2 in [0, 5]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, ConvolutionOp_PaddingAndWindowStride) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,12,10,4] parameter(0)
+      p1 = f32[4,3,5,8] parameter(1)
+      ROOT conv = f32[1,6,5,8] convolution(p0, p1),
+        window={size=3x5 stride=2x2 pad=1_1x2_2}, dim_labels=b01f_i01o->b01f
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 * 2 + s0 - 1, d2 * 2 + s1 - 2, s2)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 5]
+                            d2 in [0, 4]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                            d1 * 2 + s0 in [1, 12]
+                            d2 * 2 + s1 in [2, 11]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (s2, s0, s1, d3)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 5]
+                            d2 in [0, 4]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, ConvolutionOp_LhsDilation) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,12,10,4] parameter(0)
+      p1 = f32[4,3,5,8] parameter(1)
+      ROOT conv = f32[1,21,15,8] convolution(p0, p1),
+        window={size=3x5 pad=0_0x0_0 lhs_dilate=2x2}, dim_labels=b01f_i01o->b01f
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, (d1 + s0) floordiv 2, (d2 + s1) floordiv 2, s2)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 20]
+                            d2 in [0, 14]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                            (d1 + s0) mod 2 in [0, 0]
+                            (d2 + s1) mod 2 in [0, 0]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (s2, s0, s1, d3)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 20]
+                            d2 in [0, 14]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, ConvolutionOp_RhsDilation) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,12,10,4] parameter(0)
+      p1 = f32[4,3,5,8] parameter(1)
+      ROOT conv = f32[1,8,2,8] convolution(p0, p1),
+        window={size=3x5 pad=0_0x0_0 rhs_dilate=2x2}, dim_labels=b01f_i01o->b01f
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 + s0 * 2, d2 + s1 * 2, s2)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 7]
+                            d2 in [0, 1]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (s2, s0, s1, d3)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 7]
+                            d2 in [0, 1]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, ConvolutionOp_FeatureGroups) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,12,10,24] parameter(0)
+      p1 = f32[4,3,5,48] parameter(1)
+      ROOT conv = f32[1,10,6,48] convolution(p0, p1),
+        window={size=3x5 pad=0_0x0_0}, dim_labels=b01f_i01o->b01f, feature_group_count=6
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 + s0, d2 + s1, (d3 floordiv 4) * 4 + s2)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 9]
+                            d2 in [0, 5]
+                            d3 in [0, 47]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (s2, s0, s1, d3)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 9]
+                            d2 in [0, 5]
+                            d3 in [0, 47]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, ConvolutionOp_BatchGroups) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[14,12,10,4] parameter(0)
+      p1 = f32[4,3,5,21] parameter(1)
+      ROOT conv = f32[2,10,6,21] convolution(p0, p1),
+        window={size=3x5 pad=0_0x0_0}, dim_labels=b01f_i01o->b01f, batch_group_count=7
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2, s3] -> (d0 + s3 * 2, d1 + s0, d2 + s1, s2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [0, 9]
+                            d2 in [0, 5]
+                            d3 in [0, 20]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                            s3 in [0, 6]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (s2, s0, s1, d3)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [0, 9]
+                            d2 in [0, 5]
+                            d3 in [0, 20]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )"))));
+}
+
 TEST_F(IndexingAnalysisTest, ReverseOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m

From 4db8b7939aa7a97e47af00dde5aeb8068d2d3102 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Wed, 27 Mar 2024 03:07:56 -0700
Subject: [PATCH 484/670] [XLA:GPU][CoalescingAnalysis] Put the TODOs in the
 code.

PiperOrigin-RevId: 619470915
---
 third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc      | 1 +
 .../service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
index 7039e992814877..72d3274e5f55f4 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
@@ -66,6 +66,7 @@ std::optional<IndexingMap>
 MlirConcatenateFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
     mlir::MLIRContext* ctx) const {
+  // TODO(b/331356433): Add constraints depending on the `hero_operand_index`.
   return GetDefaultThreadIdToOutputIndexingMap(
       launch_dimensions(), /*unroll_factor=*/1,
       GetLargestConcatOperandShape(analysis_), ctx);
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
index 3e5ce21c714f34..c13e87145f320e 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
@@ -81,6 +81,7 @@ std::optional<IndexingMap>
 MlirInPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
     mlir::MLIRContext* mlir_context) const {
+  // TODO(b/331355203): Implement thread ID -> operand indexing.
   if (hero_operand_index != kDUSUpdateIndex) {
     return std::nullopt;
   }

From 93f54dea44ace85b24579c67c9cddefe29a0c72c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Mar 2024 03:30:41 -0700
Subject: [PATCH 485/670] Automated Code Change

PiperOrigin-RevId: 619476040
---
 tensorflow/core/runtime_fallback/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/runtime_fallback/BUILD b/tensorflow/core/runtime_fallback/BUILD
index da33d97329ad0b..40b841ce0033ea 100644
--- a/tensorflow/core/runtime_fallback/BUILD
+++ b/tensorflow/core/runtime_fallback/BUILD
@@ -60,7 +60,7 @@ cc_library(
     testonly = True,
     srcs = ["bef_executor_flags.cc"],
     hdrs = ["bef_executor_flags.h"],
-    visibility = ["//third_party/tf_runtime_google:__pkg__"],
+    visibility = ["//visibility:private"],
     deps = [
         "@com_google_absl//absl/flags:flag",
         "@com_google_absl//absl/strings",

From dadeddbce13b7d422c73ed872e386c56dd41a478 Mon Sep 17 00:00:00 2001
From: Frederic Rechtenstein <frec@google.com>
Date: Wed, 27 Mar 2024 04:35:23 -0700
Subject: [PATCH 486/670] Check that zero points of FullyConnected filter is 0
 for int4 quantization

PiperOrigin-RevId: 619489290
---
 tensorflow/lite/kernels/BUILD                 |  1 +
 tensorflow/lite/kernels/fully_connected.cc    | 19 +++++++++++++++++++
 .../lite/kernels/fully_connected_4bit_test.cc |  9 +++++----
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 9445fee50db352..1130c3d1b5e432 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -2245,6 +2245,7 @@ cc_test(
         ":builtin_ops",
         ":test_main",
         ":test_util",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 0c92b0e2641b85..670f7a771da7ee 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -118,6 +118,19 @@ TfLiteStatus VerifyPerChannelQuantization(TfLiteContext* context,
   return affine_quantization->scale->size > 1 ? kTfLiteOk : kTfLiteError;
 }
 
+TfLiteStatus VerifyQuantizationZeroPoint(const TfLiteTensor* tensor,
+                                         int expected_value) {
+  const auto* params =
+      reinterpret_cast<TfLiteAffineQuantization*>(tensor->quantization.params);
+  if (params && params->zero_point &&
+      std::any_of(params->zero_point->data,
+                  params->zero_point->data + params->zero_point->size,
+                  [expected_value](int v) { return v != expected_value; })) {
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace
 
 // This file has four implementations of FullyConnected
@@ -640,6 +653,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                 params->activation == kTfLiteActReluN1To1 ||
                                 params->activation == kTfLiteActRelu6);
   }
+  if (filter->type == kTfLiteInt4) {
+    TF_LITE_ENSURE_MSG(
+        context,
+        kTfLiteOk == VerifyQuantizationZeroPoint(filter, /*expected_value=*/0),
+        "Unsupported filter quantization zero-point value.");
+  }
   return PrepareImpl(context, node, kernel_type);
 }
 
diff --git a/tensorflow/lite/kernels/fully_connected_4bit_test.cc b/tensorflow/lite/kernels/fully_connected_4bit_test.cc
index 6130af553b284c..f4deb0f2017219 100644
--- a/tensorflow/lite/kernels/fully_connected_4bit_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_4bit_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/fully_connected.h"
 #include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -105,7 +106,7 @@ TEST(Hybrid4BitFullyConnectedOpTest, SimpleTestHybridInt4) {
   FullyConnected4BitOpModel m(
       units, batches,
       /*input=*/{TensorType_FLOAT32, {batches, cols}},
-      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 7.0, 1.0},
+      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 0.0, 1.0},
       /*output=*/{TensorType_FLOAT32, {units, batches}},
       {
           -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,  -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
@@ -148,7 +149,7 @@ TEST(Hybrid4BitFullyConnectedOpTest, TestHybridInt4AllZeroBatch) {
   FullyConnected4BitOpModel m(
       units, batches,
       /*input=*/{TensorType_FLOAT32, {batches, cols}},
-      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 7.0, 1.0},
+      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 0.0, 1.0},
       /*output=*/{TensorType_FLOAT32, {units, batches}},
       {
           -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,  -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
@@ -211,7 +212,7 @@ TEST_P(Hybrid4BitFullyConnectedVsReferenceOpTests, TestHybridInt4) {
   FullyConnected4BitOpModel test(
       units, batches,
       /*input=*/{TensorType_FLOAT32, {batches, cols}},
-      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 7.0, 1.0},
+      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 0.0, 1.0},
       /*output=*/{TensorType_FLOAT32, {units, batches}}, weight_data,
       ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT(),
       ActivationFunctionType_RELU);
@@ -222,7 +223,7 @@ TEST_P(Hybrid4BitFullyConnectedVsReferenceOpTests, TestHybridInt4) {
   FullyConnected4BitOpModel expected(
       units, batches,
       /*input=*/{TensorType_FLOAT32, {batches, cols}},
-      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 7.0, 1.0},
+      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 0.0, 1.0},
       /*output=*/{TensorType_FLOAT32, {units, batches}}, weight_data,
       ops::builtin::Register_FULLY_CONNECTED_REF(),
       ActivationFunctionType_RELU);

From aa6da142f36c70da2ee324e98536876ed05f0965 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Mar 2024 17:27:49 +0000
Subject: [PATCH 487/670] Merged commit includes the following changes:
 619575611  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Run buildifier on all files where it sorts loads differently

--
619498661  by A. Unique TensorFlower<gardener@tensorflow.org>:

    [XLA:GPU][IndexAnalysis] Rename GetDefaultThreadIdToOutputIndexingMap to GetDefaultThreadIdIndexingMap.

    The "output" part was a bit confusing. We use this function for threadId->input
    mapping as well.

--
619490165  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Convert S8 to BF16 in one step without going though F32.

--

PiperOrigin-RevId: 619575611
---
 WORKSPACE                                     |    4 +-
 ci/official/requirements_updater/BUILD.bazel  |    2 +-
 ci/official/requirements_updater/WORKSPACE    |    4 +-
 ci/official/wheel_test/WORKSPACE              |    4 +-
 tensorflow/c/BUILD                            |    4 +-
 tensorflow/c/eager/parallel_device/BUILD      |    4 +-
 tensorflow/c/experimental/filesystem/BUILD    |    3 +-
 .../experimental/filesystem/plugins/gcs/BUILD |    3 +-
 .../filesystem/plugins/windows/BUILD          |    3 +-
 tensorflow/c/experimental/gradients/BUILD     |    4 +-
 tensorflow/c/experimental/grappler/BUILD      |    2 +-
 .../c/experimental/pluggable_profiler/BUILD   |    2 +-
 .../c/experimental/saved_model/core/BUILD     |    3 +-
 .../c/experimental/saved_model/core/ops/BUILD |    3 +-
 .../c/experimental/saved_model/internal/BUILD |    3 +-
 .../saved_model/internal/testdata/BUILD       |    2 +-
 tensorflow/c/kernels/BUILD                    |    2 +-
 tensorflow/cc/BUILD                           |    2 +-
 tensorflow/cc/experimental/base/tests/BUILD   |    3 +-
 tensorflow/cc/experimental/libexport/BUILD    |    2 +-
 tensorflow/cc/experimental/libtf/BUILD        |   10 +-
 tensorflow/cc/experimental/libtf/impl/BUILD   |    8 +-
 tensorflow/cc/framework/fuzzing/BUILD         |    2 +-
 tensorflow/cc/tools/BUILD                     |    2 +-
 tensorflow/compiler/aot/tests/BUILD           |    2 +-
 tensorflow/compiler/jit/BUILD                 |    2 +-
 tensorflow/compiler/jit/ops/BUILD             |    2 +-
 tensorflow/compiler/jit/tests/BUILD           |    2 +-
 .../compiler/mlir/lite/experimental/tac/BUILD |    2 +-
 .../mlir/lite/experimental/tac/examples/BUILD |    2 +-
 .../lite/experimental/tac/py_wrapper/BUILD    |    2 +-
 tensorflow/compiler/mlir/lite/sparsity/BUILD  |    2 +-
 .../compiler/mlir/lite/stablehlo/tests/BUILD  |    2 +-
 .../mlir/lite/tests/flatbuffer2mlir/BUILD     |    2 +-
 .../mlir/quantization/stablehlo/tests/BUILD   |    2 +-
 .../mlir/quantization/tensorflow/BUILD        |    2 +-
 tensorflow/compiler/mlir/stablehlo/BUILD      |    4 +-
 tensorflow/compiler/mlir/tensorflow/BUILD     | 3225 ++++++++---------
 .../compiler/mlir/tensorflow/tests/BUILD      |    2 +-
 .../mlir/tf2xla/internal/passes/BUILD         |    2 +-
 tensorflow/compiler/mlir/tf2xla/tests/BUILD   |    2 +-
 .../compiler/mlir/tf2xla/transforms/BUILD     |    2 +-
 tensorflow/compiler/mlir/tfr/BUILD            |    4 +-
 tensorflow/compiler/mlir/tfr/build_defs.bzl   |    2 +-
 tensorflow/compiler/mlir/tfrt/tests/BUILD     |    2 +-
 .../compiler/mlir/tfrt/tests/analysis/BUILD   |    2 +-
 tensorflow/compiler/mlir/tfrt/tests/ir/BUILD  |    2 +-
 .../mlir/tfrt/tests/lhlo_to_jitrt/BUILD       |    2 +-
 .../mlir/tfrt/tests/tf_to_corert/BUILD        |    2 +-
 .../mlir/tfrt/tests/tfrt_fallback/BUILD       |    2 +-
 .../compiler/mlir/tools/kernel_gen/BUILD      |   22 +-
 .../compiler/mlir/tools/kernel_gen/ir/BUILD   |    2 +-
 .../mlir/tools/kernel_gen/transforms/BUILD    |    2 +-
 tensorflow/compiler/tests/BUILD               |    2 +-
 tensorflow/compiler/tf2tensorrt/BUILD         |   10 +-
 tensorflow/compiler/tf2xla/BUILD              |   17 +-
 tensorflow/compiler/tf2xla/cc/BUILD           |    2 +-
 tensorflow/compiler/tf2xla/kernels/BUILD      |    2 +-
 tensorflow/compiler/tf2xla/ops/BUILD          |    2 +-
 tensorflow/compiler/tf2xla/python/BUILD       |    2 +-
 tensorflow/core/BUILD                         |   10 +-
 tensorflow/core/activity_watcher/BUILD        |    2 +-
 tensorflow/core/api_def/BUILD                 |   12 +-
 tensorflow/core/common_runtime/BUILD          |   10 +-
 tensorflow/core/common_runtime/eager/BUILD    |   10 +-
 .../next_pluggable_device/c/BUILD             |    2 +-
 tensorflow/core/data/service/client/BUILD     |    2 +-
 tensorflow/core/data/service/snapshot/BUILD   |    2 +-
 tensorflow/core/debug/BUILD                   |    4 +-
 tensorflow/core/distributed_runtime/BUILD     |    2 +-
 tensorflow/core/distributed_runtime/rpc/BUILD |    2 +-
 .../core/distributed_runtime/rpc/eager/BUILD  |    4 +-
 tensorflow/core/example/BUILD                 |    4 +-
 tensorflow/core/function/capture/BUILD        |    2 +-
 tensorflow/core/function/polymorphism/BUILD   |    3 +-
 tensorflow/core/function/trace_type/BUILD     |    3 +-
 tensorflow/core/graph/BUILD                   |    8 +-
 tensorflow/core/graph/regularization/BUILD    |    4 +-
 tensorflow/core/grappler/BUILD                |    2 +-
 tensorflow/core/grappler/clusters/BUILD       |    4 +-
 tensorflow/core/grappler/costs/BUILD          |    4 +-
 tensorflow/core/grappler/graph_analyzer/BUILD |    2 +-
 tensorflow/core/grappler/optimizers/BUILD     |   10 +-
 tensorflow/core/grappler/utils/BUILD          |    4 +-
 tensorflow/core/grappler/verifiers/BUILD      |    2 +-
 tensorflow/core/ir/importexport/BUILD         |    2 +-
 tensorflow/core/ir/importexport/tests/BUILD   |    2 +-
 .../importexport/tests/graphdef_to_mlir/BUILD |    2 +-
 .../importexport/tests/mlir_to_graphdef/BUILD |    2 +-
 .../ir/importexport/tests/roundtrip/BUILD     |    3 +-
 tensorflow/core/ir/tests/BUILD                |    2 +-
 tensorflow/core/ir/types/BUILD                |    2 +-
 tensorflow/core/kernels/BUILD                 |   20 +-
 tensorflow/core/kernels/fuzzing/BUILD         |    2 +-
 tensorflow/core/kernels/image/BUILD           |    4 +-
 tensorflow/core/kernels/linalg/BUILD          |    8 +-
 tensorflow/core/kernels/mkl/BUILD             |    8 +-
 .../kernels/mlir_generated/build_defs.bzl     |   10 +-
 tensorflow/core/kernels/rnn/BUILD             |   10 +-
 tensorflow/core/kernels/sparse/BUILD          |    8 +-
 tensorflow/core/nccl/BUILD                    |    2 +-
 tensorflow/core/ops/BUILD                     |    8 +-
 tensorflow/core/ops/compat/BUILD              |    2 +-
 tensorflow/core/platform/BUILD                |   18 +-
 tensorflow/core/platform/build_config.bzl     |   14 +-
 .../core/platform/build_config.default.bzl    |    8 +-
 .../core/platform/build_config_root.bzl       |   10 +-
 tensorflow/core/platform/cloud/BUILD          |    2 +-
 tensorflow/core/platform/distribute.bzl       |    2 +-
 tensorflow/core/platform/profile_utils/BUILD  |    8 +-
 tensorflow/core/profiler/backends/gpu/BUILD   |    4 +-
 tensorflow/core/profiler/internal/BUILD       |    2 +-
 .../core/profiler/internal/advisor/BUILD      |    2 +-
 tensorflow/core/profiler/rpc/BUILD            |    2 +-
 tensorflow/core/profiler/rpc/client/BUILD     |    2 +-
 tensorflow/core/profiler/rpc/oss/BUILD        |    2 +-
 tensorflow/core/protobuf/BUILD                |    2 +-
 tensorflow/core/summary/BUILD                 |    2 +-
 tensorflow/core/tfrt/common/BUILD             |    4 +-
 tensorflow/core/tfrt/fallback/BUILD           |    6 +-
 tensorflow/core/tfrt/saved_model/tests/BUILD  |    2 +-
 tensorflow/core/tpu/BUILD                     |    2 +-
 tensorflow/core/tpu/graph_rewrite/BUILD       |    2 +-
 tensorflow/core/transforms/remapper/BUILD     |    2 +-
 tensorflow/core/user_ops/BUILD                |    8 +-
 tensorflow/core/util/BUILD                    |    8 +-
 tensorflow/core/util/ctc/BUILD                |    2 +-
 tensorflow/core/util/proto/BUILD              |    2 +-
 tensorflow/core/util/quantization/BUILD       |   10 +-
 tensorflow/core/util/sparse/BUILD             |    2 +-
 .../distribute/experimental/rpc/kernels/BUILD |    4 +-
 tensorflow/dtensor/cc/BUILD                   |    6 +-
 tensorflow/dtensor/mlir/dtensor_dialect/BUILD |    5 +-
 tensorflow/dtensor/mlir/tests/BUILD           |    2 +-
 tensorflow/examples/adding_an_op/BUILD        |    2 +-
 tensorflow/examples/speech_commands/BUILD     |    2 +-
 tensorflow/java/BUILD                         |    4 +-
 tensorflow/java/src/main/native/BUILD         |    4 +-
 tensorflow/js/BUILD                           |    8 +-
 tensorflow/lite/BUILD                         |    2 +-
 .../lite/acceleration/configuration/BUILD     |    9 +-
 .../lite/acceleration/configuration/c/BUILD   |    2 +-
 tensorflow/lite/core/BUILD                    |    2 +-
 tensorflow/lite/core/async/c/BUILD            |    2 +-
 tensorflow/lite/core/async/interop/c/BUILD    |    2 +-
 tensorflow/lite/core/c/BUILD                  |    2 +-
 .../acceleration/configuration/BUILD          |    2 +-
 tensorflow/lite/core/kernels/BUILD            |    2 +-
 tensorflow/lite/core/shims/BUILD              |    2 +-
 .../core/shims/cc_library_with_tflite.bzl     |    4 +-
 tensorflow/lite/core/tools/BUILD              |    4 +-
 tensorflow/lite/delegates/flex/build_def.bzl  |    2 +-
 tensorflow/lite/delegates/flex/test/BUILD     |    4 +-
 tensorflow/lite/delegates/gpu/BUILD           |    4 +-
 .../lite/delegates/gpu/gl/converters/BUILD    |    2 +-
 tensorflow/lite/delegates/gpu/metal/BUILD     |   12 +-
 .../lite/delegates/gpu/metal/kernels/BUILD    |    8 +-
 tensorflow/lite/delegates/nnapi/BUILD         |    2 +-
 .../acceleration/compatibility/BUILD          |    2 +-
 .../acceleration/configuration/BUILD          |    9 +-
 .../acceleration/mini_benchmark/BUILD         |    2 +-
 tensorflow/lite/g3doc/tools/BUILD             |    2 +-
 tensorflow/lite/ios/BUILD.apple               |    2 +-
 tensorflow/lite/ios/ios.bzl                   |    2 +-
 tensorflow/lite/java/BUILD                    |    4 +-
 tensorflow/lite/kernels/internal/utils/BUILD  |    2 +-
 tensorflow/lite/kernels/parse_example/BUILD   |    2 +-
 tensorflow/lite/kernels/shim/test_op/BUILD    |    2 +-
 tensorflow/lite/nnapi/BUILD                   |    2 +-
 tensorflow/lite/objc/BUILD.apple              |    2 +-
 tensorflow/lite/profiling/telemetry/BUILD     |    2 +-
 tensorflow/lite/profiling/telemetry/c/BUILD   |    2 +-
 tensorflow/lite/python/BUILD                  |    2 +-
 tensorflow/lite/python/metrics/BUILD          |    5 +-
 tensorflow/lite/schema/BUILD                  |    2 +-
 tensorflow/lite/swift/BUILD.apple             |    4 +-
 tensorflow/lite/toco/BUILD                    |    8 +-
 tensorflow/lite/toco/logging/BUILD            |   11 +-
 tensorflow/lite/tools/BUILD                   |   11 +-
 tensorflow/lite/tools/benchmark/android/BUILD |    2 +-
 .../delegate_performance/android/models/BUILD |    4 +-
 .../android/src/main/native/BUILD             |    2 +-
 .../experimental/firebase/android/BUILD       |    2 +-
 .../benchmark/experimental/ios/BUILD.apple    |    2 +-
 .../tools/evaluation/tasks/ios/BUILD.apple    |    2 +-
 tensorflow/lite/tools/optimize/BUILD          |    4 +-
 .../lite/tools/optimize/calibration/BUILD     |    2 +-
 tensorflow/lite/tools/signature/BUILD         |    2 +-
 tensorflow/python/BUILD                       |    8 +-
 tensorflow/python/build_defs.bzl              |    4 +-
 .../python/distribute/integration_test/BUILD  |    2 +-
 tensorflow/python/eager/BUILD                 |    2 +-
 .../python/eager/polymorphic_function/BUILD   |    2 +-
 tensorflow/python/kernel_tests/proto/BUILD    |    5 +-
 tensorflow/python/kernel_tests/signal/BUILD   |    2 +-
 tensorflow/python/lib/io/BUILD                |    2 +-
 tensorflow/python/ops/linalg/sparse/BUILD     |    2 +-
 tensorflow/python/ops/memory_tests/BUILD      |    2 +-
 tensorflow/python/tools/api/generator/BUILD   |    4 +-
 tensorflow/python/tools/api/generator2/BUILD  |    2 +-
 .../tools/api/generator2/generator/BUILD      |    2 +-
 tensorflow/python/tpu/BUILD                   |    3 +-
 tensorflow/python/training/BUILD              |    5 +-
 .../tools/android/inference_interface/BUILD   |    2 +-
 .../def_file_filter_configure.bzl             |    3 +-
 tensorflow/tools/docs/BUILD                   |    2 +-
 tensorflow/tools/lib_package/BUILD            |    2 +-
 tensorflow/tools/pip_package/BUILD            |    2 +-
 .../tools/proto_splitter/testdata/BUILD       |    3 +-
 .../aarch64/aarch64_compiler_configure.bzl    |    2 +-
 .../toolchains/remote_config/rbe_config.bzl   |    8 +-
 .../tools/toolchains/win/bazel_211/BUILD      |    2 +-
 .../bazel_211/windows_cc_toolchain_config.bzl |    2 +-
 .../toolchains/win/tf_win_05022023/BUILD      |    2 +-
 .../windows_cc_toolchain_config.bzl           |    2 +-
 tensorflow/workspace0.bzl                     |    6 +-
 tensorflow/workspace1.bzl                     |    2 +-
 tensorflow/workspace2.bzl                     |   14 +-
 tensorflow/workspace3.bzl                     |    4 +-
 third_party/flatbuffers/flatbuffers.BUILD     |    3 +-
 third_party/googleapis/build_rules.bzl        |    2 +-
 third_party/gpus/cuda_configure.bzl           |    2 +-
 third_party/gpus/rocm_configure.bzl           |   14 +-
 third_party/hwloc/hwloc.BUILD                 |    6 +-
 third_party/jpeg/jpeg.BUILD                   |    2 +-
 third_party/llvm_openmp/BUILD                 |    2 +-
 third_party/mkl_dnn/mkldnn_acl.BUILD          |    4 +-
 third_party/mkl_dnn/mkldnn_v1.BUILD           |    4 +-
 third_party/nccl/archive.BUILD                |    8 +-
 third_party/pprof.BUILD                       |    4 +-
 third_party/systemlibs/protobuf.BUILD         |    2 +-
 third_party/xla/WORKSPACE                     |    1 +
 third_party/xla/build_tools/configure/BUILD   |    3 +-
 third_party/xla/third_party/llvm_openmp/BUILD |    2 +-
 third_party/xla/third_party/tsl/WORKSPACE     |    1 +
 .../tsl/third_party/gpus/cuda_configure.bzl   |    2 +-
 .../tsl/third_party/gpus/rocm_configure.bzl   |   14 +-
 .../tsl/third_party/hwloc/hwloc.BUILD         |    6 +-
 .../tsl/third_party/llvm_openmp/BUILD         |   14 +-
 .../tsl/third_party/mkl_dnn/mkldnn_acl.BUILD  |    4 +-
 .../tsl/third_party/mkl_dnn/mkldnn_v1.BUILD   |    4 +-
 .../tsl/third_party/nccl/archive.BUILD        |    8 +-
 .../tsl/third_party/systemlibs/protobuf.BUILD |    2 +-
 .../def_file_filter_configure.bzl             |    3 +-
 .../aarch64/aarch64_compiler_configure.bzl    |    4 +-
 .../toolchains/remote_config/rbe_config.bzl   |    6 +-
 .../tsl/tools/toolchains/win/bazel_211/BUILD  |    2 +-
 .../bazel_211/windows_cc_toolchain_config.bzl |    2 +-
 .../toolchains/win/tf_win_05022023/BUILD      |    2 +-
 .../windows_cc_toolchain_config.bzl           |    2 +-
 third_party/xla/third_party/tsl/tsl/BUILD     |    2 +-
 .../xla/third_party/tsl/tsl/concurrency/BUILD |    2 +-
 .../tsl/tsl/distributed_runtime/BUILD         |    2 +-
 .../distributed_runtime/coordination/BUILD    |    2 +-
 .../tsl/distributed_runtime/preemption/BUILD  |    2 +-
 .../tsl/tsl/distributed_runtime/rpc/BUILD     |    2 +-
 .../rpc/coordination/BUILD                    |    2 +-
 .../xla/third_party/tsl/tsl/framework/BUILD   |    8 +-
 .../tsl/tsl/framework/contraction/BUILD       |    4 +-
 .../tsl/tsl/framework/fixedpoint/BUILD        |    2 +-
 .../xla/third_party/tsl/tsl/lib/core/BUILD    |    6 +-
 .../xla/third_party/tsl/tsl/lib/gtl/BUILD     |    8 +-
 .../xla/third_party/tsl/tsl/lib/hash/BUILD    |    8 +-
 .../third_party/tsl/tsl/lib/histogram/BUILD   |    8 +-
 .../xla/third_party/tsl/tsl/lib/io/BUILD      |    6 +-
 .../third_party/tsl/tsl/lib/monitoring/BUILD  |    4 +-
 .../xla/third_party/tsl/tsl/lib/random/BUILD  |    2 +-
 .../xla/third_party/tsl/tsl/lib/strings/BUILD |    4 +-
 .../xla/third_party/tsl/tsl/platform/BUILD    |   16 +-
 .../third_party/tsl/tsl/platform/cloud/BUILD  |    2 +-
 .../tsl/tsl/platform/default/BUILD            |    7 +-
 .../tsl/tsl/platform/default/build_config.bzl |    2 +-
 .../tsl/tsl/platform/profile_utils/BUILD      |    8 +-
 .../tsl/tsl/platform/windows/BUILD            |    9 +-
 .../tsl/tsl/profiler/backends/cpu/BUILD       |    4 +-
 .../tsl/tsl/profiler/builds/build_config.bzl  |    2 +-
 .../tsl/tsl/profiler/convert/BUILD            |    4 +-
 .../third_party/tsl/tsl/profiler/lib/BUILD    |    2 +-
 .../tsl/tsl/profiler/protobuf/BUILD           |    2 +-
 .../third_party/tsl/tsl/profiler/rpc/BUILD    |    2 +-
 .../tsl/tsl/profiler/rpc/client/BUILD         |    2 +-
 .../third_party/tsl/tsl/profiler/utils/BUILD  |    2 +-
 .../xla/third_party/tsl/tsl/util/BUILD        |   12 +-
 .../xla/third_party/tsl/workspace0.bzl        |    4 +-
 .../xla/third_party/tsl/workspace3.bzl        |    2 +-
 .../aarch64/aarch64_compiler_configure.bzl    |    4 +-
 .../toolchains/remote_config/rbe_config.bzl   |    6 +-
 .../xla/tools/toolchains/win/bazel_211/BUILD  |    2 +-
 .../bazel_211/windows_cc_toolchain_config.bzl |    2 +-
 .../toolchains/win/tf_win_05022023/BUILD      |    2 +-
 .../windows_cc_toolchain_config.bzl           |    2 +-
 third_party/xla/workspace0.bzl                |    6 +-
 third_party/xla/workspace1.bzl                |    2 +-
 third_party/xla/xla/BUILD                     |    7 +-
 .../xla/xla/backends/profiler/cpu/BUILD       |    6 +-
 .../xla/xla/backends/profiler/gpu/BUILD       |    8 +-
 .../xla/xla/backends/profiler/plugin/BUILD    |    4 +-
 third_party/xla/xla/client/BUILD              |    2 +-
 third_party/xla/xla/client/lib/BUILD          |    2 +-
 third_party/xla/xla/ffi/BUILD                 |    2 +-
 third_party/xla/xla/ffi/api/BUILD             |    2 +-
 third_party/xla/xla/hlo/evaluator/BUILD       |    2 +-
 .../xla/hlo/experimental/auto_sharding/BUILD  |    2 +-
 third_party/xla/xla/hlo/transforms/BUILD      |    2 +-
 third_party/xla/xla/hlo/utils/BUILD           |    2 +-
 .../xla/xla/mlir/runtime/transforms/BUILD     |    2 +-
 third_party/xla/xla/mlir/utils/BUILD          |    2 +-
 third_party/xla/xla/mlir_hlo/WORKSPACE        |    1 +
 third_party/xla/xla/pjrt/BUILD                |    5 +-
 third_party/xla/xla/pjrt/c/BUILD              |    2 +-
 third_party/xla/xla/pjrt/cpu/BUILD            |    2 +-
 third_party/xla/xla/pjrt/distributed/BUILD    |    2 +-
 third_party/xla/xla/pjrt/gpu/BUILD            |    4 +-
 third_party/xla/xla/python/BUILD              |   14 +-
 third_party/xla/xla/python/ifrt/BUILD         |    2 +-
 .../xla/xla/python/ifrt_proxy/client/BUILD    |    2 +-
 .../xla/xla/python/ifrt_proxy/common/BUILD    |    3 +-
 third_party/xla/xla/python/pjrt_ifrt/BUILD    |    2 +-
 .../xla/xla/python/profiler/internal/BUILD    |    2 +-
 third_party/xla/xla/runtime/BUILD             |    2 +-
 third_party/xla/xla/service/BUILD             |   24 +-
 third_party/xla/xla/service/cpu/BUILD         |   30 +-
 third_party/xla/xla/service/cpu/tests/BUILD   |    2 +-
 third_party/xla/xla/service/gpu/BUILD         |   24 +-
 third_party/xla/xla/service/gpu/fusions/BUILD |    4 +-
 .../xla/service/gpu/fusions/concatenate.cc    |    6 +-
 .../service/gpu/fusions/concatenate_mlir.cc   |    6 +-
 .../xla/service/gpu/fusions/fusion_emitter.cc |   19 +-
 .../xla/service/gpu/fusions/fusion_emitter.h  |    6 +-
 .../fusions/in_place_dynamic_update_slice.cc  |    4 +-
 .../in_place_dynamic_update_slice_mlir.cc     |    4 +-
 .../xla/service/gpu/fusions/input_slices.cc   |    3 +-
 .../service/gpu/fusions/input_slices_mlir.cc  |    3 +-
 .../xla/xla/service/gpu/fusions/loop.cc       |    4 +-
 .../xla/xla/service/gpu/fusions/loop_mlir.cc  |    4 +-
 .../xla/xla/service/gpu/fusions/scatter.cc    |    2 +-
 .../xla/service/gpu/fusions/scatter_mlir.cc   |    2 +-
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |    5 +-
 third_party/xla/xla/service/gpu/kernels/BUILD |    8 +-
 .../xla/service/gpu/llvm_gpu_backend/BUILD    |    2 +-
 third_party/xla/xla/service/gpu/model/BUILD   |    7 +-
 third_party/xla/xla/service/gpu/runtime/BUILD |    6 +-
 third_party/xla/xla/service/gpu/tests/BUILD   |   12 +-
 third_party/xla/xla/service/graphcycles/BUILD |    2 +-
 .../xla/xla/service/heap_simulator/BUILD      |    2 +-
 third_party/xla/xla/service/llvm_ir/BUILD     |    2 +-
 .../xla/service/memory_space_assignment/BUILD |    8 +-
 third_party/xla/xla/service/spmd/BUILD        |    2 +-
 third_party/xla/xla/stream_executor/BUILD     |    4 +-
 .../xla/xla/stream_executor/cuda/BUILD        |   24 +-
 third_party/xla/xla/stream_executor/gpu/BUILD |   34 +-
 .../xla/xla/stream_executor/host/BUILD        |    4 +-
 .../xla/stream_executor/integrations/BUILD    |    4 +-
 .../xla/xla/stream_executor/platform/BUILD    |    2 +-
 .../xla/xla/stream_executor/rocm/BUILD        |    1 +
 third_party/xla/xla/stream_executor/tpu/BUILD |    2 +-
 third_party/xla/xla/tests/BUILD               |   12 +-
 third_party/xla/xla/tests/build_defs.bzl      |    8 +-
 third_party/xla/xla/tests/exhaustive/BUILD    |    2 +-
 third_party/xla/xla/tools/BUILD               |   16 +-
 third_party/xla/xla/tools/hlo_opt/BUILD       |   10 +-
 .../xla/xla/tools/multihost_hlo_runner/BUILD  |    4 +-
 third_party/xla/xla/translate/BUILD           |    2 +-
 .../xla/xla/translate/hlo_to_mhlo/BUILD       |    2 +-
 .../xla/xla/translate/mhlo_to_hlo/BUILD       |    2 +-
 .../xla/xla/translate/mhlo_to_hlo/tests/BUILD |    2 +-
 366 files changed, 2416 insertions(+), 2404 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index a697405110e206..675a9481283514 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,3 +1,5 @@
+# buildifier: disable=load-on-top
+
 workspace(name = "org_tensorflow")
 
 # We must initialize hermetic python first.
@@ -23,7 +25,7 @@ load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()
 
-load("@rules_python//python:repositories.bzl", "python_register_toolchains")
+load("@rules_python//python:repositories.bzl", "python_register_toolchains")  # buildifier: disable=same-origin-load
 load(
     "//tensorflow/tools/toolchains/python:python_repo.bzl",
     "python_repository",
diff --git a/ci/official/requirements_updater/BUILD.bazel b/ci/official/requirements_updater/BUILD.bazel
index 8cdb70597f0a83..06a0898d9a2b78 100644
--- a/ci/official/requirements_updater/BUILD.bazel
+++ b/ci/official/requirements_updater/BUILD.bazel
@@ -13,10 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-load("@python//3.9:defs.bzl", compile_pip_requirements_3_9 = "compile_pip_requirements")
 load("@python//3.10:defs.bzl", compile_pip_requirements_3_10 = "compile_pip_requirements")
 load("@python//3.11:defs.bzl", compile_pip_requirements_3_11 = "compile_pip_requirements")
 load("@python//3.12:defs.bzl", compile_pip_requirements_3_12 = "compile_pip_requirements")
+load("@python//3.9:defs.bzl", compile_pip_requirements_3_9 = "compile_pip_requirements")
 load("@updater_config_repository//:updater_config_repository.bzl", "REQUIREMENTS_FILE_NAME")
 
 compile_pip_requirements_3_9(
diff --git a/ci/official/requirements_updater/WORKSPACE b/ci/official/requirements_updater/WORKSPACE
index 9b56cc0422bf6d..f9a116a6a3153e 100644
--- a/ci/official/requirements_updater/WORKSPACE
+++ b/ci/official/requirements_updater/WORKSPACE
@@ -1,3 +1,5 @@
+# buildifier: disable=load-on-top
+
 workspace(name = "requirements_updater")
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
@@ -22,7 +24,7 @@ load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()
 
-load("@rules_python//python:repositories.bzl", "python_register_multi_toolchains")
+load("@rules_python//python:repositories.bzl", "python_register_multi_toolchains")  # buildifier: disable=same-origin-load
 load("@rules_python//python/pip_install:repositories.bzl", "pip_install_dependencies")
 
 default_python_version = "3.10"
diff --git a/ci/official/wheel_test/WORKSPACE b/ci/official/wheel_test/WORKSPACE
index cef9033d30120f..d52a3ed895173b 100644
--- a/ci/official/wheel_test/WORKSPACE
+++ b/ci/official/wheel_test/WORKSPACE
@@ -1,3 +1,5 @@
+# buildifier: disable=load-on-top
+
 workspace(name = "wheel_test")
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
@@ -38,7 +40,7 @@ python_repository(name = "python_version_repo")
 load("@python_version_repo//:py_version.bzl", "TF_PYTHON_VERSION")
 
 # Register multi toolchains
-load("@rules_python//python:repositories.bzl", "python_register_toolchains")
+load("@rules_python//python:repositories.bzl", "python_register_toolchains")  # buildifier: disable=same-origin-load
 
 python_register_toolchains(
     name = "python",
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index eff34ce6a3f320..47c751c54b15a5 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -2,6 +2,8 @@
 # C API for TensorFlow, for use by client language bindings.
 
 load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "check_deps",
@@ -18,8 +20,6 @@ load(
     "//tensorflow/core/tpu:build_defs.bzl",
     "if_libtpu_tf_status",
 )
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index 647937fe98e47d..d91dd4092cd5bb 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -1,9 +1,9 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/filesystem/BUILD b/tensorflow/c/experimental/filesystem/BUILD
index e55d95334cf6cf..d25e6e9314f088 100644
--- a/tensorflow/c/experimental/filesystem/BUILD
+++ b/tensorflow/c/experimental/filesystem/BUILD
@@ -1,9 +1,8 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Experimental filesystem C APIs for TensorFlow.
 # Will be moved in proper place once all filesystems are converted to the
 # modular framework.
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index bd2041b1d43957..7c23cb79143b01 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -1,7 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Experimental gcs filesystem plugin.
 load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/filesystem/plugins/windows/BUILD b/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
index 2ac57f6a731344..159e36e485e6a6 100644
--- a/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
@@ -1,7 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Experimental windows filesystem plugin.
 load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/gradients/BUILD b/tensorflow/c/experimental/gradients/BUILD
index 65f580deee93c4..2ca36f898737c0 100644
--- a/tensorflow/c/experimental/gradients/BUILD
+++ b/tensorflow/c/experimental/gradients/BUILD
@@ -1,14 +1,14 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_libtpu",
     "tf_cuda_cc_test",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 # Library of gradient functions.
 package(
diff --git a/tensorflow/c/experimental/grappler/BUILD b/tensorflow/c/experimental/grappler/BUILD
index fd26096fd5d871..d4892b1b9b9624 100644
--- a/tensorflow/c/experimental/grappler/BUILD
+++ b/tensorflow/c/experimental/grappler/BUILD
@@ -1,11 +1,11 @@
 # Description:
 # Graph C API.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/pluggable_profiler/BUILD b/tensorflow/c/experimental/pluggable_profiler/BUILD
index a34faa3146735b..49bb842e2e6258 100644
--- a/tensorflow/c/experimental/pluggable_profiler/BUILD
+++ b/tensorflow/c/experimental/pluggable_profiler/BUILD
@@ -1,8 +1,8 @@
 # Description:
 # Profiler C API
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index af37ab0cb19011..9ad56fcf6671b2 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -1,5 +1,3 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Experimental SavedModel C APIs for TensorFlow. See RFC
 # https://github.com/tensorflow/community/pull/207
 # Targets in this directory are pure C++ "Classes" underlying the C API types
@@ -9,6 +7,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
index cce725db3fcba1..3e9d28ed8795d4 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -1,11 +1,10 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # This package contains written convenience helpers for Eager Operations
 # used by SavedModel. Once we autogenerate C++ Eager Op wrappers, we can remove these.
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index 36e5cb52d2ec25..51446cca058352 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -9,8 +9,6 @@
 # Note(bmzhao): The *.cc files in this directory form the direct implementation of the
 # C API functions exposed in tf/c/experimental/saved_model/public/.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Note(bmzhao): All *type.h files in this directory are the internal definitions of
 # the opaque C types. These headers should only be visible to internal tensorflow
 # implementors.
@@ -19,6 +17,7 @@ load(
     "tf_cc_test",
     "tf_copts",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/saved_model/internal/testdata/BUILD b/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
index a10cfd03e3dc86..ec36b292a6518e 100644
--- a/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/kernels/BUILD b/tensorflow/c/kernels/BUILD
index 8e38201c2a5960..7bcaa66060665b 100644
--- a/tensorflow/c/kernels/BUILD
+++ b/tensorflow/c/kernels/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_gen_op_libs", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_gen_op_libs", "tf_kernel_library")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index a989862e4f79fb..43f63b2ba0cb81 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -2,7 +2,6 @@
 # TensorFlow is a computational framework, primarily for use in machine
 # learning applications.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "cc_library_with_android_deps",
@@ -11,6 +10,7 @@ load(
     "transitive_hdrs",
 )
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_gen_op_wrappers_cc")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/cc/experimental/base/tests/BUILD b/tensorflow/cc/experimental/base/tests/BUILD
index e749d2433bd696..70184355fe76aa 100644
--- a/tensorflow/cc/experimental/base/tests/BUILD
+++ b/tensorflow/cc/experimental/base/tests/BUILD
@@ -1,7 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Tests for the C++ header-only base types.
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/cc/experimental/libexport/BUILD b/tensorflow/cc/experimental/libexport/BUILD
index 910ab930440f68..d206c115abea65 100644
--- a/tensorflow/cc/experimental/libexport/BUILD
+++ b/tensorflow/cc/experimental/libexport/BUILD
@@ -1,8 +1,8 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/cc/experimental/libtf/BUILD b/tensorflow/cc/experimental/libtf/BUILD
index 31e159726685c9..def467880f96fb 100644
--- a/tensorflow/cc/experimental/libtf/BUILD
+++ b/tensorflow/cc/experimental/libtf/BUILD
@@ -1,16 +1,16 @@
 #include "third_party/absl/strings/str_cat.h"
 #TODO(aselle) : describe this package.
 
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
+load("//tensorflow:strict.default.bzl", "py_strict_binary")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow:strict.default.bzl", "py_strict_binary")
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/cc/experimental/libtf/impl/BUILD b/tensorflow/cc/experimental/libtf/impl/BUILD
index 0eae5a1f05c133..4f5b7ccfd84940 100644
--- a/tensorflow/cc/experimental/libtf/impl/BUILD
+++ b/tensorflow/cc/experimental/libtf/impl/BUILD
@@ -1,13 +1,13 @@
 # libtf implementation details.
 
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/cc/framework/fuzzing/BUILD b/tensorflow/cc/framework/fuzzing/BUILD
index ec424fc0425630..74a946c283777d 100644
--- a/tensorflow/cc/framework/fuzzing/BUILD
+++ b/tensorflow/cc/framework/fuzzing/BUILD
@@ -1,11 +1,11 @@
 # TODO(unda): describe this package.
 
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("//tensorflow:tensorflow.bzl", "tf_copts")
 load(
     "//tensorflow/cc/framework/fuzzing:op_fuzzing.bzl",
     "tf_gen_op_wrappers_fuzz",
 )
-load("//tensorflow:tensorflow.bzl", "tf_copts")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
index bb5daa99742944..10601308ac7d0f 100644
--- a/tensorflow/cc/tools/BUILD
+++ b/tensorflow/cc/tools/BUILD
@@ -2,11 +2,11 @@
 #Description:
 # TensorFlow cc tools.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 92d62b34be8bf9..dfedc5a4f8c6c0 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -1,8 +1,8 @@
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "genrule")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 8ebeae499bd177..5511640af1e668 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -1,6 +1,6 @@
+load("@local_xla//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load("//tensorflow:tensorflow.bzl", "if_libtpu", "if_with_tpu_support", "tf_cc_test", "tf_copts", "tf_cuda_cc_test", "tf_cuda_only_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "tf_custom_op_py_strict_library")
-load("@local_xla//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index 66d9960ae0a62f..6372b2e5516cd3 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/jit/tests/BUILD b/tensorflow/compiler/jit/tests/BUILD
index e9880013bf2611..0c3c4986e44a88 100644
--- a/tensorflow/compiler/jit/tests/BUILD
+++ b/tensorflow/compiler/jit/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 licenses(["notice"])
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
index 8a2dff2ecb34d7..75228b1bc607bc 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
@@ -1,9 +1,9 @@
-load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 load(
     "@llvm-project//mlir:tblgen.bzl",
     "gentbl_cc_library",
 )
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/examples/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/examples/BUILD
index 57fb5ea9eef10d..c5707a5f888885 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/examples/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/examples/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
index 7ee0b43b84d98c..5b3f2836feec99 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
 load("//tensorflow:tensorflow.bzl", "VERSION")
+load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/lite/sparsity/BUILD b/tensorflow/compiler/mlir/lite/sparsity/BUILD
index 4f2e681a986f65..fce754995766d5 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/BUILD
+++ b/tensorflow/compiler/mlir/lite/sparsity/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD
index 79cb17374fa940..dd691a25be14d9 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
index e1687b22816be0..2afbe2a0d2c766 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
index db4bc1a92483c1..55a41d4ce76072 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index 9d4da1f4dc5b23..60099ccb0ea075 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -1,7 +1,7 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
 
 # Placeholder: load py_proto_library
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/compiler/mlir/quantization/tensorflow:internal_visibility_allowlist.bzl", "internal_visibility_allowlist")
diff --git a/tensorflow/compiler/mlir/stablehlo/BUILD b/tensorflow/compiler/mlir/stablehlo/BUILD
index 81cdce725460fb..5d5342e8a264c4 100644
--- a/tensorflow/compiler/mlir/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/stablehlo/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow:strict.default.bzl", "py_strict_test")
-load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("@local_tsl//tsl:tsl.default.bzl", "tsl_pybind_extension")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 5b607576c728a5..34ba4406565aee 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1,1621 +1,1620 @@
+# copybara:uncomment_end(google-only)
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 # copybara:uncomment_begin(google-only)
 # load("//learning/brain/experimental/mlir/tensorflow/dialectgen:dialectgen.bzl", "dialectgen")
+# load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_gen_op_wrapper_py")
+# load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+# load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 #
-# copybara:uncomment_end(google-only)
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_gen_op_wrapper_py")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
-)
-
-exports_files([
-    "ir/tf_generated_ops.td",
-    "ir/tf_op_base.td",
-    "ir/tf_op_interfaces.td",
-    "ir/tf_ops.td",
-])
-
-td_library(
-    name = "tensorflow_ops_td_files",
-    srcs = [
-        "ir/tf_generated_ops.td",
-        "ir/tf_op_base.td",
-        "ir/tf_op_interfaces.td",
-        "ir/tf_ops.td",
-    ],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@llvm-project//mlir:CallInterfacesTdFiles",
-        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
-        "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
-        "@llvm-project//mlir:LoopLikeInterfaceTdFiles",
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "tensorflow_op_interfaces_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-interface-decls"],
-            "ir/tf_op_interfaces.h.inc",
-        ),
-        (
-            ["-gen-op-interface-defs"],
-            "ir/tf_op_interfaces.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "ir/tf_op_interfaces.td",
-    test = True,
-    deps = [
-        ":tensorflow_ops_td_files",
-    ],
-)
-
-gentbl_cc_library(
-    name = "tensorflow_struct_doc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-dialect-doc"],
-            "g3doc/tf_ops.md",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "ir/tf_ops.td",
-    test = True,
-    deps = [
-        ":tensorflow_ops_td_files",
-    ],
-)
-
-cc_library(
-    name = "tensorflow_op_interfaces",
-    srcs = [
-        "ir/tf_op_interfaces.cc",
-        "ir/tf_op_interfaces.cc.inc",
-        "ir/tf_op_interfaces.h.inc",
-        "ir/tf_verifiers.cc",
-    ],
-    hdrs = [
-        "ir/tf_op_interfaces.h",
-        "ir/tf_verifiers.h",
-    ],
-    deps = [
-        ":tensorflow_op_interfaces_inc_gen",
-        ":tensorflow_structs",
-        "//tensorflow/core:framework",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-gentbl_cc_library(
-    name = "tensorflow_all_ops_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/tf_all_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/tf_all_ops.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "ir/tf_ops.td",
-    deps = [
-        ":tensorflow_ops_td_files",
-    ],
-)
-
-# We only shard tf_op on name for build performance reasons.
-tf_ops_category_list = [
-    {
-        "name": "ops_a_m",
-        "include": "tf.[A-M].*$",
-    },
-    {
-        "name": "ops_n_z",
-        "include": "tf.[N-Z].*$",
-    },
-]
-
-[[
-    gentbl_cc_library(
-        name = "tensorflow_" + target["name"] + "_inc_gen",
-        compatible_with = get_compatible_with_portable(),
-        tbl_outs = [
-            (
-                [
-                    "-gen-op-decls",
-                    "-op-include-regex=" + target["include"],
-                ],
-                "ir/tf_" + target["name"] + ".h.inc",
-            ),
-            (
-                [
-                    "-gen-op-defs",
-                    "-op-include-regex=" + target["include"],
-                ],
-                "ir/tf_" + target["name"] + ".cc.inc",
-            ),
-        ],
-        tblgen = "@llvm-project//mlir:mlir-tblgen",
-        td_file = "ir/tf_ops.td",
-        deps = [
-            ":tensorflow_ops_td_files",
-        ],
-    ),
-] for target in tf_ops_category_list]
-
-gentbl_cc_library(
-    name = "tensorflow_remaining_ops_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-op-decls",
-                "-op-exclude-regex=" + "|".join([target["include"] for target in tf_ops_category_list]),
-            ],
-            "ir/tf_remaining_ops.h.inc",
-        ),
-        (
-            [
-                "-gen-op-defs",
-                "-op-exclude-regex=" + "|".join([target["include"] for target in tf_ops_category_list]),
-            ],
-            "ir/tf_remaining_ops.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "ir/tf_ops.td",
-    deps = [
-        ":tensorflow_ops_td_files",
-    ],
-)
-
-gentbl_cc_library(
-    name = "tf_saved_model_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/tf_saved_model.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/tf_saved_model.cc.inc",
-        ),
-        (
-            ["-gen-dialect-doc"],
-            "g3doc/tf_saved_model.md",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "ir/tf_saved_model_ops.td",
-    test = True,
-    deps = [
-        "@llvm-project//mlir:FuncTdFiles",
-        "@llvm-project//mlir:OpBaseTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "tensorflow_executor_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/tf_executor.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/tf_executor.cc.inc",
-        ),
-        (
-            [
-                "-gen-dialect-doc",
-                "-dialect=tf_executor",
-            ],
-            "g3doc/tf_executor.md",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "ir/tf_executor_ops.td",
-    test = True,
-    deps = [
-        ":tensorflow_ops_td_files",
-        "@llvm-project//mlir:FuncTdFiles",
-        "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
-        "@llvm-project//mlir:OpBaseTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "tensorflow_device_ops_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/tf_device.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/tf_device.cc.inc",
-        ),
-        (
-            ["-gen-dialect-doc"],
-            "g3doc/tf_device.md",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "ir/tf_device_ops.td",
-    test = True,
-    deps = [
-        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
-        "@llvm-project//mlir:FuncTdFiles",
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-    ],
-)
-
-cc_library(
-    name = "tensorflow_attributes",
-    hdrs = [
-        "ir/tf_attributes.h",
-        "ir/tf_dialect.h",
-    ],
-    deps = [
-        ":tensorflow_types",
-        "//tensorflow/core/ir/types:Dialect",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "tensorflow_traits",
-    srcs = [
-    ],
-    hdrs = [
-        "ir/tf_traits.h",
-    ],
-    deps = [
-        ":tensorflow_op_interfaces",
-        ":tensorflow_types",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:InferTypeOpInterface",
-        "@llvm-project//mlir:SideEffectInterfaces",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-# TensorFlow ops are separated into `tensorflow_ops_a_m.cc` and
-# `tensorflow_ops_n_z.cc` so that C++ compiler won't be stressed by huge C++
-# files. However, there might be dependencies between `tensorflow_ops_a_m.cc`
-# and `tensorflow_ops_n_z.cc`, thus they must be built in one `cc_library`.
-cc_library(
-    name = "tensorflow_ops_sharded",
-    srcs = [
-               "ir/tf_dialect.h",
-               "ir/tf_ops.h",
-               "ir/tf_remaining_ops.h",
-           ] + ["ir/tf_" + target["name"] + ".cc" for target in tf_ops_category_list] +
-           ["ir/tf_" + target["name"] + ".cc.inc" for target in tf_ops_category_list] +
-           ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
-    hdrs = [
-    ],
-    textual_hdrs = [
-        "ir/tf_types.def",
-        "ir/tf_all_ops.h.inc",
-        "ir/tf_remaining_ops.h.inc",
-    ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
-    deps = [
-        ":attribute_utils",
-        ":convert_type",
-        ":dynamic_shape_utils",
-        ":side_effect_analysis_util",
-        ":tensorflow_all_ops_inc_gen",
-        ":tensorflow_attributes",
-        ":tensorflow_op_interfaces",
-        ":tensorflow_op_interfaces_inc_gen",
-        ":tensorflow_remaining_ops_inc_gen",
-        ":tensorflow_side_effects",
-        ":tensorflow_structs",
-        ":tensorflow_traits",
-        ":tensorflow_types",
-        ":tf_arith_ops_folder",
-        ":tf_ops_canonicalization_helper",
-        ":tf_ops_device_helper",
-        ":tf_ops_layout_helper",
-        ":tf_ops_tensor_helper",
-        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
-        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:rewrite_util",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:CallOpInterfaces",
-        "@llvm-project//mlir:ControlFlowInterfaces",
-        "@llvm-project//mlir:DerivedAttributeOpInterface",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:InferTypeOpInterface",
-        "@llvm-project//mlir:InliningUtils",
-        "@llvm-project//mlir:LoopLikeInterface",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:SideEffectInterfaces",
-        "@llvm-project//mlir:Support",
-    ] + [":tensorflow_" + target["name"] + "_inc_gen" for target in tf_ops_category_list],
-)
-
-cc_library(
-    name = "tensorflow_remaining_ops",
-    srcs = [
-        "ir/tf_dialect.h",
-        "ir/tf_ops.h",
-        "ir/tf_remaining_ops.cc",
-        "ir/tf_remaining_ops.h",
-    ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
-    hdrs = [
-    ],
-    textual_hdrs = [
-        "ir/tf_all_ops.h.inc",
-        "ir/tf_remaining_ops.h.inc",
-    ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
-    deps = [
-        ":attribute_utils",
-        ":serialize_mlir_module_utils",
-        ":side_effect_analysis_util",
-        ":tensorflow_attributes",
-        ":tensorflow_op_interfaces",
-        ":tensorflow_op_interfaces_inc_gen",
-        ":tensorflow_remaining_ops_inc_gen",
-        ":tensorflow_side_effects",
-        ":tensorflow_structs",
-        ":tensorflow_traits",
-        ":tensorflow_types",
-        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
-        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:rewrite_util",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:CallOpInterfaces",
-        "@llvm-project//mlir:ControlFlowInterfaces",
-        "@llvm-project//mlir:DerivedAttributeOpInterface",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:InferTypeOpInterface",
-        "@llvm-project//mlir:InliningUtils",
-        "@llvm-project//mlir:LoopLikeInterface",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:SideEffectInterfaces",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "tensorflow_ops",
-    srcs = [
-        "ir/tf_dialect.h",
-        "ir/tf_ops.cc",
-        "ir/tf_ops.h",
-    ],
-    textual_hdrs = [
-        "ir/tf_all_ops.h.inc",
-        "ir/tf_remaining_ops.h",
-    ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
-    deps = [
-        ":side_effect_analysis_util",
-        ":tensorflow_all_ops_inc_gen",
-        ":tensorflow_attributes",
-        ":tensorflow_op_interfaces",
-        ":tensorflow_op_interfaces_inc_gen",
-        ":tensorflow_ops_sharded",
-        ":tensorflow_remaining_ops",
-        ":tensorflow_remaining_ops_inc_gen",
-        ":tensorflow_side_effects",
-        ":tensorflow_structs",
-        ":tensorflow_traits",
-        ":tensorflow_types",
-        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
-        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:rewrite_util",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/common_runtime:inline_function_utils",
-        "//tensorflow/core/common_runtime:lower_function_call_inline_policy",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:CallOpInterfaces",
-        "@llvm-project//mlir:ControlFlowInterfaces",
-        "@llvm-project//mlir:DerivedAttributeOpInterface",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:InferTypeOpInterface",
-        "@llvm-project//mlir:InliningUtils",
-        "@llvm-project//mlir:LoopLikeInterface",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:SideEffectInterfaces",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "tensorflow_structs",
-    srcs = [
-        "ir/tf_structs.cc",
-    ],
-    hdrs = [
-        "ir/tf_structs.h",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core/ir/types:Dialect",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "tensorflow_side_effects",
-    srcs = [
-    ],
-    hdrs = [
-        "ir/tf_side_effects.h",
-    ],
-    deps = ["@llvm-project//mlir:SideEffectInterfaces"],
-)
-
-cc_library(
-    name = "tensorflow_types",
-    hdrs = [
-        "ir/tf_dialect.h",
-        "ir/tf_types.h",
-    ],
-    textual_hdrs = [
-        "ir/tf_types.def",
-    ],
-    deps = [
-        "//tensorflow/core/ir/types:Dialect",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "tensorflow",
-    srcs = [
-        "ir/tf_device.cc",
-        "ir/tf_executor.cc",
-        "ir/tf_executor.cc.inc",
-        "ir/tf_executor.h.inc",
-        "ir/tf_saved_model.cc",
-    ],
-    hdrs = [
-        "dialect_registration.h",
-        "ir/tf_device.h",
-        "ir/tf_dialect.h",
-        "ir/tf_executor.h",
-        "ir/tf_ops.h",
-        "ir/tf_saved_model.h",
-        "ir/tf_structs.h",
-        "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.h",
-    ],
-    includes = ["include"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":tensorflow_all_ops_inc_gen",
-        ":tensorflow_attributes",
-        ":tensorflow_device_ops_inc_gen",
-        ":tensorflow_executor_inc_gen",
-        ":tensorflow_op_interfaces",
-        ":tensorflow_ops",
-        ":tensorflow_side_effects",
-        ":tensorflow_structs",
-        ":tensorflow_traits",
-        ":tensorflow_types",
-        ":tf_saved_model_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
-        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_device_pass_inc_gen",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/ir:Dialect",
-        "//tensorflow/core/ir/types:Dialect",
-        "//tensorflow/core/platform:logging",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:CallOpInterfacesIncGen",
-        "@llvm-project//mlir:ControlFlowDialect",
-        "@llvm-project//mlir:ControlFlowInterfaces",
-        "@llvm-project//mlir:DerivedAttributeOpInterface",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncExtensions",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:InferTypeOpInterface",
-        "@llvm-project//mlir:InliningUtils",
-        "@llvm-project//mlir:LoopLikeInterface",
-        "@llvm-project//mlir:MLProgramDialect",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SideEffectInterfaces",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
-
-tf_cc_test(
-    name = "tf_saved_model_test",
-    srcs = ["ir/tf_saved_model_test.cc"],
-    deps = [
-        ":tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform:test",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "string_util",
-    srcs = ["utils/string_util.cc"],
-    hdrs = ["utils/string_util.h"],
-    deps = [
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-    ],
-)
-
-cc_library(
-    name = "fake_session",
-    srcs = ["utils/fake_session.cc"],
-    hdrs = ["utils/fake_session.h"],
-    deps = [
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:session_options",
-        "//tensorflow/core/common_runtime:threadpool_device",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:threadpool_options",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "session_utils",
-    srcs = ["utils/session_utils.cc"],
-    hdrs = ["utils/session_utils.h"],
-    deps = [
-        ":tensorflow",
-        ":tensorflow_ops",
-        "//tensorflow/compiler/mlir/utils:string_container_utils",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "topological_sort",
-    srcs = ["utils/topological_sort.cc"],
-    hdrs = ["utils/topological_sort.h"],
-    deps = [
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "tensorflow_analysis",
-    srcs = [
-        "analysis/per_function_aggregate_analysis.h",
-        "analysis/resource_alias_analysis.cc",
-        "analysis/resource_dataflow.cc",
-        "analysis/side_effect_analysis.cc",
-    ],
-    hdrs = [
-        "analysis/resource_alias_analysis.h",
-        "analysis/resource_dataflow.h",
-        "analysis/side_effect_analysis.h",
-        "analysis/tf_dataflow.h",
-    ],
-    deps = [
-        ":tensorflow",
-        ":tensorflow_op_interfaces",
-        ":tensorflow_side_effects",
-        ":tensorflow_types",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SideEffectInterfaces",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "xla_call_module_attrs",
-    srcs = [],
-    hdrs = ["utils/xla_call_module_attrs.h"],
-    deps = ["@llvm-project//llvm:Support"],
-)
-
-cc_library(
-    name = "stablehlo_custom_call_utils",
-    srcs = ["utils/stablehlo_custom_call.cc"],
-    hdrs = ["utils/stablehlo_custom_call.h"],
-    deps = [
-        ":xla_call_module_attrs",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@stablehlo//:stablehlo_ops",
-    ],
-)
-
-cc_library(
-    name = "parse_text_proto",
-    srcs = ["utils/parse_text_proto.cc"],
-    hdrs = ["utils/parse_text_proto.h"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:casts",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "import_utils",
-    srcs = ["utils/import_utils.cc"],
-    hdrs = ["utils/import_utils.h"],
-    deps = [
-        ":error_util",
-        ":parse_text_proto",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "export_utils",
-    srcs = [
-        "utils/export_utils.cc",
-    ],
-    hdrs = [
-        "utils/export_utils.h",
-    ],
-    deps = [
-        ":attribute_utils",
-        ":convert_tensor",
-        ":convert_type",
-        ":location_utils",
-        ":mangling_util",
-        ":tensorflow",
-        ":tensorflow_attributes",
-        ":tensorflow_types",
-        "//tensorflow/compiler/mlir/tensorflow:xla_sharding_util",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:protobuf",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@local_xla//xla:status_macros",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/client:sharding_builder",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/service:hlo_parser",
-    ],
-)
-
-cc_library(
-    name = "location_utils",
-    srcs = ["utils/location_utils.cc"],
-    hdrs = ["utils/location_utils.h"],
-    deps = [
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "tpu_cluster_util",
-    srcs = ["utils/tpu_cluster_util.cc"],
-    hdrs = ["utils/tpu_cluster_util.h"],
-    deps = [
-        ":device_util",
-        ":tpu_rewrite_device_util",
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "translate_utils",
-    srcs = [
-        "utils/translate_utils.cc",
-    ],
-    hdrs = [
-        "utils/translate_utils.h",
-    ],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "convert_attr",
-    srcs = ["utils/convert_attr.cc"],
-    hdrs = ["utils/convert_attr.h"],
-    visibility = [
-        "//visibility:public",
-    ],
-    deps = [
-        ":convert_tensor",
-        ":convert_type",
-        ":tensorflow_attributes",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:errors",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "convert_type",
-    srcs = [
-        "utils/convert_type.cc",
-    ],
-    hdrs = [
-        "utils/convert_type.h",
-    ],
-    textual_hdrs = [
-        "ir/tf_types.def",
-    ],
-    visibility = [
-        "//visibility:public",
-    ],
-    deps = [
-        ":dynamic_shape_utils",
-        ":tensorflow_types",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-tf_cc_test(
-    name = "convert_type_test",
-    size = "small",
-    srcs = ["utils/convert_type_test.cc"],
-    deps = [
-        ":convert_type",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@local_xla//xla:test",
-    ],
-)
-
-cc_library(
-    name = "convert_tensor",
-    srcs = ["utils/convert_tensor.cc"],
-    hdrs = ["utils/convert_tensor.h"],
-    deps = [
-        ":convert_type",
-        ":dynamic_shape_utils",
-        ":mangling_util",
-        ":tensorflow_attributes",
-        ":tensorflow_types",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:ml_dtypes",
-    ],
-)
-
-tf_cc_test(
-    name = "convert_tensor_test",
-    size = "small",
-    srcs = ["utils/convert_tensor_test.cc"],
-    deps = [
-        ":convert_tensor",
-        ":dynamic_shape_utils",
-        ":tensorflow",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_xla//xla:test",
-    ],
-)
-
-cc_library(
-    name = "mangling_util",
-    srcs = ["utils/mangling_util.cc"],
-    hdrs = ["utils/mangling_util.h"],
-    deps = [
-        ":parse_text_proto",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/ir/importexport:mangling",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "error_util",
-    srcs = ["utils/error_util.cc"],
-    hdrs = ["utils/error_util.h"],
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/core/util:managed_stack_trace",
-        "@com_google_absl//absl/status",
-        "@llvm-project//mlir:IR",
-        "@local_xla//xla/mlir/utils:error_util",
-    ],
-)
-
-cc_library(
-    name = "tf_dialect_lib",
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
-        "@llvm-project//mlir:AllPassesAndDialects",
-    ],
-)
-
-cc_library(
-    name = "eval_util",
-    srcs = ["utils/eval_util.cc"],
-    hdrs = ["utils/eval_util.h"],
-    deps = [
-        ":convert_tensor",
-        ":export_tf_dialect_op",
-        "//tensorflow/c/eager:c_api",
-        "//tensorflow/c/eager:c_api_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-tf_cc_test(
-    name = "error_util_test",
-    srcs = ["utils/error_util_test.cc"],
-    deps = [
-        ":error_util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@local_xla//xla:test",
-        "@local_xla//xla/mlir/utils:error_util",
-    ],
-)
-
-cc_library(
-    name = "serialize_mlir_module_utils",
-    srcs = ["utils/serialize_mlir_module_utils.cc"],
-    hdrs = ["utils/serialize_mlir_module_utils.h"],
-    deps = [
-        ":error_util",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:status",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@local_xla//xla:status_macros",
-    ],
-)
-
-cc_library(
-    name = "tf_xla_mlir_translate",
-    testonly = True,  # Ensure alwayslink does not leak in the codebase.
-    srcs = ["utils/tf_xla_mlir_translate.cc"],
-    deps = [
-        ":mlir_roundtrip_flags",
-        ":serialize_mlir_module_utils",
-        ":tensorflow",
-        ":translate_cl_options",
-        "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
-        "//tensorflow/compiler/mlir/utils:string_container_utils",
-        "//tensorflow/compiler/tf2xla:xla_argument",
-        "//tensorflow/compiler/tf2xla:xla_helpers",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:status",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:AsmParser",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncExtensions",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TranslateLib",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/service:hlo_module_config",
-        "@local_xla//xla/service:hlo_proto_cc",
-        "@local_xla//xla/translate/mhlo_to_hlo:type_to_shape",
-        "@stablehlo//:stablehlo_ops",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "mlir_passthrough_op",
-    srcs = ["ops/mlir_passthrough_op.cc"],
-    visibility = [
-        "//visibility:public",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "mlir_local_var_op",
-    srcs = ["ops/mlir_local_var_op.cc"],
-    visibility = [
-        "//visibility:public",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-    ],
-    alwayslink = 1,
-)
-
-tf_gen_op_wrapper_py(
-    name = "gen_mlir_passthrough_op_py",
-    out = "gen_mlir_passthrough_op.py",
-    compatible_with = [],
-    extra_py_deps = [
-        "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-    ],
-    py_lib_rule = py_strict_library,
-    deps = [":mlir_passthrough_op"],
-)
-
-cc_library(
-    name = "parallel_execute_util",
-    srcs = ["utils/parallel_execute_util.cc"],
-    hdrs = ["utils/parallel_execute_util.h"],
-    deps = [
-        ":tensorflow",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "cluster_util",
-    srcs = ["utils/cluster_util.cc"],
-    hdrs = ["utils/cluster_util.h"],
-    deps = [
-        ":tensorflow_analysis",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
-tf_cc_test(
-    name = "cluster_util_test",
-    size = "small",
-    srcs = ["utils/cluster_util_test.cc"],
-    deps = [
-        ":cluster_util",
-        ":serialize_mlir_module_utils",
-        ":tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform:errors",
-        "@com_google_googletest//:gtest",
-        "@llvm-project//mlir:FuncDialect",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "tpu_rewrite_device_util",
-    srcs = ["utils/tpu_rewrite_device_util.cc"],
-    hdrs = ["utils/tpu_rewrite_device_util.h"],
-    deps = [
-        ":device_util",
-        ":tensorflow",
-        ":tensorflow_types",
-        "//tensorflow/compiler/jit:flags_headers",
-        "//tensorflow/compiler/mlir/utils:string_container_utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/protobuf/tpu:topology_proto_cc",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@local_xla//xla:array4d",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/service:computation_placer",
-    ],
-)
-
-tf_cc_test(
-    name = "tpu_rewrite_device_util_test",
-    size = "small",
-    srcs = ["utils/tpu_rewrite_device_util_test.cc"],
-    deps = [
-        ":device_util",
-        ":serialize_mlir_module_utils",
-        ":tensorflow",
-        ":tpu_rewrite_device_util",
-        "//tensorflow/compiler/jit:flags",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/protobuf/tpu:topology_proto_cc",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "device_util",
-    srcs = ["utils/device_util.cc"],
-    hdrs = ["utils/device_util.h"],
-    deps = [
-        ":tensorflow",
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:framework",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-tf_cc_test(
-    name = "device_util_test",
-    size = "small",
-    srcs = ["utils/device_util_test.cc"],
-    deps = [
-        ":device_util",
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/ir/types:Dialect",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "dump_mlir_util",
-    srcs = ["utils/dump_mlir_util.cc"],
-    hdrs = ["utils/dump_mlir_util.h"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:crash_analysis",
-        "//tensorflow/core/platform:logging",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@local_tsl//tsl/lib/io:buffered_file",
-    ],
-)
-
-tf_cc_test(
-    name = "dump_mlir_util_test",
-    size = "small",
-    srcs = ["utils/dump_mlir_util_test.cc"],
-    deps = [
-        ":dump_mlir_util",
-        ":tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:bridge",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform:test",
-        "@com_google_googletest//:gtest",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MlirOptLib",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "dump_graph",
-    srcs = ["utils/dump_graph.cc"],
-    hdrs = ["utils/dump_graph.h"],
-    deps = [
-        ":error_util",
-        ":tensorflow",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/ir/importexport:graphdef_import",
-        "//tensorflow/core/platform:logging",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-tf_cc_test(
-    name = "dump_graph_test",
-    size = "small",
-    srcs = ["utils/dump_graph_test.cc"],
-    tags = [
-        "no_windows",  # b/208469759
-    ],
-    deps = [
-        ":dump_graph",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:ops",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform:test",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-tf_cc_test(
-    name = "bridge_logger_test",
-    size = "small",
-    srcs = ["utils/bridge_logger_test.cc"],
-    deps = [
-        ":bridge_logger",
-        ":serialize_mlir_module_utils",
-        ":tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform:test",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
-
-tf_cc_test(
-    name = "data_dumper_logger_config_test",
-    size = "small",
-    srcs = ["utils/data_dumper_logger_config_test.cc"],
-    deps = [
-        ":bridge_logger",
-        ":serialize_mlir_module_utils",
-        ":tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform:test",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
-
-cc_library(
-    name = "bridge_logger",
-    srcs = [
-        "utils/bridge_logger.cc",
-        "utils/data_dumper_logger_config.cc",
-    ],
-    hdrs = [
-        "utils/bridge_logger.h",
-        "utils/data_dumper_logger_config.h",
-    ],
-    deps = [
-        ":dump_mlir_util",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "call_graph_util",
-    srcs = [
-        "utils/call_graph_util.cc",
-    ],
-    hdrs = [
-        "utils/call_graph_util.h",
-    ],
-    deps = [
-        ":tensorflow",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-tf_cc_test(
-    name = "call_graph_util_test",
-    size = "small",
-    srcs = ["utils/call_graph_util_test.cc"],
-    deps = [
-        ":attribute_utils",
-        ":call_graph_util",
-        ":tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-    ],
-)
-
-cc_library(
-    name = "xla_sharding_util",
-    srcs = [
-        "utils/xla_sharding_util.cc",
-    ],
-    hdrs = [
-        "utils/xla_sharding_util.h",
-    ],
-    deps = [
-        ":tensorflow",
-        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/client:sharding_builder",
-        "@local_xla//xla/service:hlo_parser",
-    ],
-)
-
-cc_library(
-    name = "attribute_utils",
-    srcs = ["utils/attribute_utils.cc"],
-    hdrs = ["utils/attribute_utils.h"],
-    deps = [
-        "//tensorflow/compiler/tf2xla:tf2xla_defs",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "shape_inference_utils",
-    srcs = ["utils/shape_inference_utils.cc"],
-    hdrs = ["utils/shape_inference_utils.h"],
-    deps = [
-        ":export_tf_dialect_op",
-        ":tensorflow_types",
-        "//tensorflow/core/ir:shape_inference_utils",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "dynamic_shape_utils",
-    srcs = ["utils/dynamic_shape_utils.cc"],
-    hdrs = ["utils/dynamic_shape_utils.h"],
-    deps = [
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "verification_utils",
-    srcs = ["utils/verification_utils.cc"],
-    hdrs = ["utils/verification_utils.h"],
-    deps = [
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "verify_suitable_for_graph_export",
-    srcs = ["utils/verify_suitable_for_graph_export.cc"],
-    hdrs = ["utils/verify_suitable_for_graph_export.h"],
-    deps = [
-        ":tensorflow",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "resource_value_typed_analyzer",
-    srcs = ["analysis/resource_value_typed_analyzer.cc"],
-    hdrs = ["analysis/resource_value_typed_analyzer.h"],
-    deps = [
-        ":tensorflow",
-        ":tensorflow_types",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
-cc_library(
-    name = "tf_arith_ops_folder",
-    srcs = ["ir/tf_arith_ops_folder.cc"],
-    hdrs = ["ir/tf_arith_ops_folder.h"],
-    deps = [
-        ":tensorflow_types",
-        "//tensorflow/core/ir/types:Dialect",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "tf_ops_canonicalization_helper",
-    hdrs = ["ir/tf_ops_canonicalization_helper.h"],
-    deps = [
-        ":attribute_utils",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "tf_ops_device_helper",
-    srcs = ["ir/tf_ops_device_helper.cc"],
-    hdrs = ["ir/tf_ops_device_helper.h"],
-    deps = [
-        ":tensorflow_structs",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-    ],
-)
-
-cc_library(
-    name = "tf_ops_layout_helper",
-    srcs = ["ir/tf_ops_layout_helper.cc"],
-    hdrs = ["ir/tf_ops_layout_helper.h"],
-    deps = [
-        ":tensorflow_op_interfaces",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "tf_ops_tensor_helper",
-    srcs = ["ir/tf_ops_tensor_helper.cc"],
-    hdrs = ["ir/tf_ops_tensor_helper.h"],
-    deps = [
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "tpu_embedding_ops_registry",
-    srcs = [
-        "ir/tpu_embedding_ops_registry.cc",
-    ],
-    hdrs = [
-        "ir/tpu_embedding_ops_registry.h",
-    ],
-    deps = [
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "mlprogram_util",
-    srcs = ["utils/mlprogram_util.cc"],
-    hdrs = ["utils/mlprogram_util.h"],
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow/transforms:mlprogram",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "visitor",
-    srcs = ["utils/visitor.cc"],
-    hdrs = ["utils/visitor.h"],
-    deps = [
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "xla_rewrite_util",
-    srcs = ["utils/xla_rewrite_util.cc"],
-    hdrs = ["utils/xla_rewrite_util.h"],
-    deps = [
-        ":device_util",
-        ":tensorflow",
-        ":tensorflow_types",
-        "//tensorflow/compiler/mlir/utils:string_container_utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/protobuf/tpu:topology_proto_cc",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@local_xla//xla:array4d",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/service:computation_placer",
-    ],
-)
-
-tf_cc_test(
-    name = "xla_rewrite_util_test",
-    size = "small",
-    srcs = ["utils/xla_rewrite_util_test.cc"],
-    deps = [
-        ":device_util",
-        ":serialize_mlir_module_utils",
-        ":tensorflow",
-        ":tpu_rewrite_device_util",
-        ":xla_rewrite_util",
-        "//tensorflow/compiler/jit:flags",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/protobuf/tpu:topology_proto_cc",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "side_effect_analysis_util",
-    srcs = [
-        "utils/side_effect_analysis_util.cc",
-    ],
-    hdrs = [
-        "utils/side_effect_analysis_util.h",
-    ],
-    deps = [
-        "tensorflow_side_effects",
-        "tensorflow_types",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:SideEffectInterfaces",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-build_test(
-    name = "tensorflow_build_test",
-    targets = [
-        ":tensorflow",
-    ],
-)
+# package(
+#     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+#     default_visibility = ["//visibility:public"],
+#     licenses = ["notice"],
+# )
+#
+# exports_files([
+#     "ir/tf_generated_ops.td",
+#     "ir/tf_op_base.td",
+#     "ir/tf_op_interfaces.td",
+#     "ir/tf_ops.td",
+# ])
+#
+# td_library(
+#     name = "tensorflow_ops_td_files",
+#     srcs = [
+#         "ir/tf_generated_ops.td",
+#         "ir/tf_op_base.td",
+#         "ir/tf_op_interfaces.td",
+#         "ir/tf_ops.td",
+#     ],
+#     compatible_with = get_compatible_with_portable(),
+#     deps = [
+#         "@llvm-project//mlir:CallInterfacesTdFiles",
+#         "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
+#         "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
+#         "@llvm-project//mlir:LoopLikeInterfaceTdFiles",
+#         "@llvm-project//mlir:OpBaseTdFiles",
+#         "@llvm-project//mlir:SideEffectInterfacesTdFiles",
+#     ],
+# )
+#
+# gentbl_cc_library(
+#     name = "tensorflow_op_interfaces_inc_gen",
+#     compatible_with = get_compatible_with_portable(),
+#     tbl_outs = [
+#         (
+#             ["-gen-op-interface-decls"],
+#             "ir/tf_op_interfaces.h.inc",
+#         ),
+#         (
+#             ["-gen-op-interface-defs"],
+#             "ir/tf_op_interfaces.cc.inc",
+#         ),
+#     ],
+#     tblgen = "@llvm-project//mlir:mlir-tblgen",
+#     td_file = "ir/tf_op_interfaces.td",
+#     test = True,
+#     deps = [
+#         ":tensorflow_ops_td_files",
+#     ],
+# )
+#
+# gentbl_cc_library(
+#     name = "tensorflow_struct_doc_gen",
+#     compatible_with = get_compatible_with_portable(),
+#     tbl_outs = [
+#         (
+#             ["-gen-dialect-doc"],
+#             "g3doc/tf_ops.md",
+#         ),
+#     ],
+#     tblgen = "@llvm-project//mlir:mlir-tblgen",
+#     td_file = "ir/tf_ops.td",
+#     test = True,
+#     deps = [
+#         ":tensorflow_ops_td_files",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tensorflow_op_interfaces",
+#     srcs = [
+#         "ir/tf_op_interfaces.cc",
+#         "ir/tf_op_interfaces.cc.inc",
+#         "ir/tf_op_interfaces.h.inc",
+#         "ir/tf_verifiers.cc",
+#     ],
+#     hdrs = [
+#         "ir/tf_op_interfaces.h",
+#         "ir/tf_verifiers.h",
+#     ],
+#     deps = [
+#         ":tensorflow_op_interfaces_inc_gen",
+#         ":tensorflow_structs",
+#         "//tensorflow/core:framework",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# gentbl_cc_library(
+#     name = "tensorflow_all_ops_inc_gen",
+#     compatible_with = get_compatible_with_portable(),
+#     tbl_outs = [
+#         (
+#             ["-gen-op-decls"],
+#             "ir/tf_all_ops.h.inc",
+#         ),
+#         (
+#             ["-gen-op-defs"],
+#             "ir/tf_all_ops.cc.inc",
+#         ),
+#     ],
+#     tblgen = "@llvm-project//mlir:mlir-tblgen",
+#     td_file = "ir/tf_ops.td",
+#     deps = [
+#         ":tensorflow_ops_td_files",
+#     ],
+# )
+#
+# # We only shard tf_op on name for build performance reasons.
+# tf_ops_category_list = [
+#     {
+#         "name": "ops_a_m",
+#         "include": "tf.[A-M].*$",
+#     },
+#     {
+#         "name": "ops_n_z",
+#         "include": "tf.[N-Z].*$",
+#     },
+# ]
+#
+# [[
+#     gentbl_cc_library(
+#         name = "tensorflow_" + target["name"] + "_inc_gen",
+#         compatible_with = get_compatible_with_portable(),
+#         tbl_outs = [
+#             (
+#                 [
+#                     "-gen-op-decls",
+#                     "-op-include-regex=" + target["include"],
+#                 ],
+#                 "ir/tf_" + target["name"] + ".h.inc",
+#             ),
+#             (
+#                 [
+#                     "-gen-op-defs",
+#                     "-op-include-regex=" + target["include"],
+#                 ],
+#                 "ir/tf_" + target["name"] + ".cc.inc",
+#             ),
+#         ],
+#         tblgen = "@llvm-project//mlir:mlir-tblgen",
+#         td_file = "ir/tf_ops.td",
+#         deps = [
+#             ":tensorflow_ops_td_files",
+#         ],
+#     ),
+# ] for target in tf_ops_category_list]
+#
+# gentbl_cc_library(
+#     name = "tensorflow_remaining_ops_inc_gen",
+#     compatible_with = get_compatible_with_portable(),
+#     tbl_outs = [
+#         (
+#             [
+#                 "-gen-op-decls",
+#                 "-op-exclude-regex=" + "|".join([target["include"] for target in tf_ops_category_list]),
+#             ],
+#             "ir/tf_remaining_ops.h.inc",
+#         ),
+#         (
+#             [
+#                 "-gen-op-defs",
+#                 "-op-exclude-regex=" + "|".join([target["include"] for target in tf_ops_category_list]),
+#             ],
+#             "ir/tf_remaining_ops.cc.inc",
+#         ),
+#     ],
+#     tblgen = "@llvm-project//mlir:mlir-tblgen",
+#     td_file = "ir/tf_ops.td",
+#     deps = [
+#         ":tensorflow_ops_td_files",
+#     ],
+# )
+#
+# gentbl_cc_library(
+#     name = "tf_saved_model_inc_gen",
+#     compatible_with = get_compatible_with_portable(),
+#     tbl_outs = [
+#         (
+#             ["-gen-op-decls"],
+#             "ir/tf_saved_model.h.inc",
+#         ),
+#         (
+#             ["-gen-op-defs"],
+#             "ir/tf_saved_model.cc.inc",
+#         ),
+#         (
+#             ["-gen-dialect-doc"],
+#             "g3doc/tf_saved_model.md",
+#         ),
+#     ],
+#     tblgen = "@llvm-project//mlir:mlir-tblgen",
+#     td_file = "ir/tf_saved_model_ops.td",
+#     test = True,
+#     deps = [
+#         "@llvm-project//mlir:FuncTdFiles",
+#         "@llvm-project//mlir:OpBaseTdFiles",
+#     ],
+# )
+#
+# gentbl_cc_library(
+#     name = "tensorflow_executor_inc_gen",
+#     compatible_with = get_compatible_with_portable(),
+#     tbl_outs = [
+#         (
+#             ["-gen-op-decls"],
+#             "ir/tf_executor.h.inc",
+#         ),
+#         (
+#             ["-gen-op-defs"],
+#             "ir/tf_executor.cc.inc",
+#         ),
+#         (
+#             [
+#                 "-gen-dialect-doc",
+#                 "-dialect=tf_executor",
+#             ],
+#             "g3doc/tf_executor.md",
+#         ),
+#     ],
+#     tblgen = "@llvm-project//mlir:mlir-tblgen",
+#     td_file = "ir/tf_executor_ops.td",
+#     test = True,
+#     deps = [
+#         ":tensorflow_ops_td_files",
+#         "@llvm-project//mlir:FuncTdFiles",
+#         "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
+#         "@llvm-project//mlir:OpBaseTdFiles",
+#     ],
+# )
+#
+# gentbl_cc_library(
+#     name = "tensorflow_device_ops_inc_gen",
+#     compatible_with = get_compatible_with_portable(),
+#     tbl_outs = [
+#         (
+#             ["-gen-op-decls"],
+#             "ir/tf_device.h.inc",
+#         ),
+#         (
+#             ["-gen-op-defs"],
+#             "ir/tf_device.cc.inc",
+#         ),
+#         (
+#             ["-gen-dialect-doc"],
+#             "g3doc/tf_device.md",
+#         ),
+#     ],
+#     tblgen = "@llvm-project//mlir:mlir-tblgen",
+#     td_file = "ir/tf_device_ops.td",
+#     test = True,
+#     deps = [
+#         "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
+#         "@llvm-project//mlir:FuncTdFiles",
+#         "@llvm-project//mlir:OpBaseTdFiles",
+#         "@llvm-project//mlir:SideEffectInterfacesTdFiles",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tensorflow_attributes",
+#     hdrs = [
+#         "ir/tf_attributes.h",
+#         "ir/tf_dialect.h",
+#     ],
+#     deps = [
+#         ":tensorflow_types",
+#         "//tensorflow/core/ir/types:Dialect",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Parser",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tensorflow_traits",
+#     srcs = [
+#     ],
+#     hdrs = [
+#         "ir/tf_traits.h",
+#     ],
+#     deps = [
+#         ":tensorflow_op_interfaces",
+#         ":tensorflow_types",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:InferTypeOpInterface",
+#         "@llvm-project//mlir:SideEffectInterfaces",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# # TensorFlow ops are separated into `tensorflow_ops_a_m.cc` and
+# # `tensorflow_ops_n_z.cc` so that C++ compiler won't be stressed by huge C++
+# # files. However, there might be dependencies between `tensorflow_ops_a_m.cc`
+# # and `tensorflow_ops_n_z.cc`, thus they must be built in one `cc_library`.
+# cc_library(
+#     name = "tensorflow_ops_sharded",
+#     srcs = [
+#                "ir/tf_dialect.h",
+#                "ir/tf_ops.h",
+#                "ir/tf_remaining_ops.h",
+#            ] + ["ir/tf_" + target["name"] + ".cc" for target in tf_ops_category_list] +
+#            ["ir/tf_" + target["name"] + ".cc.inc" for target in tf_ops_category_list] +
+#            ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
+#     hdrs = [
+#     ],
+#     textual_hdrs = [
+#         "ir/tf_types.def",
+#         "ir/tf_all_ops.h.inc",
+#         "ir/tf_remaining_ops.h.inc",
+#     ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
+#     deps = [
+#         ":attribute_utils",
+#         ":convert_type",
+#         ":dynamic_shape_utils",
+#         ":side_effect_analysis_util",
+#         ":tensorflow_all_ops_inc_gen",
+#         ":tensorflow_attributes",
+#         ":tensorflow_op_interfaces",
+#         ":tensorflow_op_interfaces_inc_gen",
+#         ":tensorflow_remaining_ops_inc_gen",
+#         ":tensorflow_side_effects",
+#         ":tensorflow_structs",
+#         ":tensorflow_traits",
+#         ":tensorflow_types",
+#         ":tf_arith_ops_folder",
+#         ":tf_ops_canonicalization_helper",
+#         ":tf_ops_device_helper",
+#         ":tf_ops_layout_helper",
+#         ":tf_ops_tensor_helper",
+#         "@com_google_absl//absl/log:check",
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
+#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:rewrite_util",
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:CallOpInterfaces",
+#         "@llvm-project//mlir:ControlFlowInterfaces",
+#         "@llvm-project//mlir:DerivedAttributeOpInterface",
+#         "@llvm-project//mlir:Dialect",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:InferTypeOpInterface",
+#         "@llvm-project//mlir:InliningUtils",
+#         "@llvm-project//mlir:LoopLikeInterface",
+#         "@llvm-project//mlir:Parser",
+#         "@llvm-project//mlir:SideEffectInterfaces",
+#         "@llvm-project//mlir:Support",
+#     ] + [":tensorflow_" + target["name"] + "_inc_gen" for target in tf_ops_category_list],
+# )
+#
+# cc_library(
+#     name = "tensorflow_remaining_ops",
+#     srcs = [
+#         "ir/tf_dialect.h",
+#         "ir/tf_ops.h",
+#         "ir/tf_remaining_ops.cc",
+#         "ir/tf_remaining_ops.h",
+#     ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
+#     hdrs = [
+#     ],
+#     textual_hdrs = [
+#         "ir/tf_all_ops.h.inc",
+#         "ir/tf_remaining_ops.h.inc",
+#     ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
+#     deps = [
+#         ":attribute_utils",
+#         ":serialize_mlir_module_utils",
+#         ":side_effect_analysis_util",
+#         ":tensorflow_attributes",
+#         ":tensorflow_op_interfaces",
+#         ":tensorflow_op_interfaces_inc_gen",
+#         ":tensorflow_remaining_ops_inc_gen",
+#         ":tensorflow_side_effects",
+#         ":tensorflow_structs",
+#         ":tensorflow_traits",
+#         ":tensorflow_types",
+#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
+#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:rewrite_util",
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:CallOpInterfaces",
+#         "@llvm-project//mlir:ControlFlowInterfaces",
+#         "@llvm-project//mlir:DerivedAttributeOpInterface",
+#         "@llvm-project//mlir:Dialect",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:InferTypeOpInterface",
+#         "@llvm-project//mlir:InliningUtils",
+#         "@llvm-project//mlir:LoopLikeInterface",
+#         "@llvm-project//mlir:Parser",
+#         "@llvm-project//mlir:SideEffectInterfaces",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tensorflow_ops",
+#     srcs = [
+#         "ir/tf_dialect.h",
+#         "ir/tf_ops.cc",
+#         "ir/tf_ops.h",
+#     ],
+#     textual_hdrs = [
+#         "ir/tf_all_ops.h.inc",
+#         "ir/tf_remaining_ops.h",
+#     ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
+#     deps = [
+#         ":side_effect_analysis_util",
+#         ":tensorflow_all_ops_inc_gen",
+#         ":tensorflow_attributes",
+#         ":tensorflow_op_interfaces",
+#         ":tensorflow_op_interfaces_inc_gen",
+#         ":tensorflow_ops_sharded",
+#         ":tensorflow_remaining_ops",
+#         ":tensorflow_remaining_ops_inc_gen",
+#         ":tensorflow_side_effects",
+#         ":tensorflow_structs",
+#         ":tensorflow_traits",
+#         ":tensorflow_types",
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
+#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:rewrite_util",
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core/common_runtime:inline_function_utils",
+#         "//tensorflow/core/common_runtime:lower_function_call_inline_policy",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:CallOpInterfaces",
+#         "@llvm-project//mlir:ControlFlowInterfaces",
+#         "@llvm-project//mlir:DerivedAttributeOpInterface",
+#         "@llvm-project//mlir:Dialect",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:InferTypeOpInterface",
+#         "@llvm-project//mlir:InliningUtils",
+#         "@llvm-project//mlir:LoopLikeInterface",
+#         "@llvm-project//mlir:Parser",
+#         "@llvm-project//mlir:SideEffectInterfaces",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tensorflow_structs",
+#     srcs = [
+#         "ir/tf_structs.cc",
+#     ],
+#     hdrs = [
+#         "ir/tf_structs.h",
+#     ],
+#     deps = [
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core/ir/types:Dialect",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tensorflow_side_effects",
+#     srcs = [
+#     ],
+#     hdrs = [
+#         "ir/tf_side_effects.h",
+#     ],
+#     deps = ["@llvm-project//mlir:SideEffectInterfaces"],
+# )
+#
+# cc_library(
+#     name = "tensorflow_types",
+#     hdrs = [
+#         "ir/tf_dialect.h",
+#         "ir/tf_types.h",
+#     ],
+#     textual_hdrs = [
+#         "ir/tf_types.def",
+#     ],
+#     deps = [
+#         "//tensorflow/core/ir/types:Dialect",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:Dialect",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tensorflow",
+#     srcs = [
+#         "ir/tf_device.cc",
+#         "ir/tf_executor.cc",
+#         "ir/tf_executor.cc.inc",
+#         "ir/tf_executor.h.inc",
+#         "ir/tf_saved_model.cc",
+#     ],
+#     hdrs = [
+#         "dialect_registration.h",
+#         "ir/tf_device.h",
+#         "ir/tf_dialect.h",
+#         "ir/tf_executor.h",
+#         "ir/tf_ops.h",
+#         "ir/tf_saved_model.h",
+#         "ir/tf_structs.h",
+#         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.h",
+#     ],
+#     includes = ["include"],
+#     visibility = ["//visibility:public"],
+#     deps = [
+#         ":tensorflow_all_ops_inc_gen",
+#         ":tensorflow_attributes",
+#         ":tensorflow_device_ops_inc_gen",
+#         ":tensorflow_executor_inc_gen",
+#         ":tensorflow_op_interfaces",
+#         ":tensorflow_ops",
+#         ":tensorflow_side_effects",
+#         ":tensorflow_structs",
+#         ":tensorflow_traits",
+#         ":tensorflow_types",
+#         ":tf_saved_model_inc_gen",
+#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
+#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_device_pass_inc_gen",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core/ir:Dialect",
+#         "//tensorflow/core/ir/types:Dialect",
+#         "//tensorflow/core/platform:logging",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:Analysis",
+#         "@llvm-project//mlir:ArithDialect",
+#         "@llvm-project//mlir:CallOpInterfacesIncGen",
+#         "@llvm-project//mlir:ControlFlowDialect",
+#         "@llvm-project//mlir:ControlFlowInterfaces",
+#         "@llvm-project//mlir:DerivedAttributeOpInterface",
+#         "@llvm-project//mlir:Dialect",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:FuncExtensions",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:InferTypeOpInterface",
+#         "@llvm-project//mlir:InliningUtils",
+#         "@llvm-project//mlir:LoopLikeInterface",
+#         "@llvm-project//mlir:MLProgramDialect",
+#         "@llvm-project//mlir:Parser",
+#         "@llvm-project//mlir:Pass",
+#         "@llvm-project//mlir:SideEffectInterfaces",
+#         "@llvm-project//mlir:Support",
+#         "@llvm-project//mlir:TransformUtils",
+#         "@llvm-project//mlir:Transforms",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "tf_saved_model_test",
+#     srcs = ["ir/tf_saved_model_test.cc"],
+#     deps = [
+#         ":tensorflow",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "//tensorflow/core/platform:test",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Parser",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "string_util",
+#     srcs = ["utils/string_util.cc"],
+#     hdrs = ["utils/string_util.h"],
+#     deps = [
+#         "@com_google_absl//absl/strings",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Pass",
+#     ],
+# )
+#
+# cc_library(
+#     name = "fake_session",
+#     srcs = ["utils/fake_session.cc"],
+#     hdrs = ["utils/fake_session.h"],
+#     deps = [
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/core:core_cpu_base",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:framework_internal",
+#         "//tensorflow/core:protos_all_cc",
+#         "//tensorflow/core:session_options",
+#         "//tensorflow/core/common_runtime:threadpool_device",
+#         "//tensorflow/core/platform:errors",
+#         "//tensorflow/core/platform:status",
+#         "//tensorflow/core/platform:threadpool_options",
+#         "@llvm-project//llvm:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "session_utils",
+#     srcs = ["utils/session_utils.cc"],
+#     hdrs = ["utils/session_utils.h"],
+#     deps = [
+#         ":tensorflow",
+#         ":tensorflow_ops",
+#         "@com_google_absl//absl/status",
+#         "@com_google_absl//absl/status:statusor",
+#         "//tensorflow/compiler/mlir/utils:string_container_utils",
+#         "//tensorflow/core:core_cpu_base",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:framework_internal",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "topological_sort",
+#     srcs = ["utils/topological_sort.cc"],
+#     hdrs = ["utils/topological_sort.h"],
+#     deps = [
+#         "@com_google_absl//absl/types:span",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tensorflow_analysis",
+#     srcs = [
+#         "analysis/per_function_aggregate_analysis.h",
+#         "analysis/resource_alias_analysis.cc",
+#         "analysis/resource_dataflow.cc",
+#         "analysis/side_effect_analysis.cc",
+#     ],
+#     hdrs = [
+#         "analysis/resource_alias_analysis.h",
+#         "analysis/resource_dataflow.h",
+#         "analysis/side_effect_analysis.h",
+#         "analysis/tf_dataflow.h",
+#     ],
+#     deps = [
+#         ":tensorflow",
+#         ":tensorflow_op_interfaces",
+#         ":tensorflow_side_effects",
+#         ":tensorflow_types",
+#         "@com_google_absl//absl/container:flat_hash_map",
+#         "@com_google_absl//absl/container:flat_hash_set",
+#         "@com_google_absl//absl/container:node_hash_map",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:Analysis",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Pass",
+#         "@llvm-project//mlir:SideEffectInterfaces",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "xla_call_module_attrs",
+#     srcs = [],
+#     hdrs = ["utils/xla_call_module_attrs.h"],
+#     deps = ["@llvm-project//llvm:Support"],
+# )
+#
+# cc_library(
+#     name = "stablehlo_custom_call_utils",
+#     srcs = ["utils/stablehlo_custom_call.cc"],
+#     hdrs = ["utils/stablehlo_custom_call.h"],
+#     deps = [
+#         ":xla_call_module_attrs",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#         "@stablehlo//:stablehlo_ops",
+#     ],
+# )
+#
+# cc_library(
+#     name = "parse_text_proto",
+#     srcs = ["utils/parse_text_proto.cc"],
+#     hdrs = ["utils/parse_text_proto.h"],
+#     deps = [
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:protos_all_cc",
+#         "//tensorflow/core/platform:casts",
+#     ],
+# )
+#
+# cc_library(
+#     name = "import_utils",
+#     srcs = ["utils/import_utils.cc"],
+#     hdrs = ["utils/import_utils.h"],
+#     deps = [
+#         ":error_util",
+#         ":parse_text_proto",
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/core:lib",
+#         "@llvm-project//llvm:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "export_utils",
+#     srcs = [
+#         "utils/export_utils.cc",
+#     ],
+#     hdrs = [
+#         "utils/export_utils.h",
+#     ],
+#     deps = [
+#         ":attribute_utils",
+#         ":convert_tensor",
+#         ":convert_type",
+#         ":location_utils",
+#         ":mangling_util",
+#         ":tensorflow",
+#         ":tensorflow_attributes",
+#         ":tensorflow_types",
+#         "@com_google_absl//absl/container:flat_hash_set",
+#         "@com_google_absl//absl/memory",
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/compiler/mlir/tensorflow:xla_sharding_util",
+#         "@local_xla//xla:status_macros",
+#         "@local_xla//xla:xla_data_proto_cc",
+#         "@local_xla//xla/client:sharding_builder",
+#         "@local_xla//xla/hlo/ir:hlo",
+#         "@local_xla//xla/service:hlo_parser",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:graph",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:protos_all_cc",
+#         "//tensorflow/core/platform:protobuf",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "location_utils",
+#     srcs = ["utils/location_utils.cc"],
+#     hdrs = ["utils/location_utils.h"],
+#     deps = [
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tpu_cluster_util",
+#     srcs = ["utils/tpu_cluster_util.cc"],
+#     hdrs = ["utils/tpu_cluster_util.h"],
+#     deps = [
+#         ":device_util",
+#         ":tpu_rewrite_device_util",
+#         "@llvm-project//mlir:Analysis",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "translate_utils",
+#     srcs = [
+#         "utils/translate_utils.cc",
+#     ],
+#     hdrs = [
+#         "utils/translate_utils.h",
+#     ],
+#     deps = [
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:protos_all_cc",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "convert_attr",
+#     srcs = ["utils/convert_attr.cc"],
+#     hdrs = ["utils/convert_attr.h"],
+#     visibility = [
+#         "//visibility:public",
+#     ],
+#     deps = [
+#         ":convert_tensor",
+#         ":convert_type",
+#         ":tensorflow_attributes",
+#         "//tensorflow/core:protos_all_cc",
+#         "//tensorflow/core/platform:errors",
+#         "@local_tsl//tsl/platform:statusor",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "convert_type",
+#     srcs = [
+#         "utils/convert_type.cc",
+#     ],
+#     hdrs = [
+#         "utils/convert_type.h",
+#     ],
+#     textual_hdrs = [
+#         "ir/tf_types.def",
+#     ],
+#     visibility = [
+#         "//visibility:public",
+#     ],
+#     deps = [
+#         ":dynamic_shape_utils",
+#         ":tensorflow_types",
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:protos_all_cc",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "convert_type_test",
+#     size = "small",
+#     srcs = ["utils/convert_type_test.cc"],
+#     deps = [
+#         ":convert_type",
+#         "@local_xla//xla:test",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "convert_tensor",
+#     srcs = ["utils/convert_tensor.cc"],
+#     hdrs = ["utils/convert_tensor.h"],
+#     deps = [
+#         ":convert_type",
+#         ":dynamic_shape_utils",
+#         ":mangling_util",
+#         ":tensorflow_attributes",
+#         ":tensorflow_types",
+#         "@com_google_absl//absl/base",
+#         "@com_google_absl//absl/container:inlined_vector",
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:protos_all_cc",
+#         "@local_tsl//tsl/platform:ml_dtypes",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "convert_tensor_test",
+#     size = "small",
+#     srcs = ["utils/convert_tensor_test.cc"],
+#     deps = [
+#         ":convert_tensor",
+#         ":dynamic_shape_utils",
+#         ":tensorflow",
+#         "@local_xla//xla:test",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:protos_all_cc",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "//tensorflow/core:testlib",
+#         "@local_tsl//tsl/platform:ml_dtypes",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "mangling_util",
+#     srcs = ["utils/mangling_util.cc"],
+#     hdrs = ["utils/mangling_util.h"],
+#     deps = [
+#         ":parse_text_proto",
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:protos_all_cc",
+#         "//tensorflow/core/ir/importexport:mangling",
+#     ],
+# )
+#
+# cc_library(
+#     name = "error_util",
+#     srcs = ["utils/error_util.cc"],
+#     hdrs = ["utils/error_util.h"],
+#     visibility = ["//tensorflow:__subpackages__"],
+#     deps = [
+#         "@com_google_absl//absl/status",
+#         "@local_xla//xla/mlir/utils:error_util",
+#         "//tensorflow/core/platform:errors",
+#         "//tensorflow/core/platform:status",
+#         "//tensorflow/core/util:managed_stack_trace",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tf_dialect_lib",
+#     deps = [
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
+#         "@llvm-project//mlir:AllPassesAndDialects",
+#     ],
+# )
+#
+# cc_library(
+#     name = "eval_util",
+#     srcs = ["utils/eval_util.cc"],
+#     hdrs = ["utils/eval_util.h"],
+#     deps = [
+#         ":convert_tensor",
+#         ":export_tf_dialect_op",
+#         "@com_google_absl//absl/container:inlined_vector",
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/c/eager:c_api",
+#         "//tensorflow/c/eager:c_api_internal",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:framework_internal",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:protos_all_cc",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "error_util_test",
+#     srcs = ["utils/error_util_test.cc"],
+#     deps = [
+#         ":error_util",
+#         "@local_xla//xla:test",
+#         "@local_xla//xla/mlir/utils:error_util",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "serialize_mlir_module_utils",
+#     srcs = ["utils/serialize_mlir_module_utils.cc"],
+#     hdrs = ["utils/serialize_mlir_module_utils.h"],
+#     deps = [
+#         ":error_util",
+#         "@local_xla//xla:status_macros",
+#         "//tensorflow/core/platform:errors",
+#         "//tensorflow/core/platform:status",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Parser",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tf_xla_mlir_translate",
+#     testonly = True,  # Ensure alwayslink does not leak in the codebase.
+#     srcs = ["utils/tf_xla_mlir_translate.cc"],
+#     deps = [
+#         ":mlir_roundtrip_flags",
+#         ":serialize_mlir_module_utils",
+#         ":tensorflow",
+#         ":translate_cl_options",
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
+#         "//tensorflow/compiler/mlir/utils:string_container_utils",
+#         "//tensorflow/compiler/tf2xla:xla_argument",
+#         "//tensorflow/compiler/tf2xla:xla_helpers",
+#         "@local_xla//xla/hlo/ir:hlo",
+#         "@local_xla//xla/service:hlo_module_config",
+#         "@local_xla//xla/service:hlo_proto_cc",
+#         "@local_xla//xla/translate/mhlo_to_hlo:type_to_shape",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:protos_all_cc",
+#         "//tensorflow/core/platform:errors",
+#         "//tensorflow/core/platform:status",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:ArithDialect",
+#         "@llvm-project//mlir:AsmParser",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:FuncExtensions",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:QuantOps",
+#         "@llvm-project//mlir:Support",
+#         "@llvm-project//mlir:TranslateLib",
+#         "@stablehlo//:stablehlo_ops",
+#     ],
+#     alwayslink = 1,
+# )
+#
+# cc_library(
+#     name = "mlir_passthrough_op",
+#     srcs = ["ops/mlir_passthrough_op.cc"],
+#     visibility = [
+#         "//visibility:public",
+#     ],
+#     deps = [
+#         "//tensorflow/core:framework",
+#     ],
+#     alwayslink = 1,
+# )
+#
+# cc_library(
+#     name = "mlir_local_var_op",
+#     srcs = ["ops/mlir_local_var_op.cc"],
+#     visibility = [
+#         "//visibility:public",
+#     ],
+#     deps = [
+#         "//tensorflow/core:framework",
+#     ],
+#     alwayslink = 1,
+# )
+#
+# tf_gen_op_wrapper_py(
+#     name = "gen_mlir_passthrough_op_py",
+#     out = "gen_mlir_passthrough_op.py",
+#     compatible_with = [],
+#     extra_py_deps = [
+#         "//tensorflow/python:pywrap_tfe",
+#         "//tensorflow/python/util:dispatch",
+#         "//tensorflow/python/util:deprecation",
+#         "//tensorflow/python/util:tf_export",
+#     ],
+#     py_lib_rule = py_strict_library,
+#     deps = [":mlir_passthrough_op"],
+# )
+#
+# cc_library(
+#     name = "parallel_execute_util",
+#     srcs = ["utils/parallel_execute_util.cc"],
+#     hdrs = ["utils/parallel_execute_util.h"],
+#     deps = [
+#         ":tensorflow",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "cluster_util",
+#     srcs = ["utils/cluster_util.cc"],
+#     hdrs = ["utils/cluster_util.h"],
+#     deps = [
+#         ":tensorflow_analysis",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#         "@llvm-project//mlir:TransformUtils",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "cluster_util_test",
+#     size = "small",
+#     srcs = ["utils/cluster_util_test.cc"],
+#     deps = [
+#         ":cluster_util",
+#         ":serialize_mlir_module_utils",
+#         ":tensorflow",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "//tensorflow/core/platform:errors",
+#         "@local_tsl//tsl/platform:statusor",
+#         "@com_google_googletest//:gtest",
+#         "@llvm-project//mlir:FuncDialect",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tpu_rewrite_device_util",
+#     srcs = ["utils/tpu_rewrite_device_util.cc"],
+#     hdrs = ["utils/tpu_rewrite_device_util.h"],
+#     deps = [
+#         ":device_util",
+#         ":tensorflow",
+#         ":tensorflow_types",
+#         "@com_google_absl//absl/log",
+#         "@com_google_absl//absl/status",
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/compiler/jit:flags_headers",
+#         "//tensorflow/compiler/mlir/utils:string_container_utils",
+#         "@local_xla//xla:array4d",
+#         "@local_xla//xla:xla_data_proto_cc",
+#         "@local_xla//xla/service:computation_placer",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "tpu_rewrite_device_util_test",
+#     size = "small",
+#     srcs = ["utils/tpu_rewrite_device_util_test.cc"],
+#     deps = [
+#         ":device_util",
+#         ":serialize_mlir_module_utils",
+#         ":tensorflow",
+#         ":tpu_rewrite_device_util",
+#         "//tensorflow/compiler/jit:flags",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "device_util",
+#     srcs = ["utils/device_util.cc"],
+#     hdrs = ["utils/device_util.h"],
+#     deps = [
+#         ":tensorflow",
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/core:core_cpu_lib",
+#         "//tensorflow/core:framework",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "device_util_test",
+#     size = "small",
+#     srcs = ["utils/device_util_test.cc"],
+#     deps = [
+#         ":device_util",
+#         "//tensorflow/core:core_cpu_lib",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:protos_all_cc",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "//tensorflow/core/ir/types:Dialect",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "dump_mlir_util",
+#     srcs = ["utils/dump_mlir_util.cc"],
+#     hdrs = ["utils/dump_mlir_util.h"],
+#     deps = [
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core/platform:crash_analysis",
+#         "//tensorflow/core/platform:logging",
+#         "@local_tsl//tsl/lib/io:buffered_file",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Pass",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "dump_mlir_util_test",
+#     size = "small",
+#     srcs = ["utils/dump_mlir_util_test.cc"],
+#     deps = [
+#         ":dump_mlir_util",
+#         ":tensorflow",
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:bridge",
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "//tensorflow/core/platform:test",
+#         "@com_google_googletest//:gtest",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:AllPassesAndDialects",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:MlirOptLib",
+#         "@llvm-project//mlir:Pass",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "dump_graph",
+#     srcs = ["utils/dump_graph.cc"],
+#     hdrs = ["utils/dump_graph.h"],
+#     deps = [
+#         ":error_util",
+#         ":tensorflow",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:graph",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core/ir/importexport:graphdef_import",
+#         "//tensorflow/core/platform:logging",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "dump_graph_test",
+#     size = "small",
+#     srcs = ["utils/dump_graph_test.cc"],
+#     tags = [
+#         "no_windows",  # b/208469759
+#     ],
+#     deps = [
+#         ":dump_graph",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:graph",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:ops",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "//tensorflow/core/platform:test",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "bridge_logger_test",
+#     size = "small",
+#     srcs = ["utils/bridge_logger_test.cc"],
+#     deps = [
+#         ":bridge_logger",
+#         ":serialize_mlir_module_utils",
+#         ":tensorflow",
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "//tensorflow/core/platform:test",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Transforms",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "data_dumper_logger_config_test",
+#     size = "small",
+#     srcs = ["utils/data_dumper_logger_config_test.cc"],
+#     deps = [
+#         ":bridge_logger",
+#         ":serialize_mlir_module_utils",
+#         ":tensorflow",
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "//tensorflow/core/platform:test",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Transforms",
+#     ],
+# )
+#
+# cc_library(
+#     name = "bridge_logger",
+#     srcs = [
+#         "utils/bridge_logger.cc",
+#         "utils/data_dumper_logger_config.cc",
+#     ],
+#     hdrs = [
+#         "utils/bridge_logger.h",
+#         "utils/data_dumper_logger_config.h",
+#     ],
+#     deps = [
+#         ":dump_mlir_util",
+#         "@com_google_absl//absl/strings",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Pass",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "call_graph_util",
+#     srcs = [
+#         "utils/call_graph_util.cc",
+#     ],
+#     hdrs = [
+#         "utils/call_graph_util.h",
+#     ],
+#     deps = [
+#         ":tensorflow",
+#         "@com_google_absl//absl/strings",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "call_graph_util_test",
+#     size = "small",
+#     srcs = ["utils/call_graph_util_test.cc"],
+#     deps = [
+#         ":attribute_utils",
+#         ":call_graph_util",
+#         ":tensorflow",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "@com_google_googletest//:gtest_main",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Parser",
+#     ],
+# )
+#
+# cc_library(
+#     name = "xla_sharding_util",
+#     srcs = [
+#         "utils/xla_sharding_util.cc",
+#     ],
+#     hdrs = [
+#         "utils/xla_sharding_util.h",
+#     ],
+#     deps = [
+#         ":tensorflow",
+#         "@com_google_absl//absl/strings",
+#         "@local_xla//xla:xla_data_proto_cc",
+#         "@local_xla//xla/client:sharding_builder",
+#         "@local_xla//xla/service:hlo_parser",
+#         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "attribute_utils",
+#     srcs = ["utils/attribute_utils.cc"],
+#     hdrs = ["utils/attribute_utils.h"],
+#     deps = [
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/compiler/tf2xla:tf2xla_defs",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "shape_inference_utils",
+#     srcs = ["utils/shape_inference_utils.cc"],
+#     hdrs = ["utils/shape_inference_utils.h"],
+#     deps = [
+#         ":export_tf_dialect_op",
+#         ":tensorflow_types",
+#         "//tensorflow/core/ir:shape_inference_utils",
+#         "@llvm-project//llvm:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "dynamic_shape_utils",
+#     srcs = ["utils/dynamic_shape_utils.cc"],
+#     hdrs = ["utils/dynamic_shape_utils.h"],
+#     deps = [
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "verification_utils",
+#     srcs = ["utils/verification_utils.cc"],
+#     hdrs = ["utils/verification_utils.h"],
+#     deps = [
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "verify_suitable_for_graph_export",
+#     srcs = ["utils/verify_suitable_for_graph_export.cc"],
+#     hdrs = ["utils/verify_suitable_for_graph_export.h"],
+#     deps = [
+#         ":tensorflow",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "resource_value_typed_analyzer",
+#     srcs = ["analysis/resource_value_typed_analyzer.cc"],
+#     hdrs = ["analysis/resource_value_typed_analyzer.h"],
+#     deps = [
+#         ":tensorflow",
+#         ":tensorflow_types",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#         "@llvm-project//mlir:TransformUtils",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tf_arith_ops_folder",
+#     srcs = ["ir/tf_arith_ops_folder.cc"],
+#     hdrs = ["ir/tf_arith_ops_folder.h"],
+#     deps = [
+#         ":tensorflow_types",
+#         "//tensorflow/core/ir/types:Dialect",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:Dialect",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tf_ops_canonicalization_helper",
+#     hdrs = ["ir/tf_ops_canonicalization_helper.h"],
+#     deps = [
+#         ":attribute_utils",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tf_ops_device_helper",
+#     srcs = ["ir/tf_ops_device_helper.cc"],
+#     hdrs = ["ir/tf_ops_device_helper.h"],
+#     deps = [
+#         ":tensorflow_structs",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:framework_internal",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tf_ops_layout_helper",
+#     srcs = ["ir/tf_ops_layout_helper.cc"],
+#     hdrs = ["ir/tf_ops_layout_helper.h"],
+#     deps = [
+#         ":tensorflow_op_interfaces",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tf_ops_tensor_helper",
+#     srcs = ["ir/tf_ops_tensor_helper.cc"],
+#     hdrs = ["ir/tf_ops_tensor_helper.h"],
+#     deps = [
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:Dialect",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "tpu_embedding_ops_registry",
+#     srcs = [
+#         "ir/tpu_embedding_ops_registry.cc",
+#     ],
+#     hdrs = [
+#         "ir/tpu_embedding_ops_registry.h",
+#     ],
+#     deps = [
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "mlprogram_util",
+#     srcs = ["utils/mlprogram_util.cc"],
+#     hdrs = ["utils/mlprogram_util.h"],
+#     deps = [
+#         "//tensorflow/compiler/mlir/tensorflow/transforms:mlprogram",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Pass",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "visitor",
+#     srcs = ["utils/visitor.cc"],
+#     hdrs = ["utils/visitor.h"],
+#     deps = [
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# cc_library(
+#     name = "xla_rewrite_util",
+#     srcs = ["utils/xla_rewrite_util.cc"],
+#     hdrs = ["utils/xla_rewrite_util.h"],
+#     deps = [
+#         ":device_util",
+#         ":tensorflow",
+#         ":tensorflow_types",
+#         "@com_google_absl//absl/strings",
+#         "//tensorflow/compiler/mlir/utils:string_container_utils",
+#         "@local_xla//xla:array4d",
+#         "@local_xla//xla:xla_data_proto_cc",
+#         "@local_xla//xla/service:computation_placer",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# tf_cc_test(
+#     name = "xla_rewrite_util_test",
+#     size = "small",
+#     srcs = ["utils/xla_rewrite_util_test.cc"],
+#     deps = [
+#         ":device_util",
+#         ":serialize_mlir_module_utils",
+#         ":tensorflow",
+#         ":tpu_rewrite_device_util",
+#         ":xla_rewrite_util",
+#         "//tensorflow/compiler/jit:flags",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
+#         "@llvm-project//llvm:Support",
+#         "@llvm-project//mlir:FuncDialect",
+#         "@llvm-project//mlir:IR",
+#     ],
+# )
+#
+# cc_library(
+#     name = "side_effect_analysis_util",
+#     srcs = [
+#         "utils/side_effect_analysis_util.cc",
+#     ],
+#     hdrs = [
+#         "utils/side_effect_analysis_util.h",
+#     ],
+#     deps = [
+#         "tensorflow_side_effects",
+#         "tensorflow_types",
+#         "@llvm-project//mlir:IR",
+#         "@llvm-project//mlir:SideEffectInterfaces",
+#         "@llvm-project//mlir:Support",
+#     ],
+# )
+#
+# build_test(
+#     name = "tensorflow_build_test",
+#     targets = [
+#         ":tensorflow",
+#     ],
+# )
 # copybara:uncomment_begin(google-only)
 #
 # # Generate new_tf_generated_ops.td. Without --update-existing, this file can
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/BUILD
index a6ecd2d9fbe91f..35336a005eba0c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
index b1d7863e860aa6..4c6f68a3419656 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_portable")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/BUILD b/tensorflow/compiler/mlir/tf2xla/tests/BUILD
index 97bb01c30d1855..c68c485954de1b 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
index 3226474fbf8ff6..1250206ef80cea 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
@@ -2,10 +2,10 @@
 #    TF2XLA Bridge transforms
 
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
index 1efa99861b588a..cfb2a9b0b86a35 100644
--- a/tensorflow/compiler/mlir/tfr/BUILD
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -1,16 +1,16 @@
-load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load(
     "@llvm-project//mlir:tblgen.bzl",
     "gentbl_cc_library",
     "td_library",
 )
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
     "tf_cc_test",
 )
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "tf_py_strict_test", "tf_python_pybind_extension")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
diff --git a/tensorflow/compiler/mlir/tfr/build_defs.bzl b/tensorflow/compiler/mlir/tfr/build_defs.bzl
index 090523ce7da3e9..e9dd5e9178080b 100644
--- a/tensorflow/compiler/mlir/tfr/build_defs.bzl
+++ b/tensorflow/compiler/mlir/tfr/build_defs.bzl
@@ -1,8 +1,8 @@
 """BUILD extension for TF composition project."""
 
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library")
-load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
 
 def gen_op_libraries(
         name,
diff --git a/tensorflow/compiler/mlir/tfrt/tests/BUILD b/tensorflow/compiler/mlir/tfrt/tests/BUILD
index cdae75eea036d2..1efb1ac7a16322 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD b/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
index c9b64b7b4fb625..cfe04b0689155d 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss", "tf_cc_test")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD b/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD
index 8d49d08b1025f8..44fc2c0f6945b4 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss", "tf_cc_test")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD b/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD
index 90bd835edb2828..1d2b470c8adb91 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
index 60823b2abba41c..70c2235a20a104 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/BUILD b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/BUILD
index 1065a5fc1a682a..bbcc6e963788c9 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_shared_test")
 load("@tf_runtime//tools:mlir_to_bef.bzl", "glob_tfrt_lit_tests", "mlir_to_bef")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_test")
 # copybara:uncomment load("//third_party/tf_runtime_google/cpp_tests:gen_tests.bzl", "tfrt_cc_test_and_strict_benchmark")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 70dbc0cae06fdf..59dabbafe2de88 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -1,12 +1,20 @@
 load(
-    "//tensorflow:tensorflow.bzl",
-    "check_deps",
-    "tf_cc_binary",
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
 )
 load(
     "@local_xla//xla/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
 )
+load(
+    "//tensorflow:tensorflow.bzl",
+    "check_deps",
+    "tf_cc_binary",
+)
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
@@ -19,14 +27,6 @@ load(
     "if_llvm_system_z_available",
     "if_llvm_x86_available",
 )
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
index acec5d7ae27ff5..29335382ec41a7 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index 0d75396d3d48d4..f94d1adf2b0766 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -1,5 +1,4 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
@@ -8,6 +7,7 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index def24f8187fb16..cc5d75ab70081c 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
-load("//tensorflow/compiler/tests:build_defs.bzl", "generate_backend_suites", "tf_xla_py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_cuda_cc_test")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+load("//tensorflow/compiler/tests:build_defs.bzl", "generate_backend_suites", "tf_xla_py_strict_test")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 91ef722b52db86..498aa0f91e487a 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -3,6 +3,11 @@
 #   and provide TensorRT operators and converter package.
 #   APIs are meant to change over time.
 
+load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "cuda_rpath_flags",
+)
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 # Placeholder: load py_proto_library
@@ -28,11 +33,6 @@ load(
     "if_static",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index cee78e817fe7e7..6ee27ba27a8345 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1,24 +1,25 @@
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+
 # copybara:uncomment_begin(google-only)
 # load("//devtools/deps/check:deps_check.bzl", "check_dependencies")
 # copybara:uncomment_end
 
+load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
+load("@local_xla//xla:xla.bzl", "xla_py_proto_library")
+load("@local_xla//xla/service/cpu:build_defs.bzl", "runtime_copts")
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "if_google", "if_libtpu", "tf_cc_binary", "tf_cc_test", "tf_copts", "tf_cuda_cc_test", "tf_gen_op_wrapper_py", "tf_openmp_copts")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "filegroup", "get_compatible_with_portable")
-load("@local_xla//xla:xla.bzl", "xla_py_proto_library")
-load("@local_xla//xla/service/cpu:build_defs.bzl", "runtime_copts")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_tensor_coding_deps",
     "tf_proto_library",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index ccfd67e223b1b3..7ead605ca65c07 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_gen_op_wrapper_cc")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index f06596bfe6530b..6a60149d7cc4a1 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -1,3 +1,4 @@
+load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_cuda_or_rocm",
@@ -11,7 +12,6 @@ load(
     "tf_proto_library",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index ba26c4fe54b31a..6adab4c6c7f6b4 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -1,10 +1,10 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index 128d865b8c63cf..50dbf000b03501 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -1,8 +1,8 @@
+load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_py_clif_cc",
 )
-load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 4bc1b596d4eefc..97c79bec1faccb 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -63,6 +63,11 @@
 
 # Placeholder: load py_proto_library
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "if_mkl",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
@@ -83,10 +88,6 @@ load(
     "transitive_hdrs",
 )
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "get_compatible_with_portable", "tensorflow_opensource_extra_deps", "tf_monitoring_framework_deps", "tf_selective_registration_deps")
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "if_mkl",
-)
 
 # For platform specific build config
 load(
@@ -111,7 +112,6 @@ load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
 )
-load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/activity_watcher/BUILD b/tensorflow/core/activity_watcher/BUILD
index 0526bccd5a1672..d471f4f892c4fc 100644
--- a/tensorflow/core/activity_watcher/BUILD
+++ b/tensorflow/core/activity_watcher/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 8d96a2cbaff5f8..76b8cc01324619 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -7,19 +7,19 @@
 #   :java_api_def
 
 load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_binary",
-    "tf_cc_test",
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
 )
 load(
     "@local_xla//xla/tsl/mkl:build_defs.bzl",
     "if_mkl",
 )
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
-    "@local_config_tensorrt//:build_defs.bzl",
-    "if_tensorrt",
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index d9f05c303cc0fd..0eee0cdf05a4aa 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1,4 +1,9 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "if_mkl",
+    "if_mkl_ml",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "if_libtpu",
@@ -13,11 +18,6 @@ load(
     "tf_openmp_lopts",
 )
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "tf_cuda_cc_test")
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "if_mkl",
-    "if_mkl_ml",
-)
 
 # For platform specific build config
 load(
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index cf569caa543a9a..8161b5cd49aac3 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -1,4 +1,9 @@
 load("@bazel_skylib//lib:selects.bzl", "selects")
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "if_mkl",
+    "mkl_deps",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "clean_dep",
@@ -11,11 +16,6 @@ load(
     "tf_mkl_kernel_library",
 )
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "if_mkl",
-    "mkl_deps",
-)
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
index a49f2aa46aefc4..f9235ce4fadc23 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/data/service/client/BUILD b/tensorflow/core/data/service/client/BUILD
index edc9244e290f00..16bd0efd808e78 100644
--- a/tensorflow/core/data/service/client/BUILD
+++ b/tensorflow/core/data/service/client/BUILD
@@ -1,9 +1,9 @@
 # tf.data service client library.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 load("//tensorflow/core/platform:build_config.bzl", "tf_protos_profiler_service")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/data/service/snapshot/BUILD b/tensorflow/core/data/service/snapshot/BUILD
index 4f145b270db425..717699dc1b21b0 100644
--- a/tensorflow/core/data/service/snapshot/BUILD
+++ b/tensorflow/core/data/service/snapshot/BUILD
@@ -1,9 +1,9 @@
 # Distributed snapshot library.
 
+load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies", "tf_kernel_library")
 load("//tensorflow/core/platform:build_config.bzl", "tf_protos_profiler_service")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 9958819a1f179a..6c8211537271f9 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -13,8 +13,6 @@
 # ":debug_node_key" - Defines a struct used for tracking tensors.
 
 # Placeholder: load py_proto_library
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 load(
     "//tensorflow:tensorflow.bzl",
     "check_deps",
@@ -24,6 +22,7 @@ load(
     "tf_copts",
     "tf_cuda_library",
 )
+load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 
 # For platform specific build config
 load(
@@ -31,6 +30,7 @@ load(
     "tf_additional_all_protos",
     "tf_proto_library",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 4a4c234f56b943..c87c153ba61d28 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -3,7 +3,6 @@
 # to be distributed and performed in parallel across multiple
 # processes.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_google",
@@ -21,6 +20,7 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 04b4a3abdb2160..9ceccd7ccfa015 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -1,7 +1,6 @@
 # Description:
 #   RPC communication interfaces and implementations for TensorFlow.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_google",
@@ -21,6 +20,7 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index 37cc64b77409a0..84bce367bdd389 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/example/BUILD b/tensorflow/core/example/BUILD
index 2a1b70a5350e82..4527044f383921 100644
--- a/tensorflow/core/example/BUILD
+++ b/tensorflow/core/example/BUILD
@@ -1,5 +1,3 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "@io_bazel_rules_closure//closure:defs.bzl",
     "closure_proto_library",
@@ -9,12 +7,14 @@ load(
     "tf_cc_test",
     "tf_copts",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_jspb_proto_library",
     "tf_proto_library",
     "tf_pyclif_proto_library",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/function/capture/BUILD b/tensorflow/core/function/capture/BUILD
index 31ecbe1a79c16f..e1307342c6d60c 100644
--- a/tensorflow/core/function/capture/BUILD
+++ b/tensorflow/core/function/capture/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/function/polymorphism/BUILD b/tensorflow/core/function/polymorphism/BUILD
index 67165dfc34deca..5027542560ee1d 100644
--- a/tensorflow/core/function/polymorphism/BUILD
+++ b/tensorflow/core/function/polymorphism/BUILD
@@ -1,6 +1,7 @@
-# Placeholder: load py_proto_library
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
+
+# Placeholder: load py_proto_library
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
diff --git a/tensorflow/core/function/trace_type/BUILD b/tensorflow/core/function/trace_type/BUILD
index b3c0f2e05f8fa6..7e7acb23a29fe4 100644
--- a/tensorflow/core/function/trace_type/BUILD
+++ b/tensorflow/core/function/trace_type/BUILD
@@ -1,6 +1,7 @@
-# Placeholder: load py_proto_library
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
+
+# Placeholder: load py_proto_library
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
diff --git a/tensorflow/core/graph/BUILD b/tensorflow/core/graph/BUILD
index 7016a2dde88be0..aac871128e24ed 100644
--- a/tensorflow/core/graph/BUILD
+++ b/tensorflow/core/graph/BUILD
@@ -1,13 +1,13 @@
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "if_mkl",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
     "tf_cc_tests",
 )
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "if_mkl",
-)
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
diff --git a/tensorflow/core/graph/regularization/BUILD b/tensorflow/core/graph/regularization/BUILD
index 543e490dfe2bc0..4562878a3a80f3 100644
--- a/tensorflow/core/graph/regularization/BUILD
+++ b/tensorflow/core/graph/regularization/BUILD
@@ -1,12 +1,12 @@
 # Description:
 # A TensorFlow Graph Regularization Library
 
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 63894b42422be7..5515c119893436 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index 8c1715a23aae84..82dc1e38db8b7b 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -1,11 +1,11 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 4d40fbe5f0812e..c627c09025a8a1 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -1,17 +1,17 @@
 # Placeholder: load py_proto_library
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
     "tf_cuda_library",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
     "tf_protos_grappler",
     "tf_pyclif_proto_library",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/graph_analyzer/BUILD b/tensorflow/core/grappler/graph_analyzer/BUILD
index 8b4f8c3ad0d5ba..cd77c2b35a00c1 100644
--- a/tensorflow/core/grappler/graph_analyzer/BUILD
+++ b/tensorflow/core/grappler/graph_analyzer/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 799b6ea3ad03cb..e967c46836756d 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -1,6 +1,10 @@
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
+load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_mkl")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cc_test_mkl", "tf_copts", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_kernel_library")
-load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_mkl")
 
 # Platform specific build config
 load(
@@ -8,10 +12,6 @@ load(
     "if_static",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index e73a367d8420d2..71d75dcbf9c704 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -1,10 +1,10 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_protos_grappler",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/verifiers/BUILD b/tensorflow/core/grappler/verifiers/BUILD
index 2d3cc54c72e510..959a9955b8eab5 100644
--- a/tensorflow/core/grappler/verifiers/BUILD
+++ b/tensorflow/core/grappler/verifiers/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ir/importexport/BUILD b/tensorflow/core/ir/importexport/BUILD
index c97dc466c7e826..5694f228b32b7b 100644
--- a/tensorflow/core/ir/importexport/BUILD
+++ b/tensorflow/core/ir/importexport/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
diff --git a/tensorflow/core/ir/importexport/tests/BUILD b/tensorflow/core/ir/importexport/tests/BUILD
index f25d004408d86f..32d6b56232765c 100644
--- a/tensorflow/core/ir/importexport/tests/BUILD
+++ b/tensorflow/core/ir/importexport/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD
index f25d004408d86f..32d6b56232765c 100644
--- a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD
+++ b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD b/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD
index f25d004408d86f..32d6b56232765c 100644
--- a/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD
+++ b/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ir/importexport/tests/roundtrip/BUILD b/tensorflow/core/ir/importexport/tests/roundtrip/BUILD
index 68789a9126b4a1..e80b874a1ddcd4 100644
--- a/tensorflow/core/ir/importexport/tests/roundtrip/BUILD
+++ b/tensorflow/core/ir/importexport/tests/roundtrip/BUILD
@@ -1,6 +1,7 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+
 # Regression tests for bridge.
 load(":roundtrip.bzl", "glob_roundtrip_tests")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ir/tests/BUILD b/tensorflow/core/ir/tests/BUILD
index 14304dfdee7d5e..97e8aa27a235ff 100644
--- a/tensorflow/core/ir/tests/BUILD
+++ b/tensorflow/core/ir/tests/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ir/types/BUILD b/tensorflow/core/ir/types/BUILD
index eee60c360ce497..2181219addb51f 100644
--- a/tensorflow/core/ir/types/BUILD
+++ b/tensorflow/core/ir/types/BUILD
@@ -1,6 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index f97593aaf43898..41546d050012f7 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2,6 +2,16 @@ load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+    "if_rocm_hipblaslt",
+)
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "if_mkl",
+    "mkl_deps",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
@@ -20,11 +30,6 @@ load(
     "tf_opts_nortti_if_lite_protos",
 )
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "get_compatible_with_portable", "tf_cc_shared_library", "tf_cuda_cc_test", "tf_cuda_cc_tests", "tf_disable_ptxas_warning_flags", "tf_kernel_library")
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "if_mkl",
-    "mkl_deps",
-)
 load(
     "//tensorflow/core/kernels/mlir_generated:build_defs.bzl",
     "if_mlir_generated_experimental_kernels_enabled",
@@ -40,11 +45,6 @@ load(
     "tf_cuda_tests_tags",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm",
-    "if_rocm_hipblaslt",
-)
 
 # Description:
 # Op kernel implementations for TensorFlow.
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 2a4d470c4136f0..23d6062a8519a3 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_ops_fuzz_target_lib", "tf_oss_fuzz_corpus", "tf_oss_fuzz_dict")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/kernels/image/BUILD b/tensorflow/core/kernels/image/BUILD
index 9ad554620faf34..d26a532e1a8315 100644
--- a/tensorflow/core/kernels/image/BUILD
+++ b/tensorflow/core/kernels/image/BUILD
@@ -1,4 +1,3 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load(
     "//tensorflow:tensorflow.bzl",
@@ -8,11 +7,12 @@ load(
     "tf_cc_tests",
     "tf_copts",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test", "tf_disable_ptxas_warning_flags", "tf_kernel_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test", "tf_disable_ptxas_warning_flags", "tf_kernel_library")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 # TODO(rmlarsen): Remove ASAP.
 package_group(
diff --git a/tensorflow/core/kernels/linalg/BUILD b/tensorflow/core/kernels/linalg/BUILD
index 17e2fad4559eb8..76ab173ffc7531 100644
--- a/tensorflow/core/kernels/linalg/BUILD
+++ b/tensorflow/core/kernels/linalg/BUILD
@@ -1,14 +1,14 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "if_cuda_or_rocm",
 )
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test", "tf_kernel_library")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
index bb4691ed8f145c..f533b2f9abcf06 100644
--- a/tensorflow/core/kernels/mkl/BUILD
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -1,12 +1,12 @@
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "mkl_deps",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test_mkl",
     "tf_mkl_kernel_library",
 )
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "mkl_deps",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/kernels/mlir_generated/build_defs.bzl b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
index fb1959dc90f012..435e10d43a3f3c 100644
--- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl
+++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
@@ -1,15 +1,15 @@
 """Generates cubin headers for TF dialect ops."""
 
+load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain", "use_cpp_toolchain")
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
-load(
-    "@local_xla//xla/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "rocm_gpu_architectures",
 )
-load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain", "use_cpp_toolchain")
+load(
+    "@local_xla//xla/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
 
 def _lookup_file(filegroup, path):
     """Extracts file at (relative) path in filegroup."""
diff --git a/tensorflow/core/kernels/rnn/BUILD b/tensorflow/core/kernels/rnn/BUILD
index 45dd448d1e9ca1..cf6defabe68bc9 100644
--- a/tensorflow/core/kernels/rnn/BUILD
+++ b/tensorflow/core/kernels/rnn/BUILD
@@ -2,18 +2,18 @@
 #   OpKernels for RNN ops.
 
 load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_gpu_library",
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
 )
-load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
 load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
+    "//tensorflow:tensorflow.bzl",
+    "tf_gpu_library",
 )
+load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/kernels/sparse/BUILD b/tensorflow/core/kernels/sparse/BUILD
index d99e8b6f79b3b3..4df115b0f47e9b 100644
--- a/tensorflow/core/kernels/sparse/BUILD
+++ b/tensorflow/core/kernels/sparse/BUILD
@@ -1,5 +1,9 @@
 # Description: Op kernels for sparse matrix operations.
 
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "if_cuda_or_rocm",
@@ -7,10 +11,6 @@ load(
 )
 load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index bc7ca26371890d..80ef849a9ed93d 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -3,6 +3,7 @@
 #   APIs are meant to change over time.
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("//tensorflow:tensorflow.bzl", "if_cuda_or_rocm", "if_nccl", "tf_copts")
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test")
 load(
@@ -10,7 +11,6 @@ load(
     "tf_cuda_tests_tags",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ops/BUILD b/tensorflow/core/ops/BUILD
index 4d3c38aba41503..c1e0497969e7fc 100644
--- a/tensorflow/core/ops/BUILD
+++ b/tensorflow/core/ops/BUILD
@@ -1,6 +1,10 @@
 # Description:
 # Tensorflow default op definitions.
 
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "if_mkl",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -10,10 +14,6 @@ load(
     "filegroup",
     "tf_gen_op_libs",
 )
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "if_mkl",
-)
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index c9f574981969fe..db7225099edf8d 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -2,12 +2,12 @@
 # For keeping the history of OpDefs for every major version of TensorFlow,
 # to validate that we don't make backwards-incompatible changes.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 9af3f143790b36..1e1e02fd64cf27 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -8,6 +8,15 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to any TensorFlow code outside this package.
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load("//tensorflow:strict.default.bzl", "py_strict_test")
 load(
     "//tensorflow:tensorflow.bzl",
@@ -45,15 +54,6 @@ load(
     "cc_binary",
     "cc_library",
 )
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/platform/build_config.bzl b/tensorflow/core/platform/build_config.bzl
index 3cd84637725d95..de0453b6deac98 100644
--- a/tensorflow/core/platform/build_config.bzl
+++ b/tensorflow/core/platform/build_config.bzl
@@ -1,12 +1,5 @@
 """Provides a redirection point for platform specific implementations of starlark utilities."""
 
-load(
-    "//tensorflow/core/platform:build_config.default.bzl",
-    _tf_additional_binary_deps = "tf_additional_binary_deps",
-    _tf_dtensor_tpu_dependencies = "tf_dtensor_tpu_dependencies",
-    _tf_protos_all = "tf_protos_all",
-    _tf_tpu_dependencies = "tf_tpu_dependencies",
-)
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     _pyx_library = "pyx_library",
@@ -42,6 +35,13 @@ load(
     _tf_stream_executor_deps = "tf_stream_executor_deps",
     _tf_windows_aware_platform_deps = "tf_windows_aware_platform_deps",
 )
+load(
+    "//tensorflow/core/platform:build_config.default.bzl",
+    _tf_additional_binary_deps = "tf_additional_binary_deps",
+    _tf_dtensor_tpu_dependencies = "tf_dtensor_tpu_dependencies",
+    _tf_protos_all = "tf_protos_all",
+    _tf_tpu_dependencies = "tf_tpu_dependencies",
+)
 
 pyx_library = _pyx_library
 tf_additional_all_protos = _tf_additional_all_protos
diff --git a/tensorflow/core/platform/build_config.default.bzl b/tensorflow/core/platform/build_config.default.bzl
index bb8c0f2982cb62..f3c7eba3fd03b6 100644
--- a/tensorflow/core/platform/build_config.default.bzl
+++ b/tensorflow/core/platform/build_config.default.bzl
@@ -1,6 +1,10 @@
 """OSS versions of Bazel macros that can't be migrated to TSL."""
 
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load(
+    "@local_tsl//tsl:tsl.bzl",
+    "if_libtpu",
+)
 load(
     "@local_xla//xla/tsl/mkl:build_defs.bzl",
     "if_mkl_ml",
@@ -9,10 +13,6 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
 )
-load(
-    "@local_tsl//tsl:tsl.bzl",
-    "if_libtpu",
-)
 
 def tf_tpu_dependencies():
     return if_libtpu([Label("//tensorflow/core/tpu/kernels")])
diff --git a/tensorflow/core/platform/build_config_root.bzl b/tensorflow/core/platform/build_config_root.bzl
index e499b3aa49328f..c0513041898983 100644
--- a/tensorflow/core/platform/build_config_root.bzl
+++ b/tensorflow/core/platform/build_config_root.bzl
@@ -1,10 +1,5 @@
 """Provides a redirection point for platform specific implementations of starlark utilities."""
 
-load(
-    "//tensorflow/core/platform:build_config_root.default.bzl",
-    _if_dynamic_kernels = "if_dynamic_kernels",
-    _tf_additional_plugin_deps = "tf_additional_plugin_deps",
-)
 load(
     "@local_tsl//tsl/platform/default:build_config_root.bzl",
     _if_llvm_aarch32_available = "if_llvm_aarch32_available",
@@ -24,6 +19,11 @@ load(
     _tf_exec_properties = "tf_exec_properties",
     _tf_gpu_tests_tags = "tf_gpu_tests_tags",
 )
+load(
+    "//tensorflow/core/platform:build_config_root.default.bzl",
+    _if_dynamic_kernels = "if_dynamic_kernels",
+    _tf_additional_plugin_deps = "tf_additional_plugin_deps",
+)
 
 if_llvm_aarch32_available = _if_llvm_aarch32_available
 if_llvm_aarch64_available = _if_llvm_aarch64_available
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 6fda33d324f8dc..c1d6061917ceaa 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -1,11 +1,11 @@
 # Description:
 # Cloud file system implementation.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "@local_tsl//tsl:tsl.bzl",
     "tsl_copts",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/platform/distribute.bzl b/tensorflow/core/platform/distribute.bzl
index edb55b5627e312..e05b3e06a79758 100644
--- a/tensorflow/core/platform/distribute.bzl
+++ b/tensorflow/core/platform/distribute.bzl
@@ -1,7 +1,7 @@
 """Build rules for tf.distribute testing."""
 
-load("//tensorflow/python/tpu:tpu.bzl", _tpu_py_test = "tpu_py_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow/python/tpu:tpu.bzl", _tpu_py_test = "tpu_py_test")
 
 def distribute_py_test(
         name,
diff --git a/tensorflow/core/platform/profile_utils/BUILD b/tensorflow/core/platform/profile_utils/BUILD
index 67d4177bd9c67b..4af8510840ef63 100644
--- a/tensorflow/core/platform/profile_utils/BUILD
+++ b/tensorflow/core/platform/profile_utils/BUILD
@@ -1,15 +1,15 @@
 # Description:
 # profile_utils targets.
 
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",  # @unused
+)
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
 )
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_copts",  # @unused
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/profiler/backends/gpu/BUILD b/tensorflow/core/profiler/backends/gpu/BUILD
index 1f40f0657d948c..94afc773e15835 100644
--- a/tensorflow/core/profiler/backends/gpu/BUILD
+++ b/tensorflow/core/profiler/backends/gpu/BUILD
@@ -1,9 +1,9 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
-load("//tensorflow/core/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
+load("//tensorflow/core/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 7af5f06e6169f2..e9da7fa9f4f154 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/profiler/internal/advisor/BUILD b/tensorflow/core/profiler/internal/advisor/BUILD
index d8fe28c8b96e17..7dd7f547b09c82 100644
--- a/tensorflow/core/profiler/internal/advisor/BUILD
+++ b/tensorflow/core/profiler/internal/advisor/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index 03cc7b8f1e1e0c..068d599d5b5f1b 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_external_workspace_visible", "tf_grpc_cc_dependencies")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow/core/profiler/builds:build_config.bzl",
     "tf_profiler_alias",
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index ce85502e5a3273..82e68a58529c2c 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow/core/profiler/builds:build_config.bzl",
     "tf_profiler_copts",
diff --git a/tensorflow/core/profiler/rpc/oss/BUILD b/tensorflow/core/profiler/rpc/oss/BUILD
index b104bffff89690..c84227b59f7468 100644
--- a/tensorflow/core/profiler/rpc/oss/BUILD
+++ b/tensorflow/core/profiler/rpc/oss/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
index 8c60bdab0656ee..86bf0017f3cfda 100644
--- a/tensorflow/core/protobuf/BUILD
+++ b/tensorflow/core/protobuf/BUILD
@@ -1,6 +1,6 @@
 # copybara:uncomment_begin(oss-unused)
-# load("//net/grpc/go/build_defs:go_grpc_library.bzl", "go_grpc_library")
 # load("//net/grpc:cc_grpc_library.bzl", "cc_grpc_library")
+# load("//net/grpc/go/build_defs:go_grpc_library.bzl", "go_grpc_library")
 # copybara:uncomment_end
 
 # For platform specific build config
diff --git a/tensorflow/core/summary/BUILD b/tensorflow/core/summary/BUILD
index 5f66bb91816858..81b600f036716c 100644
--- a/tensorflow/core/summary/BUILD
+++ b/tensorflow/core/summary/BUILD
@@ -1,13 +1,13 @@
 # Description:
 #   C++ implementation code for the summary writing APIs.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
     "tf_cc_test",
     "tf_copts",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/tfrt/common/BUILD b/tensorflow/core/tfrt/common/BUILD
index ac61ea80b22a2e..2de9b7fcf904b3 100644
--- a/tensorflow/core/tfrt/common/BUILD
+++ b/tensorflow/core/tfrt/common/BUILD
@@ -1,9 +1,9 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_cc_test")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/tfrt/fallback/BUILD b/tensorflow/core/tfrt/fallback/BUILD
index 9ec53499c285b4..454c102deec081 100644
--- a/tensorflow/core/tfrt/fallback/BUILD
+++ b/tensorflow/core/tfrt/fallback/BUILD
@@ -1,4 +1,3 @@
-load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_mobile",
@@ -6,13 +5,14 @@ load(
     "tf_cc_test",
     "tf_features_nolayering_check_if_ios",
 )
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
     "tf_cuda_tests_tags",
 )
-load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/tfrt/saved_model/tests/BUILD b/tensorflow/core/tfrt/saved_model/tests/BUILD
index 09f1e7deaa8d10..71a181205f6a33 100644
--- a/tensorflow/core/tfrt/saved_model/tests/BUILD
+++ b/tensorflow/core/tfrt/saved_model/tests/BUILD
@@ -1,8 +1,8 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_binary")
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "if_google", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(":gen_saved_model.bzl", "gen_saved_model")
 
 package(
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index eade2efb96f75c..6299ce67840684 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -1,5 +1,6 @@
 # Description: Utilities for TPU Operations
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_libtpu",
@@ -7,7 +8,6 @@ load(
     "tf_cc_test",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/tpu/graph_rewrite/BUILD b/tensorflow/core/tpu/graph_rewrite/BUILD
index 0140362078487d..df0dfe1d1010e6 100644
--- a/tensorflow/core/tpu/graph_rewrite/BUILD
+++ b/tensorflow/core/tpu/graph_rewrite/BUILD
@@ -1,10 +1,10 @@
 # Contains graph rewrites for TPU runtimes and optimizations.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/transforms/remapper/BUILD b/tensorflow/core/transforms/remapper/BUILD
index a75461d412b23b..0c348a93f6d723 100644
--- a/tensorflow/core/transforms/remapper/BUILD
+++ b/tensorflow/core/transforms/remapper/BUILD
@@ -1,6 +1,6 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/user_ops/BUILD b/tensorflow/core/user_ops/BUILD
index abb6310822893d..7b562a05d12679 100644
--- a/tensorflow/core/user_ops/BUILD
+++ b/tensorflow/core/user_ops/BUILD
@@ -1,13 +1,13 @@
 # User ops.
 
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_copts",
 )
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index aabf50cd39109c..6f8dce9bf2107d 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -1,4 +1,8 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "check_deps",
@@ -24,10 +28,6 @@ load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
 )
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm",
-)
 
 default_package_visibility = [
     "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/util/ctc/BUILD b/tensorflow/core/util/ctc/BUILD
index 93ddc91b361440..46c561e0ff0402 100644
--- a/tensorflow/core/util/ctc/BUILD
+++ b/tensorflow/core/util/ctc/BUILD
@@ -2,9 +2,9 @@
 # is a type of seq2seq loss.  The libraries in this directory
 # implement the CTC loss and a number of CTC decoders.
 
+load("//tensorflow:tensorflow.bzl", "tf_cc_tests")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
index 41e7b7e29d762b..be93c135384c59 100644
--- a/tensorflow/core/util/proto/BUILD
+++ b/tensorflow/core/util/proto/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/util/quantization/BUILD b/tensorflow/core/util/quantization/BUILD
index 532f43044f8ca5..51314b42cd494d 100644
--- a/tensorflow/core/util/quantization/BUILD
+++ b/tensorflow/core/util/quantization/BUILD
@@ -1,5 +1,10 @@
 # Description: Utils for Tensorflow quantization.
 
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
@@ -8,11 +13,6 @@ load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
 )
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/util/sparse/BUILD b/tensorflow/core/util/sparse/BUILD
index 630cd02a690d85..5f140a988e1294 100644
--- a/tensorflow/core/util/sparse/BUILD
+++ b/tensorflow/core/util/sparse/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/distribute/experimental/rpc/kernels/BUILD b/tensorflow/distribute/experimental/rpc/kernels/BUILD
index 60fa55c7702ee4..611339854ec09c 100644
--- a/tensorflow/distribute/experimental/rpc/kernels/BUILD
+++ b/tensorflow/distribute/experimental/rpc/kernels/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 load("//tensorflow/distribute/experimental/rpc/kernels/oss:defs.bzl", "grpc_credentials_dependency")
 
 package(
diff --git a/tensorflow/dtensor/cc/BUILD b/tensorflow/dtensor/cc/BUILD
index e54625d1a7cac5..7d5070507fd14f 100644
--- a/tensorflow/dtensor/cc/BUILD
+++ b/tensorflow/dtensor/cc/BUILD
@@ -1,13 +1,13 @@
 #include "third_party/absl/strings/str_cat.h"
 #DTensor C++ runtime and libraries.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "if_google", "if_libtpu")
+load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_dtensor_tpu_dependencies",
 )
-load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
-load("//tensorflow:tensorflow.bzl", "if_google", "if_libtpu")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 default_visibility = [
     "//tensorflow/dtensor:dtensor-internal",
diff --git a/tensorflow/dtensor/mlir/dtensor_dialect/BUILD b/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
index feb41e226c21c0..d484322d42fea4 100644
--- a/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
+++ b/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
@@ -1,8 +1,9 @@
-# DTensor MLIR dialect.
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
+# DTensor MLIR dialect.
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
diff --git a/tensorflow/dtensor/mlir/tests/BUILD b/tensorflow/dtensor/mlir/tests/BUILD
index 6cee3359c3d980..675792ac142ef8 100644
--- a/tensorflow/dtensor/mlir/tests/BUILD
+++ b/tensorflow/dtensor/mlir/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index 3e4feb2ce619e5..2f0bf583d2f095 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -2,12 +2,12 @@
 # Code examples referenced by adding_an_op
 
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_custom_op_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
     "tf_exec_properties",
 )
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_custom_op_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index dcccad17807ab1..ae630e0247b246 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index c529b2bcffa1f2..cab2e7254ee940 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -1,8 +1,6 @@
 # Description:
 # TensorFlow Java API.
 
-load(":build_defs.bzl", "JAVACOPTS")
-load(":src/gen/gen_ops.bzl", "tf_java_op_gen_srcjar")
 load(
     "//tensorflow:tensorflow.bzl",
     "VERSION",
@@ -13,6 +11,8 @@ load(
     "tf_custom_op_library",
     "tf_java_test",
 )
+load(":build_defs.bzl", "JAVACOPTS")
+load(":src/gen/gen_ops.bzl", "tf_java_op_gen_srcjar")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index 2bc604973a7b38..527288dc5348ad 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -2,6 +2,8 @@
 # Java Native Interface (JNI) library intended for implementing the
 # TensorFlow Java API using the TensorFlow C library.
 
+load("//tensorflow:tensorflow.bzl", "tf_copts", "tf_cuda_library")
+
 package(default_visibility = [
     "//tensorflow/java:__pkg__",
     "//tensorflow/tools/android/inference_interface:__pkg__",
@@ -9,8 +11,6 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_copts", "tf_cuda_library")
-
 tf_cuda_library(
     name = "native",
     srcs = glob(["*.cc"]) + select({
diff --git a/tensorflow/js/BUILD b/tensorflow/js/BUILD
index b24aef63f9c9c5..5e34184fe327dc 100644
--- a/tensorflow/js/BUILD
+++ b/tensorflow/js/BUILD
@@ -1,15 +1,15 @@
 # Description:
 # JavaScript/TypeScript code generation for TensorFlow.js
 
-visibility = [
-    "//tensorflow:internal",
-]
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
 
+visibility = [
+    "//tensorflow:internal",
+]
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 7385de95bd034c..d97faeaaaeca08 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -1,3 +1,4 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("//tensorflow:tensorflow.bzl", "if_google", "if_not_windows", "if_oss", "tf_cc_test")
@@ -5,7 +6,6 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_copts", "tflite_copts_warnings", "tflite_linkopts_no_undefined", "tflite_self_contained_libs_test_suite")
 load("//tensorflow/lite:special_rules.bzl", "SPECIAL_RULES_DEPS", "internal_visibility_allowlist", "tflite_internal_cc_3p_api_deps_src_all_visibility_allowlist", "tflite_portable_test_suite")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "alias_with_tflite", "cc_library_with_tflite", "cc_test_with_tflite")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:LICENSE"],
diff --git a/tensorflow/lite/acceleration/configuration/BUILD b/tensorflow/lite/acceleration/configuration/BUILD
index 375d73fa2c23a4..f78221caf8ee2c 100644
--- a/tensorflow/lite/acceleration/configuration/BUILD
+++ b/tensorflow/lite/acceleration/configuration/BUILD
@@ -14,6 +14,11 @@
 # ==============================================================================
 
 load("@flatbuffers//:build_defs.bzl", "DEFAULT_FLATC_ARGS", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library", "flatc_path")
+
+# copybara:comment_begin(oss-only)
+load("@local_tsl//tsl/platform/default:build_config.bzl", "tf_proto_library_py")
+# copybara:comment_end
+
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
@@ -22,10 +27,6 @@ load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with
 # copybara:uncomment load("//tools/build_defs/proto/cpp:cc_proto_library.bzl", "cc_proto_library")
 load(":build_defs.bzl", "flatbuffer_schema_compat_test")
 
-# copybara:comment_begin(oss-only)
-load("@local_tsl//tsl/platform/default:build_config.bzl", "tf_proto_library_py")
-# copybara:comment_end
-
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
diff --git a/tensorflow/lite/acceleration/configuration/c/BUILD b/tensorflow/lite/acceleration/configuration/c/BUILD
index b441d77fb1e729..df9cdb4d9d621c 100644
--- a/tensorflow/lite/acceleration/configuration/c/BUILD
+++ b/tensorflow/lite/acceleration/configuration/c/BUILD
@@ -16,11 +16,11 @@
 # C API for delegate plugins.
 
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load(
     "//tensorflow/lite/core/shims:cc_library_with_tflite.bzl",
     "cc_library_with_tflite_with_c_headers_test",
 )
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 
 # LINT.IfChange(tflite_acceleration_exported_headers)
 exports_files([
diff --git a/tensorflow/lite/core/BUILD b/tensorflow/lite/core/BUILD
index e5ace050bb9a10..4b8859346ad71f 100644
--- a/tensorflow/lite/core/BUILD
+++ b/tensorflow/lite/core/BUILD
@@ -1,8 +1,8 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings", "tflite_self_contained_libs_test_suite")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist", "tflite_portable_test_suite")
 load("//tensorflow/lite/core:special_rules.bzl", "macros_visibility_allowlist")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/core/async/c/BUILD b/tensorflow/lite/core/async/c/BUILD
index af618aaa22300c..e9a8bf9ae6c7cc 100644
--- a/tensorflow/lite/core/async/c/BUILD
+++ b/tensorflow/lite/core/async/c/BUILD
@@ -2,13 +2,13 @@
 # For clients using async APIs, please use tensorflow/lite/async/c instead of this package.
 # NOTE: Targets in this package are experimental.
 
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/lite:build_def.bzl",
     "tflite_cc_library_with_c_headers_test",
     "tflite_copts",
     "tflite_copts_warnings",
 )
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/core/async/interop/c/BUILD b/tensorflow/lite/core/async/interop/c/BUILD
index 54f37ed48b1177..13f81e545b46bb 100644
--- a/tensorflow/lite/core/async/interop/c/BUILD
+++ b/tensorflow/lite/core/async/interop/c/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index a9a7af312cbc87..1b0bccb9e4857e 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -1,3 +1,4 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/lite:build_def.bzl",
@@ -10,7 +11,6 @@ load(
     "c_api_visibility_allowlist",
     "common_header_visibility_allowlist",
 )
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/BUILD b/tensorflow/lite/core/experimental/acceleration/configuration/BUILD
index 3f5cb1a87f52c8..55f376948b2eec 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/BUILD
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite/core:special_rules.bzl", "delegate_registry_visibility_allowlist")
 load("//tensorflow/lite/core/c:special_rules.bzl", "experimental_acceleration_api_allowlist")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/core/kernels/BUILD b/tensorflow/lite/core/kernels/BUILD
index f200394d6123de..fcb5a458edcbce 100644
--- a/tensorflow/lite/core/kernels/BUILD
+++ b/tensorflow/lite/core/kernels/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_self_contained_libs_test_suite")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_self_contained_libs_test_suite")
 load("//tensorflow/lite/core:special_rules.bzl", "builtin_ops_visibility_allowlist")
 
 package(
diff --git a/tensorflow/lite/core/shims/BUILD b/tensorflow/lite/core/shims/BUILD
index 902a2a6d862753..7febb33af081ed 100644
--- a/tensorflow/lite/core/shims/BUILD
+++ b/tensorflow/lite/core/shims/BUILD
@@ -1,9 +1,9 @@
 # Description: this package contains shim library targets that forward
 # to the TF Lite C and C++ API targets.  See README.md.
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(":build_defs.bzl", "build_test")
 load(":cc_library_with_tflite.bzl", "custom_c_library_with_tflite")
 load(":cc_library_with_tflite_test.bzl", "cc_library_with_tflite_test_suite")
diff --git a/tensorflow/lite/core/shims/cc_library_with_tflite.bzl b/tensorflow/lite/core/shims/cc_library_with_tflite.bzl
index 7ed021d9d5c9d6..a328f8e61ddafe 100644
--- a/tensorflow/lite/core/shims/cc_library_with_tflite.bzl
+++ b/tensorflow/lite/core/shims/cc_library_with_tflite.bzl
@@ -1,5 +1,7 @@
 """Definitions for targets that use the TFLite shims."""
 
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary", "android_library")
 load("//tensorflow:tensorflow.bzl", "clean_dep")
 load(
     "//tensorflow/lite:build_def.bzl",
@@ -7,8 +9,6 @@ load(
     "tflite_custom_c_library",
     "tflite_jni_binary",
 )
-load("@build_bazel_rules_android//android:rules.bzl", "android_binary", "android_library")
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
 
 def _concat(lists):
     """Concatenate a list of lists, without requiring the inner lists to be iterable.
diff --git a/tensorflow/lite/core/tools/BUILD b/tensorflow/lite/core/tools/BUILD
index 3e5363b809dd06..8891caef8f75de 100644
--- a/tensorflow/lite/core/tools/BUILD
+++ b/tensorflow/lite/core/tools/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow/lite:special_rules.bzl", "verifier_internal_visibility_allowlist")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_self_contained_libs_test_suite")
+load("//tensorflow/lite:special_rules.bzl", "verifier_internal_visibility_allowlist")
 load("//tensorflow/lite/core:special_rules.bzl", "verifier_visibility_allowlist")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/delegates/flex/build_def.bzl b/tensorflow/lite/delegates/flex/build_def.bzl
index 44588f9abad0cc..bd62777f46c9dd 100644
--- a/tensorflow/lite/delegates/flex/build_def.bzl
+++ b/tensorflow/lite/delegates/flex/build_def.bzl
@@ -1,5 +1,6 @@
 """Generate custom flex delegate library."""
 
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "clean_dep",
@@ -22,7 +23,6 @@ load(
     "tflite_jni_linkopts",
 )
 load("//tensorflow/lite:special_rules.bzl", "flex_portable_tensorflow_deps")
-load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 
 def generate_flex_kernel_header(
         name,
diff --git a/tensorflow/lite/delegates/flex/test/BUILD b/tensorflow/lite/delegates/flex/test/BUILD
index 238b6adadd824c..65467a9a84f903 100644
--- a/tensorflow/lite/delegates/flex/test/BUILD
+++ b/tensorflow/lite/delegates/flex/test/BUILD
@@ -1,7 +1,7 @@
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
 load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
 load("//tensorflow/lite/delegates/flex:build_def.bzl", "tflite_flex_jni_library")
-load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
 
 # Following targets are using for testing selective-built flex delegate
 # in Java. Please don't use them for other purposes.
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index d6eb080e6a74e9..5eeb17646eb9d9 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -1,12 +1,12 @@
 load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+load("@build_bazel_rules_apple//apple:macos.bzl", "macos_dylib")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_gpu_tests_tags",
 )
 load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps", "tflite_portable_test_suite")
 load("//tensorflow/lite/delegates/gpu:build_defs.bzl", "gpu_delegate_linkopts")
-load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
-load("@build_bazel_rules_apple//apple:macos.bzl", "macos_dylib")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/delegates/gpu/gl/converters/BUILD b/tensorflow/lite/delegates/gpu/gl/converters/BUILD
index 70687b224b021a..8403a80594776c 100644
--- a/tensorflow/lite/delegates/gpu/gl/converters/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/converters/BUILD
@@ -1,8 +1,8 @@
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_gpu_tests_tags",
 )
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index 0c555a4e6b9a12..8f23fa22a91fbc 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -1,16 +1,16 @@
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+load("//tensorflow:tensorflow.bzl", "workspace_root")
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "tf_gpu_tests_tags",
+)
 load(
     "//tensorflow/lite:special_rules.bzl",
     "tflite_ios_lab_runner",
     "tflite_ios_per_kernel_test",
     "tflite_portable_test_suite",
 )
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "tf_gpu_tests_tags",
-)
-load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
-load("//tensorflow:tensorflow.bzl", "workspace_root")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index 06c295646eec6e..c668e8345d0395 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -1,14 +1,14 @@
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_unit_test")
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "tf_gpu_tests_tags",
+)
 load(
     "//tensorflow/lite:special_rules.bzl",
     "tflite_ios_lab_runner",
     "tflite_ios_per_kernel_test",
     "tflite_portable_test_suite",
 )
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "tf_gpu_tests_tags",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 6bb146bb6e79b3..fe3c30897dd71d 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/BUILD b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
index 9a5ececc1ece1b..6fa1532106e83c 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/BUILD
+++ b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
@@ -13,8 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 
-load("//tensorflow:strict.default.bzl", "py_strict_binary")
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library")
+load("//tensorflow:strict.default.bzl", "py_strict_binary")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_gpu_tests_tags")
 load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps", "tflite_portable_test_suite")
 
diff --git a/tensorflow/lite/experimental/acceleration/configuration/BUILD b/tensorflow/lite/experimental/acceleration/configuration/BUILD
index a1329c13fb6bc5..79932ae571be7e 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/BUILD
+++ b/tensorflow/lite/experimental/acceleration/configuration/BUILD
@@ -14,16 +14,17 @@
 # ==============================================================================
 
 load("@flatbuffers//:build_defs.bzl", "DEFAULT_FLATC_ARGS", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library")
+
+# copybara:comment_begin(oss-only)
+load("@local_tsl//tsl/platform/default:build_config.bzl", "tf_proto_library_py")
+# copybara:comment_end
+
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow/lite:special_rules.bzl", "nnapi_plugin_impl_visibility_allowlist", "tflite_portable_test_suite")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
 # copybara:uncomment load("//tools/build_defs/proto/cpp:cc_proto_library.bzl", "cc_proto_library")
 
-# copybara:comment_begin(oss-only)
-load("@local_tsl//tsl/platform/default:build_config.bzl", "tf_proto_library_py")
-# copybara:comment_end
-
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
index 23cedaae8879a4..ad625bdff90c29 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
@@ -14,11 +14,11 @@
 # ==============================================================================
 
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
+load("//tensorflow:tensorflow.bzl", "clean_dep")
 
 # Mini benchmarking helps in validating HW acceleration across Android and iOS.
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow:tensorflow.bzl", "clean_dep")
 load("build_defs.bzl", "cc_library_with_forced_in_process_benchmark_variant", "embedded_binary")
 load("special_rules.bzl", "libjpeg_deps", "libjpeg_handle_deps", "libjpeg_hdrs_deps", "minibenchmark_visibility_allowlist")
 
diff --git a/tensorflow/lite/g3doc/tools/BUILD b/tensorflow/lite/g3doc/tools/BUILD
index bbde7a9e01a1df..2c9a83b8ccec65 100644
--- a/tensorflow/lite/g3doc/tools/BUILD
+++ b/tensorflow/lite/g3doc/tools/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:strict.default.bzl", "py_strict_binary")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_binary")
+load("//tensorflow:strict.default.bzl", "py_strict_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/ios/BUILD.apple b/tensorflow/lite/ios/BUILD.apple
index e344e56dd16427..66d51b14f85edc 100644
--- a/tensorflow/lite/ios/BUILD.apple
+++ b/tensorflow/lite/ios/BUILD.apple
@@ -1,6 +1,7 @@
 # TensorFlow Lite for iOS
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_binary", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
 load(
@@ -10,7 +11,6 @@ load(
     "tflite_ios_framework",
     "tflite_ios_xcframework",
 )
-load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
 
 package(
     default_visibility = [
diff --git a/tensorflow/lite/ios/ios.bzl b/tensorflow/lite/ios/ios.bzl
index a123f2d998209b..2320d016557595 100644
--- a/tensorflow/lite/ios/ios.bzl
+++ b/tensorflow/lite/ios/ios.bzl
@@ -1,10 +1,10 @@
 """TensorFlow Lite Build Configurations for iOS"""
 
-load("//tensorflow:tensorflow.bzl", "clean_dep")
 load("@build_bazel_rules_apple//apple:apple.bzl", "apple_static_xcframework")
 
 # Placeholder for Google-internal load statements.
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+load("//tensorflow:tensorflow.bzl", "clean_dep")
 
 # LINT.IfChange
 TFL_MINIMUM_OS_VERSION = "12.0"
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index ae7a0fd7dad92f..46e01dda5b49c6 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -1,13 +1,13 @@
 # Description:
 # TensorFlow Lite Java API.
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
 load("//tensorflow/lite:build_def.bzl", "tflite_jni_linkopts", "tflite_linkopts_no_undefined")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "android_library_with_tflite", "cc_library_with_tflite", "java_library_with_tflite", "java_test_with_tflite", "jni_binary_with_tflite")
 load("//tensorflow/lite/delegates/flex:build_def.bzl", "tflite_flex_android_library")
 load("//tensorflow/lite/java:aar_with_jni.bzl", "aar_with_jni", "aar_without_jni")
-load("@build_bazel_rules_android//android:rules.bzl", "android_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/kernels/internal/utils/BUILD b/tensorflow/lite/kernels/internal/utils/BUILD
index c11cc252e97670..d1fdf50b761963 100644
--- a/tensorflow/lite/kernels/internal/utils/BUILD
+++ b/tensorflow/lite/kernels/internal/utils/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/kernels/parse_example/BUILD b/tensorflow/lite/kernels/parse_example/BUILD
index a4ad08d79a001f..b18d23b6c7d607 100644
--- a/tensorflow/lite/kernels/parse_example/BUILD
+++ b/tensorflow/lite/kernels/parse_example/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_features_nolayering_check_if_ios")
-load("//tensorflow/lite:special_rules.bzl", "nonportable_visibility_allowlist")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:special_rules.bzl", "nonportable_visibility_allowlist")
 
 # Kernel for custom parse_example
 package(
diff --git a/tensorflow/lite/kernels/shim/test_op/BUILD b/tensorflow/lite/kernels/shim/test_op/BUILD
index ecb8b7e3b9f70b..c0a35b6a4655ff 100644
--- a/tensorflow/lite/kernels/shim/test_op/BUILD
+++ b/tensorflow/lite/kernels/shim/test_op/BUILD
@@ -1,7 +1,7 @@
 # A simple op. for testing and demonstrating the OpKernel interface.
 
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_kernel_library")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/nnapi/BUILD
index 1f1e5e1d8e6d72..847cfada1ad4ad 100644
--- a/tensorflow/lite/nnapi/BUILD
+++ b/tensorflow/lite/nnapi/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:special_rules.bzl", "if_nnapi")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:special_rules.bzl", "if_nnapi")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/objc/BUILD.apple b/tensorflow/lite/objc/BUILD.apple
index ef9ec8372db42d..856ed56ad35967 100644
--- a/tensorflow/lite/objc/BUILD.apple
+++ b/tensorflow/lite/objc/BUILD.apple
@@ -1,8 +1,8 @@
 # TensorFlow Lite for Objective-C
 
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
 load("//tensorflow/lite:special_rules.bzl", "ios_visibility_allowlist", "tflite_ios_lab_runner")
 load("//tensorflow/lite/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZER_TAGS", "TFL_MINIMUM_OS_VERSION")
-load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/profiling/telemetry/BUILD b/tensorflow/lite/profiling/telemetry/BUILD
index 513c2a6253175e..ac8ea8854ddf9f 100644
--- a/tensorflow/lite/profiling/telemetry/BUILD
+++ b/tensorflow/lite/profiling/telemetry/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/profiling/telemetry/c/BUILD b/tensorflow/lite/profiling/telemetry/c/BUILD
index a02a5207d0b663..e05618beae4ce4 100644
--- a/tensorflow/lite/profiling/telemetry/c/BUILD
+++ b/tensorflow/lite/profiling/telemetry/c/BUILD
@@ -1,9 +1,9 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/lite:build_def.bzl",
     "tflite_cc_library_with_c_headers_test",
     "tflite_copts",
 )
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index f4f8b9fe4db33a..7a5cd8ff42dc42 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -1,6 +1,6 @@
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_py_library")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_contrib_test", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
-load("@flatbuffers//:build_defs.bzl", "flatbuffer_py_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist")
 
diff --git a/tensorflow/lite/python/metrics/BUILD b/tensorflow/lite/python/metrics/BUILD
index 8f6b4d3863df0d..0258eb2dcefd15 100644
--- a/tensorflow/lite/python/metrics/BUILD
+++ b/tensorflow/lite/python/metrics/BUILD
@@ -1,7 +1,8 @@
-# Placeholder: load py_proto_library
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
+
+# Placeholder: load py_proto_library
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "if_portable", "pybind_extension")
-load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 246791ec392c45..3195a5016aa118 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite", "tflite_schema_utils_friends")
 
diff --git a/tensorflow/lite/swift/BUILD.apple b/tensorflow/lite/swift/BUILD.apple
index 0102871e2bc50e..292c2ce94e4c63 100644
--- a/tensorflow/lite/swift/BUILD.apple
+++ b/tensorflow/lite/swift/BUILD.apple
@@ -1,10 +1,10 @@
 # TensorFlow Lite for Swift
 
-load("//tensorflow/lite:special_rules.bzl", "ios_visibility_allowlist", "tflite_ios_lab_runner")
-load("//tensorflow/lite/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZER_TAGS", "TFL_MINIMUM_OS_VERSION")
 load("@build_bazel_rules_apple//apple:apple.bzl", "apple_static_xcframework")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework", "ios_unit_test")
 load("@build_bazel_rules_swift//swift:swift.bzl", "swift_library")
+load("//tensorflow/lite:special_rules.bzl", "ios_visibility_allowlist", "tflite_ios_lab_runner")
+load("//tensorflow/lite/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZER_TAGS", "TFL_MINIMUM_OS_VERSION")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index c5b5a50cac6e2d..66718eb70c185f 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -1,8 +1,4 @@
 # Placeholder: load py_proto_library
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library",
-)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
@@ -10,6 +6,10 @@ load(
     "tf_copts",
 )
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_proto_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/toco/logging/BUILD b/tensorflow/lite/toco/logging/BUILD
index 83daab2357364a..4f6efd6d03a31c 100644
--- a/tensorflow/lite/toco/logging/BUILD
+++ b/tensorflow/lite/toco/logging/BUILD
@@ -1,13 +1,14 @@
-# Placeholder: load py_proto_library
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library",
-)
+
+# Placeholder: load py_proto_library
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_proto_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 704021241d6e07..d7608ec188363e 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -1,14 +1,15 @@
-# Placeholder: load py_proto_library
 load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
+
+# Placeholder: load py_proto_library
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
-load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
 )
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/benchmark/android/BUILD b/tensorflow/lite/tools/benchmark/android/BUILD
index 1d1482e0aec5bf..4ea40dbe232b65 100644
--- a/tensorflow/lite/tools/benchmark/android/BUILD
+++ b/tensorflow/lite/tools/benchmark/android/BUILD
@@ -1,9 +1,9 @@
 # Description:
 #   BenchmarkModel Android harness for TensorFlow Lite benchmarks.
 
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
 load("//tensorflow/lite:special_rules.bzl", "tflite_hexagon_nn_skel_libraries")
-load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD
index 8f944837c3c15c..fd8657c8b9a8fa 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD
@@ -1,9 +1,9 @@
 # Description:
 #  Holds model-specific files. The app will bundle the files into assets.
 
-load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:proto.bzl", "proto_data")
-load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:build_defs.bzl", "accuracy_benchmark_extra_models", "latency_benchmark_extra_models")
 load("//tensorflow/lite/experimental/acceleration/mini_benchmark:build_defs.bzl", "validation_model")
+load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:build_defs.bzl", "accuracy_benchmark_extra_models", "latency_benchmark_extra_models")
+load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:proto.bzl", "proto_data")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD
index 5209526ab6712c..d2d1f807604ab5 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #  Holds the native layer of the app.
 
-load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:build_defs.bzl", "accuracy_benchmark_extra_deps", "latency_benchmark_extra_deps")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite", "jni_binary_with_tflite")
+load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:build_defs.bzl", "accuracy_benchmark_extra_deps", "latency_benchmark_extra_deps")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD b/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD
index a03a18fcbe7ab3..36193f17394955 100644
--- a/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD
@@ -1,9 +1,9 @@
 # Description:
 #   BenchmarkModel Android harness for Firebase Test Lab.
 
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
 load("//tensorflow/lite:special_rules.bzl", "tflite_hexagon_nn_skel_libraries")
-load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/benchmark/experimental/ios/BUILD.apple b/tensorflow/lite/tools/benchmark/experimental/ios/BUILD.apple
index 86289ca2e846c9..fef7eff82bf8f1 100644
--- a/tensorflow/lite/tools/benchmark/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/tools/benchmark/experimental/ios/BUILD.apple
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//tensorflow/lite/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION", "strip_common_include_path_prefix")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+load("//tensorflow/lite/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION", "strip_common_include_path_prefix")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/BUILD.apple b/tensorflow/lite/tools/evaluation/tasks/ios/BUILD.apple
index 37177ac5316cc8..1db59b4821e75e 100644
--- a/tensorflow/lite/tools/evaluation/tasks/ios/BUILD.apple
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/BUILD.apple
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//tensorflow/lite/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+load("//tensorflow/lite/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 89e21a86805331..dbdddcedecf06f 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -1,10 +1,10 @@
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/lite:build_def.bzl",
     "tflite_copts",
 )
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
index f199ba272b1a33..d2ba8553c62e74 100644
--- a/tensorflow/lite/tools/optimize/calibration/BUILD
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/signature/BUILD b/tensorflow/lite/tools/signature/BUILD
index 161dcd1554d04b..26b7acb7665d4a 100644
--- a/tensorflow/lite/tools/signature/BUILD
+++ b/tensorflow/lite/tools/signature/BUILD
@@ -1,9 +1,9 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "pybind_extension", "replace_with_portable_tf_lib_when_required")
 
 # Utilities for signature_defs in TFLite
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "pybind_extension", "replace_with_portable_tf_lib_when_required")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 8b51ed5d808e6a..210fa3b1bff005 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2,6 +2,10 @@
 # Public targets:
 #  ":platform" - Low-level and platform-specific Python code.
 
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 
@@ -19,10 +23,6 @@ load(
     "tf_additional_plugin_deps",
     "tf_additional_profiler_deps",
 )
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
 
 # TODO(mdan): Break into per-directory files.
 
diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl
index fe17a30249aefb..845da5ef95991b 100644
--- a/tensorflow/python/build_defs.bzl
+++ b/tensorflow/python/build_defs.bzl
@@ -10,9 +10,9 @@
 # consumers of the tf_gen_op_wrapper_py rule would be simplified if we don't
 # hard code the ops/ directory.
 
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
-load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:py.default.bzl", "py_library")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 # This is a private function only intended to be used in this directory, no need to
 # document all its args for public consumption.
diff --git a/tensorflow/python/distribute/integration_test/BUILD b/tensorflow/python/distribute/integration_test/BUILD
index 26ee3c7b51f59f..5500a86fb64b3b 100644
--- a/tensorflow/python/distribute/integration_test/BUILD
+++ b/tensorflow/python/distribute/integration_test/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/core/platform:distribute.bzl", "distribute_py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
+load("//tensorflow/core/platform:distribute.bzl", "distribute_py_strict_test")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 
 package(
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index ee681ef0e97a92..af25ec35db0bdf 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
-load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_strict_test")
 load("//tensorflow:tensorflow.bzl", "check_deps")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test", "tf_python_pybind_extension")
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_strict_test")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_rpc_deps",
diff --git a/tensorflow/python/eager/polymorphic_function/BUILD b/tensorflow/python/eager/polymorphic_function/BUILD
index 2160b5a8f91c2d..663ac4582ba3d9 100644
--- a/tensorflow/python/eager/polymorphic_function/BUILD
+++ b/tensorflow/python/eager/polymorphic_function/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD
index c8068f02af560d..95a370e19a9c79 100644
--- a/tensorflow/python/kernel_tests/proto/BUILD
+++ b/tensorflow/python/kernel_tests/proto/BUILD
@@ -1,11 +1,12 @@
 # Tests of tf.io.*proto.
 
-# Placeholder: load py_proto_library
 load("//tensorflow:strict.default.bzl", "py_strict_library")
+
+# Placeholder: load py_proto_library
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
 load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
-load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
+load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index e11d3c1dbde4c4..687f27d7d2b078 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 load("//tensorflow:tensorflow.bzl", "py_test")  # @unused
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/lib/io/BUILD b/tensorflow/python/lib/io/BUILD
index cc0fbee4cf99b4..fd03e91c846a98 100644
--- a/tensorflow/python/lib/io/BUILD
+++ b/tensorflow/python/lib/io/BUILD
@@ -1,8 +1,8 @@
 # python/lib/io package
 
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test", "tf_python_pybind_extension")
 load("//tensorflow:tensorflow.bzl", "if_oss")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test", "tf_python_pybind_extension")
 
 # copybara:uncomment_begin(google-only)
 # load("//third_party/zlib:BUILD_defs.bzl", "brittle_test_relying_on_stable_zlib_output")
diff --git a/tensorflow/python/ops/linalg/sparse/BUILD b/tensorflow/python/ops/linalg/sparse/BUILD
index fa87211b113e3b..0012a19a12d047 100644
--- a/tensorflow/python/ops/linalg/sparse/BUILD
+++ b/tensorflow/python/ops/linalg/sparse/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 # Description: Sparse CSR support for TensorFlow.
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
diff --git a/tensorflow/python/ops/memory_tests/BUILD b/tensorflow/python/ops/memory_tests/BUILD
index 77d88ed25a552e..7ac2e7882f53aa 100644
--- a/tensorflow/python/ops/memory_tests/BUILD
+++ b/tensorflow/python/ops/memory_tests/BUILD
@@ -1,8 +1,8 @@
 # python/ops/memory_tests package
 
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
-load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_xla_deps_py")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/tools/api/generator/BUILD b/tensorflow/python/tools/api/generator/BUILD
index 6a40f1a8292aa0..ad98ab315a7228 100644
--- a/tensorflow/python/tools/api/generator/BUILD
+++ b/tensorflow/python/tools/api/generator/BUILD
@@ -1,11 +1,11 @@
 # Description:
 # Scripts used to generate TensorFlow Python API.
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow/python/tools/api/generator:api_gen.bzl", "TENSORFLOW_API_GEN_PACKAGES")
 load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "KERAS_API_INIT_FILES", "TENSORFLOW_API_INIT_FILES")
 load("//tensorflow/python/tools/api/generator:api_init_files_v1.bzl", "KERAS_API_INIT_FILES_V1", "TENSORFLOW_API_INIT_FILES_V1")
-load("//tensorflow/python/tools/api/generator:api_gen.bzl", "TENSORFLOW_API_GEN_PACKAGES")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/tools/api/generator2/BUILD b/tensorflow/python/tools/api/generator2/BUILD
index 8c2c1e4c41b9ad..9b739b45c348d3 100644
--- a/tensorflow/python/tools/api/generator2/BUILD
+++ b/tensorflow/python/tools/api/generator2/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/tools/api/generator2/generator/BUILD b/tensorflow/python/tools/api/generator2/generator/BUILD
index a8b56d670c353a..8aba8163a63a29 100644
--- a/tensorflow/python/tools/api/generator2/generator/BUILD
+++ b/tensorflow/python/tools/api/generator2/generator/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:strict.default.bzl", "py_strict_test")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_binary", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index b1840d29128af6..89e5caf9ad6ef8 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -1,3 +1,5 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+
 # Description: Operations defined for Cloud TPUs
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
@@ -6,7 +8,6 @@ load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test", "tf_python_pybind_extension")
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/python/tpu:tpu.bzl", "internal_create_sanitizer_settings", "tpu_py_strict_test")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 # Do not add anymore paths here. You do not need to be in the visibility list
 # to use TPU symbols. They are accessible from tf.contrib.tpu in TF 1.x and
diff --git a/tensorflow/python/training/BUILD b/tensorflow/python/training/BUILD
index 847e93392b2c67..aec2d3f94567eb 100644
--- a/tensorflow/python/training/BUILD
+++ b/tensorflow/python/training/BUILD
@@ -1,8 +1,9 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 # Placeholder: load py_proto_library
-load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/core/platform:distribute.bzl", "distribute_py_strict_test")
-load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/tools/android/inference_interface/BUILD b/tensorflow/tools/android/inference_interface/BUILD
index 22de79201c4010..4cbc61957f6384 100644
--- a/tensorflow/tools/android/inference_interface/BUILD
+++ b/tensorflow/tools/android/inference_interface/BUILD
@@ -1,13 +1,13 @@
 # Description:
 #   JNI-based Java inference interface for TensorFlow.
 
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
     "tf_cc_binary",  # @unused
     "tf_copts",
 )
-load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index 383021af7dd342..6afd971397e5f7 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -19,9 +19,8 @@ symbols through this python script.
   * `VS140COMNTOOLS`
 """
 
-load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_vc_path")
-load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_msvc_tool")
 load("@bazel_tools//tools/cpp:lib_cc_configure.bzl", "auto_configure_fail")
+load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_msvc_tool", "find_vc_path")
 
 def _def_file_filter_configure_impl(repository_ctx):
     if repository_ctx.os.name.lower().find("windows") == -1:
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 28a821e7dc85f5..45081a2a695599 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   Doc generator
 
-load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
 load("//tensorflow:py.default.bzl", "py_library")
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 4e6cbfdecb501c..c78d03f82b52fd 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -2,9 +2,9 @@
 # This includes the C API, Java API, and protocol buffer files.
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_enable_mkl", "if_mkl")
 load("@rules_pkg//:pkg.bzl", "pkg_tar", "pkg_zip")
 load("//tensorflow:tensorflow.bzl", "VERSION", "VERSION_MAJOR", "if_macos")
-load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_enable_mkl", "if_mkl")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_license_deps")
 
 package(default_visibility = ["//visibility:private"])
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 521f492d46f04f..282b7c1229146c 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -3,8 +3,8 @@
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
-load("//tensorflow:tensorflow.bzl", "if_with_tpu_support", "transitive_hdrs")
 load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_enable_mkl", "if_mkl", "if_mkl_ml")
+load("//tensorflow:tensorflow.bzl", "if_with_tpu_support", "transitive_hdrs")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_license_deps")
 load("//tensorflow/tools/pip_package/utils:data_deps.bzl", "collect_data_files")
 load("//tensorflow/tools/pip_package/utils:py_deps.bzl", "transitive_py_deps")
diff --git a/tensorflow/tools/proto_splitter/testdata/BUILD b/tensorflow/tools/proto_splitter/testdata/BUILD
index f3d817b9ae283b..5871675f30c856 100644
--- a/tensorflow/tools/proto_splitter/testdata/BUILD
+++ b/tensorflow/tools/proto_splitter/testdata/BUILD
@@ -1,5 +1,6 @@
-# Placeholder: load py_proto_library
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
+
+# Placeholder: load py_proto_library
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
index 9261a652f9c367..00cd6983ca3835 100644
--- a/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+++ b/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -1,8 +1,8 @@
 """Configurations of AARCH64 builds used with Docker container."""
 
 load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/py:python_configure.bzl", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 
 def ml2014_tf_aarch64_configs(name_container_map, env):
     for name, container in name_container_map.items():
diff --git a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
index 9f71a414bf7746..ae776c2a2fd388 100644
--- a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
+++ b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,12 +1,12 @@
 """Macro that creates external repositories for remote config."""
 
-load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
+load("//tensorflow/tools/toolchains/remote_config:containers.bzl", "containers")
 load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
-load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
-load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
-load("//tensorflow/tools/toolchains/remote_config:containers.bzl", "containers")
+load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
 
 def _container_image_uri(container_name):
     container = containers[container_name]
diff --git a/tensorflow/tools/toolchains/win/bazel_211/BUILD b/tensorflow/tools/toolchains/win/bazel_211/BUILD
index cc23c8ecb22680..c7484d2ae2efdf 100644
--- a/tensorflow/tools/toolchains/win/bazel_211/BUILD
+++ b/tensorflow/tools/toolchains/win/bazel_211/BUILD
@@ -15,8 +15,8 @@
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
-load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/tensorflow/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
index 30571b6a5ace85..9ccc1706e5eca5 100644
--- a/tensorflow/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
+++ b/tensorflow/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
@@ -14,6 +14,7 @@
 
 """A Starlark cc_toolchain configuration rule for Windows"""
 
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
@@ -29,7 +30,6 @@ load(
     "variable_with_value",
     "with_feature_set",
 )
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/tensorflow/tools/toolchains/win/tf_win_05022023/BUILD b/tensorflow/tools/toolchains/win/tf_win_05022023/BUILD
index f245f6d0789c9d..8a2ae6fe4a9dd3 100644
--- a/tensorflow/tools/toolchains/win/tf_win_05022023/BUILD
+++ b/tensorflow/tools/toolchains/win/tf_win_05022023/BUILD
@@ -15,8 +15,8 @@
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
-load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/tensorflow/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
index ba3de607d10451..d6b966b32cecad 100644
--- a/tensorflow/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
+++ b/tensorflow/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
@@ -14,6 +14,7 @@
 
 """A Starlark cc_toolchain configuration rule for Windows"""
 
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
@@ -28,7 +29,6 @@ load(
     "variable_with_value",
     "with_feature_set",
 )
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/tensorflow/workspace0.bzl b/tensorflow/workspace0.bzl
index d8b53e1ef8b3be..0fdb0342fae676 100644
--- a/tensorflow/workspace0.bzl
+++ b/tensorflow/workspace0.bzl
@@ -1,14 +1,14 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("//third_party/googleapis:repository_rules.bzl", "config_googleapis")
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_toolchains//repositories:repositories.bzl", bazel_toolchains_repositories = "repositories")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
 load("@build_bazel_rules_apple//apple:repositories.bzl", "apple_rules_dependencies")
 load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependencies")
-load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
 load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
 load("@local_config_android//:android.bzl", "android_workspace")
 load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
+load("//third_party/googleapis:repository_rules.bzl", "config_googleapis")
 
 def _tf_bind():
     """Bind targets for some external repositories"""
diff --git a/tensorflow/workspace1.bzl b/tensorflow/workspace1.bzl
index 9b092a10bf3310..812baccbbbe851 100644
--- a/tensorflow/workspace1.bzl
+++ b/tensorflow/workspace1.bzl
@@ -1,11 +1,11 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("//third_party/android:android_configure.bzl", "android_configure")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
 load("@com_google_benchmark//:bazel/benchmark_deps.bzl", "benchmark_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
+load("//third_party/android:android_configure.bzl", "android_configure")
 
 # buildifier: disable=unnamed-macro
 def workspace(with_rules_cc = True):
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 512d43623da089..d89168256105a7 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -8,6 +8,13 @@ load("@bazel_skylib//lib:versions.bzl", "versions")
 load("@bazel_tools//tools/build_defs/repo:java.bzl", "java_import_external")
 load("@rules_jvm_external//:defs.bzl", "maven_install")
 load("@tf_runtime//:dependencies.bzl", "tfrt_dependencies")
+load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure")
+load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure")
+load("//tensorflow/tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure")
+load("//tensorflow/tools/toolchains/clang6:repo.bzl", "clang6_configure")
+load("//tensorflow/tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
+load("//tensorflow/tools/toolchains/remote:configure.bzl", "remote_execution_configure")
+load("//tensorflow/tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/absl:workspace.bzl", absl = "repo")
 load("//third_party/benchmark:workspace.bzl", benchmark = "repo")
@@ -45,13 +52,6 @@ load("//third_party/ruy:workspace.bzl", ruy = "repo")
 load("//third_party/sobol_data:workspace.bzl", sobol_data = "repo")
 load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
-load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure")
-load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure")
-load("//tensorflow/tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure")
-load("//tensorflow/tools/toolchains/clang6:repo.bzl", "clang6_configure")
-load("//tensorflow/tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
-load("//tensorflow/tools/toolchains/remote:configure.bzl", "remote_execution_configure")
-load("//tensorflow/tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 load("//third_party/tensorrt:workspace.bzl", tensorrt = "repo")
 load("//third_party/triton:workspace.bzl", triton = "repo")
diff --git a/tensorflow/workspace3.bzl b/tensorflow/workspace3.bzl
index 7d187724bb1e47..d7b32f01c7144d 100644
--- a/tensorflow/workspace3.bzl
+++ b/tensorflow/workspace3.bzl
@@ -1,9 +1,9 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
-load("//third_party/llvm:workspace.bzl", llvm = "repo")
 load("//third_party:repo.bzl", "tf_vendored")
+load("//third_party/llvm:workspace.bzl", llvm = "repo")
+load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
 
 def workspace():
     tf_vendored(name = "local_xla", relpath = "third_party/xla")
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index ef738212657fdd..2b78d1e03ba47c 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -1,4 +1,5 @@
 load("@build_bazel_rules_android//android:rules.bzl", "android_library")
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
 load(":build_defs.bzl", "flatbuffer_py_strip_prefix_srcs")
 
 package(default_visibility = ["//visibility:public"])
@@ -22,8 +23,6 @@ config_setting(
     values = {"cpu": "x64_windows"},
 )
 
-load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
-
 # Public flatc library to compile flatbuffer files at runtime.
 cc_library(
     name = "flatbuffers",
diff --git a/third_party/googleapis/build_rules.bzl b/third_party/googleapis/build_rules.bzl
index 377d74be1adaf2..3715b85da29339 100644
--- a/third_party/googleapis/build_rules.bzl
+++ b/third_party/googleapis/build_rules.bzl
@@ -16,8 +16,8 @@
 Utilities for building grpc and proto libraries from googleapis.
 """
 
-load("@rules_cc//cc:defs.bzl", native_cc_proto_library = "cc_proto_library")
 load("@com_github_grpc_grpc//bazel:generate_cc.bzl", "generate_cc")
+load("@rules_cc//cc:defs.bzl", native_cc_proto_library = "cc_proto_library")
 
 def _tf_cc_headers(ctx):
     if len(ctx.attr.deps) != 1:
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 5bf0504dc91bcc..03a373a0024e3e 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -26,7 +26,6 @@
   * `PYTHON_BIN_PATH`: The python binary path
 """
 
-load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
 load(
     "@bazel_tools//tools/cpp:lib_cc_configure.bzl",
     "escape_string",
@@ -38,6 +37,7 @@ load(
     "find_vc_path",
     "setup_vc_env_vars",
 )
+load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
 load(
     "//third_party/remote_config:common.bzl",
     "config_repo_label",
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index a83755f0c17f80..1792df5dc23d12 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -8,13 +8,6 @@
   * `TF_ROCM_AMDGPU_TARGETS`: The AMDGPU targets.
 """
 
-load(
-    ":cuda_configure.bzl",
-    "enable_cuda",
-    "make_copy_dir_rule",
-    "make_copy_files_rule",
-    "to_list_of_strings",
-)
 load(
     "//third_party/remote_config:common.bzl",
     "config_repo_label",
@@ -29,6 +22,13 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":cuda_configure.bzl",
+    "enable_cuda",
+    "make_copy_dir_rule",
+    "make_copy_files_rule",
+    "to_list_of_strings",
+)
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
diff --git a/third_party/hwloc/hwloc.BUILD b/third_party/hwloc/hwloc.BUILD
index a8ed9db0883d41..ff14af8b59e21d 100644
--- a/third_party/hwloc/hwloc.BUILD
+++ b/third_party/hwloc/hwloc.BUILD
@@ -1,5 +1,8 @@
 # hwloc: Portable Hardware Locality Library
 
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+
 package(
     default_visibility = ["//visibility:public"],
 )
@@ -8,9 +11,6 @@ licenses(["notice"])
 
 exports_files(["COPYING"])
 
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-
 COMMON_INCLUDE_COPTS = [
     "-I.",
     "-Ihwloc",
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 9f61f9e31e5e12..9afb8ba2165651 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   libjpeg-turbo is a drop in replacement for jpeglib optimized with SIMD.
 
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
 licenses(["notice"])  # custom notice-style license, see LICENSE.md
 
diff --git a/third_party/llvm_openmp/BUILD b/third_party/llvm_openmp/BUILD
index cd410257058ce1..0d1fad28f9c1ac 100644
--- a/third_party/llvm_openmp/BUILD
+++ b/third_party/llvm_openmp/BUILD
@@ -1,5 +1,6 @@
 # Build file for OpenMP library that is part of llvm
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "@local_tsl//tsl:tsl.bzl",
     "if_linux_x86_64",
@@ -16,7 +17,6 @@ load(
     "dict_add",
     "libiomp5_cc_binary",
 )
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     default_visibility = [
diff --git a/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/mkl_dnn/mkldnn_acl.BUILD
index d8373b0eae0f34..619b9deea1fbf1 100644
--- a/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/mkl_dnn/mkldnn_acl.BUILD
@@ -1,7 +1,7 @@
-exports_files(["LICENSE"])
-
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
+exports_files(["LICENSE"])
+
 _DNNL_COPTS_THREADPOOL = [
     "-fopenmp-simd",
     "-fexceptions",
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 5494e718a9c168..370319c5b8ee6f 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -1,7 +1,7 @@
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@local_tsl//tsl:tsl.bzl", "tf_openmp_copts")
-load("@org_tensorflow//third_party/mkl_dnn:build_defs.bzl", "if_mkldnn_openmp")
 load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml")
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@org_tensorflow//third_party/mkl_dnn:build_defs.bzl", "if_mkldnn_openmp")
 
 exports_files(["LICENSE"])
 
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 5d040b95dcd4b8..72f91a68474f97 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -1,10 +1,6 @@
 # NVIDIA NCCL 2
 # A package of optimized primitives for collective multi-GPU communication.
 
-licenses(["notice"])
-
-exports_files(["LICENSE.txt"])
-
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@bazel_skylib//rules:write_file.bzl", "write_file")
 load(
@@ -20,6 +16,10 @@ load(
     "GENERATED_SOURCES",
 )
 
+licenses(["notice"])
+
+exports_files(["LICENSE.txt"])
+
 NCCL_MAJOR = 2
 
 NCCL_MINOR = 19
diff --git a/third_party/pprof.BUILD b/third_party/pprof.BUILD
index 1b33aa8794ca53..ba25846c709cc2 100644
--- a/third_party/pprof.BUILD
+++ b/third_party/pprof.BUILD
@@ -1,11 +1,11 @@
+load("@local_tsl//tsl/platform/default:build_config.bzl", "py_proto_library")
+
 package(
     default_visibility = ["//visibility:public"],
 )
 
 licenses(["notice"])  # MIT
 
-load("@local_tsl//tsl/platform/default:build_config.bzl", "py_proto_library")
-
 exports_files(["pprof/LICENSE"])
 
 py_proto_library(
diff --git a/third_party/systemlibs/protobuf.BUILD b/third_party/systemlibs/protobuf.BUILD
index 4d05ab28d12e99..c7d940605f9f72 100644
--- a/third_party/systemlibs/protobuf.BUILD
+++ b/third_party/systemlibs/protobuf.BUILD
@@ -1,10 +1,10 @@
-load("@rules_proto//proto:defs.bzl", "proto_library")
 load(
     "@com_google_protobuf//:protobuf.bzl",
     "cc_proto_library",
     "proto_gen",
     "py_proto_library",
 )
+load("@rules_proto//proto:defs.bzl", "proto_library")
 
 licenses(["notice"])
 
diff --git a/third_party/xla/WORKSPACE b/third_party/xla/WORKSPACE
index 374838de234e01..7ba74d6276c2e4 100644
--- a/third_party/xla/WORKSPACE
+++ b/third_party/xla/WORKSPACE
@@ -1,3 +1,4 @@
+# buildifier: disable=load-on-top
 workspace(name = "xla")
 
 # Initialize the XLA repository and all dependencies.
diff --git a/third_party/xla/build_tools/configure/BUILD b/third_party/xla/build_tools/configure/BUILD
index 90d63687cfb1b7..3be5b6a0446109 100644
--- a/third_party/xla/build_tools/configure/BUILD
+++ b/third_party/xla/build_tools/configure/BUILD
@@ -13,10 +13,9 @@
 # limitations under the License.
 # ============================================================================
 
-load("//xla:pytype.default.bzl", "pytype_strict_library")
-
 # Placeholder: load py_test
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+load("//xla:pytype.default.bzl", "pytype_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/llvm_openmp/BUILD b/third_party/xla/third_party/llvm_openmp/BUILD
index 71a21b4e3786bb..31bf29f966c495 100644
--- a/third_party/xla/third_party/llvm_openmp/BUILD
+++ b/third_party/xla/third_party/llvm_openmp/BUILD
@@ -1,5 +1,6 @@
 # Build file for OpenMP library that is part of llvm
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "@local_tsl//tsl:tsl.bzl",
     "if_linux_x86_64",
@@ -16,7 +17,6 @@ load(
     "dict_add",
     "libiomp5_cc_binary",
 )
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     default_visibility = [
diff --git a/third_party/xla/third_party/tsl/WORKSPACE b/third_party/xla/third_party/tsl/WORKSPACE
index cab76d7b38432a..45fe4fccfcc658 100644
--- a/third_party/xla/third_party/tsl/WORKSPACE
+++ b/third_party/xla/third_party/tsl/WORKSPACE
@@ -1,3 +1,4 @@
+# buildifier: disable=load-on-top
 workspace(name = "tsl")
 
 # Initialize the TSL repository and all dependencies.
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
index 89ea8f54a3495e..fefbf081c87e1c 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
@@ -26,7 +26,6 @@
   * `PYTHON_BIN_PATH`: The python binary path
 """
 
-load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
 load(
     "@bazel_tools//tools/cpp:lib_cc_configure.bzl",
     "escape_string",
@@ -38,6 +37,7 @@ load(
     "find_vc_path",
     "setup_vc_env_vars",
 )
+load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
 load(
     "//third_party/remote_config:common.bzl",
     "config_repo_label",
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
index c96ecf4d62eb64..b8b1aa92a5b037 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
@@ -8,13 +8,6 @@
   * `TF_ROCM_AMDGPU_TARGETS`: The AMDGPU targets.
 """
 
-load(
-    ":cuda_configure.bzl",
-    "enable_cuda",
-    "make_copy_dir_rule",
-    "make_copy_files_rule",
-    "to_list_of_strings",
-)
 load(
     "//third_party/remote_config:common.bzl",
     "config_repo_label",
@@ -29,6 +22,13 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":cuda_configure.bzl",
+    "enable_cuda",
+    "make_copy_dir_rule",
+    "make_copy_files_rule",
+    "to_list_of_strings",
+)
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
diff --git a/third_party/xla/third_party/tsl/third_party/hwloc/hwloc.BUILD b/third_party/xla/third_party/tsl/third_party/hwloc/hwloc.BUILD
index e90235dd9fcc52..457cef024907ed 100644
--- a/third_party/xla/third_party/tsl/third_party/hwloc/hwloc.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/hwloc/hwloc.BUILD
@@ -1,5 +1,8 @@
 # hwloc: Portable Hardware Locality Library
 
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+
 package(
     default_visibility = ["//visibility:public"],
 )
@@ -8,9 +11,6 @@ licenses(["notice"])
 
 exports_files(["COPYING"])
 
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-
 COMMON_INCLUDE_COPTS = [
     "-I.",
     "-Ihwloc",
diff --git a/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD b/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD
index 34ad101bd35036..e60c91bbe376b6 100644
--- a/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD
+++ b/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD
@@ -1,11 +1,6 @@
 # Build file for OpenMP library that is part of llvm
 
-load(
-    "@local_tsl//tsl:tsl.bzl",
-    "if_linux_x86_64",
-    "if_macos",
-    "if_windows",
-)
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "@local_tsl//third_party/llvm_openmp:cmake_vars.bzl",
     "cmake_var_string",
@@ -16,7 +11,12 @@ load(
     "dict_add",
     "libiomp5_cc_binary",
 )
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load(
+    "@local_tsl//tsl:tsl.bzl",
+    "if_linux_x86_64",
+    "if_macos",
+    "if_windows",
+)
 
 package(
     default_visibility = [
diff --git a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_acl.BUILD
index 1c4e4c86fa1134..d67b62a98d2660 100644
--- a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_acl.BUILD
@@ -1,7 +1,7 @@
-exports_files(["LICENSE"])
-
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
+exports_files(["LICENSE"])
+
 _DNNL_COPTS_THREADPOOL = [
     "-fopenmp-simd",
     "-fexceptions",
diff --git a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
index abf24705dd3f48..021557f4c2bc76 100644
--- a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -1,7 +1,7 @@
-load("@local_tsl//tsl:tsl.bzl", "tf_openmp_copts")
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@local_tsl//third_party/mkl_dnn:build_defs.bzl", "if_mkldnn_openmp")
+load("@local_tsl//tsl:tsl.bzl", "tf_openmp_copts")
 load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml")
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
 exports_files(["LICENSE"])
 
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
index 5d040b95dcd4b8..72f91a68474f97 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
@@ -1,10 +1,6 @@
 # NVIDIA NCCL 2
 # A package of optimized primitives for collective multi-GPU communication.
 
-licenses(["notice"])
-
-exports_files(["LICENSE.txt"])
-
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@bazel_skylib//rules:write_file.bzl", "write_file")
 load(
@@ -20,6 +16,10 @@ load(
     "GENERATED_SOURCES",
 )
 
+licenses(["notice"])
+
+exports_files(["LICENSE.txt"])
+
 NCCL_MAJOR = 2
 
 NCCL_MINOR = 19
diff --git a/third_party/xla/third_party/tsl/third_party/systemlibs/protobuf.BUILD b/third_party/xla/third_party/tsl/third_party/systemlibs/protobuf.BUILD
index 4d05ab28d12e99..c7d940605f9f72 100644
--- a/third_party/xla/third_party/tsl/third_party/systemlibs/protobuf.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/systemlibs/protobuf.BUILD
@@ -1,10 +1,10 @@
-load("@rules_proto//proto:defs.bzl", "proto_library")
 load(
     "@com_google_protobuf//:protobuf.bzl",
     "cc_proto_library",
     "proto_gen",
     "py_proto_library",
 )
+load("@rules_proto//proto:defs.bzl", "proto_library")
 
 licenses(["notice"])
 
diff --git a/third_party/xla/third_party/tsl/tools/def_file_filter/def_file_filter_configure.bzl b/third_party/xla/third_party/tsl/tools/def_file_filter/def_file_filter_configure.bzl
index 3c7543cba86f96..a648d3d8d646ac 100644
--- a/third_party/xla/third_party/tsl/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/third_party/xla/third_party/tsl/tools/def_file_filter/def_file_filter_configure.bzl
@@ -19,9 +19,8 @@ symbols through this python script.
   * `VS140COMNTOOLS`
 """
 
-load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_vc_path")
-load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_msvc_tool")
 load("@bazel_tools//tools/cpp:lib_cc_configure.bzl", "auto_configure_fail")
+load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_msvc_tool", "find_vc_path")
 
 def _def_file_filter_configure_impl(repository_ctx):
     if repository_ctx.os.name.lower().find("windows") == -1:
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/third_party/xla/third_party/tsl/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
index 5f2057a3c14360..4455aea60109fa 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -1,8 +1,8 @@
 """Configurations of AARCH64 builds used with Docker container."""
 
-load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/py:python_configure.bzl", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
 
 def ml2014_tf_aarch64_configs(name_container_map, env):
     for name, container in name_container_map.items():
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl
index b1488584566aa6..18a84d96c39f82 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,12 +1,12 @@
 """Macro that creates external repositories for remote config."""
 
-load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
-load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
+load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
 load("//tools/toolchains/remote_config:containers.bzl", "containers")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 
 def _container_image_uri(container_name):
     container = containers[container_name]
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/BUILD
index cc23c8ecb22680..c7484d2ae2efdf 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/BUILD
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/BUILD
@@ -15,8 +15,8 @@
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
-load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl b/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
index 30571b6a5ace85..9ccc1706e5eca5 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
@@ -14,6 +14,7 @@
 
 """A Starlark cc_toolchain configuration rule for Windows"""
 
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
@@ -29,7 +30,6 @@ load(
     "variable_with_value",
     "with_feature_set",
 )
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/BUILD
index f245f6d0789c9d..8a2ae6fe4a9dd3 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/BUILD
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/BUILD
@@ -15,8 +15,8 @@
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
-load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl b/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
index ba3de607d10451..d6b966b32cecad 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
@@ -14,6 +14,7 @@
 
 """A Starlark cc_toolchain configuration rule for Windows"""
 
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
@@ -28,7 +29,6 @@ load(
     "variable_with_value",
     "with_feature_set",
 )
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/third_party/xla/third_party/tsl/tsl/BUILD b/third_party/xla/third_party/tsl/tsl/BUILD
index 2019dd229631a3..926656f3b84087 100644
--- a/third_party/xla/third_party/tsl/tsl/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/BUILD
@@ -1,6 +1,6 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "bool_setting")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("tsl.bzl", "if_google", "if_oss")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/BUILD b/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
index 4250b3db059897..a6294d99b40e52 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
@@ -1,5 +1,5 @@
-load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 
 package(
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD
index 479903d7841ca0..69980ac4a1d7a2 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD
@@ -2,11 +2,11 @@
 # Distributed runtime modules for machine learning, which allows coordination between multiple
 # processes for distributed operations.
 
-load("//tsl:tsl.bzl", "internal_visibility")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load("//tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
index afd7b3c06b78ff..c5686096428dba 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
@@ -1,6 +1,6 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "if_oss", "internal_visibility", "tsl_gpu_library")
 load("//tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD
index aca10c8ca69783..4acb411bf32f00 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD
@@ -1,7 +1,7 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "get_compatible_with_portable", "tsl_grpc_cc_dependencies")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
index 9effe4beb3e2e8..cc0a37aa4d0009 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
@@ -1,10 +1,10 @@
 # Description:
 #   RPC communication interfaces and implementations for TensorFlow.
 
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("//tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD
index ba8ec2333d197d..7eb439e018a4bd 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD
@@ -1,6 +1,6 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/framework/BUILD b/third_party/xla/third_party/tsl/tsl/framework/BUILD
index a71b67464e511f..cfa12cab82e00e 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/BUILD
@@ -4,6 +4,10 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
@@ -14,10 +18,6 @@ load(
     "//tsl/platform:build_config_root.bzl",
     "if_static",
 )
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD b/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD
index e265644dce3d79..47bf5f20aa81dc 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD
@@ -1,7 +1,7 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
-load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD b/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD
index 310080f5285a60..5818820ad654be 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD
@@ -1,6 +1,6 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/core/BUILD b/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
index eb5bbda819c7c3..c5b5830535dd0e 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
@@ -4,13 +4,13 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 
 # TODO(rdzhabarov): Tighten visibility after migration is complete.
 package(
diff --git a/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD b/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD
index 306360d7cd6b16..8a6185374c92bf 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD
@@ -1,13 +1,13 @@
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD b/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD
index 20b335bb0110c0..2145beac6e8dda 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD
@@ -1,3 +1,7 @@
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 load(
     "//tsl:tsl.bzl",
     "if_linux_x86_64",
@@ -9,10 +13,6 @@ load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD b/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD
index 0093ad1b5274da..3f0565eaf9ae63 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD
@@ -1,13 +1,13 @@
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/BUILD b/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
index 5b45d10a620a1e..9d6e02ec835ef1 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
@@ -1,10 +1,10 @@
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl:tsl.default.bzl", "filegroup")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD b/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD
index 5b739e5605c4a3..06af0dc45a5ac1 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD
@@ -1,9 +1,9 @@
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/random/BUILD b/third_party/xla/third_party/tsl/tsl/lib/random/BUILD
index 0dbd76ef3023ed..169d41b6f6c969 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/random/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/random/BUILD
@@ -1,10 +1,10 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 default_visibility = [
     "//tsl/lib/io:__pkg__",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD b/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD
index b19d298365ae15..6593d44eb6c087 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD
@@ -1,9 +1,9 @@
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl:tsl.default.bzl", "filegroup")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/BUILD b/third_party/xla/third_party/tsl/tsl/platform/BUILD
index 0f022d33a16809..2757cc6b86b651 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/BUILD
@@ -4,6 +4,14 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
+load(
+    "@bazel_skylib//:bzl_library.bzl",
+    "bzl_library",
+)
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 load(
     "//tsl:tsl.bzl",
     "if_not_fuchsia",
@@ -29,14 +37,6 @@ load(
     "tsl_protobuf_deps",
 )
 load("//tsl/platform:build_config_root.bzl", "if_static")
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
-load(
-    "@bazel_skylib//:bzl_library.bzl",
-    "bzl_library",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD b/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
index e9588213ddf932..c9067c74c8e526 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
@@ -1,6 +1,7 @@
 # Description:
 # Cloud file system implementation.
 
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl:tsl.bzl",
     "if_windows",
@@ -8,7 +9,6 @@ load(
     "tsl_copts",
 )
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
index 8e6a52db7d1854..bac3898fef57a5 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
@@ -1,3 +1,6 @@
+# Tensorflow default + linux implementations of tensorflow/core/platform libraries.
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl:tsl.bzl",
     "if_not_fuchsia",
@@ -6,10 +9,6 @@ load(
     "tsl_copts",
 )
 load("//tsl:tsl.default.bzl", "filegroup", "tsl_grpc_cc_dependencies")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-
-# Tensorflow default + linux implementations of tensorflow/core/platform libraries.
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl b/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
index 3c35e4cd37aa57..d8ba2d02903691 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
@@ -1,6 +1,7 @@
 # Platform-specific build configurations.
 
 load("@com_github_grpc_grpc//bazel:generate_cc.bzl", "generate_cc")
+load("@com_google_protobuf//:protobuf.bzl", "proto_gen")
 load(
     "//tsl:tsl.bzl",
     "clean_dep",
@@ -8,7 +9,6 @@ load(
     "if_tsl_link_protobuf",
 )
 load("//tsl/platform:build_config_root.bzl", "if_static")
-load("@com_google_protobuf//:protobuf.bzl", "proto_gen")
 
 def well_known_proto_libs():
     """Set of standard protobuf protos, like Any and Timestamp.
diff --git a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
index 1421f3c7e143f4..5285e49bcb1df3 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
@@ -1,16 +1,12 @@
 # Description:
 # profile_utils targets.
 
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load(
-    "//tsl:tsl.bzl",
-    "tsl_copts",
-)
+load("//tsl:tsl.bzl", "internal_visibility", "tsl_copts")
+load("//tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD b/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
index fda40c1e211f02..c8f4ed0d14b6fb 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
@@ -1,3 +1,8 @@
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
+
 # Tensorflow windows-specific implementations of tensorflow/core/platform libraries.
 load(
     "//tsl:tsl.bzl",
@@ -5,10 +10,6 @@ load(
     "tsl_copts",
 )
 load("//tsl:tsl.default.bzl", "filegroup")
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD
index c2dc119d6bb8c9..7c7d9e7f036ab5 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD
@@ -1,8 +1,8 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/platform:build_config_root.bzl", "if_static")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/builds/build_config.bzl b/third_party/xla/third_party/tsl/tsl/profiler/builds/build_config.bzl
index 22df6844da1a08..72e2e53537794d 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/builds/build_config.bzl
+++ b/third_party/xla/third_party/tsl/tsl/profiler/builds/build_config.bzl
@@ -1,11 +1,11 @@
 """Provides a redirection point for platform specific implementations of Starlark utilities."""
 
+load("//tsl:tsl.bzl", "clean_dep")
 load(
     "//tsl/profiler/builds/oss:build_config.bzl",
     _tf_profiler_alias = "tf_profiler_alias",
     _tf_profiler_pybind_cc_library_wrapper = "tf_profiler_pybind_cc_library_wrapper",
 )
-load("//tsl:tsl.bzl", "clean_dep")
 
 tf_profiler_pybind_cc_library_wrapper = _tf_profiler_pybind_cc_library_wrapper
 tf_profiler_alias = _tf_profiler_alias
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD
index 12273bf96f1964..e6ab8e6b5ab874 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD
@@ -1,9 +1,9 @@
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
index b598210bff3c36..892db510abae51 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
@@ -1,8 +1,8 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "if_not_android", "internal_visibility", "nvtx_headers")
 load("//tsl:tsl.default.bzl", "filegroup")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/platform:build_config_root.bzl", "if_static")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
index f43bc94f52f585..428a963d2c31e3 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
@@ -1,5 +1,5 @@
-# Placeholder: load py_proto_library
 # copybara:uncomment(oss-unused) load("//net/grpc/go/build_defs:go_grpc_library.bzl", "go_grpc_library")
+# Placeholder: load py_proto_library
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl/platform:build_config.bzl", "tf_proto_library")
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD
index debedf53094de4..d87ad885f05aab 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD
@@ -1,6 +1,6 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl/profiler/builds:build_config.bzl",
     "tf_profiler_copts",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
index deb9383157a594..f9210aeaab557e 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
@@ -1,6 +1,6 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl/platform:build_config.bzl",
     "tf_protos_profiler_service",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
index 527723044ae302..24d0417b13c652 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
@@ -1,7 +1,7 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/platform:build_config_root.bzl", "if_static")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
diff --git a/third_party/xla/third_party/tsl/tsl/util/BUILD b/third_party/xla/third_party/tsl/tsl/util/BUILD
index cb70feac45c080..45432219bdcc4d 100644
--- a/third_party/xla/third_party/tsl/tsl/util/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/util/BUILD
@@ -4,6 +4,10 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 load(
     "//tsl:tsl.bzl",
     "check_deps",
@@ -11,17 +15,13 @@ load(
     "tsl_copts",
 )
 load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
-load(
-    "//tsl/platform:build_config_root.bzl",
-    "if_static",
-)
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
 load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
+    "//tsl/platform:build_config_root.bzl",
+    "if_static",
 )
 
 package(
diff --git a/third_party/xla/third_party/tsl/workspace0.bzl b/third_party/xla/third_party/tsl/workspace0.bzl
index 82419fece2d464..30a9426e5c6044 100644
--- a/third_party/xla/third_party/tsl/workspace0.bzl
+++ b/third_party/xla/third_party/tsl/workspace0.bzl
@@ -1,10 +1,10 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_toolchains//repositories:repositories.bzl", bazel_toolchains_repositories = "repositories")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
 load("@build_bazel_rules_apple//apple:repositories.bzl", "apple_rules_dependencies")
 load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependencies")
-load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
 load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
 
 def _tf_bind():
diff --git a/third_party/xla/third_party/tsl/workspace3.bzl b/third_party/xla/third_party/tsl/workspace3.bzl
index 2c13446fb4c1ab..9510b09374206c 100644
--- a/third_party/xla/third_party/tsl/workspace3.bzl
+++ b/third_party/xla/third_party/tsl/workspace3.bzl
@@ -1,8 +1,8 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
 load("//third_party/llvm:workspace.bzl", llvm = "repo")
+load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
 
 def workspace():
     http_archive(
diff --git a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
index 5f2057a3c14360..4455aea60109fa 100644
--- a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+++ b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -1,8 +1,8 @@
 """Configurations of AARCH64 builds used with Docker container."""
 
-load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/py:python_configure.bzl", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
 
 def ml2014_tf_aarch64_configs(name_container_map, env):
     for name, container in name_container_map.items():
diff --git a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
index b1488584566aa6..18a84d96c39f82 100644
--- a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,12 +1,12 @@
 """Macro that creates external repositories for remote config."""
 
-load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
-load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
+load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
 load("//tools/toolchains/remote_config:containers.bzl", "containers")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 
 def _container_image_uri(container_name):
     container = containers[container_name]
diff --git a/third_party/xla/tools/toolchains/win/bazel_211/BUILD b/third_party/xla/tools/toolchains/win/bazel_211/BUILD
index cc23c8ecb22680..c7484d2ae2efdf 100644
--- a/third_party/xla/tools/toolchains/win/bazel_211/BUILD
+++ b/third_party/xla/tools/toolchains/win/bazel_211/BUILD
@@ -15,8 +15,8 @@
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
-load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/third_party/xla/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl b/third_party/xla/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
index 30571b6a5ace85..9ccc1706e5eca5 100644
--- a/third_party/xla/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
+++ b/third_party/xla/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
@@ -14,6 +14,7 @@
 
 """A Starlark cc_toolchain configuration rule for Windows"""
 
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
@@ -29,7 +30,6 @@ load(
     "variable_with_value",
     "with_feature_set",
 )
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/third_party/xla/tools/toolchains/win/tf_win_05022023/BUILD b/third_party/xla/tools/toolchains/win/tf_win_05022023/BUILD
index f245f6d0789c9d..8a2ae6fe4a9dd3 100644
--- a/third_party/xla/tools/toolchains/win/tf_win_05022023/BUILD
+++ b/third_party/xla/tools/toolchains/win/tf_win_05022023/BUILD
@@ -15,8 +15,8 @@
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
-load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/third_party/xla/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl b/third_party/xla/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
index ba3de607d10451..d6b966b32cecad 100644
--- a/third_party/xla/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
+++ b/third_party/xla/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
@@ -14,6 +14,7 @@
 
 """A Starlark cc_toolchain configuration rule for Windows"""
 
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
@@ -28,7 +29,6 @@ load(
     "variable_with_value",
     "with_feature_set",
 )
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/third_party/xla/workspace0.bzl b/third_party/xla/workspace0.bzl
index 014ffbbebbbfed..76b8ed2bbae1f2 100644
--- a/third_party/xla/workspace0.bzl
+++ b/third_party/xla/workspace0.bzl
@@ -1,11 +1,11 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@local_tsl//:workspace0.bzl", "tsl_workspace0")
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_toolchains//repositories:repositories.bzl", bazel_toolchains_repositories = "repositories")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
 load("@build_bazel_rules_apple//apple:repositories.bzl", "apple_rules_dependencies")
 load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependencies")
-load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
+load("@local_tsl//:workspace0.bzl", "tsl_workspace0")
 
 def _tf_bind():
     """Bind targets for some external repositories"""
diff --git a/third_party/xla/workspace1.bzl b/third_party/xla/workspace1.bzl
index 961549f98a1001..90ab0e28809e5c 100644
--- a/third_party/xla/workspace1.bzl
+++ b/third_party/xla/workspace1.bzl
@@ -1,9 +1,9 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@local_tsl//:workspace1.bzl", "tsl_workspace1")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
+load("@local_tsl//:workspace1.bzl", "tsl_workspace1")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
 
 # buildifier: disable=unnamed-macro
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 32e7fc1eb15187..dd5858b633f826 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -1,6 +1,3 @@
-# Placeholder: load py_proto_library
-load("//xla:xla.bzl", "xla_cc_test", "xla_py_proto_library")
-load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
@@ -8,6 +5,10 @@ load(
     "tf_proto_library",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
+
+# Placeholder: load py_proto_library
+load("//xla:xla.bzl", "xla_cc_test", "xla_py_proto_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/backends/profiler/cpu/BUILD b/third_party/xla/xla/backends/profiler/cpu/BUILD
index af534da5582e5f..5c55e71f8406b3 100644
--- a/third_party/xla/xla/backends/profiler/cpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/cpu/BUILD
@@ -1,10 +1,10 @@
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 9a9265bd5559bb..0d259e573afdec 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -1,8 +1,4 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "@local_tsl//tsl:tsl.bzl",
@@ -23,6 +19,10 @@ load(
     "if_cuda_is_configured",
 )
 load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/backends/profiler/plugin/BUILD b/third_party/xla/xla/backends/profiler/plugin/BUILD
index 8c7768cf3dbb6c..990df18380b508 100644
--- a/third_party/xla/xla/backends/profiler/plugin/BUILD
+++ b/third_party/xla/xla/backends/profiler/plugin/BUILD
@@ -1,9 +1,9 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index 6b8fa182a1b5ed..d55782e865802e 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -1,9 +1,9 @@
 # Description:
 #   XLA client libraries.
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/client/lib/BUILD b/third_party/xla/xla/client/lib/BUILD
index 65b29dcbda7cd4..dcfd207d69ee5d 100644
--- a/third_party/xla/xla/client/lib/BUILD
+++ b/third_party/xla/xla/client/lib/BUILD
@@ -1,9 +1,9 @@
 # Common computation builders for XLA.
 
-load("//xla/tests:build_defs.bzl", "generate_backend_suites", "xla_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tests:build_defs.bzl", "generate_backend_suites", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index f2ed209201a313..fc0f1066e73f16 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -1,5 +1,5 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index 399f233a211f38..fcd33d23fef49a 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -1,6 +1,6 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/hlo/evaluator/BUILD b/third_party/xla/xla/hlo/evaluator/BUILD
index 0d6b67083576f7..52ce1414644efa 100644
--- a/third_party/xla/xla/hlo/evaluator/BUILD
+++ b/third_party/xla/xla/hlo/evaluator/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   XLA evaluator implementation.
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index bb0b54184ce5c8..aa8135cc69ea94 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -1,9 +1,9 @@
 # Automatic sharding annotation
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//xla:xla.bzl", "auto_sharding_deps", "auto_sharding_solver_deps", "xla_cc_binary", "xla_cc_test")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_libtpu_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
+load("//xla:xla.bzl", "auto_sharding_deps", "auto_sharding_solver_deps", "xla_cc_binary", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/hlo/transforms/BUILD b/third_party/xla/xla/hlo/transforms/BUILD
index 16b4c0cf384094..2f778ccfd80c93 100644
--- a/third_party/xla/xla/hlo/transforms/BUILD
+++ b/third_party/xla/xla/hlo/transforms/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   Implementation of XLA’s HLO transformations.
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/hlo/utils/BUILD b/third_party/xla/xla/hlo/utils/BUILD
index cec66c2bd7be4f..98913c1cef2d4b 100644
--- a/third_party/xla/xla/hlo/utils/BUILD
+++ b/third_party/xla/xla/hlo/utils/BUILD
@@ -1,11 +1,11 @@
 # Description:
 #   Implementation of XLA’s HLO utilities used for higher-level transformations.
 
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir/runtime/transforms/BUILD b/third_party/xla/xla/mlir/runtime/transforms/BUILD
index 0af4d67c910f7c..2de0bcb1c4f878 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/BUILD
+++ b/third_party/xla/xla/mlir/runtime/transforms/BUILD
@@ -1,5 +1,4 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
@@ -10,6 +9,7 @@ load(
     "if_llvm_x86_available",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir/utils/BUILD b/third_party/xla/xla/mlir/utils/BUILD
index ac80b27c79b810..cabd61a0d739a5 100644
--- a/third_party/xla/xla/mlir/utils/BUILD
+++ b/third_party/xla/xla/mlir/utils/BUILD
@@ -1,7 +1,7 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir_hlo/WORKSPACE b/third_party/xla/xla/mlir_hlo/WORKSPACE
index 7078b6cc7bd6c4..bab4b26e9a648d 100644
--- a/third_party/xla/xla/mlir_hlo/WORKSPACE
+++ b/third_party/xla/xla/mlir_hlo/WORKSPACE
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Workspace for MLIR HLO."""
+# buildifier: disable=load-on-top
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index b6f7c6f8b9d079..178b7b4f82a87c 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -1,5 +1,3 @@
-# Placeholder: load py_proto_library
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
@@ -7,6 +5,9 @@ load(
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
+# Placeholder: load py_proto_library
+load("//xla:xla.bzl", "xla_cc_test")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = internal_visibility(["//xla:internal"]),
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index f6489de89b672e..e01564fb6a0ca9 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -1,4 +1,3 @@
-load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -8,6 +7,7 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index f736d66bfeb4bf..a673cd5191e5ee 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -1,6 +1,6 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index 2e20fc7b2e8f70..1085bb236703f4 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -1,7 +1,7 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 licenses(["notice"])
 
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 90c6fb4a8aff04..2620ae166bb7b9 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -1,10 +1,10 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm", "if_gpu_is_configured")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 7bbd87f5cf9ac5..4a53f04bb532cf 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -1,12 +1,5 @@
-load("//xla:pytype.default.bzl", "pytype_strict_library")
-load("//xla:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-    "xla_py_test_deps",
-)
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "@local_tsl//tsl:tsl.bzl",
@@ -20,6 +13,13 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:pytype.default.bzl", "pytype_strict_library")
+load("//xla:strict.default.bzl", "py_strict_library", "py_strict_test")
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+    "xla_py_test_deps",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index 3aa5d4d2443bc7..c2f96ff92d35a0 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -1,7 +1,7 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package_group(
     name = "friends",
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index 69656e7d0eb5ec..1f8b8fe5df28f4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "if_google")
+load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/BUILD b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
index 6d8d32b9e0fc5f..fff6b664a95013 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "if_google")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 # copybara:uncomment load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
+load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = default_ifrt_proxy_visibility,
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index 3a33843ae4d7fa..a0e3a4297a8c15 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -1,7 +1,7 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package_group(
     name = "friends",
diff --git a/third_party/xla/xla/python/profiler/internal/BUILD b/third_party/xla/xla/python/profiler/internal/BUILD
index 0d9ed5cad4a3ec..127a5b54638013 100644
--- a/third_party/xla/xla/python/profiler/internal/BUILD
+++ b/third_party/xla/xla/python/profiler/internal/BUILD
@@ -1,7 +1,7 @@
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
-load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/runtime/BUILD b/third_party/xla/xla/runtime/BUILD
index 647fddb8c87b4f..45f2217fcf09b1 100644
--- a/third_party/xla/xla/runtime/BUILD
+++ b/third_party/xla/xla/runtime/BUILD
@@ -1,4 +1,3 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_platform_deps")
@@ -11,6 +10,7 @@ load(
     "if_llvm_x86_available",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index f5ed53d95befe1..99f2a683026f2f 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1,20 +1,8 @@
 # Description:
 #   XLA service implementation.
 
-load("//xla:strict.default.bzl", "py_strict_library", "py_strict_test")
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_binary",
-    "xla_cc_test",
-    "xla_py_proto_library",
-    "xla_py_test_deps",
-    "xla_symbol_repository_deps",
-)
-load("//xla/service:xla_compile.bzl", "xla_aot_compile_cpu", "xla_aot_compile_gpu", "xla_aot_compile_gpu_runtime_autotuning")
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm",
@@ -32,6 +20,18 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:strict.default.bzl", "py_strict_library", "py_strict_test")
+load(
+    "//xla:xla.bzl",
+    "xla_cc_binary",
+    "xla_cc_test",
+    "xla_py_proto_library",
+    "xla_py_test_deps",
+    "xla_symbol_repository_deps",
+)
+load("//xla/service:xla_compile.bzl", "xla_aot_compile_cpu", "xla_aot_compile_gpu", "xla_aot_compile_gpu_runtime_autotuning")
+load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 0bcefd568698d5..04d5a6bff995ab 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -3,21 +3,6 @@
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
-load(
-    "//xla:xla.bzl",
-    "ORC_JIT_MEMORY_MAPPER_TARGETS",
-    "xla_cc_binary",
-    "xla_cc_test",
-)
-load(
-    "//xla/tsl/mkl:build_defs.bzl",
-    "mkl_deps",
-)
-load(
-    "//third_party/compute_library:build_defs.bzl",
-    "acl_deps",
-    "if_enable_acl",
-)
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "tf_openmp_copts", "tsl_copts")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
@@ -29,6 +14,21 @@ load(
     "if_llvm_x86_available",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load(
+    "//third_party/compute_library:build_defs.bzl",
+    "acl_deps",
+    "if_enable_acl",
+)
+load(
+    "//xla:xla.bzl",
+    "ORC_JIT_MEMORY_MAPPER_TARGETS",
+    "xla_cc_binary",
+    "xla_cc_test",
+)
+load(
+    "//xla/tsl/mkl:build_defs.bzl",
+    "mkl_deps",
+)
 load(":build_defs.bzl", "runtime_copts")
 
 package(
diff --git a/third_party/xla/xla/service/cpu/tests/BUILD b/third_party/xla/xla/service/cpu/tests/BUILD
index c0044b17d31f58..32d785885e0ab5 100644
--- a/third_party/xla/xla/service/cpu/tests/BUILD
+++ b/third_party/xla/xla/service/cpu/tests/BUILD
@@ -1,9 +1,9 @@
 # Description:
 #    Tests for LLVM-based CPU backend for XLA.
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "tsl_copts")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index e3a106e7bef46d..becd9855abdbb7 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1,20 +1,8 @@
 # Description:
 #   GPU-specific components in XLA service implementation.
 
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
-load("//xla:xla.bzl", "xla_cc_test", "xla_cub_deps", "xla_export_hlo_deps")
-load(
-    "//xla/service/gpu:build_defs.bzl",
-    "build_cub_sort_kernels",
-    "get_cub_sort_kernel_types",
-    "gpu_kernel_library",
-)
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_hipblaslt",
@@ -44,6 +32,18 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:xla.bzl", "xla_cc_test", "xla_cub_deps", "xla_export_hlo_deps")
+load(
+    "//xla/service/gpu:build_defs.bzl",
+    "build_cub_sort_kernels",
+    "get_cub_sort_kernel_types",
+    "gpu_kernel_library",
+)
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index a213584af671af..c54387cf6ff642 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -1,8 +1,8 @@
-load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.cc b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
index 3135e5d807df1f..f0f2eee6e16f18 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
@@ -65,9 +65,9 @@ std::optional<IndexingMap> ConcatenateFusion::ComputeThreadIdToOutputIndexing(
 std::optional<IndexingMap> ConcatenateFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
     mlir::MLIRContext* ctx) const {
-  return GetDefaultThreadIdToOutputIndexingMap(
-      launch_dimensions(), /*unroll_factor=*/1,
-      GetLargestConcatOperandShape(analysis_), ctx);
+  return GetDefaultThreadIdIndexingMap(launch_dimensions(), /*unroll_factor=*/1,
+                                       GetLargestConcatOperandShape(analysis_),
+                                       ctx);
 }
 
 absl::Status ConcatenateFusion::EmitKernel(
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
index 72d3274e5f55f4..e6091085c2e2c5 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
@@ -67,9 +67,9 @@ MlirConcatenateFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
     mlir::MLIRContext* ctx) const {
   // TODO(b/331356433): Add constraints depending on the `hero_operand_index`.
-  return GetDefaultThreadIdToOutputIndexingMap(
-      launch_dimensions(), /*unroll_factor=*/1,
-      GetLargestConcatOperandShape(analysis_), ctx);
+  return GetDefaultThreadIdIndexingMap(launch_dimensions(), /*unroll_factor=*/1,
+                                       GetLargestConcatOperandShape(analysis_),
+                                       ctx);
 }
 
 std::vector<const HloInstruction*>
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
index f5dd10924ec327..5cf2481dcb5322 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
@@ -117,10 +117,10 @@ absl::Status AnnotateKernelLaunchDimensions(
   return absl::OkStatus();
 }
 
-IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
-    const LaunchDimensions& launch_dims, int unroll_factor,
-    const Shape& output_shape, mlir::MLIRContext* ctx) {
-  std::vector<mlir::AffineExpr> output_dims(output_shape.rank());
+IndexingMap KernelFusionInterface::GetDefaultThreadIdIndexingMap(
+    const LaunchDimensions& launch_dims, int unroll_factor, const Shape& shape,
+    mlir::MLIRContext* ctx) {
+  std::vector<mlir::AffineExpr> output_dims(shape.rank());
 
   std::array<uint64_t, 3> thread_counts{
       launch_dims.thread_counts_per_block().x,
@@ -163,11 +163,10 @@ IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
 
   // See IndexUtil::LinearIndexToMultidimensionalIndex.
   uint64_t divisor = 1;
-  for (auto dimension : LayoutUtil::MinorToMajor(output_shape)) {
-    output_dims[dimension] =
-        (linear_index.floorDiv(divisor)) %
-        static_cast<uint64_t>(output_shape.dimensions(dimension));
-    divisor *= output_shape.dimensions(dimension);
+  for (auto dimension : LayoutUtil::MinorToMajor(shape)) {
+    output_dims[dimension] = (linear_index.floorDiv(divisor)) %
+                             static_cast<uint64_t>(shape.dimensions(dimension));
+    divisor *= shape.dimensions(dimension);
   }
 
   std::vector<DimVar> dim_vars = {
@@ -179,7 +178,7 @@ IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
       {{0, static_cast<int64_t>(launch_dims.block_counts().z) - 1}},
   };
   std::vector<RangeVar> range_vars;
-  int64_t num_elements = ShapeUtil::ElementsIn(output_shape);
+  int64_t num_elements = ShapeUtil::ElementsIn(shape);
   range_vars.push_back(
       {{0, CeilOfRatio(num_elements,
                        static_cast<int64_t>(launch_dims.launch_bound()) *
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
index 506dffcd92ac61..dc8c399cbf42fd 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
@@ -91,12 +91,12 @@ class KernelFusionInterface : public FusionInterface {
 
  protected:
   // Returns the default mapping for the given launch dimensions: linearizes
-  // the thread index and then reshapes it into the output layout.
+  // the thread index and then reshapes it into the given layout.
   // Populates the ranges for d0, d1, d2, d3, d4, d5 from the thread counts and
   // block sizes in the given launch dimensions.
-  static IndexingMap GetDefaultThreadIdToOutputIndexingMap(
+  static IndexingMap GetDefaultThreadIdIndexingMap(
       const LaunchDimensions& launch_dims, int unroll_factor,
-      const Shape& output_shape, mlir::MLIRContext* ctx);
+      const Shape& shape, mlir::MLIRContext* ctx);
 };
 
 // Base class for fusions that are implemented using a single kernel, which is
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
index 1dd6bda4d13da5..ea4c56fa576b14 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
@@ -56,8 +56,8 @@ InPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
   // It is guaranteed that all DUS ops have the same output shape at this point.
   const auto& update_shape =
       dus_ops_.front()->operand(kDUSUpdateIndex)->shape();
-  return GetDefaultThreadIdToOutputIndexingMap(launch_dims, /*unroll_factor=*/1,
-                                               update_shape, mlir_context);
+  return GetDefaultThreadIdIndexingMap(launch_dims, /*unroll_factor=*/1,
+                                       update_shape, mlir_context);
 }
 
 absl::Status InPlaceDynamicUpdateSliceFusion::EmitKernel(
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
index c13e87145f320e..2334a5b74c892a 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
@@ -89,8 +89,8 @@ MlirInPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
   // It is guaranteed that all DUS ops have the same output shape at this point.
   const auto& update_shape =
       dus_ops_.front()->operand(kDUSUpdateIndex)->shape();
-  return GetDefaultThreadIdToOutputIndexingMap(launch_dims, /*unroll_factor=*/1,
-                                               update_shape, mlir_context);
+  return GetDefaultThreadIdIndexingMap(launch_dims, /*unroll_factor=*/1,
+                                       update_shape, mlir_context);
 }
 
 std::vector<const HloInstruction*>
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.cc b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
index 85f661a8f125f5..225de1da8be49b 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
@@ -190,8 +190,7 @@ std::optional<IndexingMap> InputSlicesFusion::ComputeThreadIdToOutputIndexing(
   // The implementation requires the shapes and layouts to be the same, but we
   // still use the requested output's shape for clarity.
   const auto& shape = analysis_.fusion_roots()[output_id]->shape();
-  return GetDefaultThreadIdToOutputIndexingMap(launch_dims, unroll_factor_,
-                                               shape, ctx);
+  return GetDefaultThreadIdIndexingMap(launch_dims, unroll_factor_, shape, ctx);
 }
 
 absl::Status InputSlicesFusion::EmitKernel(
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
index c1108ca37e8cd3..16c4fbf5062343 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
@@ -59,8 +59,7 @@ MlirInputSlicesFusion::ComputeThreadIdToOutputIndexing(
   // The implementation requires the shapes and layouts to be the same, but we
   // still use the requested output's shape for clarity.
   const auto& shape = analysis_.fusion_roots()[output_id]->shape();
-  return GetDefaultThreadIdToOutputIndexingMap(launch_dims, unroll_factor_,
-                                               shape, ctx);
+  return GetDefaultThreadIdIndexingMap(launch_dims, unroll_factor_, shape, ctx);
 }
 
 LaunchDimensions MlirInputSlicesFusion::launch_dimensions() const {
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.cc b/third_party/xla/xla/service/gpu/fusions/loop.cc
index e7a13200fe391f..b2ef86d5a916b6 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop.cc
@@ -217,8 +217,8 @@ LoopFusion::LoopFusion(const HloFusionAnalysis& analysis)
 std::optional<IndexingMap> LoopFusion::ComputeThreadIdToOutputIndexing(
     int64_t root_index, mlir::MLIRContext* ctx) const {
   auto launch_dims = launch_dimensions();
-  return GetDefaultThreadIdToOutputIndexingMap(
-      launch_dims, config_.unroll_factor, GetElementShape(analysis_), ctx);
+  return GetDefaultThreadIdIndexingMap(launch_dims, config_.unroll_factor,
+                                       GetElementShape(analysis_), ctx);
 }
 
 std::optional<IndexingMap> LoopFusion::ComputeThreadIdToInputIndexing(
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
index 82734d06cc9c9a..63b54cadb2bdc5 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
@@ -65,8 +65,8 @@ const Shape& GetFusionResultShape(const HloFusionAnalysis& analysis) {
 std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToOutputIndexing(
     int64_t root_index, mlir::MLIRContext* ctx) const {
   auto launch_dims = launch_dimensions();
-  return GetDefaultThreadIdToOutputIndexingMap(
-      launch_dims, config_.unroll_factor, GetFusionResultShape(analysis_), ctx);
+  return GetDefaultThreadIdIndexingMap(launch_dims, config_.unroll_factor,
+                                       GetFusionResultShape(analysis_), ctx);
 }
 
 std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToInputIndexing(
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.cc b/third_party/xla/xla/service/gpu/fusions/scatter.cc
index f7c5598d3ef887..55488cc0df5e0c 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.cc
@@ -257,7 +257,7 @@ std::optional<IndexingMap> ScatterFusion::ComputeThreadIdToInputIndexing(
   }
   // Compute thread id mapping based on the first update operand.
   Shape scatter_update_shape = scatter->scatter_updates().front()->shape();
-  IndexingMap scatter_update_map = GetDefaultThreadIdToOutputIndexingMap(
+  IndexingMap scatter_update_map = GetDefaultThreadIdIndexingMap(
       launch_dimensions(), config_.unroll_factor, scatter_update_shape, ctx);
 
   // For scatter indices we project indexing for scatter updates and take the
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
index 102240b31a7c33..932ec9cfccf1a1 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
@@ -105,7 +105,7 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
   }
   // Compute thread id mapping based on the first update operand.
   Shape scatter_update_shape = scatter->scatter_updates().front()->shape();
-  IndexingMap scatter_update_map = GetDefaultThreadIdToOutputIndexingMap(
+  IndexingMap scatter_update_map = GetDefaultThreadIdIndexingMap(
       launch_dimensions(), config_.unroll_factor, scatter_update_shape, ctx);
 
   // For scatter indices we project indexing for scatter updates and take the
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index d24392d1aeffa8..1fcbb8ce3c4e2c 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -271,7 +271,10 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
     return Cast(b, b.create<ma::ExtFOp>(fp32_ty, value), dst_element_ty);
   }
   if (dst_element_ty.isBF16()) {
-    return b.create<ma::TruncFOp>(dst_ty, Cast(b, value, b.getF32Type()));
+    // S8 -> BF16 is directly supported and doesn't need to go through f32.
+    if (!src_element_ty.isInteger(8)) {
+      return b.create<ma::TruncFOp>(dst_ty, Cast(b, value, b.getF32Type()));
+    }
   }
 
   // float => float
diff --git a/third_party/xla/xla/service/gpu/kernels/BUILD b/third_party/xla/xla/service/gpu/kernels/BUILD
index d5a8e83080855b..324286a7dcb77c 100644
--- a/third_party/xla/xla/service/gpu/kernels/BUILD
+++ b/third_party/xla/xla/service/gpu/kernels/BUILD
@@ -1,11 +1,11 @@
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/service/gpu:build_defs.bzl", "gpu_kernel_library")
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_gpu_tests_tags")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/service/gpu:build_defs.bzl", "gpu_kernel_library")
+load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index 44c1ca0b3edf85..5731422b1c87eb 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -1,9 +1,9 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 782121674db3d1..560ebe466a3777 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -1,12 +1,13 @@
-# Libraries for performance modeling of HLO.
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("//xla:xla.bzl", "xla_cc_test", "xla_nvml_deps")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+load("//xla:xla.bzl", "xla_cc_test", "xla_nvml_deps")
+
+# Libraries for performance modeling of HLO.
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 0fa9d245dcf111..852048d66fe307 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -1,8 +1,8 @@
-load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla/service/gpu:build_defs.bzl", "get_cub_sort_kernel_types")
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+load("//xla/service/gpu:build_defs.bzl", "get_cub_sort_kernel_types")
+load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 39528d7b8305c8..783661f929211d 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -1,12 +1,6 @@
 # Description: GPU-specific XLA tests. For example, codegen tests that
 # verify the IR emitted.
 
-load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
@@ -21,6 +15,12 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+)
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/graphcycles/BUILD b/third_party/xla/xla/service/graphcycles/BUILD
index 22d7067c07bc78..78e5a7b878df21 100644
--- a/third_party/xla/xla/service/graphcycles/BUILD
+++ b/third_party/xla/xla/service/graphcycles/BUILD
@@ -1,6 +1,6 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/heap_simulator/BUILD b/third_party/xla/xla/service/heap_simulator/BUILD
index 161c8f069028d3..41ebb4275290b5 100644
--- a/third_party/xla/xla/service/heap_simulator/BUILD
+++ b/third_party/xla/xla/service/heap_simulator/BUILD
@@ -1,11 +1,11 @@
 # Description:
 #   XLA Heap simulator.
 
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/llvm_ir/BUILD b/third_party/xla/xla/service/llvm_ir/BUILD
index a706957cb6f960..068c4ea8d98709 100644
--- a/third_party/xla/xla/service/llvm_ir/BUILD
+++ b/third_party/xla/xla/service/llvm_ir/BUILD
@@ -1,10 +1,10 @@
 # Description:
 #    Libraries for helping construct LLVM IR for XLA backends.
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index 1ffb2c1655a73a..9e95c1e61e3b53 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -1,16 +1,16 @@
 # Description:
 #   Memory Space Assignment service implementation.
 
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_proto_library",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/spmd/BUILD b/third_party/xla/xla/service/spmd/BUILD
index 2e173e2cda1133..93c0c84c6f0944 100644
--- a/third_party/xla/xla/service/spmd/BUILD
+++ b/third_party/xla/xla/service/spmd/BUILD
@@ -1,7 +1,7 @@
 # Description: SPMD partitioning pass.
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 0ed4e8ed255b59..57163b1845b107 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -1,10 +1,10 @@
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends", "stream_executor_internal")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "transitive_hdrs")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends", "stream_executor_internal")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 599987444361b0..4c851dece33b95 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -1,16 +1,4 @@
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "cuda_only_cc_library",
-    "stream_executor_friends",
-    "tf_additional_cuda_platform_deps",
-    "tf_additional_cudnn_plugin_copts",
-    "tf_additional_gpu_compilation_copts",
-)
 load("@local_tsl//tsl:tsl.bzl", "if_google", "if_nccl", "internal_visibility", "tsl_copts")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
@@ -25,6 +13,18 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+)
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "cuda_only_cc_library",
+    "stream_executor_friends",
+    "tf_additional_cuda_platform_deps",
+    "tf_additional_cudnn_plugin_copts",
+    "tf_additional_gpu_compilation_copts",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 4d902bafadd58a..f90bf78da0cc23 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -1,27 +1,10 @@
 # Description:
 #   GPU-platform specific StreamExecutor support code.
 
-load(
-    "//xla/tests:build_defs.bzl",
-    "xla_test",
-)
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
     "if_cuda",
 )
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
-load(
-    "//xla/service/gpu:build_defs.bzl",
-    "gpu_kernel_library",
-)
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "gpu_only_cc_library",
-    "if_gpu_is_configured",
-)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm",
@@ -47,6 +30,23 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+)
+load(
+    "//xla/service/gpu:build_defs.bzl",
+    "gpu_kernel_library",
+)
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "gpu_only_cc_library",
+    "if_gpu_is_configured",
+)
+load(
+    "//xla/tests:build_defs.bzl",
+    "xla_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index 4909b2d890dd88..22b1d5382701c8 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -1,10 +1,10 @@
 # Description:
 #   Host-platform specific StreamExecutor support code.
 
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/stream_executor/integrations/BUILD b/third_party/xla/xla/stream_executor/integrations/BUILD
index d52d09e853b058..1b3682d9cce5eb 100644
--- a/third_party/xla/xla/stream_executor/integrations/BUILD
+++ b/third_party/xla/xla/stream_executor/integrations/BUILD
@@ -1,8 +1,8 @@
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 load("@local_tsl//tsl:tsl.bzl", "if_google", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/stream_executor/platform/BUILD b/third_party/xla/xla/stream_executor/platform/BUILD
index cd73a976be545b..6754de3ebe3dda 100644
--- a/third_party/xla/xla/stream_executor/platform/BUILD
+++ b/third_party/xla/xla/stream_executor/platform/BUILD
@@ -1,7 +1,7 @@
-load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_stream_executor_deps")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index e71a76b2462236..49265995ff5beb 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -1,5 +1,6 @@
 # Description:
 #   ROCm-platform specific StreamExecutor support code.
+# buildifier: disable=out-of-order-load
 
 load(
     "//xla/stream_executor:build_defs.bzl",
diff --git a/third_party/xla/xla/stream_executor/tpu/BUILD b/third_party/xla/xla/stream_executor/tpu/BUILD
index 6b79aa1b395467..4f26d6c6b4fcf5 100644
--- a/third_party/xla/xla/stream_executor/tpu/BUILD
+++ b/third_party/xla/xla/stream_executor/tpu/BUILD
@@ -1,8 +1,8 @@
 # Description: StreamExecutor Interface for TPUs
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 9cb47d83f5ced6..32788698bf4203 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -1,12 +1,6 @@
 # Description:
 #   Base testing infrastructure for XLA.
 
-load("//xla/tests:build_defs.bzl", "generate_backend_suites", "generate_backend_test_macros", "xla_test", "xla_test_library")
-load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
@@ -22,6 +16,12 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
+load("//xla/tests:build_defs.bzl", "generate_backend_suites", "generate_backend_test_macros", "xla_test", "xla_test_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/tests/build_defs.bzl b/third_party/xla/xla/tests/build_defs.bzl
index 05ee999937a5d8..b1d56b72b1273d 100644
--- a/third_party/xla/xla/tests/build_defs.bzl
+++ b/third_party/xla/xla/tests/build_defs.bzl
@@ -1,15 +1,15 @@
 """Build rules for XLA testing."""
 
+load(
+    "@local_tsl//tsl/platform:build_config_root.bzl",
+    "tf_gpu_tests_tags",
+)
 load("//xla:xla.bzl", "xla_cc_test")
 load(
     "//xla/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
 )
 load("//xla/tests:plugin.bzl", "plugins")
-load(
-    "@local_tsl//tsl/platform:build_config_root.bzl",
-    "tf_gpu_tests_tags",
-)
 
 all_backends = ["cpu", "gpu"] + plugins.keys()
 
diff --git a/third_party/xla/xla/tests/exhaustive/BUILD b/third_party/xla/xla/tests/exhaustive/BUILD
index 1b3ee7e074d852..e995732bee51f5 100644
--- a/third_party/xla/xla/tests/exhaustive/BUILD
+++ b/third_party/xla/xla/tests/exhaustive/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   Computationally expensive, exhaustive tests for XLA
 
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 9aaf3aa33a0383..601c27f26f0b09 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -1,15 +1,7 @@
 # Tools and utilities that aid in XLA development and usage.
 
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_binary",
-    "xla_cc_test",
-    "xla_py_proto_library",
-)
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load(
     "@local_tsl//tsl:tsl.bzl",
@@ -26,6 +18,14 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load(
+    "//xla:xla.bzl",
+    "xla_cc_binary",
+    "xla_cc_test",
+    "xla_py_proto_library",
+)
+load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index 3bd009b55481d8..cf847c5ad47d3b 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -1,8 +1,3 @@
-load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
@@ -18,6 +13,11 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index aecf96c53d43e3..fe7b8b8bf03482 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -1,10 +1,10 @@
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("//xla:xla.bzl", "xla_cc_binary")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_tsl//tsl:tsl.bzl", "if_cuda_or_rocm")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_binary")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/translate/BUILD b/third_party/xla/xla/translate/BUILD
index aad084173418de..21f67f91401763 100644
--- a/third_party/xla/xla/translate/BUILD
+++ b/third_party/xla/xla/translate/BUILD
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//xla:xla.bzl", "xla_cc_binary")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
+load("//xla:xla.bzl", "xla_cc_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
index ae459a98dd6219..ad5ad86b6b3518 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
@@ -1,6 +1,6 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
index d3bb42f5ce17f4..cb94fd2ce960fa 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
@@ -1,9 +1,9 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_binary", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD
index 46548c0f647d91..140d5bd9093576 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD
@@ -1,5 +1,5 @@
-load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
+load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],

From b25e973681359a62b41d49bce568d631192018a5 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Wed, 27 Mar 2024 11:07:45 -0700
Subject: [PATCH 488/670] [xla][gpu] Make the p2p-schedule-preparation pass to
 not rely on certain pattern to find the corresponding Send/Recv for
 SendDone/RecvDone.

Previously, we relied on certain code patterns to find the Send/Recv for
Send-done/Recv-done for a pipelined loop. Now that we realize copy-insertion
can complicate such code patterns and make such pattern matching fragile.

We modify the p2p-schedule-preparation pass to record Send/Recv in additional
to SendDone/RecvDone for each p2p communication group.

PiperOrigin-RevId: 619589458
---
 .../xla/service/p2p_schedule_preparation.cc   | 178 +++++++++---------
 .../service/p2p_schedule_preparation_test.cc  |  98 ++++++++--
 2 files changed, 175 insertions(+), 101 deletions(-)

diff --git a/third_party/xla/xla/service/p2p_schedule_preparation.cc b/third_party/xla/xla/service/p2p_schedule_preparation.cc
index 197699f66e436a..1771a99ea2227a 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation.cc
@@ -52,11 +52,6 @@ bool IsP2POp(const HloInstruction* op) {
   return p2p != nullptr && !p2p->is_host_transfer();
 }
 
-bool IsP2PDoneOp(const HloInstruction* op) {
-  return IsP2POp(op) && (op->opcode() == HloOpcode::kRecvDone ||
-                         op->opcode() == HloOpcode::kSendDone);
-}
-
 // Returns whether the instruction is a collective operation, for the purpose
 // of detecting whether the computation directly invokes collective
 // operations. As such, we only need to detect one of the instructions for a
@@ -111,21 +106,38 @@ struct P2PGroupNode {
     return computation == parent;
   }
 
-  bool RecordDoneOp(HloSendRecvInstruction* p2p) {
+  bool RecordP2POp(HloSendRecvInstruction* p2p) {
     if (!RecordParentComputation(p2p->parent())) {
       return false;
     }
 
-    if (p2p->opcode() == HloOpcode::kRecvDone) {
-      if (recv_done == nullptr) {
-        recv_done = Cast<HloRecvDoneInstruction>(p2p);
-        return true;
-      }
-    } else if (p2p->opcode() == HloOpcode::kSendDone) {
-      if (send_done == nullptr) {
-        send_done = Cast<HloSendDoneInstruction>(p2p);
-        return true;
-      }
+    switch (p2p->opcode()) {
+      case HloOpcode::kRecvDone:
+        if (recv_done == nullptr) {
+          recv_done = Cast<HloRecvDoneInstruction>(p2p);
+          return true;
+        }
+        break;
+      case HloOpcode::kSendDone:
+        if (send_done == nullptr) {
+          send_done = Cast<HloSendDoneInstruction>(p2p);
+          return true;
+        }
+        break;
+      case HloOpcode::kRecv:
+        if (recv == nullptr) {
+          recv = Cast<HloRecvInstruction>(p2p);
+          return true;
+        }
+        break;
+      case HloOpcode::kSend:
+        if (send == nullptr) {
+          send = Cast<HloSendInstruction>(p2p);
+          return true;
+        }
+        break;
+      default:
+        break;
     }
     return false;
   }
@@ -142,50 +154,14 @@ struct P2PGroupNode {
   }
 
   bool Incomplete() const {
-    return recv_done == nullptr || send_done == nullptr;
+    return recv_done == nullptr || send_done == nullptr || recv == nullptr ||
+           send == nullptr;
   }
 
   bool IncompletePipelinedParent() const {
     return Incomplete() || while_loop == nullptr;
   }
 
-  // Returns the Send/Recv instruction for the given Send-done/Recv-done
-  // instruction. The HLO verifier ensures that the operand of such a done
-  // instruction can only be a Send/Recv, or a value in a while-op result, or
-  // a parameter of a while-body.
-  HloSendRecvInstruction* GetAsyncStart(HloSendRecvInstruction* done) const {
-    HloInstruction* op = done->mutable_operand(0);
-    HloSendRecvInstruction* start = DynCast<HloSendRecvInstruction>(op);
-    if (start != nullptr) {
-      return start;
-    }
-
-    HloGetTupleElementInstruction* gte =
-        DynCast<HloGetTupleElementInstruction>(op);
-    int64_t tuple_index = gte->tuple_index();
-    if (gte->operand(0)->opcode() == HloOpcode::kWhile) {
-      // The op is a while-result, so the start-op should be a value in the
-      // while-op operands.
-      start = DynCast<HloSendRecvInstruction>(
-          gte->mutable_operand(0)->mutable_operand(0)->mutable_operand(
-              tuple_index));
-    } else {
-      // The op is a while-body parameter, so the start-op should be a value in
-      // the while-body result.
-      start = DynCast<HloSendRecvInstruction>(
-          computation->root_instruction()->mutable_operand(tuple_index));
-    }
-    return start;
-  }
-
-  HloSendInstruction* GetSend() const {
-    return DynCast<HloSendInstruction>(GetAsyncStart(send_done));
-  }
-
-  HloRecvInstruction* GetRecv() const {
-    return DynCast<HloRecvInstruction>(GetAsyncStart(recv_done));
-  }
-
   // Returns the pipeline stream used to execute the P2P instructions in the
   // group.
   P2PPipelineStream GetPipelineStream(const HloInstruction* start) const {
@@ -205,8 +181,8 @@ struct P2PGroupNode {
   // the pipeline group node, verifies they both have the same value and returns
   // the stream.
   P2PPipelineStream GetPipelineStream() const {
-    P2PPipelineStream send_stream = GetPipelineStream(GetSend());
-    P2PPipelineStream recv_stream = GetPipelineStream(GetRecv());
+    P2PPipelineStream send_stream = GetPipelineStream(send);
+    P2PPipelineStream recv_stream = GetPipelineStream(recv);
     if (send_stream != recv_stream) {
       return kUnknown;
     }
@@ -215,6 +191,8 @@ struct P2PGroupNode {
 
   HloRecvDoneInstruction* recv_done = nullptr;
   HloSendDoneInstruction* send_done = nullptr;
+  HloRecvInstruction* recv = nullptr;
+  HloSendInstruction* send = nullptr;
   // The computation that contains the Send and Recv instructions.
   HloComputation* computation = nullptr;
   // The while-loop instruction that calls the while-body with the pipelined
@@ -254,7 +232,7 @@ static constexpr int kPipelinedParentNodeIdx = 1;
 // If a group forms a cycle with another group, records the other group as a
 // complement group.
 struct P2PGroup {
-  Status RecordDoneOpForUnpipelinedGroup(HloSendRecvInstruction* p2p) {
+  Status RecordP2POpForUnpipelinedGroup(HloSendRecvInstruction* p2p) {
     if (kind == kUnrecognized) {
       // Leave unrecognized P2P groups alone.
       return OkStatus();
@@ -263,13 +241,13 @@ struct P2PGroup {
       return Internal("Expected unpipelined group");
     }
     P2PGroupNode& node = nodes[kUnpipelinedNodeIdx];
-    if (!node.RecordDoneOp(p2p)) {
+    if (!node.RecordP2POp(p2p)) {
       kind = kUnrecognized;
     }
     return OkStatus();
   }
 
-  Status RecordDoneOpForPipelinedGroup(HloSendRecvInstruction* p2p) {
+  Status RecordP2POpForPipelinedGroup(HloSendRecvInstruction* p2p) {
     if (kind == kUnrecognized) {
       // Leave unrecognized P2P groups alone.
       return OkStatus();
@@ -281,7 +259,7 @@ struct P2PGroup {
       kind = kPipelined;
     }
     P2PGroupNode& node = nodes[kPipelinedParentNodeIdx];
-    if (!node.RecordDoneOp(p2p)) {
+    if (!node.RecordP2POp(p2p)) {
       kind = kUnrecognized;
     }
     return OkStatus();
@@ -362,7 +340,7 @@ struct P2PGroup {
   // collectives should be scheduled to.
   ChainStartEnd GetChainStartEnd(HloComputation* computation) const {
     if (kind == kUnpipelined) {
-      return std::make_pair(GetChild().GetRecv(), GetChild().send_done);
+      return std::make_pair(GetChild().recv, GetChild().send_done);
     }
 
     CHECK(kind == kPipelined);
@@ -370,18 +348,18 @@ struct P2PGroup {
       // For the child computation of a pipelined group, we return the start
       // and end of the instruction where we can put other collectives.
       if (complement_group == nullptr) {
-        return std::make_pair(GetChild().send_done, GetChild().GetRecv());
+        return std::make_pair(GetChild().send_done, GetChild().recv);
       }
       CHECK(pipeline_stream == kPipeline1);
-      return std::make_pair(GetChild().send_done, GetChild().GetRecv());
+      return std::make_pair(GetChild().send_done, GetChild().recv);
     }
 
     CHECK(computation == ParentComputation());
     if (complement_group == nullptr) {
-      return std::make_pair(GetParent().GetRecv(), GetParent().send_done);
+      return std::make_pair(GetParent().recv, GetParent().send_done);
     }
     CHECK(pipeline_stream == kPipeline1);
-    return std::make_pair(complement_group->GetParent().GetRecv(),
+    return std::make_pair(complement_group->GetParent().recv,
                           GetParent().send_done);
   }
 
@@ -419,13 +397,45 @@ bool MayInvokeCollectiveOp(
 Status MayAddWhileOpToPipelinedGroup(HloInstruction* while_op,
                                      P2PInComputation& p2p_in_computation,
                                      P2PGroupMap& p2p_group_map) {
+  if (while_op->while_init()->opcode() != HloOpcode::kTuple) {
+    // A while-init should contain the loop index variable. So if a while-init
+    // is not a tuple, it only contains the loop index variable and shouldn't
+    // contain any pipelined Send operand.
+    return OkStatus();
+  }
   HloComputation* body = while_op->called_computations()[0];
   auto p2p_in_while = p2p_in_computation.find(body);
   if (p2p_in_while == p2p_in_computation.end()) {
     return OkStatus();
   }
   int pipelined_group = 0;
-  for (auto hlo : while_op->operand(0)->operands()) {
+  // Check whether the while-op init contains a token from a Send result.
+  for (auto hlo : while_op->while_init()->operands()) {
+    if (hlo->opcode() == HloOpcode::kTuple) {
+      // A send has a tuple as its result, the tuple contains a token.
+      // If a send is pipelined, then, the while-init either contains
+      // a send-result, or contains a tuple with a token element from the
+      // send result. As such, if a tuple represent a pipelined send, it is
+      // either a direct send result, or a tuple with this code pattern:
+      ///
+      //   send = (..., token) send(...)
+      //   send.token = token[] get-tuple-element(send) index=...
+      //   send.tuple.reconstruct = tuple(..., send.token)
+      //   while-init =  tuple(..., send.tuple.reconstruct)
+      //   while-result =  while(while-init), ...
+      //
+      // So if the tuple contains a token, we make `hlo` point-to the producer
+      // of the token so that we can check whether the producer is a send after.
+      for (auto ele : hlo->operands()) {
+        if (ele->shape().IsToken()) {
+          // Assure that the token is part of an instruction result and not
+          // generated by a copy as we currently don't copy token.
+          CHECK(ele->opcode() == HloOpcode::kGetTupleElement);
+          hlo = ele->mutable_operand(0);
+          break;
+        }
+      }
+    }
     if (hlo->opcode() != HloOpcode::kSend) {
       continue;
     }
@@ -459,9 +469,9 @@ Status OrderBefore(HloInstruction* i1, HloInstruction* i2) {
 Status ConnectUnpipelinedP2P(const P2PGroup& p2p_group) {
   const P2PGroupNode& node = p2p_group.GetChild();
   HloRecvDoneInstruction* recv_done = node.recv_done;
-  HloRecvInstruction* recv = node.GetRecv();
+  HloRecvInstruction* recv = node.recv;
   HloSendDoneInstruction* send_done = node.send_done;
-  HloSendInstruction* send = node.GetSend();
+  HloSendInstruction* send = node.send;
   TF_RETURN_IF_ERROR(OrderBefore(recv, send));
   TF_RETURN_IF_ERROR(OrderBefore(send, recv_done));
   TF_RETURN_IF_ERROR(OrderBefore(recv_done, send_done));
@@ -474,9 +484,9 @@ Status ConnectUnpipelinedP2P(const P2PGroup& p2p_group) {
 Status ConnectPipelined1P2PChild(const P2PGroup& p2p_group) {
   const P2PGroupNode& node = p2p_group.GetChild();
   HloSendRecvInstruction* recv_done = node.recv_done;
-  HloRecvInstruction* recv = node.GetRecv();
+  HloRecvInstruction* recv = node.recv;
   HloSendRecvInstruction* send_done = node.send_done;
-  HloSendInstruction* send = node.GetSend();
+  HloSendInstruction* send = node.send;
   TF_RETURN_IF_ERROR(OrderBefore(recv_done, send_done));
   TF_RETURN_IF_ERROR(OrderBefore(send_done, recv));
   TF_RETURN_IF_ERROR(OrderBefore(recv, send));
@@ -491,13 +501,13 @@ Status ConnectPipelined2P2PChild(const P2PGroup& p2p_group) {
   const P2PGroupNode& node0 = p2p_group.complement_group->GetChild();
   const P2PGroupNode& node1 = p2p_group.GetChild();
   HloSendRecvInstruction* recv_done0 = node0.recv_done;
-  HloRecvInstruction* recv0 = node0.GetRecv();
+  HloRecvInstruction* recv0 = node0.recv;
   HloSendRecvInstruction* send_done0 = node0.send_done;
-  HloSendInstruction* send0 = node0.GetSend();
+  HloSendInstruction* send0 = node0.send;
   HloSendRecvInstruction* recv_done1 = node1.recv_done;
-  HloRecvInstruction* recv1 = node1.GetRecv();
+  HloRecvInstruction* recv1 = node1.recv;
   HloSendRecvInstruction* send_done1 = node1.send_done;
-  HloSendInstruction* send1 = node1.GetSend();
+  HloSendInstruction* send1 = node1.send;
 
   TF_RETURN_IF_ERROR(OrderBefore(recv_done0, send_done0));
   TF_RETURN_IF_ERROR(OrderBefore(send_done0, recv_done1));
@@ -516,9 +526,9 @@ Status ConnectPipelined2P2PChild(const P2PGroup& p2p_group) {
 Status ConnectPipelined1P2PParent(const P2PGroup& p2p_group) {
   const P2PGroupNode& node = p2p_group.GetParent();
   HloSendRecvInstruction* recv_done = node.recv_done;
-  HloRecvInstruction* recv = node.GetRecv();
+  HloRecvInstruction* recv = node.recv;
   HloSendRecvInstruction* send_done = node.send_done;
-  HloSendInstruction* send = node.GetSend();
+  HloSendInstruction* send = node.send;
   TF_RETURN_IF_ERROR(OrderBefore(recv, send));
   TF_RETURN_IF_ERROR(OrderBefore(recv_done, send_done));
   return OkStatus();
@@ -532,13 +542,13 @@ Status ConnectPipelined2P2PParent(const P2PGroup& p2p_group) {
   const P2PGroupNode& node0 = p2p_group.complement_group->GetParent();
   const P2PGroupNode& node1 = p2p_group.GetParent();
   HloSendRecvInstruction* recv_done0 = node0.recv_done;
-  HloRecvInstruction* recv0 = node0.GetRecv();
+  HloRecvInstruction* recv0 = node0.recv;
   HloSendRecvInstruction* send_done0 = node0.send_done;
-  HloSendInstruction* send0 = node0.GetSend();
+  HloSendInstruction* send0 = node0.send;
   HloSendRecvInstruction* recv_done1 = node1.recv_done;
-  HloRecvInstruction* recv1 = node1.GetRecv();
+  HloRecvInstruction* recv1 = node1.recv;
   HloSendRecvInstruction* send_done1 = node1.send_done;
-  HloSendInstruction* send1 = node1.GetSend();
+  HloSendInstruction* send1 = node1.send;
 
   TF_RETURN_IF_ERROR(OrderBefore(recv0, send0));
   TF_RETURN_IF_ERROR(OrderBefore(send0, recv1));
@@ -580,7 +590,7 @@ Status GatherP2PGroupsAndCollectiveInfo(
       while_ops.push_back(hlo);
       continue;
     }
-    if (!IsP2PDoneOp(hlo)) {
+    if (!IsP2POp(hlo)) {
       continue;
     }
     HloSendRecvInstruction* p2p = Cast<HloSendRecvInstruction>(hlo);
@@ -591,15 +601,15 @@ Status GatherP2PGroupsAndCollectiveInfo(
       // P2P group and may turn it into a kPipelined group or kUnrecognized
       // group.
       P2PGroup group;
-      TF_RETURN_IF_ERROR(group.RecordDoneOpForUnpipelinedGroup(p2p));
+      TF_RETURN_IF_ERROR(group.RecordP2POpForUnpipelinedGroup(p2p));
       p2p_group_map[channel] = group;
     } else {
       P2PGroup& group = p2p_group->second;
       if (group.ChildComputation() == computation) {
-        TF_RETURN_IF_ERROR(group.RecordDoneOpForUnpipelinedGroup(p2p));
+        TF_RETURN_IF_ERROR(group.RecordP2POpForUnpipelinedGroup(p2p));
       } else {
         // We are at the parent computation for a pipelined P2P group.
-        TF_RETURN_IF_ERROR(group.RecordDoneOpForPipelinedGroup(p2p));
+        TF_RETURN_IF_ERROR(group.RecordP2POpForPipelinedGroup(p2p));
       }
     }
     // We can't rely on the operation on p2p_group_map above to find out
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation_test.cc b/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
index 393734a8c41f8b..f4d04a56556399 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
@@ -412,10 +412,24 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
     param = (u32[], (f32[1, 1024, 1024], token[]),
       (f32[1, 1024, 1024], token[])) parameter(0)
     count = get-tuple-element(param), index=0
-    send.1.q = (f32[1, 1024, 1024], token[]) get-tuple-element(param), index=2
+
+    // Mimic the code transformation done by copy-insertion to complicate
+    // the code pattern.
+    send.1.q.t = (f32[1,1024,1024], token[]) get-tuple-element(param), index=1
+    send.1.q.data = f32[1,1024,1024] get-tuple-element(send.1.q.t), index=0
+    send.1.q.data.copy = f32[1,1024,1024] copy(send.1.q.data)
+    send.1.q.token = token[] get-tuple-element(send.1.q.t), index=1
+    send.1.q = (f32[1, 1024, 1024], token[]) tuple(send.1.q.data.copy, send.1.q.token)
+
     recv.1.q = (f32[1, 1024, 1024], token[])get-tuple-element(param), index=1
-    send-done.1 = token[] send-done(send.1.q), channel_id=1
-    recv-done.1 = token[] recv-done(recv.1.q), channel_id=1
+    send-done.1 = token[] send-done(send.1.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.1 = token[] recv-done(recv.1.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
 
     c1 = u32[] constant(1)
@@ -441,11 +455,18 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
       _xla_send_recv_pipeline="0"
     }
-    recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=1,
+
+    // Mimic the code transformation done by copy-insertion to complicate
+    // the code pattern.
+    recv.1 = (f32[1, 1024, 1024], token[]) recv(after-all.1), channel_id=1,
       frontend_attributes={
        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
        _xla_send_recv_pipeline="0"
     }
+    recv.1.data = f32[1,1024,1024] get-tuple-element(recv.1), index=0
+    recv.1.data.copy = f32[1,1024,1024] copy(recv.1.data)
+    recv.1.token = token[] get-tuple-element(recv.1), index=1
+    recv.1.tuple = (f32[1,1024,1024], token[]) tuple(recv.1.data.copy, recv.1.token)
 
     ROOT body-result = (u32[], (f32[1, 1024, 1024], token[]),
       (f32[1, 1024, 1024], token[])) tuple(new-count, recv.1, send.1)
@@ -458,8 +479,14 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
     count = get-tuple-element(param), index=0
     send.1.q = (f32[1, 1024, 1024], token[]) get-tuple-element(param), index=2
     recv.1.q = (f32[1, 1024, 1024], token[])get-tuple-element(param), index=1
-    send-done.1 = token[] send-done(send.1.q), channel_id=1
-    recv-done.1 = token[] recv-done(recv.1.q), channel_id=1
+    send-done.1 = token[] send-done(send.1.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.1 = token[] recv-done(recv.1.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
 
     c1 = u32[] constant(1)
@@ -536,24 +563,37 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
        _xla_send_recv_pipeline="0"
     }
+
+    // Mimic the code transformation done by copy-insertion to complicate
+    // the code pattern.
     send.2 = (f32[1, 1024, 1024], token[]) send(init, after-all.2),
       channel_id=1, frontend_attributes={
       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
       _xla_send_recv_pipeline="0"
     }
+    send.2.data = f32[1,1024,1024] get-tuple-element(send.2), index=0
+    send.2.data.copy = f32[1,1024,1024] copy(send.2.data)
+    send.2.token = token[] get-tuple-element(send.2), index=1
+    send.2.tuple = (f32[1,1024,1024], token[]) tuple(send.2.data.copy, send.2.token)
 
     while-init =  (u32[], (f32[1, 1024, 1024], token[]),
-      (f32[1, 1024, 1024], token[])) tuple(c0, recv.2, send.2)
+      (f32[1, 1024, 1024], token[])) tuple(c0, recv.2, send.2.tuple)
     while-result =  (u32[], (f32[1, 1024, 1024], token[]),
       (f32[1, 1024, 1024], token[])) while(while-init),
       body=while-body, condition=while-cond,
       backend_config={"known_trip_count":{"n":"25"}}
 
     recv.2.q = (f32[1, 1024, 1024], token[]) get-tuple-element(while-result), index=1
-    recv-done.2 = (f32[1, 1024, 1024], token[]) recv-done(recv.2.q), channel_id=1
+    recv-done.2 = (f32[1, 1024, 1024], token[]) recv-done(recv.2.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data.2.q = f32[1, 1024, 1024] get-tuple-element(recv-done.2), index=0
     send.2.q = (f32[1, 1024, 1024], token[]) get-tuple-element(while-result), index=2
-    send-done.2 = token[] send-done(send.2.q), channel_id=1
+    send-done.2 = token[] send-done(send.2.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
 
     // The code for the computation result goes here.
     %s
@@ -692,11 +732,17 @@ body {
     count = get-tuple-element(param), index=0
 
     recv.0.f = (u32[2], u32[], token[]) get-tuple-element(param), index=1
-    recv-done.0 = (u32[2], token[]) recv-done(recv.0.f), channel_id=1
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0.f), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data.0 = u32[2] get-tuple-element(recv-done.0), index=0
 
     recv.1.f = (u32[2], u32[], token[]) get-tuple-element(param), index=2
-    recv-done.1 = (u32[2], token[]) recv-done(recv.1.f), channel_id=2
+    recv-done.1 = (u32[2], token[]) recv-done(recv.1.f), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
     recv-data.1 = u32[2] get-tuple-element(recv-done.1), index=0
 
     replica = u32[] replica-id()
@@ -712,9 +758,15 @@ body {
     s = u32[2] add(r, recv-data)
 
     send.0.f = (u32[2], u32[], token[]) get-tuple-element(param), index=3
-    send-done.0 = token[] send-done(send.0.f), channel_id=1
+    send-done.0 = token[] send-done(send.0.f), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     send.1.f = (u32[2], u32[], token[]) get-tuple-element(param), index=4
-    send-done.1 = token[] send-done(send.1.f), channel_id=2
+    send-done.1 = token[] send-done(send.1.f), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
 
     // The Recv "rotated" from the beginning of the loop to the end of the loop.
     after-all.0.n = token[] after-all()
@@ -792,11 +844,17 @@ body {
     // Use .q as suffix for HLO name.
 
      recv.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=1
-     recv-done.2 = (u32[2], token[]) recv-done(recv.0.q), channel_id=1
+     recv-done.2 = (u32[2], token[]) recv-done(recv.0.q), channel_id=1,
+       frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
      recv-data.0.q = u32[2] get-tuple-element(recv-done.2), index=0
 
      recv.1.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=2
-     recv-done.3 = (u32[2], token[]) recv-done(recv.1.q), channel_id=2
+     recv-done.3 = (u32[2], token[]) recv-done(recv.1.q), channel_id=2,
+       frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
      recv-data.1.q = u32[2] get-tuple-element(recv-done.2), index=0
 
     replica = u32[] replica-id()
@@ -808,9 +866,15 @@ body {
     s = u32[2] add(c1, recv-data)
 
     send.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=3
-    send-done.2 = token[] send-done(send.0.q), channel_id=1
+    send-done.2 = token[] send-done(send.0.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     send.1.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=4
-    send-done.3 = token[] send-done(send.1.q), channel_id=2
+    send-done.3 = token[] send-done(send.1.q), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
 
     ROOT result = u32[2] add(s, recv-data)
   }

From 7edecb917185eae8f3ed6a1f7b5ccc63df1bdfdb Mon Sep 17 00:00:00 2001
From: TJ Xu <tjx@nvidia.com>
Date: Wed, 27 Mar 2024 11:26:36 -0700
Subject: [PATCH 489/670] PR #10858: [NVIDIA GPU] Enforce first collective
 permute of collective matmul to always run at the top of loop.

Imported from GitHub PR https://github.com/openxla/xla/pull/10858

This is to port collective matmul related changes from this pr(https://github.com/openxla/xla/pull/10316).
We'd need the first collective permute of a collective matmul loop always run at the beginning of the loop.
We set the force_earliest_schedule flag  for that instruction to achieve so.
Force this schedule order would give about additional 3% speedup on top of existing collective matmul for GPT-3 530B model.
Copybara import of the project:

--
eb822e1493c0b0b51438f4b7153df6f9dfaa9276 by TJ Xu <tjx@nvidia.com>:

Enforce first collective permute of collective matmul to always run at
the top of loop.

Merging this change closes #10858

PiperOrigin-RevId: 619595918
---
 .../gpu/gpu_windowed_einsum_handler.cc        | 32 +++++++++++++++++++
 .../gpu/gpu_windowed_einsum_handler_test.cc   | 15 +++++++--
 .../gpu/stream_attribute_async_wrapper.cc     |  6 ++++
 .../stream_attribute_async_wrapper_test.cc    |  8 ++++-
 4 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.cc b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.cc
index 24ff8a18b8d9ce..fa5339fc007f44 100644
--- a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.cc
@@ -62,6 +62,16 @@ absl::Status UpdateDotAndConsumerConfig(HloInstruction* dot,
   return absl::OkStatus();
 }
 
+absl::Status SetForceDelayForInstruction(HloInstruction* instr,
+                                         bool force_delay) {
+  auto gpu_config = instr->backend_config<gpu::GpuBackendConfig>();
+
+  gpu_config->set_force_earliest_schedule(force_delay);
+
+  TF_RETURN_IF_ERROR(instr->set_backend_config(gpu_config.value()));
+  return absl::OkStatus();
+}
+
 absl::StatusOr<bool> HandleRsWindowedEinsumLoop(HloComputation* comp,
                                                 int64_t stream_id) {
   bool changed = false;
@@ -82,6 +92,16 @@ absl::StatusOr<bool> HandleRsWindowedEinsumLoop(HloComputation* comp,
       ++stream_id;
       changed = true;
     }
+
+    // We need to enforce the first collective-permute to be always scheduled
+    // at the beginning of the loop.
+    HloInstruction* matched_cp;
+    if (Match(inst, m::CollectivePermute(
+                        &matched_cp, m::GetTupleElement(m::Parameter(), 2)))) {
+      TF_RETURN_IF_ERROR(
+          SetForceDelayForInstruction(matched_cp, /*force_delay=*/true));
+      changed = true;
+    }
   }
   return changed;
 }
@@ -104,6 +124,18 @@ absl::StatusOr<bool> HandleAgWindowedEinsumLoop(HloComputation* comp,
       // Dispatch the dot to additional compute stream.
       TF_RETURN_IF_ERROR(UpdateDotAndConsumerConfig(matched_dot, stream_id));
       ++stream_id;
+      TF_RETURN_IF_ERROR(
+          SetForceDelayForInstruction(matched_dot, /*force_delay=*/true));
+      changed = true;
+    }
+
+    // We need to enforce the first collective-permute to be always scheduled
+    // at the beginning of the loop.
+    HloInstruction* matched_cp;
+    if (Match(inst, m::CollectivePermute(
+                        &matched_cp, m::GetTupleElement(m::Parameter(), 0)))) {
+      TF_RETURN_IF_ERROR(
+          SetForceDelayForInstruction(matched_cp, /*force_delay=*/true));
       changed = true;
     }
   }
diff --git a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler_test.cc b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler_test.cc
index 06fa8b107067f6..c70fbf2b08d126 100644
--- a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler_test.cc
@@ -106,8 +106,14 @@ ENTRY test_main {
       module->entry_computation()->root_instruction()->mutable_operand(0);
   HloComputation* ag_loop_body = ag_loop->while_body();
   HloInstruction* inst = FindInstructionByName(ag_loop_body, "dot.2");
-  EXPECT_TRUE(inst->backend_config<GpuBackendConfig>()->operation_queue_id() >
-              0);
+  EXPECT_GT(inst->backend_config<GpuBackendConfig>()->operation_queue_id(), 0);
+  EXPECT_TRUE(
+      inst->backend_config<GpuBackendConfig>()->force_earliest_schedule());
+
+  HloInstruction* cp1 =
+      FindInstructionByName(ag_loop_body, "collective-permute");
+  EXPECT_TRUE(
+      cp1->backend_config<GpuBackendConfig>()->force_earliest_schedule());
 }
 
 TEST_F(GpuWindowedEinsumHanlderTest, RsLoopsHaveStreamIds) {
@@ -180,6 +186,11 @@ ENTRY main.9_spmd {
   HloInstruction* inst = FindInstructionByName(rs_loop_body, "dot.7");
   EXPECT_TRUE(inst->backend_config<GpuBackendConfig>()->operation_queue_id() >
               0);
+
+  HloInstruction* cp1 =
+      FindInstructionByName(rs_loop_body, "collective-permute.1");
+  EXPECT_TRUE(
+      cp1->backend_config<GpuBackendConfig>()->force_earliest_schedule());
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper.cc b/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper.cc
index 97baaa76fb3818..822c6473dba483 100644
--- a/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper.cc
@@ -42,6 +42,12 @@ static absl::StatusOr<bool> AsynchronizeInstruction(HloInstruction* instr) {
       computation->CreateAsyncInstructions(
           instr, {}, StreamAttributeAsyncWrapper::kParallelExecutionThread,
           /*replace=*/true));
+  TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
+                      done->backend_config<GpuBackendConfig>());
+  // Set the false delay of done op to be false so it can be scheduled
+  // far apart from start.
+  gpu_config.set_force_earliest_schedule(false);
+  TF_RETURN_IF_ERROR(done->set_backend_config(gpu_config));
   VLOG(5) << "Created async instruction: " << done->ToString();
   return true;
 }
diff --git a/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper_test.cc b/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper_test.cc
index d8da1fd47a4bc9..8b3dcb23eac7bc 100644
--- a/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper_test.cc
+++ b/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper_test.cc
@@ -40,7 +40,7 @@ TEST_F(StreamAttributeAsyncWrapperTest, NonDefaultOpIsWrapped) {
   ENTRY entry {
     p1_32 = f32[1] parameter(0)
     p2_32 = f32[1] parameter(1)
-    add_32 = f32[1] add(p1_32, p2_32), backend_config={"operation_queue_id":"1", "wait_on_operation_queues":[]}
+    add_32 = f32[1] add(p1_32, p2_32), backend_config={"operation_queue_id":"1", "wait_on_operation_queues":[], "force_earliest_schedule":true}
     ROOT exp_32 = f32[1] exponential(add_32), backend_config={"operation_queue_id":"0", "wait_on_operation_queues":[1]}
   }
   )";
@@ -55,6 +55,11 @@ TEST_F(StreamAttributeAsyncWrapperTest, NonDefaultOpIsWrapped) {
   const HloInstruction* producer =
       module->entry_computation()->root_instruction()->operand(0);
   EXPECT_EQ(producer->opcode(), HloOpcode::kAsyncDone);
+  // Verify that the force_earliest_schedule is set to false for the done op.
+  TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig done_gpu_config,
+                          producer->backend_config<GpuBackendConfig>());
+  EXPECT_EQ(done_gpu_config.force_earliest_schedule(), false);
+
   const HloInstruction* producer_start = producer->operand(0);
   EXPECT_EQ(producer_start->opcode(), HloOpcode::kAsyncStart);
 
@@ -65,6 +70,7 @@ TEST_F(StreamAttributeAsyncWrapperTest, NonDefaultOpIsWrapped) {
   TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
                           async->backend_config<GpuBackendConfig>());
   EXPECT_EQ(gpu_config.operation_queue_id(), 1);
+  EXPECT_EQ(gpu_config.force_earliest_schedule(), true);
   EXPECT_EQ(async->async_execution_thread(), "parallel");
 }
 }  // namespace

From 56fa03c17dea484665e0373c6fb6441c615ac515 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Mar 2024 11:33:34 -0700
Subject: [PATCH 490/670] Enable test for
 CALIBRATION_METHOD_HISTOGRAM_PERCENTILE.

PiperOrigin-RevId: 619598572
---
 .../integration_test/quantize_model_test.py     | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
index eda6cc15c5be5a..6db660e23c20ef 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -810,7 +810,6 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
   (default in TF2) to ensure support for when TF2 is disabled.
   """
 
-  # TODO(b/331467239): Fix CALIBRATION_METHOD_HISTOGRAM_PERCENTILE.
   @parameterized.parameters(
       {
           'calibration_options': qc.CalibrationOptions(
@@ -822,14 +821,14 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
               calibration_method=_CalibrationMethod.CALIBRATION_METHOD_AVERAGE_MIN_MAX
           ),
       },
-      # {
-      #     'calibration_options': qc.CalibrationOptions(
-      #         calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
-      #         calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
-      #             initial_num_bins=10,
-      #         ),
-      #     ),
-      # },
+      {
+          'calibration_options': qc.CalibrationOptions(
+              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
+              calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
+                  initial_num_bins=10,
+              ),
+          ),
+      },
       {
           'calibration_options': qc.CalibrationOptions(
               calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,

From 04b31c7d1e349856d4bf7d7758020ce4a36dd21d Mon Sep 17 00:00:00 2001
From: Michael Levesque-Dion <mlevesquedion@google.com>
Date: Wed, 27 Mar 2024 11:58:13 -0700
Subject: [PATCH 491/670] Remove dead values before shape refinement

Some programs have a lot of dead values, causing shape refinement to fail to
converge. Indeed, even if there is no shape refinement to do, greedy pattern
rewrites keep iterating as long as the IR changes, which includes removing dead
values.

PiperOrigin-RevId: 619606321
---
 third_party/stablehlo/temporary.patch         | 91 ++++++++++++++++++-
 .../xla/third_party/stablehlo/temporary.patch | 91 ++++++++++++++++++-
 .../xla/python/refine_polymorphic_shapes.cc   |  4 +
 3 files changed, 180 insertions(+), 6 deletions(-)

diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index 1668264c315c19..df3d010154251c 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -278,7 +278,7 @@ diff --ruN a/stablehlo/stablehlo/dialect/TypeInference.cpp b/stablehlo/stablehlo
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
-@@ -0,0 +1,115 @@
+@@ -0,0 +1,116 @@
 +# Copyright 2023 The StableHLO Authors. All Rights Reserved.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -352,6 +352,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/
 +        "transforms/ChloRecomposeOps.cpp",
 +        "transforms/StablehloCanonicalizeDynamism.cpp",
 +        "transforms/StablehloRefineShapes.cpp",
++        "transforms/StablehloTrivialDce.cpp",
 +    ],
 +    hdrs = [
 +        "transforms/Passes.h",
@@ -2004,7 +2005,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp b/stabl
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 --- stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 +++ stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
-@@ -0,0 +1,40 @@
+@@ -0,0 +1,41 @@
 +# Copyright 2023 The StableHLO Authors.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -2028,6 +2029,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 +  ChloRecomposeOps.cpp
 +  StablehloCanonicalizeDynamism.cpp
 +  StablehloRefineShapes.cpp
++  StablehloTrivialDce.cpp
 +
 +  DEPENDS
 +  ExperimentalPassesIncGen
@@ -2272,7 +2274,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/stablehlo/experimental/transforms/Passes.td
 --- stablehlo/stablehlo/experimental/transforms/Passes.td
 +++ stablehlo/stablehlo/experimental/transforms/Passes.td
-@@ -0,0 +1,39 @@
+@@ -0,0 +1,55 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2312,6 +2314,22 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/s
 +  }];
 +  let dependentDialects = ["chlo::ChloDialect"];
 +}
++
++def StablehloTrivialDcePass : Pass<"experimental-stablehlo-trivial-dce", "ModuleOp"> {
++  let summary = "(Experimental) Performs a single bottom up pass to remove values that are trivially dead.";
++  let description = [{
++    An experimental pass to remove dead values prior to running other passes
++    that may fail to converge otherwise. For example, running shape refinement
++    on a program that has a lot of dead values can fail because shape refinement
++    is top down and removing values causes a new iteration to be triggered, and
++    removing all the dead values with a top down traversal can take a lot of
++    iterations (10+), which is slow.
++
++    Performing a single pass should be fast, and doing it bottom up means that
++    values that are transitively dead can be removed since leaf values will be
++    processed first.
++  }];
++}
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
@@ -2661,6 +2679,73 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
+--- stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
++++ stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
+@@ -0,0 +1,63 @@
++/* Copyright 2022 The StableHLO Authors.
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include <cstdint>
++
++#include "llvm/ADT/SmallVector.h"
++#include "mlir/Dialect/Func/IR/FuncOps.h"
++#include "mlir/IR/PatternMatch.h"
++#include "mlir/Support/LogicalResult.h"
++#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
++#include "stablehlo/experimental/transforms/Passes.h"
++
++namespace mlir {
++namespace stablehlo {
++namespace experimental {
++
++#define GEN_PASS_DEF_STABLEHLOTRIVIALDCEPASS
++#include "stablehlo/experimental/transforms/Passes.h.inc"
++
++namespace {
++
++struct StablehloTrivialDcePass
++    : public impl::StablehloTrivialDcePassBase<StablehloTrivialDcePass> {
++  using StablehloTrivialDcePassBase::StablehloTrivialDcePassBase;
++
++  void runOnOperation() override {
++    GreedyRewriteConfig config;
++
++    // Hardcode defaults for stability.
++    config.enableRegionSimplification = true;
++    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
++    config.strictMode = GreedyRewriteStrictness::AnyOp;
++
++    // Run a single bottom up pass.
++    config.useTopDownTraversal = false;
++    config.maxIterations = 1;
++
++    // Running a greedy rewrite will cause trivially dead values to be removed.
++    // Doing it without patterns ensures that no other changes are made to the
++    // IR. Doing it bottom-up ensures that values that are transitively dead are
++    // also removed. Although 1 pass should be enough,
++    // applyPatternsAndFoldGreedily will want to run at least 1 more iteration
++    // to confirm convergence, but we don't need to check for convergence, so we
++    // ignore the return value.
++    (void)applyPatternsAndFoldGreedily(getOperation(), RewritePatternSet(&getContext()), config);
++  }
++};
++
++}  // namespace
++}  // namespace experimental
++}  // namespace stablehlo
++}  // namespace mlir
 diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
 --- stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
 +++ stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 1668264c315c19..df3d010154251c 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -278,7 +278,7 @@ diff --ruN a/stablehlo/stablehlo/dialect/TypeInference.cpp b/stablehlo/stablehlo
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
-@@ -0,0 +1,115 @@
+@@ -0,0 +1,116 @@
 +# Copyright 2023 The StableHLO Authors. All Rights Reserved.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -352,6 +352,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/
 +        "transforms/ChloRecomposeOps.cpp",
 +        "transforms/StablehloCanonicalizeDynamism.cpp",
 +        "transforms/StablehloRefineShapes.cpp",
++        "transforms/StablehloTrivialDce.cpp",
 +    ],
 +    hdrs = [
 +        "transforms/Passes.h",
@@ -2004,7 +2005,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp b/stabl
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 --- stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 +++ stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
-@@ -0,0 +1,40 @@
+@@ -0,0 +1,41 @@
 +# Copyright 2023 The StableHLO Authors.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -2028,6 +2029,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 +  ChloRecomposeOps.cpp
 +  StablehloCanonicalizeDynamism.cpp
 +  StablehloRefineShapes.cpp
++  StablehloTrivialDce.cpp
 +
 +  DEPENDS
 +  ExperimentalPassesIncGen
@@ -2272,7 +2274,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/stablehlo/experimental/transforms/Passes.td
 --- stablehlo/stablehlo/experimental/transforms/Passes.td
 +++ stablehlo/stablehlo/experimental/transforms/Passes.td
-@@ -0,0 +1,39 @@
+@@ -0,0 +1,55 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2312,6 +2314,22 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/s
 +  }];
 +  let dependentDialects = ["chlo::ChloDialect"];
 +}
++
++def StablehloTrivialDcePass : Pass<"experimental-stablehlo-trivial-dce", "ModuleOp"> {
++  let summary = "(Experimental) Performs a single bottom up pass to remove values that are trivially dead.";
++  let description = [{
++    An experimental pass to remove dead values prior to running other passes
++    that may fail to converge otherwise. For example, running shape refinement
++    on a program that has a lot of dead values can fail because shape refinement
++    is top down and removing values causes a new iteration to be triggered, and
++    removing all the dead values with a top down traversal can take a lot of
++    iterations (10+), which is slow.
++
++    Performing a single pass should be fast, and doing it bottom up means that
++    values that are transitively dead can be removed since leaf values will be
++    processed first.
++  }];
++}
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
@@ -2661,6 +2679,73 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
+--- stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
++++ stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
+@@ -0,0 +1,63 @@
++/* Copyright 2022 The StableHLO Authors.
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include <cstdint>
++
++#include "llvm/ADT/SmallVector.h"
++#include "mlir/Dialect/Func/IR/FuncOps.h"
++#include "mlir/IR/PatternMatch.h"
++#include "mlir/Support/LogicalResult.h"
++#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
++#include "stablehlo/experimental/transforms/Passes.h"
++
++namespace mlir {
++namespace stablehlo {
++namespace experimental {
++
++#define GEN_PASS_DEF_STABLEHLOTRIVIALDCEPASS
++#include "stablehlo/experimental/transforms/Passes.h.inc"
++
++namespace {
++
++struct StablehloTrivialDcePass
++    : public impl::StablehloTrivialDcePassBase<StablehloTrivialDcePass> {
++  using StablehloTrivialDcePassBase::StablehloTrivialDcePassBase;
++
++  void runOnOperation() override {
++    GreedyRewriteConfig config;
++
++    // Hardcode defaults for stability.
++    config.enableRegionSimplification = true;
++    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
++    config.strictMode = GreedyRewriteStrictness::AnyOp;
++
++    // Run a single bottom up pass.
++    config.useTopDownTraversal = false;
++    config.maxIterations = 1;
++
++    // Running a greedy rewrite will cause trivially dead values to be removed.
++    // Doing it without patterns ensures that no other changes are made to the
++    // IR. Doing it bottom-up ensures that values that are transitively dead are
++    // also removed. Although 1 pass should be enough,
++    // applyPatternsAndFoldGreedily will want to run at least 1 more iteration
++    // to confirm convergence, but we don't need to check for convergence, so we
++    // ignore the return value.
++    (void)applyPatternsAndFoldGreedily(getOperation(), RewritePatternSet(&getContext()), config);
++  }
++};
++
++}  // namespace
++}  // namespace experimental
++}  // namespace stablehlo
++}  // namespace mlir
 diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
 --- stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
 +++ stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
diff --git a/third_party/xla/xla/python/refine_polymorphic_shapes.cc b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
index 465a3d3565b54d..dbf681c4d0be32 100644
--- a/third_party/xla/xla/python/refine_polymorphic_shapes.cc
+++ b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
@@ -254,6 +254,10 @@ absl::Status RefinePolymorphicShapes(mlir::ModuleOp module,
 
   // TODO(necula): we should not need the inliner.
   pm.addPass(mlir::createInlinerPass());
+  // Efficiently remove dead code to avoid issues in subsequent passes.
+  // Too much dead code can cause e.g. the shape refinement pass to fail to
+  // converge.
+  pm.addPass(mlir::stablehlo::experimental::createStablehloTrivialDcePass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::stablehlo::experimental::createChloRecomposeOpsPass());
   pm.addPass(mlir::stablehlo::experimental::createStablehloRefineShapesPass());

From 9eee8959b3d74271e71893260680128f864138e4 Mon Sep 17 00:00:00 2001
From: Yunlong Liu <yunlongl@google.com>
Date: Wed, 27 Mar 2024 12:05:46 -0700
Subject: [PATCH 492/670] Supports PjRt CPU array converting py array when the
 CPU PjRt arrays have non-default layouts.

PiperOrigin-RevId: 619608909
---
 third_party/xla/xla/python/py_array.cc | 36 ++++++++++++++++++++------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index d73568c2ccb6d4..28b5242cca6b7f 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -1200,6 +1200,12 @@ struct ExtraBufferInfo {
   std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold;
 };
 
+// The default layout of a non-tuple array should have major-to-minor layout
+// and no tiles.
+bool HasDefaultLayout(const Layout& layout) {
+  return LayoutUtil::IsMonotonicWithDim0Major(layout) && layout.tiles().empty();
+}
+
 int PyArray_bf_getbuffer(PyObject* exporter, Py_buffer* view, int flags) {
   Status status = [&]() {
     PyArray py_array = nb::borrow<PyArray>(exporter);
@@ -1277,6 +1283,12 @@ int PyArray_bf_getbuffer(PyObject* exporter, Py_buffer* view, int flags) {
                !LayoutUtil::IsMonotonicWithDim0Major(xla_layout) &&
                !LayoutUtil::IsMonotonicWithDim0Minor(xla_layout)) {
       return InvalidArgument("Buffer is not in contiguous layout.");
+    } else if (!HasDefaultLayout(xla_layout)) {
+      // Fail and fall back to using __array__ if the CPU buffer has a device
+      // specific layout. For instance, this happens for host buffers in pinned
+      // memories of the TPU device.
+      return InvalidArgument(
+          "Buffer is potentially a device buffer with non default layout.");
     }
     std::memset(view, 0, sizeof(Py_buffer));
     const void* root_ptr =
@@ -1348,6 +1360,18 @@ std::optional<std::vector<int64_t>> ByteStridesOrDefaultForShapeInt64(
   return ByteStridesForShape(shape);
 }
 
+bool IsZeroCopyableCpuBuffer(const PjRtBuffer* buf) {
+  // For CPU buffers with device-specific layouts, we must delinearize
+  // to unpack the array. This could happen for the host buffer
+  // pre-mapped to the TPU device, a.k.a., pinned host buffers for the
+  // device.
+  bool has_default_layout = buf->layout() == nullptr ||
+                            HasDefaultLayout(GetXlaLayoutUnsafe(buf->layout()));
+  // On CPU for non-int4 values, we can return the value in a zero-copy way.
+  // For int4 values, we must copy in order to unpack the array.
+  return buf->IsOnCpu() && !primitive_util::Is4BitType(buf->element_type()) &&
+         has_default_layout;
+}
 }  // namespace
 
 PyHostValue::PyHostValue() = default;
@@ -1364,8 +1388,7 @@ StatusOr<nb::object> PyHostValue::AsNumPyArray(
     TF_RET_CHECK(!pjrt_buffer->IsTuple());
     // On CPU for non-int4 values, we can return the value in a zero-copy way.
     // For int4 values, we must copy in order to unpack the array.
-    if (pjrt_buffer->IsOnCpu() &&
-        !primitive_util::Is4BitType(pjrt_buffer->element_type())) {
+    if (IsZeroCopyableCpuBuffer(pjrt_buffer)) {
       TF_ASSIGN_OR_RETURN(const auto* shape,
                           XlaDynamicShape(ifrt_array, dynamic_shape_holder));
       TF_ASSIGN_OR_RETURN(nb_dtype dtype,
@@ -1412,12 +1435,9 @@ Status PyHostValue::CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
     return OkStatus();
   }
   auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array);
-  if (arr != nullptr) {
-    auto* pjrt_buffer = arr->pjrt_buffers().front().get();
-    if (pjrt_buffer->IsOnCpu() &&
-        !primitive_util::Is4BitType(pjrt_buffer->element_type())) {
-      return OkStatus();
-    }
+  if (arr != nullptr && !arr->pjrt_buffers().front()->IsTuple() &&
+      IsZeroCopyableCpuBuffer(arr->pjrt_buffers().front().get())) {
+    return OkStatus();
   }
   auto transfer_guard_formatter = [ifrt_array] {
     return absl::StrCat(

From a4ca0ab5b3a87cc278d36605a3bd27b4c8262cc7 Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <slebedev@google.com>
Date: Wed, 27 Mar 2024 12:14:15 -0700
Subject: [PATCH 493/670] [xla:gpu] Do not deduplicate kernel arguments when
 compiling Pallas GPU kerneles

PiperOrigin-RevId: 619611227
---
 third_party/xla/xla/service/gpu/ir_emitter_unnested.cc |  6 ++++--
 third_party/xla/xla/service/gpu/kernel_arguments.cc    | 10 +++++-----
 third_party/xla/xla/service/gpu/kernel_arguments.h     |  9 +++++----
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 79298088d9fc5a..62a695dd4af293 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -1653,7 +1653,8 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
     TF_ASSIGN_OR_RETURN(
         auto kernel_arguments,
         KernelArguments::Create(ir_emitter_context_->buffer_assignment(), instr,
-                                instr->operands()));
+                                instr->operands(),
+                                /*dedup=*/false));
     auto launch_dimensions =
         LaunchDimensions(se::BlockDim(call.grid_x, call.grid_y, call.grid_z),
                          se::ThreadDim(call.num_warps * 32));
@@ -1696,7 +1697,8 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
   TF_ASSIGN_OR_RETURN(
       auto kernel_arguments,
       KernelArguments::Create(ir_emitter_context_->buffer_assignment(), instr,
-                              instr->operands()));
+                              instr->operands(),
+                              /*dedup=*/false));
 
   AddThunkToThunkSequence(std::make_unique<KernelThunk>(
       instr, entry->kernel_name, kernel_arguments.args(),
diff --git a/third_party/xla/xla/service/gpu/kernel_arguments.cc b/third_party/xla/xla/service/gpu/kernel_arguments.cc
index 80f98779a3a93b..ebdfbd7946cad3 100644
--- a/third_party/xla/xla/service/gpu/kernel_arguments.cc
+++ b/third_party/xla/xla/service/gpu/kernel_arguments.cc
@@ -60,11 +60,11 @@ absl::StatusOr<KernelArguments> KernelArguments::Create(
         return absl::OkStatus();
       }));
 
-  return KernelArguments{std::move(kernel_arguments)};
+  return KernelArguments{std::move(kernel_arguments), /*dedup=*/true};
 }
 
 std::vector<KernelArgument> KernelArguments::ProcessArguments(
-    std::vector<KernelArgument> kernel_arguments) {
+    std::vector<KernelArgument> kernel_arguments, bool dedup) {
   absl::flat_hash_set<BufferAllocation::Slice> buffers_written;
   for (const KernelArgument& kernel_argument : kernel_arguments) {
     if (kernel_argument.written()) {
@@ -79,7 +79,7 @@ std::vector<KernelArgument> KernelArguments::ProcessArguments(
     KernelArgument& kernel_argument = kernel_arguments[i];
 
     auto& first_index = first_indices_for_slices[kernel_argument.slice_];
-    if (first_index) {
+    if (dedup && first_index) {
       const KernelArgument& same = kernel_arguments[*first_index];
       kernel_argument.first_with_same_slice_ = first_index;
       kernel_argument.alignment_ = same.alignment_;
@@ -128,7 +128,7 @@ std::vector<KernelArgument> KernelArguments::ProcessArguments(
 absl::StatusOr<KernelArguments> KernelArguments::Create(
     const BufferAssignment& buffer_assignment,
     const HloInstruction* non_fusion_hlo,
-    absl::Span<const HloInstruction* const> needed_operands) {
+    absl::Span<const HloInstruction* const> needed_operands, bool dedup) {
   std::vector<KernelArgument> kernel_arguments;
   for (const HloInstruction* operand : needed_operands) {
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
@@ -151,7 +151,7 @@ absl::StatusOr<KernelArguments> KernelArguments::Create(
         return absl::OkStatus();
       }));
 
-  return KernelArguments{std::move(kernel_arguments)};
+  return KernelArguments{std::move(kernel_arguments), dedup};
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/kernel_arguments.h b/third_party/xla/xla/service/gpu/kernel_arguments.h
index 2b303383440675..eeff9720ec7fce 100644
--- a/third_party/xla/xla/service/gpu/kernel_arguments.h
+++ b/third_party/xla/xla/service/gpu/kernel_arguments.h
@@ -70,16 +70,17 @@ class KernelArguments {
   static absl::StatusOr<KernelArguments> Create(
       const BufferAssignment& buffer_assignment,
       const HloInstruction* non_fusion_hlo,
-      absl::Span<const HloInstruction* const> needed_operands);
+      absl::Span<const HloInstruction* const> needed_operands,
+      bool dedup = true);
 
   const std::vector<KernelArgument>& args() const { return args_; }
 
  private:
-  explicit KernelArguments(std::vector<KernelArgument> args)
-      : args_(ProcessArguments(std::move(args))) {}
+  explicit KernelArguments(std::vector<KernelArgument> args, bool dedup = true)
+      : args_(ProcessArguments(std::move(args), dedup)) {}
 
   static std::vector<KernelArgument> ProcessArguments(
-      std::vector<KernelArgument> kernel_arguments);
+      std::vector<KernelArgument> kernel_arguments, bool dedup);
 
   std::vector<KernelArgument> args_;
 };

From e866528e77a5c1691e5ff22822e63ebb927807fc Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Wed, 27 Mar 2024 12:28:20 -0700
Subject: [PATCH 494/670] [XLA:GPU] Add a Gemv rewriter pass to convert gemv to
 gemm with a trivial dimension

We need this pass since GemmFusion only accepts gemms. We should run this pass before GemmFusion. After GemmFusion, we should use AlgebraicSimplifier to remove trivial dimension from gemms that are not fused by GemmFusion.

PiperOrigin-RevId: 619615263
---
 third_party/xla/xla/service/gpu/BUILD         |  35 ++++
 .../xla/xla/service/gpu/gemv_rewriter.cc      | 178 ++++++++++++++++++
 .../xla/xla/service/gpu/gemv_rewriter.h       |  44 +++++
 .../xla/xla/service/gpu/gemv_rewriter_test.cc | 135 +++++++++++++
 4 files changed, 392 insertions(+)
 create mode 100644 third_party/xla/xla/service/gpu/gemv_rewriter.cc
 create mode 100644 third_party/xla/xla/service/gpu/gemv_rewriter.h
 create mode 100644 third_party/xla/xla/service/gpu/gemv_rewriter_test.cc

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index becd9855abdbb7..77c3ad6368341d 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1620,6 +1620,41 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "gemv_rewriter",
+    srcs = ["gemv_rewriter.cc"],
+    hdrs = ["gemv_rewriter.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "gemv_rewriter_test",
+    srcs = ["gemv_rewriter_test.cc"],
+    deps = [
+        ":gemv_rewriter",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "split_k_gemm_rewriter",
     srcs = ["split_k_gemm_rewriter.cc"],
diff --git a/third_party/xla/xla/service/gpu/gemv_rewriter.cc b/third_party/xla/xla/service/gpu/gemv_rewriter.cc
new file mode 100644
index 00000000000000..67ffd2b81db172
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gemv_rewriter.cc
@@ -0,0 +1,178 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gemv_rewriter.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/shape.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Construct a new layout by adding a new minor-most dimension to the input
+// layout. For example, {3, 2, 1, 0} is extended to {4, 3, 2, 1, 0}.
+// We expect that the input layout is normalized by LayoutNormalizer, so that
+// the input layout has a descending ordering.
+absl::StatusOr<Layout> GetLayoutWithNewMinorMostDimension(
+    const Layout& layout) {
+  // Check that the layout is normalized.
+  if (!LayoutUtil::IsMonotonicWithDim0Major(layout)) {
+    return absl::InvalidArgumentError("Layout is not normalized.");
+  }
+  return LayoutUtil::MakeDescendingLayout(layout.minor_to_major_size() + 1);
+}
+
+class GemvRewriterVisitor : public DfsHloRewriteVisitor {
+ public:
+  absl::Status HandleDot(HloInstruction* instr) override {
+    HloDotInstruction* dot = Cast<HloDotInstruction>(instr);
+    const DotDimensionNumbers& dim_numbers = dot->dot_dimension_numbers();
+    HloInstruction* lhs = dot->mutable_operand(0);
+    HloInstruction* rhs = dot->mutable_operand(1);
+
+    // This pass relies on dot decomposer which ensures that all non-batch
+    // dimensions are merged into one.
+    bool lhs_has_non_contracting_dim =
+        lhs->shape().rank() ==
+        dim_numbers.lhs_batch_dimensions_size() +
+            dim_numbers.lhs_contracting_dimensions_size() + 1;
+    bool rhs_has_non_contracting_dim =
+        rhs->shape().rank() ==
+        dim_numbers.rhs_batch_dimensions_size() +
+            dim_numbers.rhs_contracting_dimensions_size() + 1;
+
+    // Skip vector-vector multiplication.
+    if (!lhs_has_non_contracting_dim && !rhs_has_non_contracting_dim) {
+      return absl::OkStatus();
+    }
+
+    if (dot->shape().is_dynamic()) {
+      return absl::OkStatus();
+    }
+
+    changed_ = true;
+
+    HloComputation* computation = dot->parent();
+    HloInstruction* new_lhs = lhs;
+    if (!lhs_has_non_contracting_dim) {
+      const Shape& lhs_shape = lhs->shape();
+      absl::Span<const int64_t> lhs_dimensions = lhs_shape.dimensions();
+      std::vector<int64_t> new_lhs_dimensions(lhs_dimensions.begin(),
+                                              lhs_dimensions.end());
+      new_lhs_dimensions.push_back(1);
+      Shape new_lhs_shape(
+          lhs_shape.element_type(), new_lhs_dimensions,
+          absl::InlinedVector<bool, 4>(new_lhs_dimensions.size(), false),
+          /*tuple_shapes=*/{});
+      TF_ASSIGN_OR_RETURN(
+          *new_lhs_shape.mutable_layout(),
+          GetLayoutWithNewMinorMostDimension(lhs_shape.layout()));
+      new_lhs = computation->AddInstruction(
+          HloInstruction::CreateBitcast(new_lhs_shape, lhs));
+    }
+
+    HloInstruction* new_rhs = rhs;
+    if (!rhs_has_non_contracting_dim) {
+      const Shape& rhs_shape = rhs->shape();
+      absl::Span<const int64_t> rhs_dimensions = rhs_shape.dimensions();
+      std::vector<int64_t> new_rhs_dimensions(rhs_dimensions.begin(),
+                                              rhs_dimensions.end());
+      new_rhs_dimensions.push_back(1);
+      Shape new_rhs_shape(
+          rhs_shape.element_type(), new_rhs_dimensions,
+          absl::InlinedVector<bool, 4>(new_rhs_dimensions.size(), false),
+          /*tuple_shapes=*/{});
+      TF_ASSIGN_OR_RETURN(
+          *new_rhs_shape.mutable_layout(),
+          GetLayoutWithNewMinorMostDimension(rhs_shape.layout()));
+      new_rhs = computation->AddInstruction(
+          HloInstruction::CreateBitcast(new_rhs_shape, rhs));
+    }
+
+    std::vector<int64_t> new_out_dimensions;
+    new_out_dimensions.reserve(dot->shape().dimensions().size() + 1);
+    for (int64_t dim_size : dot->shape().dimensions()) {
+      new_out_dimensions.push_back(dim_size);
+    }
+    if (!lhs_has_non_contracting_dim) {
+      // Insert the trivial dimension before the non-contracting dimension from
+      // rhs.
+      int non_contracting_dim_size = new_out_dimensions.back();
+      new_out_dimensions[new_out_dimensions.size() - 1] = 1;
+      new_out_dimensions.push_back(non_contracting_dim_size);
+    } else {
+      new_out_dimensions.push_back(1);
+    }
+
+    Shape new_out_shape(
+        dot->shape().element_type(), new_out_dimensions,
+        absl::InlinedVector<bool, 4>(new_out_dimensions.size(), false),
+        /*tuple_shapes=*/{});
+    TF_ASSIGN_OR_RETURN(
+        *new_out_shape.mutable_layout(),
+        GetLayoutWithNewMinorMostDimension(dot->shape().layout()));
+
+    HloInstruction* new_dot =
+        computation->AddInstruction(HloInstruction::CreateDot(
+            new_out_shape, new_lhs, new_rhs, dot->dot_dimension_numbers(),
+            dot->precision_config()));
+    HloInstruction* bitcast = computation->AddInstruction(
+        HloInstruction::CreateBitcast(dot->shape(), new_dot));
+    return computation->ReplaceInstruction(dot, bitcast);
+  }
+
+  bool changed() const { return changed_; }
+
+ private:
+  bool changed_ = false;
+};
+
+}  // namespace
+
+absl::StatusOr<bool> GemvRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  GemvRewriterVisitor gemv_rewriter;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    TF_RETURN_IF_ERROR(computation->Accept(&gemv_rewriter));
+  }
+  return gemv_rewriter.changed();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gemv_rewriter.h b/third_party/xla/xla/service/gpu/gemv_rewriter.h
new file mode 100644
index 00000000000000..a041138b8af5c6
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gemv_rewriter.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_GEMV_REWRITER_H_
+#define XLA_SERVICE_GPU_GEMV_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrite a matrix-vector or a vector-matrix multiplication into a
+// matrix-matrix multiplication with a trivial dimension. For example,
+// [m x n] @ [n] is rewritten to [m x n] @ [n x 1], and [n] @ [m x n] is
+// rewritten to [n x 1] @ [m x n].
+class GemvRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "gemv-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GEMV_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc b/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc
new file mode 100644
index 00000000000000..46aee0aab3fb88
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc
@@ -0,0 +1,135 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gemv_rewriter.h"
+
+#include <memory>
+#include <optional>
+
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+class GemvRewriterTest : public HloTestBase {};
+
+TEST_F(GemvRewriterTest, RewriteMatrixVectorMultiplicationToGemm) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[32,7] parameter(0)
+    p1 = f32[7] parameter(1)
+    ROOT d = f32[32] dot(p0, p1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })";
+
+  const char* expected = R"()
+// CHECK:  %[[P0:.*]] = f32[32,7]{1,0} parameter(0)
+// CHECK:  %[[P1:.*]] = f32[7]{0} parameter(1)
+// CHECK:  %[[BITCAST:.*]] = f32[7,1]{1,0} bitcast(%[[P1]])
+// CHECK:  %[[DOT:.*]] = f32[32,1]{1,0} dot(%[[P0]], %[[BITCAST]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+// CHECK:  ROOT %[[ROOT:.*]] = f32[32]{0} bitcast(%[[DOT]])
+})";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), expected);
+}
+
+TEST_F(GemvRewriterTest, RewriteVectorMatrixMultiplicationToGemm) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[7] parameter(0)
+    p1 = f32[7,32] parameter(1)
+    ROOT d = f32[32] dot(p0, p1),
+      lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  })";
+
+  const char* expected = R"()
+// CHECK:  %[[P0:.*]] = f32[7]{0} parameter(0)
+// CHECK:  %[[BITCAST:.*]] = f32[7,1]{1,0} bitcast(%[[P0]])
+// CHECK:  %[[P1:.*]] = f32[7,32]{1,0} parameter(1)
+// CHECK:  %[[DOT:.*]] = f32[1,32]{1,0} dot(%[[BITCAST]], %[[P1]]), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+// CHECK:  ROOT %[[ROOT:.*]].1 = f32[32]{0} bitcast(%[[DOT]])
+})";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), expected);
+}
+
+TEST_F(GemvRewriterTest, RewriteMatrixVectorMultiplicationWithBatch) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[2,5,32,7] parameter(0)
+    p1 = f32[2,5,7] parameter(1)
+    ROOT d = f32[2,5,32] dot(p0, p1),
+      lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
+      lhs_contracting_dims={3}, rhs_contracting_dims={2}
+  })";
+
+  const char* expected = R"()
+// CHECK:  %[[P0:.*]] = f32[2,5,32,7]{3,2,1,0} parameter(0)
+// CHECK:  %[[P1:.*]] = f32[2,5,7]{2,1,0} parameter(1)
+// CHECK:  %[[BITCAST:.*]] = f32[2,5,7,1]{3,2,1,0} bitcast(%[[P1]])
+// CHECK:  %[[DOT:.*]] = f32[2,5,32,1]{3,2,1,0} dot(%[[P0]], %[[BITCAST]]),
+// CHECK-SAME: lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+// CHECK:  ROOT %[[ROOT:.*]] = f32[2,5,32]{2,1,0} bitcast(%[[DOT]])
+})";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), expected);
+}
+
+TEST_F(GemvRewriterTest, DotNotRewriteVectorVectorMultiplication) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[7] parameter(0)
+    p1 = f32[7] parameter(1)
+    ROOT d = f32[] dot(p0, p1),
+      lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  })";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), /*expected=*/std::nullopt);
+}
+
+TEST_F(GemvRewriterTest, DoNotRewriteDotsWithNonNormalizedLayout) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[5,32,7]{2,1,0} parameter(0)
+    p1 = f32[5,7]{0,1} parameter(1)
+    ROOT d = f32[5,32]{0,1} dot(p0, p1),
+      lhs_batch_dims={0}, rhs_batch_dims={0},
+      lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+  GemvRewriter rewriter;
+  absl::StatusOr<bool> result = this->RunHloPass(&rewriter, module.get());
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.status().message(), "Layout is not normalized.");
+}
+
+}  // namespace
+}  // namespace xla::gpu

From d87b5650899ff9d0f5eb791a7b15f7bfc9f4b4c7 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rjascani@google.com>
Date: Wed, 27 Mar 2024 19:54:04 +0000
Subject: [PATCH 495/670] shlo_ref: Disable legacy code build

The code under shlo/legacy is useful for reference, but cannot build
with the oss toolchains. This PR disables those targets with the no_oss
tag.
---
 tensorflow/lite/experimental/shlo/legacy/BUILD       |  3 +++
 tensorflow/lite/experimental/shlo/legacy/bench/BUILD |  3 +++
 tensorflow/lite/experimental/shlo/legacy/test/BUILD  | 12 ++++++++++++
 3 files changed, 18 insertions(+)

diff --git a/tensorflow/lite/experimental/shlo/legacy/BUILD b/tensorflow/lite/experimental/shlo/legacy/BUILD
index 64f986fd607afa..bd4b549af73010 100644
--- a/tensorflow/lite/experimental/shlo/legacy/BUILD
+++ b/tensorflow/lite/experimental/shlo/legacy/BUILD
@@ -32,6 +32,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
     ],
+    tags = ["no_oss"],
 )
 
 cc_library(
@@ -49,6 +50,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
     ],
+    tags = ["no_oss"],
 )
 
 cc_library(
@@ -62,4 +64,5 @@ cc_library(
     ],
     deps = [
     ],
+    tags = ["no_oss"],
 )
diff --git a/tensorflow/lite/experimental/shlo/legacy/bench/BUILD b/tensorflow/lite/experimental/shlo/legacy/bench/BUILD
index 2555f70adc7b31..8ae2703c04bdce 100644
--- a/tensorflow/lite/experimental/shlo/legacy/bench/BUILD
+++ b/tensorflow/lite/experimental/shlo/legacy/bench/BUILD
@@ -10,6 +10,7 @@ cc_library(
     deps = [
         "//tensorflow/lite/experimental/shlo/legacy:float",
     ],
+    tags = ["no_oss"],
 )
 
 cc_binary(
@@ -26,6 +27,7 @@ cc_binary(
         "@com_google_absl//absl/status",
         "@com_google_benchmark//:benchmark",
     ],
+    tags = ["no_oss"],
 )
 
 cc_binary(
@@ -42,4 +44,5 @@ cc_binary(
         "@com_google_absl//absl/log",
         "@com_google_benchmark//:benchmark",
     ],
+    tags = ["no_oss"],
 )
diff --git a/tensorflow/lite/experimental/shlo/legacy/test/BUILD b/tensorflow/lite/experimental/shlo/legacy/test/BUILD
index 275dae134f0edf..d1adfb26c49c7b 100644
--- a/tensorflow/lite/experimental/shlo/legacy/test/BUILD
+++ b/tensorflow/lite/experimental/shlo/legacy/test/BUILD
@@ -8,6 +8,7 @@ cc_library(
         "//tensorflow/lite/experimental/shlo/legacy:debug",
         "@com_google_googletest//:gtest_main",
     ],
+    tags = ["no_oss"],
 )
 
 cc_library(
@@ -23,6 +24,7 @@ cc_library(
         "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_absl//absl/log:check",
     ],
+    tags = ["no_oss"],
 )
 
 cc_test(
@@ -39,6 +41,7 @@ cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
+    tags = ["no_oss"],
 )
 
 cc_test(
@@ -54,6 +57,7 @@ cc_test(
         "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_googletest//:gtest_main",
     ],
+    tags = ["no_oss"],
 )
 
 cc_test(
@@ -69,6 +73,7 @@ cc_test(
         "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_googletest//:gtest_main",
     ],
+    tags = ["no_oss"],
 )
 
 cc_test(
@@ -85,6 +90,7 @@ cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
+    tags = ["no_oss"],
 )
 
 cc_test(
@@ -102,6 +108,7 @@ cc_test(
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
+    tags = ["no_oss"],
 )
 
 cc_test(
@@ -119,6 +126,7 @@ cc_test(
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
+    tags = ["no_oss"],
 )
 
 cc_test(
@@ -134,6 +142,7 @@ cc_test(
         "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_googletest//:gtest_main",
     ],
+    tags = ["no_oss"],
 )
 
 cc_test(
@@ -148,6 +157,7 @@ cc_test(
         "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_googletest//:gtest_main",
     ],
+    tags = ["no_oss"],
 )
 
 cc_test(
@@ -163,6 +173,7 @@ cc_test(
         "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_googletest//:gtest_main",
     ],
+    tags = ["no_oss"],
 )
 
 cc_test(
@@ -178,4 +189,5 @@ cc_test(
         "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_googletest//:gtest_main",
     ],
+    tags = ["no_oss"],
 )

From 73400c8b8d22717c2aa2253ea0fa92f6d6905761 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 27 Mar 2024 12:34:46 -0700
Subject: [PATCH 496/670] Disable load reordering by buildifier in mlir BUILD

PiperOrigin-RevId: 619617017
---
 tensorflow/compiler/mlir/tensorflow/BUILD | 3227 +++++++++++----------
 1 file changed, 1615 insertions(+), 1612 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 34ba4406565aee..2a0a2222d9aa04 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1,1620 +1,1623 @@
-# copybara:uncomment_end(google-only)
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+# buildifier: disable=out-of-order-load
+
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 # copybara:uncomment_begin(google-only)
 # load("//learning/brain/experimental/mlir/tensorflow/dialectgen:dialectgen.bzl", "dialectgen")
-# load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_gen_op_wrapper_py")
-# load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
-# load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-#
-# package(
-#     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-#     default_visibility = ["//visibility:public"],
-#     licenses = ["notice"],
-# )
-#
-# exports_files([
-#     "ir/tf_generated_ops.td",
-#     "ir/tf_op_base.td",
-#     "ir/tf_op_interfaces.td",
-#     "ir/tf_ops.td",
-# ])
-#
-# td_library(
-#     name = "tensorflow_ops_td_files",
-#     srcs = [
-#         "ir/tf_generated_ops.td",
-#         "ir/tf_op_base.td",
-#         "ir/tf_op_interfaces.td",
-#         "ir/tf_ops.td",
-#     ],
-#     compatible_with = get_compatible_with_portable(),
-#     deps = [
-#         "@llvm-project//mlir:CallInterfacesTdFiles",
-#         "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
-#         "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
-#         "@llvm-project//mlir:LoopLikeInterfaceTdFiles",
-#         "@llvm-project//mlir:OpBaseTdFiles",
-#         "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-#     ],
-# )
-#
-# gentbl_cc_library(
-#     name = "tensorflow_op_interfaces_inc_gen",
-#     compatible_with = get_compatible_with_portable(),
-#     tbl_outs = [
-#         (
-#             ["-gen-op-interface-decls"],
-#             "ir/tf_op_interfaces.h.inc",
-#         ),
-#         (
-#             ["-gen-op-interface-defs"],
-#             "ir/tf_op_interfaces.cc.inc",
-#         ),
-#     ],
-#     tblgen = "@llvm-project//mlir:mlir-tblgen",
-#     td_file = "ir/tf_op_interfaces.td",
-#     test = True,
-#     deps = [
-#         ":tensorflow_ops_td_files",
-#     ],
-# )
-#
-# gentbl_cc_library(
-#     name = "tensorflow_struct_doc_gen",
-#     compatible_with = get_compatible_with_portable(),
-#     tbl_outs = [
-#         (
-#             ["-gen-dialect-doc"],
-#             "g3doc/tf_ops.md",
-#         ),
-#     ],
-#     tblgen = "@llvm-project//mlir:mlir-tblgen",
-#     td_file = "ir/tf_ops.td",
-#     test = True,
-#     deps = [
-#         ":tensorflow_ops_td_files",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tensorflow_op_interfaces",
-#     srcs = [
-#         "ir/tf_op_interfaces.cc",
-#         "ir/tf_op_interfaces.cc.inc",
-#         "ir/tf_op_interfaces.h.inc",
-#         "ir/tf_verifiers.cc",
-#     ],
-#     hdrs = [
-#         "ir/tf_op_interfaces.h",
-#         "ir/tf_verifiers.h",
-#     ],
-#     deps = [
-#         ":tensorflow_op_interfaces_inc_gen",
-#         ":tensorflow_structs",
-#         "//tensorflow/core:framework",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# gentbl_cc_library(
-#     name = "tensorflow_all_ops_inc_gen",
-#     compatible_with = get_compatible_with_portable(),
-#     tbl_outs = [
-#         (
-#             ["-gen-op-decls"],
-#             "ir/tf_all_ops.h.inc",
-#         ),
-#         (
-#             ["-gen-op-defs"],
-#             "ir/tf_all_ops.cc.inc",
-#         ),
-#     ],
-#     tblgen = "@llvm-project//mlir:mlir-tblgen",
-#     td_file = "ir/tf_ops.td",
-#     deps = [
-#         ":tensorflow_ops_td_files",
-#     ],
-# )
-#
-# # We only shard tf_op on name for build performance reasons.
-# tf_ops_category_list = [
-#     {
-#         "name": "ops_a_m",
-#         "include": "tf.[A-M].*$",
-#     },
-#     {
-#         "name": "ops_n_z",
-#         "include": "tf.[N-Z].*$",
-#     },
-# ]
-#
-# [[
-#     gentbl_cc_library(
-#         name = "tensorflow_" + target["name"] + "_inc_gen",
-#         compatible_with = get_compatible_with_portable(),
-#         tbl_outs = [
-#             (
-#                 [
-#                     "-gen-op-decls",
-#                     "-op-include-regex=" + target["include"],
-#                 ],
-#                 "ir/tf_" + target["name"] + ".h.inc",
-#             ),
-#             (
-#                 [
-#                     "-gen-op-defs",
-#                     "-op-include-regex=" + target["include"],
-#                 ],
-#                 "ir/tf_" + target["name"] + ".cc.inc",
-#             ),
-#         ],
-#         tblgen = "@llvm-project//mlir:mlir-tblgen",
-#         td_file = "ir/tf_ops.td",
-#         deps = [
-#             ":tensorflow_ops_td_files",
-#         ],
-#     ),
-# ] for target in tf_ops_category_list]
-#
-# gentbl_cc_library(
-#     name = "tensorflow_remaining_ops_inc_gen",
-#     compatible_with = get_compatible_with_portable(),
-#     tbl_outs = [
-#         (
-#             [
-#                 "-gen-op-decls",
-#                 "-op-exclude-regex=" + "|".join([target["include"] for target in tf_ops_category_list]),
-#             ],
-#             "ir/tf_remaining_ops.h.inc",
-#         ),
-#         (
-#             [
-#                 "-gen-op-defs",
-#                 "-op-exclude-regex=" + "|".join([target["include"] for target in tf_ops_category_list]),
-#             ],
-#             "ir/tf_remaining_ops.cc.inc",
-#         ),
-#     ],
-#     tblgen = "@llvm-project//mlir:mlir-tblgen",
-#     td_file = "ir/tf_ops.td",
-#     deps = [
-#         ":tensorflow_ops_td_files",
-#     ],
-# )
-#
-# gentbl_cc_library(
-#     name = "tf_saved_model_inc_gen",
-#     compatible_with = get_compatible_with_portable(),
-#     tbl_outs = [
-#         (
-#             ["-gen-op-decls"],
-#             "ir/tf_saved_model.h.inc",
-#         ),
-#         (
-#             ["-gen-op-defs"],
-#             "ir/tf_saved_model.cc.inc",
-#         ),
-#         (
-#             ["-gen-dialect-doc"],
-#             "g3doc/tf_saved_model.md",
-#         ),
-#     ],
-#     tblgen = "@llvm-project//mlir:mlir-tblgen",
-#     td_file = "ir/tf_saved_model_ops.td",
-#     test = True,
-#     deps = [
-#         "@llvm-project//mlir:FuncTdFiles",
-#         "@llvm-project//mlir:OpBaseTdFiles",
-#     ],
-# )
-#
-# gentbl_cc_library(
-#     name = "tensorflow_executor_inc_gen",
-#     compatible_with = get_compatible_with_portable(),
-#     tbl_outs = [
-#         (
-#             ["-gen-op-decls"],
-#             "ir/tf_executor.h.inc",
-#         ),
-#         (
-#             ["-gen-op-defs"],
-#             "ir/tf_executor.cc.inc",
-#         ),
-#         (
-#             [
-#                 "-gen-dialect-doc",
-#                 "-dialect=tf_executor",
-#             ],
-#             "g3doc/tf_executor.md",
-#         ),
-#     ],
-#     tblgen = "@llvm-project//mlir:mlir-tblgen",
-#     td_file = "ir/tf_executor_ops.td",
-#     test = True,
-#     deps = [
-#         ":tensorflow_ops_td_files",
-#         "@llvm-project//mlir:FuncTdFiles",
-#         "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
-#         "@llvm-project//mlir:OpBaseTdFiles",
-#     ],
-# )
-#
-# gentbl_cc_library(
-#     name = "tensorflow_device_ops_inc_gen",
-#     compatible_with = get_compatible_with_portable(),
-#     tbl_outs = [
-#         (
-#             ["-gen-op-decls"],
-#             "ir/tf_device.h.inc",
-#         ),
-#         (
-#             ["-gen-op-defs"],
-#             "ir/tf_device.cc.inc",
-#         ),
-#         (
-#             ["-gen-dialect-doc"],
-#             "g3doc/tf_device.md",
-#         ),
-#     ],
-#     tblgen = "@llvm-project//mlir:mlir-tblgen",
-#     td_file = "ir/tf_device_ops.td",
-#     test = True,
-#     deps = [
-#         "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
-#         "@llvm-project//mlir:FuncTdFiles",
-#         "@llvm-project//mlir:OpBaseTdFiles",
-#         "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tensorflow_attributes",
-#     hdrs = [
-#         "ir/tf_attributes.h",
-#         "ir/tf_dialect.h",
-#     ],
-#     deps = [
-#         ":tensorflow_types",
-#         "//tensorflow/core/ir/types:Dialect",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Parser",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tensorflow_traits",
-#     srcs = [
-#     ],
-#     hdrs = [
-#         "ir/tf_traits.h",
-#     ],
-#     deps = [
-#         ":tensorflow_op_interfaces",
-#         ":tensorflow_types",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:InferTypeOpInterface",
-#         "@llvm-project//mlir:SideEffectInterfaces",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# # TensorFlow ops are separated into `tensorflow_ops_a_m.cc` and
-# # `tensorflow_ops_n_z.cc` so that C++ compiler won't be stressed by huge C++
-# # files. However, there might be dependencies between `tensorflow_ops_a_m.cc`
-# # and `tensorflow_ops_n_z.cc`, thus they must be built in one `cc_library`.
-# cc_library(
-#     name = "tensorflow_ops_sharded",
-#     srcs = [
-#                "ir/tf_dialect.h",
-#                "ir/tf_ops.h",
-#                "ir/tf_remaining_ops.h",
-#            ] + ["ir/tf_" + target["name"] + ".cc" for target in tf_ops_category_list] +
-#            ["ir/tf_" + target["name"] + ".cc.inc" for target in tf_ops_category_list] +
-#            ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
-#     hdrs = [
-#     ],
-#     textual_hdrs = [
-#         "ir/tf_types.def",
-#         "ir/tf_all_ops.h.inc",
-#         "ir/tf_remaining_ops.h.inc",
-#     ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
-#     deps = [
-#         ":attribute_utils",
-#         ":convert_type",
-#         ":dynamic_shape_utils",
-#         ":side_effect_analysis_util",
-#         ":tensorflow_all_ops_inc_gen",
-#         ":tensorflow_attributes",
-#         ":tensorflow_op_interfaces",
-#         ":tensorflow_op_interfaces_inc_gen",
-#         ":tensorflow_remaining_ops_inc_gen",
-#         ":tensorflow_side_effects",
-#         ":tensorflow_structs",
-#         ":tensorflow_traits",
-#         ":tensorflow_types",
-#         ":tf_arith_ops_folder",
-#         ":tf_ops_canonicalization_helper",
-#         ":tf_ops_device_helper",
-#         ":tf_ops_layout_helper",
-#         ":tf_ops_tensor_helper",
-#         "@com_google_absl//absl/log:check",
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
-#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:rewrite_util",
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:lib",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:CallOpInterfaces",
-#         "@llvm-project//mlir:ControlFlowInterfaces",
-#         "@llvm-project//mlir:DerivedAttributeOpInterface",
-#         "@llvm-project//mlir:Dialect",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:InferTypeOpInterface",
-#         "@llvm-project//mlir:InliningUtils",
-#         "@llvm-project//mlir:LoopLikeInterface",
-#         "@llvm-project//mlir:Parser",
-#         "@llvm-project//mlir:SideEffectInterfaces",
-#         "@llvm-project//mlir:Support",
-#     ] + [":tensorflow_" + target["name"] + "_inc_gen" for target in tf_ops_category_list],
-# )
-#
-# cc_library(
-#     name = "tensorflow_remaining_ops",
-#     srcs = [
-#         "ir/tf_dialect.h",
-#         "ir/tf_ops.h",
-#         "ir/tf_remaining_ops.cc",
-#         "ir/tf_remaining_ops.h",
-#     ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
-#     hdrs = [
-#     ],
-#     textual_hdrs = [
-#         "ir/tf_all_ops.h.inc",
-#         "ir/tf_remaining_ops.h.inc",
-#     ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
-#     deps = [
-#         ":attribute_utils",
-#         ":serialize_mlir_module_utils",
-#         ":side_effect_analysis_util",
-#         ":tensorflow_attributes",
-#         ":tensorflow_op_interfaces",
-#         ":tensorflow_op_interfaces_inc_gen",
-#         ":tensorflow_remaining_ops_inc_gen",
-#         ":tensorflow_side_effects",
-#         ":tensorflow_structs",
-#         ":tensorflow_traits",
-#         ":tensorflow_types",
-#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
-#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:rewrite_util",
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:lib",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:CallOpInterfaces",
-#         "@llvm-project//mlir:ControlFlowInterfaces",
-#         "@llvm-project//mlir:DerivedAttributeOpInterface",
-#         "@llvm-project//mlir:Dialect",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:InferTypeOpInterface",
-#         "@llvm-project//mlir:InliningUtils",
-#         "@llvm-project//mlir:LoopLikeInterface",
-#         "@llvm-project//mlir:Parser",
-#         "@llvm-project//mlir:SideEffectInterfaces",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
 #
-# cc_library(
-#     name = "tensorflow_ops",
-#     srcs = [
-#         "ir/tf_dialect.h",
-#         "ir/tf_ops.cc",
-#         "ir/tf_ops.h",
-#     ],
-#     textual_hdrs = [
-#         "ir/tf_all_ops.h.inc",
-#         "ir/tf_remaining_ops.h",
-#     ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
-#     deps = [
-#         ":side_effect_analysis_util",
-#         ":tensorflow_all_ops_inc_gen",
-#         ":tensorflow_attributes",
-#         ":tensorflow_op_interfaces",
-#         ":tensorflow_op_interfaces_inc_gen",
-#         ":tensorflow_ops_sharded",
-#         ":tensorflow_remaining_ops",
-#         ":tensorflow_remaining_ops_inc_gen",
-#         ":tensorflow_side_effects",
-#         ":tensorflow_structs",
-#         ":tensorflow_traits",
-#         ":tensorflow_types",
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
-#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:rewrite_util",
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core/common_runtime:inline_function_utils",
-#         "//tensorflow/core/common_runtime:lower_function_call_inline_policy",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:CallOpInterfaces",
-#         "@llvm-project//mlir:ControlFlowInterfaces",
-#         "@llvm-project//mlir:DerivedAttributeOpInterface",
-#         "@llvm-project//mlir:Dialect",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:InferTypeOpInterface",
-#         "@llvm-project//mlir:InliningUtils",
-#         "@llvm-project//mlir:LoopLikeInterface",
-#         "@llvm-project//mlir:Parser",
-#         "@llvm-project//mlir:SideEffectInterfaces",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tensorflow_structs",
-#     srcs = [
-#         "ir/tf_structs.cc",
-#     ],
-#     hdrs = [
-#         "ir/tf_structs.h",
-#     ],
-#     deps = [
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core/ir/types:Dialect",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tensorflow_side_effects",
-#     srcs = [
-#     ],
-#     hdrs = [
-#         "ir/tf_side_effects.h",
-#     ],
-#     deps = ["@llvm-project//mlir:SideEffectInterfaces"],
-# )
-#
-# cc_library(
-#     name = "tensorflow_types",
-#     hdrs = [
-#         "ir/tf_dialect.h",
-#         "ir/tf_types.h",
-#     ],
-#     textual_hdrs = [
-#         "ir/tf_types.def",
-#     ],
-#     deps = [
-#         "//tensorflow/core/ir/types:Dialect",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:Dialect",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tensorflow",
-#     srcs = [
-#         "ir/tf_device.cc",
-#         "ir/tf_executor.cc",
-#         "ir/tf_executor.cc.inc",
-#         "ir/tf_executor.h.inc",
-#         "ir/tf_saved_model.cc",
-#     ],
-#     hdrs = [
-#         "dialect_registration.h",
-#         "ir/tf_device.h",
-#         "ir/tf_dialect.h",
-#         "ir/tf_executor.h",
-#         "ir/tf_ops.h",
-#         "ir/tf_saved_model.h",
-#         "ir/tf_structs.h",
-#         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.h",
-#     ],
-#     includes = ["include"],
-#     visibility = ["//visibility:public"],
-#     deps = [
-#         ":tensorflow_all_ops_inc_gen",
-#         ":tensorflow_attributes",
-#         ":tensorflow_device_ops_inc_gen",
-#         ":tensorflow_executor_inc_gen",
-#         ":tensorflow_op_interfaces",
-#         ":tensorflow_ops",
-#         ":tensorflow_side_effects",
-#         ":tensorflow_structs",
-#         ":tensorflow_traits",
-#         ":tensorflow_types",
-#         ":tf_saved_model_inc_gen",
-#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
-#         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_device_pass_inc_gen",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core/ir:Dialect",
-#         "//tensorflow/core/ir/types:Dialect",
-#         "//tensorflow/core/platform:logging",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:Analysis",
-#         "@llvm-project//mlir:ArithDialect",
-#         "@llvm-project//mlir:CallOpInterfacesIncGen",
-#         "@llvm-project//mlir:ControlFlowDialect",
-#         "@llvm-project//mlir:ControlFlowInterfaces",
-#         "@llvm-project//mlir:DerivedAttributeOpInterface",
-#         "@llvm-project//mlir:Dialect",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:FuncExtensions",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:InferTypeOpInterface",
-#         "@llvm-project//mlir:InliningUtils",
-#         "@llvm-project//mlir:LoopLikeInterface",
-#         "@llvm-project//mlir:MLProgramDialect",
-#         "@llvm-project//mlir:Parser",
-#         "@llvm-project//mlir:Pass",
-#         "@llvm-project//mlir:SideEffectInterfaces",
-#         "@llvm-project//mlir:Support",
-#         "@llvm-project//mlir:TransformUtils",
-#         "@llvm-project//mlir:Transforms",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "tf_saved_model_test",
-#     srcs = ["ir/tf_saved_model_test.cc"],
-#     deps = [
-#         ":tensorflow",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "//tensorflow/core/platform:test",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Parser",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "string_util",
-#     srcs = ["utils/string_util.cc"],
-#     hdrs = ["utils/string_util.h"],
-#     deps = [
-#         "@com_google_absl//absl/strings",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Pass",
-#     ],
-# )
-#
-# cc_library(
-#     name = "fake_session",
-#     srcs = ["utils/fake_session.cc"],
-#     hdrs = ["utils/fake_session.h"],
-#     deps = [
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/core:core_cpu_base",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:framework_internal",
-#         "//tensorflow/core:protos_all_cc",
-#         "//tensorflow/core:session_options",
-#         "//tensorflow/core/common_runtime:threadpool_device",
-#         "//tensorflow/core/platform:errors",
-#         "//tensorflow/core/platform:status",
-#         "//tensorflow/core/platform:threadpool_options",
-#         "@llvm-project//llvm:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "session_utils",
-#     srcs = ["utils/session_utils.cc"],
-#     hdrs = ["utils/session_utils.h"],
-#     deps = [
-#         ":tensorflow",
-#         ":tensorflow_ops",
-#         "@com_google_absl//absl/status",
-#         "@com_google_absl//absl/status:statusor",
-#         "//tensorflow/compiler/mlir/utils:string_container_utils",
-#         "//tensorflow/core:core_cpu_base",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:framework_internal",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "topological_sort",
-#     srcs = ["utils/topological_sort.cc"],
-#     hdrs = ["utils/topological_sort.h"],
-#     deps = [
-#         "@com_google_absl//absl/types:span",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tensorflow_analysis",
-#     srcs = [
-#         "analysis/per_function_aggregate_analysis.h",
-#         "analysis/resource_alias_analysis.cc",
-#         "analysis/resource_dataflow.cc",
-#         "analysis/side_effect_analysis.cc",
-#     ],
-#     hdrs = [
-#         "analysis/resource_alias_analysis.h",
-#         "analysis/resource_dataflow.h",
-#         "analysis/side_effect_analysis.h",
-#         "analysis/tf_dataflow.h",
-#     ],
-#     deps = [
-#         ":tensorflow",
-#         ":tensorflow_op_interfaces",
-#         ":tensorflow_side_effects",
-#         ":tensorflow_types",
-#         "@com_google_absl//absl/container:flat_hash_map",
-#         "@com_google_absl//absl/container:flat_hash_set",
-#         "@com_google_absl//absl/container:node_hash_map",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:Analysis",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Pass",
-#         "@llvm-project//mlir:SideEffectInterfaces",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "xla_call_module_attrs",
-#     srcs = [],
-#     hdrs = ["utils/xla_call_module_attrs.h"],
-#     deps = ["@llvm-project//llvm:Support"],
-# )
-#
-# cc_library(
-#     name = "stablehlo_custom_call_utils",
-#     srcs = ["utils/stablehlo_custom_call.cc"],
-#     hdrs = ["utils/stablehlo_custom_call.h"],
-#     deps = [
-#         ":xla_call_module_attrs",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#         "@stablehlo//:stablehlo_ops",
-#     ],
-# )
-#
-# cc_library(
-#     name = "parse_text_proto",
-#     srcs = ["utils/parse_text_proto.cc"],
-#     hdrs = ["utils/parse_text_proto.h"],
-#     deps = [
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:protos_all_cc",
-#         "//tensorflow/core/platform:casts",
-#     ],
-# )
-#
-# cc_library(
-#     name = "import_utils",
-#     srcs = ["utils/import_utils.cc"],
-#     hdrs = ["utils/import_utils.h"],
-#     deps = [
-#         ":error_util",
-#         ":parse_text_proto",
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/core:lib",
-#         "@llvm-project//llvm:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "export_utils",
-#     srcs = [
-#         "utils/export_utils.cc",
-#     ],
-#     hdrs = [
-#         "utils/export_utils.h",
-#     ],
-#     deps = [
-#         ":attribute_utils",
-#         ":convert_tensor",
-#         ":convert_type",
-#         ":location_utils",
-#         ":mangling_util",
-#         ":tensorflow",
-#         ":tensorflow_attributes",
-#         ":tensorflow_types",
-#         "@com_google_absl//absl/container:flat_hash_set",
-#         "@com_google_absl//absl/memory",
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/compiler/mlir/tensorflow:xla_sharding_util",
-#         "@local_xla//xla:status_macros",
-#         "@local_xla//xla:xla_data_proto_cc",
-#         "@local_xla//xla/client:sharding_builder",
-#         "@local_xla//xla/hlo/ir:hlo",
-#         "@local_xla//xla/service:hlo_parser",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:graph",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:protos_all_cc",
-#         "//tensorflow/core/platform:protobuf",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "location_utils",
-#     srcs = ["utils/location_utils.cc"],
-#     hdrs = ["utils/location_utils.h"],
-#     deps = [
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tpu_cluster_util",
-#     srcs = ["utils/tpu_cluster_util.cc"],
-#     hdrs = ["utils/tpu_cluster_util.h"],
-#     deps = [
-#         ":device_util",
-#         ":tpu_rewrite_device_util",
-#         "@llvm-project//mlir:Analysis",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "translate_utils",
-#     srcs = [
-#         "utils/translate_utils.cc",
-#     ],
-#     hdrs = [
-#         "utils/translate_utils.h",
-#     ],
-#     deps = [
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:protos_all_cc",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "convert_attr",
-#     srcs = ["utils/convert_attr.cc"],
-#     hdrs = ["utils/convert_attr.h"],
-#     visibility = [
-#         "//visibility:public",
-#     ],
-#     deps = [
-#         ":convert_tensor",
-#         ":convert_type",
-#         ":tensorflow_attributes",
-#         "//tensorflow/core:protos_all_cc",
-#         "//tensorflow/core/platform:errors",
-#         "@local_tsl//tsl/platform:statusor",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "convert_type",
-#     srcs = [
-#         "utils/convert_type.cc",
-#     ],
-#     hdrs = [
-#         "utils/convert_type.h",
-#     ],
-#     textual_hdrs = [
-#         "ir/tf_types.def",
-#     ],
-#     visibility = [
-#         "//visibility:public",
-#     ],
-#     deps = [
-#         ":dynamic_shape_utils",
-#         ":tensorflow_types",
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:protos_all_cc",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "convert_type_test",
-#     size = "small",
-#     srcs = ["utils/convert_type_test.cc"],
-#     deps = [
-#         ":convert_type",
-#         "@local_xla//xla:test",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "convert_tensor",
-#     srcs = ["utils/convert_tensor.cc"],
-#     hdrs = ["utils/convert_tensor.h"],
-#     deps = [
-#         ":convert_type",
-#         ":dynamic_shape_utils",
-#         ":mangling_util",
-#         ":tensorflow_attributes",
-#         ":tensorflow_types",
-#         "@com_google_absl//absl/base",
-#         "@com_google_absl//absl/container:inlined_vector",
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:protos_all_cc",
-#         "@local_tsl//tsl/platform:ml_dtypes",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "convert_tensor_test",
-#     size = "small",
-#     srcs = ["utils/convert_tensor_test.cc"],
-#     deps = [
-#         ":convert_tensor",
-#         ":dynamic_shape_utils",
-#         ":tensorflow",
-#         "@local_xla//xla:test",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:protos_all_cc",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "//tensorflow/core:testlib",
-#         "@local_tsl//tsl/platform:ml_dtypes",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "mangling_util",
-#     srcs = ["utils/mangling_util.cc"],
-#     hdrs = ["utils/mangling_util.h"],
-#     deps = [
-#         ":parse_text_proto",
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:protos_all_cc",
-#         "//tensorflow/core/ir/importexport:mangling",
-#     ],
-# )
-#
-# cc_library(
-#     name = "error_util",
-#     srcs = ["utils/error_util.cc"],
-#     hdrs = ["utils/error_util.h"],
-#     visibility = ["//tensorflow:__subpackages__"],
-#     deps = [
-#         "@com_google_absl//absl/status",
-#         "@local_xla//xla/mlir/utils:error_util",
-#         "//tensorflow/core/platform:errors",
-#         "//tensorflow/core/platform:status",
-#         "//tensorflow/core/util:managed_stack_trace",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tf_dialect_lib",
-#     deps = [
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
-#         "@llvm-project//mlir:AllPassesAndDialects",
-#     ],
-# )
-#
-# cc_library(
-#     name = "eval_util",
-#     srcs = ["utils/eval_util.cc"],
-#     hdrs = ["utils/eval_util.h"],
-#     deps = [
-#         ":convert_tensor",
-#         ":export_tf_dialect_op",
-#         "@com_google_absl//absl/container:inlined_vector",
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/c/eager:c_api",
-#         "//tensorflow/c/eager:c_api_internal",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:framework_internal",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:protos_all_cc",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "error_util_test",
-#     srcs = ["utils/error_util_test.cc"],
-#     deps = [
-#         ":error_util",
-#         "@local_xla//xla:test",
-#         "@local_xla//xla/mlir/utils:error_util",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "serialize_mlir_module_utils",
-#     srcs = ["utils/serialize_mlir_module_utils.cc"],
-#     hdrs = ["utils/serialize_mlir_module_utils.h"],
-#     deps = [
-#         ":error_util",
-#         "@local_xla//xla:status_macros",
-#         "//tensorflow/core/platform:errors",
-#         "//tensorflow/core/platform:status",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Parser",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tf_xla_mlir_translate",
-#     testonly = True,  # Ensure alwayslink does not leak in the codebase.
-#     srcs = ["utils/tf_xla_mlir_translate.cc"],
-#     deps = [
-#         ":mlir_roundtrip_flags",
-#         ":serialize_mlir_module_utils",
-#         ":tensorflow",
-#         ":translate_cl_options",
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
-#         "//tensorflow/compiler/mlir/utils:string_container_utils",
-#         "//tensorflow/compiler/tf2xla:xla_argument",
-#         "//tensorflow/compiler/tf2xla:xla_helpers",
-#         "@local_xla//xla/hlo/ir:hlo",
-#         "@local_xla//xla/service:hlo_module_config",
-#         "@local_xla//xla/service:hlo_proto_cc",
-#         "@local_xla//xla/translate/mhlo_to_hlo:type_to_shape",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:protos_all_cc",
-#         "//tensorflow/core/platform:errors",
-#         "//tensorflow/core/platform:status",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:ArithDialect",
-#         "@llvm-project//mlir:AsmParser",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:FuncExtensions",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:QuantOps",
-#         "@llvm-project//mlir:Support",
-#         "@llvm-project//mlir:TranslateLib",
-#         "@stablehlo//:stablehlo_ops",
-#     ],
-#     alwayslink = 1,
-# )
-#
-# cc_library(
-#     name = "mlir_passthrough_op",
-#     srcs = ["ops/mlir_passthrough_op.cc"],
-#     visibility = [
-#         "//visibility:public",
-#     ],
-#     deps = [
-#         "//tensorflow/core:framework",
-#     ],
-#     alwayslink = 1,
-# )
-#
-# cc_library(
-#     name = "mlir_local_var_op",
-#     srcs = ["ops/mlir_local_var_op.cc"],
-#     visibility = [
-#         "//visibility:public",
-#     ],
-#     deps = [
-#         "//tensorflow/core:framework",
-#     ],
-#     alwayslink = 1,
-# )
-#
-# tf_gen_op_wrapper_py(
-#     name = "gen_mlir_passthrough_op_py",
-#     out = "gen_mlir_passthrough_op.py",
-#     compatible_with = [],
-#     extra_py_deps = [
-#         "//tensorflow/python:pywrap_tfe",
-#         "//tensorflow/python/util:dispatch",
-#         "//tensorflow/python/util:deprecation",
-#         "//tensorflow/python/util:tf_export",
-#     ],
-#     py_lib_rule = py_strict_library,
-#     deps = [":mlir_passthrough_op"],
-# )
-#
-# cc_library(
-#     name = "parallel_execute_util",
-#     srcs = ["utils/parallel_execute_util.cc"],
-#     hdrs = ["utils/parallel_execute_util.h"],
-#     deps = [
-#         ":tensorflow",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "cluster_util",
-#     srcs = ["utils/cluster_util.cc"],
-#     hdrs = ["utils/cluster_util.h"],
-#     deps = [
-#         ":tensorflow_analysis",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#         "@llvm-project//mlir:TransformUtils",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "cluster_util_test",
-#     size = "small",
-#     srcs = ["utils/cluster_util_test.cc"],
-#     deps = [
-#         ":cluster_util",
-#         ":serialize_mlir_module_utils",
-#         ":tensorflow",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "//tensorflow/core/platform:errors",
-#         "@local_tsl//tsl/platform:statusor",
-#         "@com_google_googletest//:gtest",
-#         "@llvm-project//mlir:FuncDialect",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tpu_rewrite_device_util",
-#     srcs = ["utils/tpu_rewrite_device_util.cc"],
-#     hdrs = ["utils/tpu_rewrite_device_util.h"],
-#     deps = [
-#         ":device_util",
-#         ":tensorflow",
-#         ":tensorflow_types",
-#         "@com_google_absl//absl/log",
-#         "@com_google_absl//absl/status",
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/compiler/jit:flags_headers",
-#         "//tensorflow/compiler/mlir/utils:string_container_utils",
-#         "@local_xla//xla:array4d",
-#         "@local_xla//xla:xla_data_proto_cc",
-#         "@local_xla//xla/service:computation_placer",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "tpu_rewrite_device_util_test",
-#     size = "small",
-#     srcs = ["utils/tpu_rewrite_device_util_test.cc"],
-#     deps = [
-#         ":device_util",
-#         ":serialize_mlir_module_utils",
-#         ":tensorflow",
-#         ":tpu_rewrite_device_util",
-#         "//tensorflow/compiler/jit:flags",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "device_util",
-#     srcs = ["utils/device_util.cc"],
-#     hdrs = ["utils/device_util.h"],
-#     deps = [
-#         ":tensorflow",
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/core:core_cpu_lib",
-#         "//tensorflow/core:framework",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "device_util_test",
-#     size = "small",
-#     srcs = ["utils/device_util_test.cc"],
-#     deps = [
-#         ":device_util",
-#         "//tensorflow/core:core_cpu_lib",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:protos_all_cc",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "//tensorflow/core/ir/types:Dialect",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "dump_mlir_util",
-#     srcs = ["utils/dump_mlir_util.cc"],
-#     hdrs = ["utils/dump_mlir_util.h"],
-#     deps = [
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core/platform:crash_analysis",
-#         "//tensorflow/core/platform:logging",
-#         "@local_tsl//tsl/lib/io:buffered_file",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Pass",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "dump_mlir_util_test",
-#     size = "small",
-#     srcs = ["utils/dump_mlir_util_test.cc"],
-#     deps = [
-#         ":dump_mlir_util",
-#         ":tensorflow",
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:bridge",
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "//tensorflow/core/platform:test",
-#         "@com_google_googletest//:gtest",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:AllPassesAndDialects",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:MlirOptLib",
-#         "@llvm-project//mlir:Pass",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "dump_graph",
-#     srcs = ["utils/dump_graph.cc"],
-#     hdrs = ["utils/dump_graph.h"],
-#     deps = [
-#         ":error_util",
-#         ":tensorflow",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:graph",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core/ir/importexport:graphdef_import",
-#         "//tensorflow/core/platform:logging",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "dump_graph_test",
-#     size = "small",
-#     srcs = ["utils/dump_graph_test.cc"],
-#     tags = [
-#         "no_windows",  # b/208469759
-#     ],
-#     deps = [
-#         ":dump_graph",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:graph",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core:ops",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "//tensorflow/core/platform:test",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "bridge_logger_test",
-#     size = "small",
-#     srcs = ["utils/bridge_logger_test.cc"],
-#     deps = [
-#         ":bridge_logger",
-#         ":serialize_mlir_module_utils",
-#         ":tensorflow",
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "//tensorflow/core/platform:test",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Transforms",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "data_dumper_logger_config_test",
-#     size = "small",
-#     srcs = ["utils/data_dumper_logger_config_test.cc"],
-#     deps = [
-#         ":bridge_logger",
-#         ":serialize_mlir_module_utils",
-#         ":tensorflow",
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "//tensorflow/core/platform:test",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Transforms",
-#     ],
-# )
-#
-# cc_library(
-#     name = "bridge_logger",
-#     srcs = [
-#         "utils/bridge_logger.cc",
-#         "utils/data_dumper_logger_config.cc",
-#     ],
-#     hdrs = [
-#         "utils/bridge_logger.h",
-#         "utils/data_dumper_logger_config.h",
-#     ],
-#     deps = [
-#         ":dump_mlir_util",
-#         "@com_google_absl//absl/strings",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Pass",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "call_graph_util",
-#     srcs = [
-#         "utils/call_graph_util.cc",
-#     ],
-#     hdrs = [
-#         "utils/call_graph_util.h",
-#     ],
-#     deps = [
-#         ":tensorflow",
-#         "@com_google_absl//absl/strings",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "call_graph_util_test",
-#     size = "small",
-#     srcs = ["utils/call_graph_util_test.cc"],
-#     deps = [
-#         ":attribute_utils",
-#         ":call_graph_util",
-#         ":tensorflow",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "@com_google_googletest//:gtest_main",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Parser",
-#     ],
-# )
-#
-# cc_library(
-#     name = "xla_sharding_util",
-#     srcs = [
-#         "utils/xla_sharding_util.cc",
-#     ],
-#     hdrs = [
-#         "utils/xla_sharding_util.h",
-#     ],
-#     deps = [
-#         ":tensorflow",
-#         "@com_google_absl//absl/strings",
-#         "@local_xla//xla:xla_data_proto_cc",
-#         "@local_xla//xla/client:sharding_builder",
-#         "@local_xla//xla/service:hlo_parser",
-#         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "attribute_utils",
-#     srcs = ["utils/attribute_utils.cc"],
-#     hdrs = ["utils/attribute_utils.h"],
-#     deps = [
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/compiler/tf2xla:tf2xla_defs",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "shape_inference_utils",
-#     srcs = ["utils/shape_inference_utils.cc"],
-#     hdrs = ["utils/shape_inference_utils.h"],
-#     deps = [
-#         ":export_tf_dialect_op",
-#         ":tensorflow_types",
-#         "//tensorflow/core/ir:shape_inference_utils",
-#         "@llvm-project//llvm:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "dynamic_shape_utils",
-#     srcs = ["utils/dynamic_shape_utils.cc"],
-#     hdrs = ["utils/dynamic_shape_utils.h"],
-#     deps = [
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "verification_utils",
-#     srcs = ["utils/verification_utils.cc"],
-#     hdrs = ["utils/verification_utils.h"],
-#     deps = [
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "verify_suitable_for_graph_export",
-#     srcs = ["utils/verify_suitable_for_graph_export.cc"],
-#     hdrs = ["utils/verify_suitable_for_graph_export.h"],
-#     deps = [
-#         ":tensorflow",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "resource_value_typed_analyzer",
-#     srcs = ["analysis/resource_value_typed_analyzer.cc"],
-#     hdrs = ["analysis/resource_value_typed_analyzer.h"],
-#     deps = [
-#         ":tensorflow",
-#         ":tensorflow_types",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#         "@llvm-project//mlir:TransformUtils",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tf_arith_ops_folder",
-#     srcs = ["ir/tf_arith_ops_folder.cc"],
-#     hdrs = ["ir/tf_arith_ops_folder.h"],
-#     deps = [
-#         ":tensorflow_types",
-#         "//tensorflow/core/ir/types:Dialect",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:Dialect",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tf_ops_canonicalization_helper",
-#     hdrs = ["ir/tf_ops_canonicalization_helper.h"],
-#     deps = [
-#         ":attribute_utils",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tf_ops_device_helper",
-#     srcs = ["ir/tf_ops_device_helper.cc"],
-#     hdrs = ["ir/tf_ops_device_helper.h"],
-#     deps = [
-#         ":tensorflow_structs",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:framework_internal",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tf_ops_layout_helper",
-#     srcs = ["ir/tf_ops_layout_helper.cc"],
-#     hdrs = ["ir/tf_ops_layout_helper.h"],
-#     deps = [
-#         ":tensorflow_op_interfaces",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tf_ops_tensor_helper",
-#     srcs = ["ir/tf_ops_tensor_helper.cc"],
-#     hdrs = ["ir/tf_ops_tensor_helper.h"],
-#     deps = [
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:Dialect",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "tpu_embedding_ops_registry",
-#     srcs = [
-#         "ir/tpu_embedding_ops_registry.cc",
-#     ],
-#     hdrs = [
-#         "ir/tpu_embedding_ops_registry.h",
-#     ],
-#     deps = [
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "mlprogram_util",
-#     srcs = ["utils/mlprogram_util.cc"],
-#     hdrs = ["utils/mlprogram_util.h"],
-#     deps = [
-#         "//tensorflow/compiler/mlir/tensorflow/transforms:mlprogram",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Pass",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "visitor",
-#     srcs = ["utils/visitor.cc"],
-#     hdrs = ["utils/visitor.h"],
-#     deps = [
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# cc_library(
-#     name = "xla_rewrite_util",
-#     srcs = ["utils/xla_rewrite_util.cc"],
-#     hdrs = ["utils/xla_rewrite_util.h"],
-#     deps = [
-#         ":device_util",
-#         ":tensorflow",
-#         ":tensorflow_types",
-#         "@com_google_absl//absl/strings",
-#         "//tensorflow/compiler/mlir/utils:string_container_utils",
-#         "@local_xla//xla:array4d",
-#         "@local_xla//xla:xla_data_proto_cc",
-#         "@local_xla//xla/service:computation_placer",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:lib",
-#         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# tf_cc_test(
-#     name = "xla_rewrite_util_test",
-#     size = "small",
-#     srcs = ["utils/xla_rewrite_util_test.cc"],
-#     deps = [
-#         ":device_util",
-#         ":serialize_mlir_module_utils",
-#         ":tensorflow",
-#         ":tpu_rewrite_device_util",
-#         ":xla_rewrite_util",
-#         "//tensorflow/compiler/jit:flags",
-#         "//tensorflow/core:framework",
-#         "//tensorflow/core:test",
-#         "//tensorflow/core:test_main",
-#         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
-#         "@llvm-project//llvm:Support",
-#         "@llvm-project//mlir:FuncDialect",
-#         "@llvm-project//mlir:IR",
-#     ],
-# )
-#
-# cc_library(
-#     name = "side_effect_analysis_util",
-#     srcs = [
-#         "utils/side_effect_analysis_util.cc",
-#     ],
-#     hdrs = [
-#         "utils/side_effect_analysis_util.h",
-#     ],
-#     deps = [
-#         "tensorflow_side_effects",
-#         "tensorflow_types",
-#         "@llvm-project//mlir:IR",
-#         "@llvm-project//mlir:SideEffectInterfaces",
-#         "@llvm-project//mlir:Support",
-#     ],
-# )
-#
-# build_test(
-#     name = "tensorflow_build_test",
-#     targets = [
-#         ":tensorflow",
-#     ],
-# )
+# copybara:uncomment_end(google-only)
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+exports_files([
+    "ir/tf_generated_ops.td",
+    "ir/tf_op_base.td",
+    "ir/tf_op_interfaces.td",
+    "ir/tf_ops.td",
+])
+
+td_library(
+    name = "tensorflow_ops_td_files",
+    srcs = [
+        "ir/tf_generated_ops.td",
+        "ir/tf_op_base.td",
+        "ir/tf_op_interfaces.td",
+        "ir/tf_ops.td",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "@llvm-project//mlir:CallInterfacesTdFiles",
+        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
+        "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
+        "@llvm-project//mlir:LoopLikeInterfaceTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "tensorflow_op_interfaces_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            ["-gen-op-interface-decls"],
+            "ir/tf_op_interfaces.h.inc",
+        ),
+        (
+            ["-gen-op-interface-defs"],
+            "ir/tf_op_interfaces.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tf_op_interfaces.td",
+    test = True,
+    deps = [
+        ":tensorflow_ops_td_files",
+    ],
+)
+
+gentbl_cc_library(
+    name = "tensorflow_struct_doc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            ["-gen-dialect-doc"],
+            "g3doc/tf_ops.md",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tf_ops.td",
+    test = True,
+    deps = [
+        ":tensorflow_ops_td_files",
+    ],
+)
+
+cc_library(
+    name = "tensorflow_op_interfaces",
+    srcs = [
+        "ir/tf_op_interfaces.cc",
+        "ir/tf_op_interfaces.cc.inc",
+        "ir/tf_op_interfaces.h.inc",
+        "ir/tf_verifiers.cc",
+    ],
+    hdrs = [
+        "ir/tf_op_interfaces.h",
+        "ir/tf_verifiers.h",
+    ],
+    deps = [
+        ":tensorflow_op_interfaces_inc_gen",
+        ":tensorflow_structs",
+        "//tensorflow/core:framework",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+gentbl_cc_library(
+    name = "tensorflow_all_ops_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "ir/tf_all_ops.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "ir/tf_all_ops.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tf_ops.td",
+    deps = [
+        ":tensorflow_ops_td_files",
+    ],
+)
+
+# We only shard tf_op on name for build performance reasons.
+tf_ops_category_list = [
+    {
+        "name": "ops_a_m",
+        "include": "tf.[A-M].*$",
+    },
+    {
+        "name": "ops_n_z",
+        "include": "tf.[N-Z].*$",
+    },
+]
+
+[[
+    gentbl_cc_library(
+        name = "tensorflow_" + target["name"] + "_inc_gen",
+        compatible_with = get_compatible_with_portable(),
+        tbl_outs = [
+            (
+                [
+                    "-gen-op-decls",
+                    "-op-include-regex=" + target["include"],
+                ],
+                "ir/tf_" + target["name"] + ".h.inc",
+            ),
+            (
+                [
+                    "-gen-op-defs",
+                    "-op-include-regex=" + target["include"],
+                ],
+                "ir/tf_" + target["name"] + ".cc.inc",
+            ),
+        ],
+        tblgen = "@llvm-project//mlir:mlir-tblgen",
+        td_file = "ir/tf_ops.td",
+        deps = [
+            ":tensorflow_ops_td_files",
+        ],
+    ),
+] for target in tf_ops_category_list]
+
+gentbl_cc_library(
+    name = "tensorflow_remaining_ops_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            [
+                "-gen-op-decls",
+                "-op-exclude-regex=" + "|".join([target["include"] for target in tf_ops_category_list]),
+            ],
+            "ir/tf_remaining_ops.h.inc",
+        ),
+        (
+            [
+                "-gen-op-defs",
+                "-op-exclude-regex=" + "|".join([target["include"] for target in tf_ops_category_list]),
+            ],
+            "ir/tf_remaining_ops.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tf_ops.td",
+    deps = [
+        ":tensorflow_ops_td_files",
+    ],
+)
+
+gentbl_cc_library(
+    name = "tf_saved_model_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "ir/tf_saved_model.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "ir/tf_saved_model.cc.inc",
+        ),
+        (
+            ["-gen-dialect-doc"],
+            "g3doc/tf_saved_model.md",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tf_saved_model_ops.td",
+    test = True,
+    deps = [
+        "@llvm-project//mlir:FuncTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "tensorflow_executor_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "ir/tf_executor.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "ir/tf_executor.cc.inc",
+        ),
+        (
+            [
+                "-gen-dialect-doc",
+                "-dialect=tf_executor",
+            ],
+            "g3doc/tf_executor.md",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tf_executor_ops.td",
+    test = True,
+    deps = [
+        ":tensorflow_ops_td_files",
+        "@llvm-project//mlir:FuncTdFiles",
+        "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "tensorflow_device_ops_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "ir/tf_device.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "ir/tf_device.cc.inc",
+        ),
+        (
+            ["-gen-dialect-doc"],
+            "g3doc/tf_device.md",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tf_device_ops.td",
+    test = True,
+    deps = [
+        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
+        "@llvm-project//mlir:FuncTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
+    ],
+)
+
+cc_library(
+    name = "tensorflow_attributes",
+    hdrs = [
+        "ir/tf_attributes.h",
+        "ir/tf_dialect.h",
+    ],
+    deps = [
+        ":tensorflow_types",
+        "//tensorflow/core/ir/types:Dialect",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "tensorflow_traits",
+    srcs = [
+    ],
+    hdrs = [
+        "ir/tf_traits.h",
+    ],
+    deps = [
+        ":tensorflow_op_interfaces",
+        ":tensorflow_types",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+# TensorFlow ops are separated into `tensorflow_ops_a_m.cc` and
+# `tensorflow_ops_n_z.cc` so that C++ compiler won't be stressed by huge C++
+# files. However, there might be dependencies between `tensorflow_ops_a_m.cc`
+# and `tensorflow_ops_n_z.cc`, thus they must be built in one `cc_library`.
+cc_library(
+    name = "tensorflow_ops_sharded",
+    srcs = [
+               "ir/tf_dialect.h",
+               "ir/tf_ops.h",
+               "ir/tf_remaining_ops.h",
+           ] + ["ir/tf_" + target["name"] + ".cc" for target in tf_ops_category_list] +
+           ["ir/tf_" + target["name"] + ".cc.inc" for target in tf_ops_category_list] +
+           ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
+    hdrs = [
+    ],
+    textual_hdrs = [
+        "ir/tf_types.def",
+        "ir/tf_all_ops.h.inc",
+        "ir/tf_remaining_ops.h.inc",
+    ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
+    deps = [
+        ":attribute_utils",
+        ":convert_type",
+        ":dynamic_shape_utils",
+        ":side_effect_analysis_util",
+        ":tensorflow_all_ops_inc_gen",
+        ":tensorflow_attributes",
+        ":tensorflow_op_interfaces",
+        ":tensorflow_op_interfaces_inc_gen",
+        ":tensorflow_remaining_ops_inc_gen",
+        ":tensorflow_side_effects",
+        ":tensorflow_structs",
+        ":tensorflow_traits",
+        ":tensorflow_types",
+        ":tf_arith_ops_folder",
+        ":tf_ops_canonicalization_helper",
+        ":tf_ops_device_helper",
+        ":tf_ops_layout_helper",
+        ":tf_ops_tensor_helper",
+        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
+        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:rewrite_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
+        "@llvm-project//mlir:ControlFlowInterfaces",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:LoopLikeInterface",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
+    ] + [":tensorflow_" + target["name"] + "_inc_gen" for target in tf_ops_category_list],
+)
+
+cc_library(
+    name = "tensorflow_remaining_ops",
+    srcs = [
+        "ir/tf_dialect.h",
+        "ir/tf_ops.h",
+        "ir/tf_remaining_ops.cc",
+        "ir/tf_remaining_ops.h",
+    ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
+    hdrs = [
+    ],
+    textual_hdrs = [
+        "ir/tf_all_ops.h.inc",
+        "ir/tf_remaining_ops.h.inc",
+    ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
+    deps = [
+        ":attribute_utils",
+        ":serialize_mlir_module_utils",
+        ":side_effect_analysis_util",
+        ":tensorflow_attributes",
+        ":tensorflow_op_interfaces",
+        ":tensorflow_op_interfaces_inc_gen",
+        ":tensorflow_remaining_ops_inc_gen",
+        ":tensorflow_side_effects",
+        ":tensorflow_structs",
+        ":tensorflow_traits",
+        ":tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
+        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:rewrite_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
+        "@llvm-project//mlir:ControlFlowInterfaces",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:LoopLikeInterface",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "tensorflow_ops",
+    srcs = [
+        "ir/tf_dialect.h",
+        "ir/tf_ops.cc",
+        "ir/tf_ops.h",
+    ],
+    textual_hdrs = [
+        "ir/tf_all_ops.h.inc",
+        "ir/tf_remaining_ops.h",
+    ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
+    deps = [
+        ":side_effect_analysis_util",
+        ":tensorflow_all_ops_inc_gen",
+        ":tensorflow_attributes",
+        ":tensorflow_op_interfaces",
+        ":tensorflow_op_interfaces_inc_gen",
+        ":tensorflow_ops_sharded",
+        ":tensorflow_remaining_ops",
+        ":tensorflow_remaining_ops_inc_gen",
+        ":tensorflow_side_effects",
+        ":tensorflow_structs",
+        ":tensorflow_traits",
+        ":tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
+        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:rewrite_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime:inline_function_utils",
+        "//tensorflow/core/common_runtime:lower_function_call_inline_policy",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
+        "@llvm-project//mlir:ControlFlowInterfaces",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:LoopLikeInterface",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "tensorflow_structs",
+    srcs = [
+        "ir/tf_structs.cc",
+    ],
+    hdrs = [
+        "ir/tf_structs.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core/ir/types:Dialect",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "tensorflow_side_effects",
+    srcs = [
+    ],
+    hdrs = [
+        "ir/tf_side_effects.h",
+    ],
+    deps = ["@llvm-project//mlir:SideEffectInterfaces"],
+)
+
+cc_library(
+    name = "tensorflow_types",
+    hdrs = [
+        "ir/tf_dialect.h",
+        "ir/tf_types.h",
+    ],
+    textual_hdrs = [
+        "ir/tf_types.def",
+    ],
+    deps = [
+        "//tensorflow/core/ir/types:Dialect",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "tensorflow",
+    srcs = [
+        "ir/tf_device.cc",
+        "ir/tf_executor.cc",
+        "ir/tf_executor.cc.inc",
+        "ir/tf_executor.h.inc",
+        "ir/tf_saved_model.cc",
+    ],
+    hdrs = [
+        "dialect_registration.h",
+        "ir/tf_device.h",
+        "ir/tf_dialect.h",
+        "ir/tf_executor.h",
+        "ir/tf_ops.h",
+        "ir/tf_saved_model.h",
+        "ir/tf_structs.h",
+        "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.h",
+    ],
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tensorflow_all_ops_inc_gen",
+        ":tensorflow_attributes",
+        ":tensorflow_device_ops_inc_gen",
+        ":tensorflow_executor_inc_gen",
+        ":tensorflow_op_interfaces",
+        ":tensorflow_ops",
+        ":tensorflow_side_effects",
+        ":tensorflow_structs",
+        ":tensorflow_traits",
+        ":tensorflow_types",
+        ":tf_saved_model_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
+        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_device_pass_inc_gen",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/ir:Dialect",
+        "//tensorflow/core/ir/types:Dialect",
+        "//tensorflow/core/platform:logging",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:CallOpInterfacesIncGen",
+        "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:ControlFlowInterfaces",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncExtensions",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:LoopLikeInterface",
+        "@llvm-project//mlir:MLProgramDialect",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+tf_cc_test(
+    name = "tf_saved_model_test",
+    srcs = ["ir/tf_saved_model_test.cc"],
+    deps = [
+        ":tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:test",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "string_util",
+    srcs = ["utils/string_util.cc"],
+    hdrs = ["utils/string_util.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+cc_library(
+    name = "fake_session",
+    srcs = ["utils/fake_session.cc"],
+    hdrs = ["utils/fake_session.h"],
+    deps = [
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/common_runtime:threadpool_device",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:threadpool_options",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "session_utils",
+    srcs = ["utils/session_utils.cc"],
+    hdrs = ["utils/session_utils.h"],
+    deps = [
+        ":tensorflow",
+        ":tensorflow_ops",
+        "//tensorflow/compiler/mlir/utils:string_container_utils",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "topological_sort",
+    srcs = ["utils/topological_sort.cc"],
+    hdrs = ["utils/topological_sort.h"],
+    deps = [
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "tensorflow_analysis",
+    srcs = [
+        "analysis/per_function_aggregate_analysis.h",
+        "analysis/resource_alias_analysis.cc",
+        "analysis/resource_dataflow.cc",
+        "analysis/side_effect_analysis.cc",
+    ],
+    hdrs = [
+        "analysis/resource_alias_analysis.h",
+        "analysis/resource_dataflow.h",
+        "analysis/side_effect_analysis.h",
+        "analysis/tf_dataflow.h",
+    ],
+    deps = [
+        ":tensorflow",
+        ":tensorflow_op_interfaces",
+        ":tensorflow_side_effects",
+        ":tensorflow_types",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "xla_call_module_attrs",
+    srcs = [],
+    hdrs = ["utils/xla_call_module_attrs.h"],
+    deps = ["@llvm-project//llvm:Support"],
+)
+
+cc_library(
+    name = "stablehlo_custom_call_utils",
+    srcs = ["utils/stablehlo_custom_call.cc"],
+    hdrs = ["utils/stablehlo_custom_call.h"],
+    deps = [
+        ":xla_call_module_attrs",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
+cc_library(
+    name = "parse_text_proto",
+    srcs = ["utils/parse_text_proto.cc"],
+    hdrs = ["utils/parse_text_proto.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:casts",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "import_utils",
+    srcs = ["utils/import_utils.cc"],
+    hdrs = ["utils/import_utils.h"],
+    deps = [
+        ":error_util",
+        ":parse_text_proto",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "export_utils",
+    srcs = [
+        "utils/export_utils.cc",
+    ],
+    hdrs = [
+        "utils/export_utils.h",
+    ],
+    deps = [
+        ":attribute_utils",
+        ":convert_tensor",
+        ":convert_type",
+        ":location_utils",
+        ":mangling_util",
+        ":tensorflow",
+        ":tensorflow_attributes",
+        ":tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:xla_sharding_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:protobuf",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla:status_macros",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/client:sharding_builder",
+        "@local_xla//xla/hlo/ir:hlo",
+        "@local_xla//xla/service:hlo_parser",
+    ],
+)
+
+cc_library(
+    name = "location_utils",
+    srcs = ["utils/location_utils.cc"],
+    hdrs = ["utils/location_utils.h"],
+    deps = [
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "tpu_cluster_util",
+    srcs = ["utils/tpu_cluster_util.cc"],
+    hdrs = ["utils/tpu_cluster_util.h"],
+    deps = [
+        ":device_util",
+        ":tpu_rewrite_device_util",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "translate_utils",
+    srcs = [
+        "utils/translate_utils.cc",
+    ],
+    hdrs = [
+        "utils/translate_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "convert_attr",
+    srcs = ["utils/convert_attr.cc"],
+    hdrs = ["utils/convert_attr.h"],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":convert_tensor",
+        ":convert_type",
+        ":tensorflow_attributes",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "convert_type",
+    srcs = [
+        "utils/convert_type.cc",
+    ],
+    hdrs = [
+        "utils/convert_type.h",
+    ],
+    textual_hdrs = [
+        "ir/tf_types.def",
+    ],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":dynamic_shape_utils",
+        ":tensorflow_types",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "convert_type_test",
+    size = "small",
+    srcs = ["utils/convert_type_test.cc"],
+    deps = [
+        ":convert_type",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@local_xla//xla:test",
+    ],
+)
+
+cc_library(
+    name = "convert_tensor",
+    srcs = ["utils/convert_tensor.cc"],
+    hdrs = ["utils/convert_tensor.h"],
+    deps = [
+        ":convert_type",
+        ":dynamic_shape_utils",
+        ":mangling_util",
+        ":tensorflow_attributes",
+        ":tensorflow_types",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:ml_dtypes",
+    ],
+)
+
+tf_cc_test(
+    name = "convert_tensor_test",
+    size = "small",
+    srcs = ["utils/convert_tensor_test.cc"],
+    deps = [
+        ":convert_tensor",
+        ":dynamic_shape_utils",
+        ":tensorflow",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:ml_dtypes",
+        "@local_xla//xla:test",
+    ],
+)
+
+cc_library(
+    name = "mangling_util",
+    srcs = ["utils/mangling_util.cc"],
+    hdrs = ["utils/mangling_util.h"],
+    deps = [
+        ":parse_text_proto",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/ir/importexport:mangling",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "error_util",
+    srcs = ["utils/error_util.cc"],
+    hdrs = ["utils/error_util.h"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/util:managed_stack_trace",
+        "@com_google_absl//absl/status",
+        "@llvm-project//mlir:IR",
+        "@local_xla//xla/mlir/utils:error_util",
+    ],
+)
+
+cc_library(
+    name = "tf_dialect_lib",
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
+        "@llvm-project//mlir:AllPassesAndDialects",
+    ],
+)
+
+cc_library(
+    name = "eval_util",
+    srcs = ["utils/eval_util.cc"],
+    hdrs = ["utils/eval_util.h"],
+    deps = [
+        ":convert_tensor",
+        ":export_tf_dialect_op",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "error_util_test",
+    srcs = ["utils/error_util_test.cc"],
+    deps = [
+        ":error_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@local_xla//xla:test",
+        "@local_xla//xla/mlir/utils:error_util",
+    ],
+)
+
+cc_library(
+    name = "serialize_mlir_module_utils",
+    srcs = ["utils/serialize_mlir_module_utils.cc"],
+    hdrs = ["utils/serialize_mlir_module_utils.h"],
+    deps = [
+        ":error_util",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@local_xla//xla:status_macros",
+    ],
+)
+
+cc_library(
+    name = "tf_xla_mlir_translate",
+    testonly = True,  # Ensure alwayslink does not leak in the codebase.
+    srcs = ["utils/tf_xla_mlir_translate.cc"],
+    deps = [
+        ":mlir_roundtrip_flags",
+        ":serialize_mlir_module_utils",
+        ":tensorflow",
+        ":translate_cl_options",
+        "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
+        "//tensorflow/compiler/mlir/utils:string_container_utils",
+        "//tensorflow/compiler/tf2xla:xla_argument",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:AsmParser",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncExtensions",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TranslateLib",
+        "@local_xla//xla/hlo/ir:hlo",
+        "@local_xla//xla/service:hlo_module_config",
+        "@local_xla//xla/service:hlo_proto_cc",
+        "@local_xla//xla/translate/mhlo_to_hlo:type_to_shape",
+        "@stablehlo//:stablehlo_ops",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "mlir_passthrough_op",
+    srcs = ["ops/mlir_passthrough_op.cc"],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "mlir_local_var_op",
+    srcs = ["ops/mlir_local_var_op.cc"],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_mlir_passthrough_op_py",
+    out = "gen_mlir_passthrough_op.py",
+    compatible_with = [],
+    extra_py_deps = [
+        "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+    py_lib_rule = py_strict_library,
+    deps = [":mlir_passthrough_op"],
+)
+
+cc_library(
+    name = "parallel_execute_util",
+    srcs = ["utils/parallel_execute_util.cc"],
+    hdrs = ["utils/parallel_execute_util.h"],
+    deps = [
+        ":tensorflow",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "cluster_util",
+    srcs = ["utils/cluster_util.cc"],
+    hdrs = ["utils/cluster_util.h"],
+    deps = [
+        ":tensorflow_analysis",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+tf_cc_test(
+    name = "cluster_util_test",
+    size = "small",
+    srcs = ["utils/cluster_util_test.cc"],
+    deps = [
+        ":cluster_util",
+        ":serialize_mlir_module_utils",
+        ":tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:errors",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:FuncDialect",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "tpu_rewrite_device_util",
+    srcs = ["utils/tpu_rewrite_device_util.cc"],
+    hdrs = ["utils/tpu_rewrite_device_util.h"],
+    deps = [
+        ":device_util",
+        ":tensorflow",
+        ":tensorflow_types",
+        "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/mlir/utils:string_container_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/protobuf/tpu:topology_proto_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla:array4d",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/service:computation_placer",
+    ],
+)
+
+tf_cc_test(
+    name = "tpu_rewrite_device_util_test",
+    size = "small",
+    srcs = ["utils/tpu_rewrite_device_util_test.cc"],
+    deps = [
+        ":device_util",
+        ":serialize_mlir_module_utils",
+        ":tensorflow",
+        ":tpu_rewrite_device_util",
+        "//tensorflow/compiler/jit:flags",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/protobuf/tpu:topology_proto_cc",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "device_util",
+    srcs = ["utils/device_util.cc"],
+    hdrs = ["utils/device_util.h"],
+    deps = [
+        ":tensorflow",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "device_util_test",
+    size = "small",
+    srcs = ["utils/device_util_test.cc"],
+    deps = [
+        ":device_util",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/ir/types:Dialect",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "dump_mlir_util",
+    srcs = ["utils/dump_mlir_util.cc"],
+    hdrs = ["utils/dump_mlir_util.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:crash_analysis",
+        "//tensorflow/core/platform:logging",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/lib/io:buffered_file",
+    ],
+)
+
+tf_cc_test(
+    name = "dump_mlir_util_test",
+    size = "small",
+    srcs = ["utils/dump_mlir_util_test.cc"],
+    deps = [
+        ":dump_mlir_util",
+        ":tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:bridge",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:test",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "dump_graph",
+    srcs = ["utils/dump_graph.cc"],
+    hdrs = ["utils/dump_graph.h"],
+    deps = [
+        ":error_util",
+        ":tensorflow",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/ir/importexport:graphdef_import",
+        "//tensorflow/core/platform:logging",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+tf_cc_test(
+    name = "dump_graph_test",
+    size = "small",
+    srcs = ["utils/dump_graph_test.cc"],
+    tags = [
+        "no_windows",  # b/208469759
+    ],
+    deps = [
+        ":dump_graph",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:test",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+tf_cc_test(
+    name = "bridge_logger_test",
+    size = "small",
+    srcs = ["utils/bridge_logger_test.cc"],
+    deps = [
+        ":bridge_logger",
+        ":serialize_mlir_module_utils",
+        ":tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:test",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+tf_cc_test(
+    name = "data_dumper_logger_config_test",
+    size = "small",
+    srcs = ["utils/data_dumper_logger_config_test.cc"],
+    deps = [
+        ":bridge_logger",
+        ":serialize_mlir_module_utils",
+        ":tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:test",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "bridge_logger",
+    srcs = [
+        "utils/bridge_logger.cc",
+        "utils/data_dumper_logger_config.cc",
+    ],
+    hdrs = [
+        "utils/bridge_logger.h",
+        "utils/data_dumper_logger_config.h",
+    ],
+    deps = [
+        ":dump_mlir_util",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "call_graph_util",
+    srcs = [
+        "utils/call_graph_util.cc",
+    ],
+    hdrs = [
+        "utils/call_graph_util.h",
+    ],
+    deps = [
+        ":tensorflow",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "call_graph_util_test",
+    size = "small",
+    srcs = ["utils/call_graph_util_test.cc"],
+    deps = [
+        ":attribute_utils",
+        ":call_graph_util",
+        ":tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+    ],
+)
+
+cc_library(
+    name = "xla_sharding_util",
+    srcs = [
+        "utils/xla_sharding_util.cc",
+    ],
+    hdrs = [
+        "utils/xla_sharding_util.h",
+    ],
+    deps = [
+        ":tensorflow",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/client:sharding_builder",
+        "@local_xla//xla/service:hlo_parser",
+    ],
+)
+
+cc_library(
+    name = "attribute_utils",
+    srcs = ["utils/attribute_utils.cc"],
+    hdrs = ["utils/attribute_utils.h"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:tf2xla_defs",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "shape_inference_utils",
+    srcs = ["utils/shape_inference_utils.cc"],
+    hdrs = ["utils/shape_inference_utils.h"],
+    deps = [
+        ":export_tf_dialect_op",
+        ":tensorflow_types",
+        "//tensorflow/core/ir:shape_inference_utils",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "dynamic_shape_utils",
+    srcs = ["utils/dynamic_shape_utils.cc"],
+    hdrs = ["utils/dynamic_shape_utils.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "verification_utils",
+    srcs = ["utils/verification_utils.cc"],
+    hdrs = ["utils/verification_utils.h"],
+    deps = [
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "verify_suitable_for_graph_export",
+    srcs = ["utils/verify_suitable_for_graph_export.cc"],
+    hdrs = ["utils/verify_suitable_for_graph_export.h"],
+    deps = [
+        ":tensorflow",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "resource_value_typed_analyzer",
+    srcs = ["analysis/resource_value_typed_analyzer.cc"],
+    hdrs = ["analysis/resource_value_typed_analyzer.h"],
+    deps = [
+        ":tensorflow",
+        ":tensorflow_types",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "tf_arith_ops_folder",
+    srcs = ["ir/tf_arith_ops_folder.cc"],
+    hdrs = ["ir/tf_arith_ops_folder.h"],
+    deps = [
+        ":tensorflow_types",
+        "//tensorflow/core/ir/types:Dialect",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "tf_ops_canonicalization_helper",
+    hdrs = ["ir/tf_ops_canonicalization_helper.h"],
+    deps = [
+        ":attribute_utils",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "tf_ops_device_helper",
+    srcs = ["ir/tf_ops_device_helper.cc"],
+    hdrs = ["ir/tf_ops_device_helper.h"],
+    deps = [
+        ":tensorflow_structs",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+    ],
+)
+
+cc_library(
+    name = "tf_ops_layout_helper",
+    srcs = ["ir/tf_ops_layout_helper.cc"],
+    hdrs = ["ir/tf_ops_layout_helper.h"],
+    deps = [
+        ":tensorflow_op_interfaces",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "tf_ops_tensor_helper",
+    srcs = ["ir/tf_ops_tensor_helper.cc"],
+    hdrs = ["ir/tf_ops_tensor_helper.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "tpu_embedding_ops_registry",
+    srcs = [
+        "ir/tpu_embedding_ops_registry.cc",
+    ],
+    hdrs = [
+        "ir/tpu_embedding_ops_registry.h",
+    ],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "mlprogram_util",
+    srcs = ["utils/mlprogram_util.cc"],
+    hdrs = ["utils/mlprogram_util.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow/transforms:mlprogram",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "visitor",
+    srcs = ["utils/visitor.cc"],
+    hdrs = ["utils/visitor.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "xla_rewrite_util",
+    srcs = ["utils/xla_rewrite_util.cc"],
+    hdrs = ["utils/xla_rewrite_util.h"],
+    deps = [
+        ":device_util",
+        ":tensorflow",
+        ":tensorflow_types",
+        "//tensorflow/compiler/mlir/utils:string_container_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/protobuf/tpu:topology_proto_cc",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla:array4d",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/service:computation_placer",
+    ],
+)
+
+tf_cc_test(
+    name = "xla_rewrite_util_test",
+    size = "small",
+    srcs = ["utils/xla_rewrite_util_test.cc"],
+    deps = [
+        ":device_util",
+        ":serialize_mlir_module_utils",
+        ":tensorflow",
+        ":tpu_rewrite_device_util",
+        ":xla_rewrite_util",
+        "//tensorflow/compiler/jit:flags",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/protobuf/tpu:topology_proto_cc",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "side_effect_analysis_util",
+    srcs = [
+        "utils/side_effect_analysis_util.cc",
+    ],
+    hdrs = [
+        "utils/side_effect_analysis_util.h",
+    ],
+    deps = [
+        "tensorflow_side_effects",
+        "tensorflow_types",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+build_test(
+    name = "tensorflow_build_test",
+    targets = [
+        ":tensorflow",
+    ],
+)
 # copybara:uncomment_begin(google-only)
 #
 # # Generate new_tf_generated_ops.td. Without --update-existing, this file can

From f5802852c3e61bf37b74358a7e4d911dd9730357 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 27 Mar 2024 12:53:53 -0700
Subject: [PATCH 497/670] [stream_executor:host] Add initial version of a host
 kernel API (ABI)

XLA:CPU compiler will compile fusions into function pointers compatible with an API defined by StreamExecutor
PiperOrigin-RevId: 619622505
---
 .../xla/xla/stream_executor/host/BUILD        | 34 ++++++++
 .../xla/stream_executor/host/host_kernel.cc   | 68 ++++++++++++++++
 .../xla/stream_executor/host/host_kernel.h    | 58 ++++++++++++++
 .../stream_executor/host/host_kernel_c_api.h  | 80 +++++++++++++++++++
 .../stream_executor/host/host_kernel_test.cc  | 62 ++++++++++++++
 5 files changed, 302 insertions(+)
 create mode 100644 third_party/xla/xla/stream_executor/host/host_kernel.cc
 create mode 100644 third_party/xla/xla/stream_executor/host/host_kernel.h
 create mode 100644 third_party/xla/xla/stream_executor/host/host_kernel_c_api.h
 create mode 100644 third_party/xla/xla/stream_executor/host/host_kernel_test.cc

diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index 22b1d5382701c8..a29fa215b5736c 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -76,6 +76,40 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "host_kernel_c_api",
+    hdrs = ["host_kernel_c_api.h"],
+)
+
+cc_library(
+    name = "host_kernel",
+    srcs = ["host_kernel.cc"],
+    hdrs = ["host_kernel.h"],
+    deps = [
+        ":host_kernel_c_api",
+        "//xla/stream_executor",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:stream_executor_internal",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "host_kernel_test",
+    srcs = ["host_kernel_test.cc"],
+    deps = [
+        ":host_kernel",
+        ":host_kernel_c_api",
+        "//xla/stream_executor",
+        "//xla/stream_executor:device_memory",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 # TODO(22689637): Rename this target.
 cc_library(
     name = "host_gpu_executor",
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel.cc b/third_party/xla/xla/stream_executor/host/host_kernel.cc
new file mode 100644
index 00000000000000..e2d37085a1c756
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/host/host_kernel.cc
@@ -0,0 +1,68 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/host/host_kernel.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace stream_executor::host {
+
+HostKernel::HostKernel(unsigned arity, SE_HOST_Kernel* kernel)
+    : arity_(arity), kernel_(kernel) {}
+
+absl::Status HostKernel::Launch(const ThreadDim& thread_dims,
+                                absl::Span<const DeviceMemoryBase> buffers) {
+  SE_HOST_KernelThreadDim kernel_thread_dims = {thread_dims.x, thread_dims.y,
+                                                thread_dims.z};
+
+  // Convert buffers to kernel arguments.
+  std::vector<SE_HOST_KernelArg> args(buffers.size());
+  for (int32_t i = 0; i < buffers.size(); ++i) {
+    args[i].data = const_cast<void*>(buffers[i].opaque());
+    args[i].size = buffers[i].size();
+  }
+
+  // TODO(b/331430625): We should be using thread pool to call kernel function
+  // for different threads (blocks) concurrently. For now it's the most trivial
+  // implementation that runs tasks sequentially.
+
+  for (uint64_t z = 0; z < thread_dims.z; ++z) {
+    for (uint64_t y = 0; y < thread_dims.y; ++y) {
+      for (uint64_t x = 0; x < thread_dims.x; ++x) {
+        SE_HOST_KernelThread kernel_thread = {x, y, z};
+
+        SE_HOST_KernelCallFrame call_frame = {
+            &kernel_thread_dims, &kernel_thread, args.size(), args.data()};
+
+        SE_HOST_KernelError* error = (*kernel_)(&call_frame);
+
+        if (error != nullptr) {
+          return absl::InternalError("Failed to call host kernel");
+        }
+      }
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace stream_executor::host
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel.h b/third_party/xla/xla/stream_executor/host/host_kernel.h
new file mode 100644
index 00000000000000..ee8f67738bf08c
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/host/host_kernel.h
@@ -0,0 +1,58 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace stream_executor::host {
+
+class HostKernel : public Kernel {
+ public:
+  HostKernel(unsigned arity, SE_HOST_Kernel* kernel);
+
+  // TODO(b/331430625): Connect this API to Launch API defined at StreamExecutor
+  // level, which requires refactoring how arguments passed to kernels, as
+  // current KernelArgs structure tied to the GPU kernel ABI.
+  absl::Status Launch(const ThreadDim& thread_dims,
+                      absl::Span<const DeviceMemoryBase> buffers);
+
+  // For host platform, we assume that a core is a thread, and we can run at
+  // most one instance of a kernel on a given thread.
+  absl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(ThreadDim,
+                                                      size_t) const override {
+    return 1;
+  };
+
+  unsigned Arity() const override { return arity_; };
+
+ private:
+  unsigned arity_;
+  SE_HOST_Kernel* kernel_ = nullptr;
+};
+
+}  // namespace stream_executor::host
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h b/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h
new file mode 100644
index 00000000000000..6564bb49e58f22
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h
@@ -0,0 +1,80 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_C_API_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+//===----------------------------------------------------------------------===//
+// StreamExecutor Host Kernel API
+//===----------------------------------------------------------------------===//
+
+// StreamExecutor host kernel API is an integration point between a codegen
+// backend and a runtime. XLA:CPU backend compiles fusion regions to native
+// functions (via LLVM backend) that are compatible with a kernel API (and ABI),
+// and the runtime is simply invoking them with user buffers and orchestrates
+// multi-threaded execution.
+
+// WARNING: This API does not provide any backward compatibility guarantees as
+// today XLA:CPU backend is statically linked and we do not plan to load
+// kernels from dynamic libraries. It's defined as C API because we have to
+// match it in the codegen backend (built on top of LLVM) and C structs have
+// trivial layout that can be expressed as llvm stuct (*).
+//
+// (*) https://llvm.org/docs/LangRef.html#structure-types
+
+// Similar to a Gpu backend an XLA:CPU compiler generates a tiled function from
+// an HLO fusion where each tile is responsible for computing a part of the
+// output. It's up to compiler to chose the tiling strategy, from StreamExecutor
+// perspective it's simply an iteration space where each task is independent and
+// can be executed concurrently.
+typedef struct SE_HOST_KernelDim3 {
+  uint64_t x;
+  uint64_t y;
+  uint64_t z;
+} SE_HOST_KernelDim3;
+
+// Kernel grid size roughly corresponds to a CUDA block size.
+typedef struct SE_HOST_KernelDim3 SE_HOST_KernelThreadDim;
+
+// Kernel grid coordinate roughly corresponds to a CUDA block, with an
+// assumption that all kernel invocations can run concurrently.
+typedef struct SE_HOST_KernelDim3 SE_HOST_KernelThread;
+
+// A CPU kernel argument that corresponds to se::DeviceMemoryBase.
+typedef struct SE_HOST_KernelArg {
+  void* data;
+  size_t size;
+} SE_HOST_KernelArg;
+
+// A CPU kernel call frame.
+typedef struct SE_HOST_KernelCallFrame {
+  SE_HOST_KernelThreadDim* thread_dims;
+  SE_HOST_KernelThread* thread;
+
+  size_t num_args;
+  SE_HOST_KernelArg* args;
+} SE_HOST_KernelCallFrame;
+
+// Error reporting for host kernels. NULL means success.
+typedef struct SE_HOST_KernelError SE_HOST_KernelError;
+
+// Host kernel API.
+typedef SE_HOST_KernelError* SE_HOST_Kernel(
+    const SE_HOST_KernelCallFrame* call_frame);
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_C_API_H_
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel_test.cc b/third_party/xla/xla/stream_executor/host/host_kernel_test.cc
new file mode 100644
index 00000000000000..6bf3439d2e95e9
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/host/host_kernel_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/host/host_kernel.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/test.h"
+
+namespace stream_executor::host {
+
+static SE_HOST_KernelError* AddI32(const SE_HOST_KernelCallFrame* call_frame) {
+  SE_HOST_KernelArg& lhs = call_frame->args[0];
+  SE_HOST_KernelArg& rhs = call_frame->args[1];
+  SE_HOST_KernelArg& out = call_frame->args[2];
+
+  int32_t* lhs_ptr = reinterpret_cast<int32_t*>(lhs.data);
+  int32_t* rhs_ptr = reinterpret_cast<int32_t*>(rhs.data);
+  int32_t* out_ptr = reinterpret_cast<int32_t*>(out.data);
+
+  uint64_t x = call_frame->thread->x;
+  *(out_ptr + x) = *(lhs_ptr + x) + *(rhs_ptr + x);
+
+  return nullptr;
+}
+
+TEST(HostKernelTest, Addition) {
+  HostKernel kernel(/*arity=*/3, AddI32);
+
+  std::vector<int32_t> lhs = {1, 2, 3, 4};
+  std::vector<int32_t> rhs = {5, 6, 7, 8};
+  std::vector<int32_t> out = {0, 0, 0, 0};
+
+  DeviceMemoryBase lhs_mem(lhs.data(), lhs.size() * sizeof(int32_t));
+  DeviceMemoryBase rhs_mem(rhs.data(), rhs.size() * sizeof(int32_t));
+  DeviceMemoryBase out_mem(out.data(), out.size() * sizeof(int32_t));
+  std::vector<DeviceMemoryBase> args = {lhs_mem, rhs_mem, out_mem};
+
+  TF_ASSERT_OK(kernel.Launch(ThreadDim(4), args));
+
+  std::vector<int32_t> expected = {6, 8, 10, 12};
+  EXPECT_EQ(out, expected);
+}
+
+}  // namespace stream_executor::host

From 3b757097014cbae8b6e0d1c83a51e52f7827ecae Mon Sep 17 00:00:00 2001
From: sachinmuradi <sachin.muradi@intel.com>
Date: Wed, 27 Mar 2024 13:33:33 -0700
Subject: [PATCH 498/670] PR #10875: [XLA:CPU][oneDNN][Bugfix] Fix BF16 oneDNN
 Layernorm

Imported from GitHub PR https://github.com/openxla/xla/pull/10875

OneDNN expects scale and bias to be in Float32.
This PR will convert BF16/FP16 scale and weights into Float32 (similar to whats done with mkl layernorm in tensorflow [here ](https://github.com/tensorflow/tensorflow/blob/3a5a9ec33c1194f8fdec2a243714055c0118b44a/tensorflow/core/kernels/mkl/mkl_layer_norm_op.cc#L138)) , without conversion there is accuracy mismatch with oneDNN layernorm.
Copybara import of the project:

--
8563d4e9fbdc6681329960c44193ed05f8a16dfd by Sachin Muradi <sachin.muradi@intel.com>:

Fix for bf16/fp16 scale bias

--
e7328a3bbbe3e30caeebc27b7b0175eda9e8b5e9 by Sachin Muradi <sachin.muradi@intel.com>:

close review comments

Merging this change closes #10875

PiperOrigin-RevId: 619637245
---
 .../xla/service/cpu/onednn_ops_rewriter.cc    | 22 +++++-
 .../xla/xla/tests/onednn_layer_norm_test.cc   | 69 +++++++++++++++++++
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
index 9f11232a9d839b..058355223cd5ee 100644
--- a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
@@ -407,10 +407,30 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
     if (!found_ln) return OkStatus();
 
     const Shape& src_shape = src->shape();
+    auto scale_type = scale->shape().element_type();
+    auto bias_type = bias->shape().element_type();
+    HloInstruction* scale_operand = scale;
+    HloInstruction* bias_operand = bias;
+
+    // oneDNN requires scale and shift float32
+    if ((scale_type == PrimitiveType::BF16) ||
+        (scale_type == PrimitiveType::F16)) {
+      scale_operand = instr->AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(scale->shape(), PrimitiveType::F32),
+          scale));
+    }
+
+    if ((bias_type == PrimitiveType::BF16) ||
+        (bias_type == PrimitiveType::F16)) {
+      bias_operand = instr->AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(bias->shape(), PrimitiveType::F32),
+          bias));
+    }
 
     HloInstruction* ln_call =
         instr->AddInstruction(HloInstruction::CreateCustomCall(
-            src_shape, {src, scale, bias}, "__onednn$layernorm"));
+            src_shape, {src, scale_operand, bias_operand},
+            "__onednn$layernorm"));
     BackendConfig backend_config;
     OneDnnLayerNormConfig* ln_config =
         backend_config.mutable_onednn_layer_norm_config();
diff --git a/third_party/xla/xla/tests/onednn_layer_norm_test.cc b/third_party/xla/xla/tests/onednn_layer_norm_test.cc
index b19603cc19739b..beead4f62b5b49 100644
--- a/third_party/xla/xla/tests/onednn_layer_norm_test.cc
+++ b/third_party/xla/xla/tests/onednn_layer_norm_test.cc
@@ -141,6 +141,75 @@ TEST_F(LayerNormTest, LayerNormTest0_F16) {
   MatchOptimizedHlo(layer_norm_module_str, onednn_layer_norm_);
 }
 
+// Test case encountered in models like TFViTForImageClassification in
+// HuggingFace
+// (https://huggingface.co/docs/transformers/model_doc/vit#transformers.TFViTForImageClassification)
+TEST_F(LayerNormTest, LayerNormTest1_BF16) {
+  const char* layer_norm_module_str = R"(
+  HloModule layer_norm.test
+  region_add {
+    Arg_0.7555 = f32[] parameter(0)
+    Arg_1.7556 = f32[] parameter(1)
+    ROOT add.7557 = f32[] add(Arg_0.7555, Arg_1.7556)
+  }
+  ENTRY main {
+    Arg_0.1 = bf16[160,197,768] parameter(0), sharding={replicated}
+    Arg_0.2 = bf16[768] parameter(1), sharding={replicated}
+    Arg_0.3 = bf16[768] parameter(2), sharding={replicated}
+    convert.80 = f32[160,197,768] convert(Arg_0.1)
+    constant.81 = f32[] constant(0)
+    convert.82 = f32[] convert(constant.81)
+    reduce.87 = f32[160,197] reduce(convert.80, convert.82), dimensions={2}, to_apply=region_add
+    constant.88 = s32[] constant(768)
+    convert.89 = f32[] convert(constant.88)
+    broadcast.90 = f32[160,197] broadcast(convert.89), dimensions={}
+    divide.91 = f32[160,197] divide(reduce.87, broadcast.90)
+    convert.92 = bf16[160,197] convert(divide.91)
+    reshape.93 = bf16[160,197,1] reshape(convert.92)
+    reshape.94 = bf16[160,197] reshape(reshape.93)
+    broadcast.95 = bf16[160,197,768] broadcast(reshape.94), dimensions={0,1}
+    subtract.96 = bf16[160,197,768] subtract(Arg_0.1, broadcast.95)
+    multiply.97 = bf16[160,197,768] multiply(subtract.96, subtract.96)
+    convert.98 = f32[160,197,768] convert(multiply.97)
+    constant.99 = f32[] constant(0)
+    convert.100 = f32[] convert(constant.99)
+    reduce.105 = f32[160,197] reduce(convert.98, convert.100), dimensions={2}, to_apply=region_add
+    constant.106 = s32[] constant(768)
+    convert.107 = f32[] convert(constant.106)
+    broadcast.108 = f32[160,197] broadcast(convert.107), dimensions={}
+    divide.109 = f32[160,197] divide(reduce.105, broadcast.108)
+    convert.110 = bf16[160,197] convert(divide.109)
+    reshape.111 = bf16[160,197,1] reshape(convert.110)
+    constant.112 = bf16[] constant(1.002e-12)
+    broadcast.113 = bf16[160,197,1] broadcast(constant.112), dimensions={}
+    add.114 = bf16[160,197,1] add(reshape.111, broadcast.113)
+    rsqrt.115 = bf16[160,197,1] rsqrt(add.114)
+    reshape.118 = bf16[160,197] reshape(rsqrt.115)
+    broadcast.119 = bf16[160,197,768] broadcast(reshape.118), dimensions={0,1}
+    broadcast.117 = bf16[160,197,768] broadcast(Arg_0.2), dimensions={2}
+    multiply.120 = bf16[160,197,768] multiply(broadcast.119, broadcast.117)
+    multiply.121 = bf16[160,197,768] multiply(Arg_0.1, multiply.120)
+    broadcast.126 = bf16[160,197,768] broadcast(Arg_0.3), dimensions={2}
+    reshape.122 = bf16[160,197] reshape(reshape.93)
+    broadcast.123 = bf16[160,197,768] broadcast(reshape.122), dimensions={0,1}
+    multiply.124 = bf16[160,197,768] multiply(multiply.120, broadcast.123)
+    subtract.127 = bf16[160,197,768] subtract(broadcast.126, multiply.124)
+    ROOT add.128 = bf16[160,197,768] add(multiply.121, subtract.127)
+  }
+)";
+
+  EXPECT_TRUE(RunAndCompare(layer_norm_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(layer_norm_module_str,
+                    R"(
+  ; CHECK:     custom_call_target="__onednn$layernorm",
+  ; CHECK:       backend_config={
+  ; CHECK-DAG:     "onednn_layer_norm_config":{
+  ; CHECK-DAG:       "fused_ops":"SCALE_AND_SHIFT"
+  ; CHECK-DAG:   }
+  ; CHECK:     }
+  )");
+}
+
 }  // namespace
 }  // namespace xla
 

From 3bb35333ad29ffede87081e2f2b358f342a15f9d Mon Sep 17 00:00:00 2001
From: Kanglan Tang <kanglan@google.com>
Date: Wed, 27 Mar 2024 13:45:09 -0700
Subject: [PATCH 499/670] Disable the import tensorflow check for TF-TPU

For TF-TPU, import tensorflow step loads libtpu along with it, which in turn try to connect with TPU drivers. When these whls are built on a CPU only machine which doesn't have drivers, the build fails. We add an env variable to disable the import tensorflow check temporarily until we come up with a neat solution.

PiperOrigin-RevId: 619641865
---
 ci/official/envs/ci_default                       | 1 +
 ci/official/envs/linux_x86_tpu                    | 1 +
 ci/official/utilities/rename_and_verify_wheels.sh | 6 ++++--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/ci/official/envs/ci_default b/ci/official/envs/ci_default
index 96d87423392541..7db6569b3dc075 100644
--- a/ci/official/envs/ci_default
+++ b/ci/official/envs/ci_default
@@ -64,5 +64,6 @@ TFCI_PYTHON_VERSION=
 TFCI_WHL_AUDIT_ENABLE=
 TFCI_WHL_AUDIT_PLAT=
 TFCI_WHL_BAZEL_TEST_ENABLE=
+TFCI_WHL_IMPORT_TEST_ENABLE=1
 TFCI_WHL_SIZE_LIMIT=
 TFCI_WHL_SIZE_LIMIT_ENABLE=
diff --git a/ci/official/envs/linux_x86_tpu b/ci/official/envs/linux_x86_tpu
index 3c7d61b2ac3794..8fa88ad7c85902 100644
--- a/ci/official/envs/linux_x86_tpu
+++ b/ci/official/envs/linux_x86_tpu
@@ -18,5 +18,6 @@ TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
 TFCI_BUILD_PIP_PACKAGE_ARGS="--repo_env=WHEEL_NAME=tensorflow_tpu"
 TFCI_LIB_SUFFIX="-tpu-linux-x86_64"
 TFCI_WHL_BAZEL_TEST_ENABLE=0
+TFCI_WHL_IMPORT_TEST_ENABLE=0
 TFCI_WHL_SIZE_LIMIT=580M
 TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS="-f https://storage.googleapis.com/libtpu-tf-releases/index.html"
diff --git a/ci/official/utilities/rename_and_verify_wheels.sh b/ci/official/utilities/rename_and_verify_wheels.sh
index 4f4ea6745d5cb9..a79ce2a8868a3e 100755
--- a/ci/official/utilities/rename_and_verify_wheels.sh
+++ b/ci/official/utilities/rename_and_verify_wheels.sh
@@ -58,8 +58,10 @@ venv=$(mktemp -d)
 "python${TFCI_PYTHON_VERSION}" -m venv "$venv"
 python="$venv/bin/python3"
 "$python" -m pip install *.whl $TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS
-"$python" -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
-"$python" -c 'import sys; import tensorflow as tf; sys.exit(0 if "keras" in tf.keras.__name__ else 1)'
+if [[ "$TFCI_WHL_IMPORT_TEST_ENABLE" == "1" ]]; then
+  "$python" -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
+  "$python" -c 'import sys; import tensorflow as tf; sys.exit(0 if "keras" in tf.keras.__name__ else 1)'
+fi
 # VERY basic check to ensure the [and-cuda] package variant is installable.
 # Checks TFCI_BAZEL_COMMON_ARGS for "gpu" or "cuda", implying that the test is
 # relevant. All of the GPU test machines have CUDA installed via other means,

From 1919e6c367ec60d5115d3fd9eddc4c67e1a7ebd2 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Wed, 27 Mar 2024 14:15:17 -0700
Subject: [PATCH 500/670] [xla:gpu] Support tupled GEMM with DUS

Separate sliced operand chains from sliced user chains to support different scenarios of tupled GEMM -> DUS.
Sliced operand chains always contain the original hero ops (GEMM or custom call) as their last elements.
Sliced user chains contain dataflow sequences from different users of the original hero ops down to the DUS ops.
For hero ops returning tuples, we need to support cases where not all elements of the tuples are used, hence separating dataflow sequences for different users (i.e. get-tuple-elements) of the hero ops.

PiperOrigin-RevId: 619652937
---
 third_party/xla/xla/service/gpu/BUILD         |   3 +-
 .../address_computation_fusion_rewriter.cc    | 269 ++++++++++-----
 ...ddress_computation_fusion_rewriter_test.cc | 321 +++++++++++++++++-
 3 files changed, 502 insertions(+), 91 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 77c3ad6368341d..b2483ae3b3c115 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3408,7 +3408,7 @@ cc_library(
         ":hlo_traversal",
         ":ir_emission_utils",
         "//xla:shape_util",
-        "//xla:statusor",
+        "//xla:status",
         "//xla:util",
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
@@ -3420,6 +3420,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index 026d98f3aa718f..e59df332e9a731 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <iterator>
 #include <optional>
 #include <string>
 #include <utility>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -45,6 +47,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -117,14 +120,11 @@ bool IsAlignedSlice(const Shape& src_shape, const Shape& dst_shape,
   return true;
 }
 
-absl::InlinedVector<HloInstruction*, 8> GetSlicedChains(
-    const HloInstruction* instr, bool dynamic,
-    absl::flat_hash_map<const HloInstruction*, const HloInstruction*>&
-        replacement_map) {
-  replacement_map[instr] = instr;
-  absl::InlinedVector<HloInstruction*, 8> dyn_slice_chains = {
+absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
+    const HloInstruction* instr, bool dynamic) {
+  absl::InlinedVector<HloInstruction*, 8> sliced_operand_chains = {
       const_cast<HloInstruction*>(instr)};
-  absl::InlinedVector<HloInstruction*, 8> dus_chain;
+
   auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
   // This set is used to avoid duplicates in the matched results. It contains
   // the matched instructions that we have seen so far.
@@ -181,58 +181,76 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedChains(
       // Even in the case of stopping at a match that has been processed, we
       // still need to add instructions encountered in the sliced operand chain
       // during the latest traversal.
-      dyn_slice_chains.insert(dyn_slice_chains.end(),
-                              maybe_sliced_operand_chain.begin(),
-                              maybe_sliced_operand_chain.end());
+      sliced_operand_chains.insert(sliced_operand_chains.end(),
+                                   maybe_sliced_operand_chain.begin(),
+                                   maybe_sliced_operand_chain.end());
       processed_sliced_chain_set.insert(maybe_sliced_operand_chain.begin(),
                                         maybe_sliced_operand_chain.end());
     }
   }
+  return sliced_operand_chains;
+}
 
-  if (dynamic) {
-    for (auto* user : instr->users()) {
-      absl::InlinedVector<HloInstruction*, 4> maybe_sliced_user_chain;
-      bool dus_found = false;
-      auto maybe_dus_adaptor = HloFindIf(
-          {HloInstructionAdaptor(*user)}, *fusion,
-          [&](auto node) {
-            const HloInstruction* cur = &node.instruction();
-            // If the node is a match that has been processed, stop the
-            // traversal.
-            if (processed_sliced_chain_set.contains(cur)) return true;
-            maybe_sliced_user_chain.push_back(const_cast<HloInstruction*>(cur));
-            if (const auto slice_instr =
-                    DynCast<HloDynamicUpdateSliceInstruction>(cur)) {
-              if (IsAlignedSlice(slice_instr->shape(),
-                                 slice_instr->update()->shape(), nullptr)) {
-                dus_found = true;
-                replacement_map[instr] = cur;
-                return dus_found;
-              }
+// Each user of `instr` that goes into a DUS will have an entry in the returned
+// vector.
+// Each entry contains the sliced chains for that user, i.e. the dataflow
+// sequence from the user itself to the DUS (included).
+absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 4>
+GetSlicedUserChains(const HloInstruction* instr) {
+  absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 4>
+      sliced_user_chains;
+  auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
+  // This set is used to avoid duplicates in the matched results. It contains
+  // the matched instructions that we have seen so far.
+  absl::flat_hash_set<HloInstruction*> processed_sliced_chain_set;
+
+  auto traverse_hlo_and_collect = [&](HloInstruction* start) {
+    absl::InlinedVector<HloInstruction*, 2> maybe_sliced_user_chain;
+    bool dus_found = false;
+    auto maybe_dus_adaptor = HloFindIf(
+        {HloInstructionAdaptor(*start)}, *fusion,
+        [&](auto node) {
+          const HloInstruction* cur = &node.instruction();
+          // If the node is a match that has been processed, stop the
+          // traversal.
+          if (processed_sliced_chain_set.contains(cur)) return true;
+          maybe_sliced_user_chain.push_back(const_cast<HloInstruction*>(cur));
+          if (const auto slice_instr =
+                  DynCast<HloDynamicUpdateSliceInstruction>(cur)) {
+            if (IsAlignedSlice(slice_instr->shape(),
+                               slice_instr->update()->shape(), nullptr)) {
+              dus_found = true;
+              return true;
             }
-            // TODO(vuson): lift the first restriction by considering fusing
-            // other uses of the user to reuse the address computation. Only
-            // worth it if other uses are also custom calls though.
-            return cur->user_count() > 1 || !IsNoOp(cur);
-          },
-          /*visit_operands=*/false);
-      if (maybe_dus_adaptor == std::nullopt) continue;
-      const auto& maybe_dus_instr = maybe_dus_adaptor->instruction();
-      if (dus_found || processed_sliced_chain_set.contains(&maybe_dus_instr)) {
-        // Even in the case of stopping at a match that has been processed, we
-        // still need to add instructions encountered in the sliced user chain
-        // during the latest traversal.
-        dus_chain.insert(dus_chain.end(), maybe_sliced_user_chain.rbegin(),
-                         maybe_sliced_user_chain.rend());
-        processed_sliced_chain_set.insert(maybe_sliced_user_chain.begin(),
-                                          maybe_sliced_user_chain.end());
+          }
+          return cur->user_count() > 1 || !IsNoOp(cur);
+        },
+        /*visit_operands=*/false);
+    if (maybe_dus_adaptor == std::nullopt) return;
+    const auto& maybe_dus_instr = maybe_dus_adaptor->instruction();
+    if (dus_found || processed_sliced_chain_set.contains(&maybe_dus_instr)) {
+      // Even in the case of stopping at a match that has been processed, we
+      // still need to add instructions encountered in the sliced user chain
+      // during the latest traversal.
+      processed_sliced_chain_set.insert(maybe_sliced_user_chain.begin(),
+                                        maybe_sliced_user_chain.end());
+      sliced_user_chains.push_back(std::move(maybe_sliced_user_chain));
+    }
+  };
+
+  if (instr->shape().IsTuple()) {
+    for (auto* user : instr->users()) {
+      if (DynCast<HloGetTupleElementInstruction>(user)) {
+        traverse_hlo_and_collect(user);
       }
     }
+  } else {
+    if (instr->user_count() == 1) {
+      traverse_hlo_and_collect(instr->users().front());
+    }
   }
 
-  dus_chain.insert(dus_chain.end(), dyn_slice_chains.begin(),
-                   dyn_slice_chains.end());
-  return dus_chain;
+  return sliced_user_chains;
 }
 
 absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
@@ -279,25 +297,46 @@ absl::InlinedVector<HloInstruction*, 8> GetSortedMatched(
   return sorted_matched;
 }
 
-void CreateRootTuple(HloInstruction* root, HloComputation::Builder& builder) {
+Status CreateRootTuple(
+    HloInstruction* hero, HloComputation::Builder& builder,
+    absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 4>
+        sliced_user_chains,
+    absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
+        instr_mapping) {
+  unsigned tuple_size = hero->shape().tuple_shapes_size();
+
+  std::vector<HloInstruction*> sliced_elems(tuple_size, nullptr);
+  for (auto& sliced_user_chain : sliced_user_chains) {
+    auto gte = Cast<HloGetTupleElementInstruction>(sliced_user_chain.front());
+    sliced_elems[gte->tuple_index()] = sliced_user_chain.back();
+  }
+
   std::vector<HloInstruction*> elements;
-  elements.reserve(root->shape().tuple_shapes_size());
-  for (size_t i = 0; i < root->shape().tuple_shapes_size(); ++i) {
-    if (root->shape().tuple_shapes(i).IsTuple()) {
-      HloInstruction* gte = builder.AddInstruction(
-          HloInstruction::CreateGetTupleElement(root, i));
-      CreateRootTuple(gte, builder);
+  for (size_t i = 0; i < tuple_size; ++i) {
+    if (sliced_elems[i] != nullptr) {
+      elements.push_back(instr_mapping[sliced_elems[i]]);
+      continue;
+    }
+    auto* gte = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(instr_mapping[hero], i));
+    if (hero->shape().tuple_shapes(i).IsTuple()) {
+      instr_mapping[gte] = gte;
+      TF_RETURN_IF_ERROR(CreateRootTuple(gte, builder, {}, instr_mapping));
       elements.push_back(builder.last_added_instruction());
     } else {
-      elements.push_back(builder.AddInstruction(
-          HloInstruction::CreateGetTupleElement(root, i)));
+      elements.push_back(gte);
     }
   }
-  builder.AddInstruction(HloInstruction::CreateTuple(elements));
+  if (elements.size() > 1)
+    builder.AddInstruction(HloInstruction::CreateTuple(elements));
+
+  return absl::OkStatus();
 }
 
 absl::StatusOr<HloComputation*> CreateFusionBody(
-    HloModule* module, absl::Span<HloInstruction* const> matched,
+    HloModule* module, absl::Span<HloInstruction* const> operand_matches,
+    absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 4>
+        sliced_user_chains,
     absl::Span<HloInstruction* const> captures) {
   HloComputation::Builder builder("address-computation");
 
@@ -323,16 +362,25 @@ absl::StatusOr<HloComputation*> CreateFusionBody(
 
   // Instructions in the pattern are already topologically sorted, as we visited
   // them following use-def chain, then reverse the list.
-  for (HloInstruction* instr : matched) {
+  HloInstruction* hero;
+  for (HloInstruction* instr : operand_matches) {
     instr_mapping[instr] = builder.AddInstruction(
         instr->CloneWithNewOperands(instr->shape(), mapped_operands(instr)));
+    hero = instr;
   }
 
-  HloInstruction* root = builder.last_added_instruction();
-  // Create a root tuple if the root is a tuple to make sure there's a buffer
+  for (auto& sliced_user_chain : sliced_user_chains) {
+    for (HloInstruction* instr : sliced_user_chain) {
+      instr_mapping[instr] = builder.AddInstruction(
+          instr->CloneWithNewOperands(instr->shape(), mapped_operands(instr)));
+    }
+  }
+
+  // Create a tuple if the hero is a tuple to make sure there's a buffer
   // assigned for each of the elements. Make sure the tuple is not nil first.
-  if (root->shape().IsTuple() && root->shape().tuple_shapes_size() > 0) {
-    CreateRootTuple(root, builder);
+  if (hero->shape().IsTuple() && hero->shape().tuple_shapes_size() > 0) {
+    TF_RETURN_IF_ERROR(
+        CreateRootTuple(hero, builder, sliced_user_chains, instr_mapping));
   }
 
   return module->AddComputationAndUnifyNamesAndIds(builder.Build(), false);
@@ -375,11 +423,12 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
   if (!module->has_schedule()) return Internal("module is not scheduled");
 
   auto process_slices = [&](bool dynamic) -> absl::StatusOr<bool> {
-    absl::flat_hash_map<HloInstruction*,
-                        absl::InlinedVector<HloInstruction*, 8>>
+    absl::flat_hash_map<
+        HloInstruction*,
+        std::pair<
+            absl::InlinedVector<HloInstruction*, 8>,
+            absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 4>>>
         matches;
-    absl::flat_hash_map<const HloInstruction*, const HloInstruction*>
-        replacement_map;
 
     // Collect all potential custom call matches in the non-fusion computations.
     for (HloComputation* computation : module->computations()) {
@@ -387,11 +436,28 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
       for (HloInstruction* instr : computation->instructions()) {
         if (IsLegacyCublasMatmul(*instr) ||
             (!dynamic && IsCustomCall(instr, platform_name_))) {
-          auto sliced_operand_chains =
-              GetSlicedChains(instr, dynamic, replacement_map);
-          if (!(sliced_operand_chains.size() == 1 &&
-                sliced_operand_chains.front() == instr)) {
-            matches[instr] = std::move(sliced_operand_chains);
+          auto sliced_operand_chains = GetSlicedOperandChains(instr, dynamic);
+          bool has_sliced_operand_chains = sliced_operand_chains.size() > 1;
+          absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 4>
+              sliced_user_chains{};
+          if (dynamic) sliced_user_chains = GetSlicedUserChains(instr);
+
+          bool has_sliced_user_chains =
+              absl::c_any_of(sliced_user_chains, [&](auto& sliced_user_chain) {
+                return !sliced_user_chain.empty();
+              });
+
+          if (absl::c_any_of(sliced_user_chains, [&](auto& sliced_user_chain) {
+                return DynCast<HloDynamicUpdateSliceInstruction>(
+                           sliced_user_chain.back()) == nullptr;
+              })) {
+            return absl::InternalError(
+                "Expect sliced user chain to end with a DUS.");
+          }
+
+          if (has_sliced_operand_chains || has_sliced_user_chains) {
+            matches[instr] = std::make_pair(std::move(sliced_operand_chains),
+                                            std::move(sliced_user_chains));
           }
         }
       }
@@ -401,18 +467,26 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
 
     HloSchedule& schedule = module->schedule();
     for (auto& kv : matches) {
-      auto captures = GetPatternCaptures(kv.second);
-      auto sorted = GetSortedMatched(kv.second);
+      auto& [operand_matches, sliced_user_chains] = kv.second;
+      std::vector<HloInstruction*> matches;
+      absl::c_copy(operand_matches, std::back_inserter(matches));
+
+      for (auto& sliced_user_chain : sliced_user_chains)
+        absl::c_copy(sliced_user_chain, std::back_inserter(matches));
+
+      auto captures = GetPatternCaptures(matches);
+      auto sorted_operand_matches = GetSortedMatched(operand_matches);
 
       TF_ASSIGN_OR_RETURN(HloComputation * fusion_body,
-                          CreateFusionBody(module, sorted, captures));
+                          CreateFusionBody(module, sorted_operand_matches,
+                                           sliced_user_chains, captures));
+
       TF_ASSIGN_OR_RETURN(HloInstruction * fusion,
                           CreateFusionInstruction(module, kv.first, captures,
                                                   fusion_body, dynamic));
 
       // As we are running after scheduling we have to keep it valid.
       HloComputation* parent = kv.first->parent();
-
       // Update schedule to replace the custom call instruction with the fusion
       // instruction.
       // Removal of the rest of the instructions in the sequence is handled by
@@ -420,9 +494,42 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
       HloInstructionSequence& sequence = schedule.GetOrCreateSequence(parent);
       sequence.replace_instruction(kv.first, fusion);
 
-      // TODO(vuson): handle control dependencies
-      TF_RETURN_IF_ERROR(parent->ReplaceInstruction(
-          const_cast<HloInstruction*>(replacement_map[kv.first]), fusion));
+      if (fusion->shape().IsTuple()) {
+        TF_RETURN_IF_ERROR(parent->ReplaceInstructionWithDifferentShape(
+            const_cast<HloInstruction*>(kv.first), fusion));
+        for (auto& sliced_user_chain : sliced_user_chains) {
+          auto old_gte =
+              Cast<HloGetTupleElementInstruction>(sliced_user_chain.front());
+          HloInstruction* gte =
+              parent->AddInstruction(HloInstruction::CreateGetTupleElement(
+                  fusion, old_gte->tuple_index()));
+          TF_RETURN_IF_ERROR(
+              parent->ReplaceInstruction(sliced_user_chain.back(), gte));
+        }
+      } else {
+        auto* old_instr = const_cast<HloInstruction*>(kv.first);
+        if (sliced_user_chains.empty()) {
+          // The only case where a tuple-shaped original hero op is fused into a
+          // non-tuple-shaped fusion is there's only one element of the original
+          // tuple being used. In that case, we need to replace that single
+          // get-tuple-element (instead of the hero op) with the fusion
+          // instruction.
+          if (kv.first->shape().IsTuple()) {
+            if (kv.first->user_count() != 1 ||
+                !DynCast<HloGetTupleElementInstruction>(
+                    kv.first->users().front())) {
+              return absl::InternalError(
+                  "Expect a single get-tuple-element user of the original "
+                  "tuple-shaped hero op when address computation fusion does "
+                  "not return a tuple");
+            }
+            old_instr = kv.first->users().front();
+          }
+        } else {
+          old_instr = sliced_user_chains.front().back();
+        }
+        TF_RETURN_IF_ERROR(parent->ReplaceInstruction(old_instr, fusion));
+      }
     }
 
     TF_RETURN_IF_ERROR(module->schedule().Update());
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
index e0480392c2bc82..434333f0440a62 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
@@ -181,6 +181,76 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmWithWorkspace) {
                             });
 }
 
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemmWorkspaceIgnored) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[2,8,8]{2,1,0} parameter(0)
+      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+      %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+      %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      ROOT %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       [[CC:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:       [[DOT:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[CC]]), index=0
+    ; CHECK:       [[WORKSPACE:%[^ ]+]] = s8[256]{0} get-tuple-element([[CC]]), index=1
+    ; CHECK:       ROOT [[TUPLE:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DOT]], [[WORKSPACE]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT [[DOT_MAIN:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[FUSION]]), index=0
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNotRoot) {
   const char* hlo = R"(
     HloModule test, is_scheduled=true
@@ -349,7 +419,7 @@ TEST_F(AddressComputationFusionRewriterTest,
           "grad_y":false
         }}
 
-      ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+      ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.42, %bitcast.41),
         custom_call_target="__cublas$gemm",
         backend_config={"gemm_backend_config":{
           "alpha_real":1,
@@ -1152,6 +1222,7 @@ TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemmWithWorkspace) {
     ; CHECK:              tuple([[DOT]], [[WORKSPACE]])
     ; CHECK:     }
 
+
     ; CHECK:     ENTRY %main{{.*}} {
     ; CHECK:       ROOT [[FUSION:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) fusion
     ; CHECK:         kind=kCustom, calls=%address-computation,
@@ -1170,6 +1241,81 @@ TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemmWithWorkspace) {
                             });
 }
 
+TEST_F(AddressComputationFusionRewriterTest,
+       DynamicSimpleGemmWorkspaceIgnored) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      ROOT get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(custom-call.1), index=0
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P1]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       [[CC:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:       [[DOT:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[CC]]), index=0
+    ; CHECK:       [[WORKSPACE:%[^ ]+]] = s8[256]{0} get-tuple-element([[CC]]), index=1
+    ; CHECK:       ROOT [[TUPLE:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DOT]], [[WORKSPACE]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT [[DOT_MAIN:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[FUSION]]), index=0
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
 TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemmNotRoot) {
   const char* hlo = R"(
     HloModule test, is_scheduled=true
@@ -1278,11 +1424,11 @@ TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemm) {
   )";
 
   const char* expected = R"(
-    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[8,8]{1,0} parameter(3)
-    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[8,8]{1,0} parameter(4)
-    ; CHECK-DAG:   [[P2:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(0)
-    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
-    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[8,8]{1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[8,8]{1,0} parameter(1)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(2)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(3)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(4)
     ; CHECK-DAG:   [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]]),
     ; CHECK-DAG:          custom_call_target="__cublas$gemm"
     ; CHECK-DAG:   [[BC:%[^ ]+]] = f16[1,8,8]{2,1,0} bitcast([[CC]])
@@ -1349,9 +1495,9 @@ TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmNotRoot) {
 
   const char* expected = R"(
     ; CHECK:     address-computation {{.*}} {
-    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
-    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(4)
-    ; CHECK-DAG:   [[P2:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(4)
     ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
     ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
     ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
@@ -1383,4 +1529,161 @@ TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmNotRoot) {
                             });
 }
 
+TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmWithWorkspace) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      p2 = f16[4,8,8]{2,1,0} parameter(2)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+
+    get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(custom-call.1), index=0
+    bitcast.43 = f16[1,8,8]{2,1,0} bitcast(get-tuple-element.0)
+    dus = f16[4,8,8]{2,1,0} dynamic-update-slice(p2, bitcast.43, c1_s32, c0_s32, c0_s32)
+    get-tuple-element.1 = s8[256]{0} get-tuple-element(custom-call.1), index=1
+    ROOT tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(dus, get-tuple-element.1)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(4)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P1]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       [[CC:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:       [[DOT:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[CC]]), index=0
+    ; CHECK:       [[BC:%[^ ]+]] = f16[1,8,8]{2,1,0} bitcast([[DOT]])
+    ; CHECK:       [[DUS:%[^ ]+]] = f16[4,8,8]{2,1,0} dynamic-update-slice([[P2]], [[BC]], [[C1]], [[C0]], [[C0]])
+    ; CHECK:       [[WORKSPACE:%[^ ]+]] = s8[256]{0} get-tuple-element([[CC]]), index=1
+    ; CHECK:       ROOT [[TUPLE:%[^ ]+]] = (f16[4,8,8]{2,1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DUS]], [[WORKSPACE]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:       [[DUS_MAIN:%[^ ]+]] = f16[4,8,8]{2,1,0} get-tuple-element([[FUSION]]), index=0
+    ; CHECK:       [[WORKSPACE_MAIN:%[^ ]+]] = s8[256]{0} get-tuple-element([[FUSION]]), index=1
+    ; CHECK:       ROOT {{.*}} = (f16[4,8,8]{2,1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DUS_MAIN]], [[WORKSPACE_MAIN]])
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmWorkspaceIgnored) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[8,8]{1,0} parameter(0)
+      %p1 = f16[8,8]{1,0} parameter(1)
+      %p2 = f16[4,8,8]{2,1,0} parameter(2)
+      %c1_s32 = s32[] constant(1)
+      %c0_s32 = s32[] constant(0)
+
+      %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%p0, %p1),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+      %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+      ROOT %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    })";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[8,8]{1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[8,8]{1,0} parameter(1)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(2)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(3)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(4)
+    ; CHECK-DAG:   [[CC:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) custom-call([[P0]], [[P1]]),
+    ; CHECK-DAG:          custom_call_target="__cublas$gemm"
+    ; CHECK-DAG:   [[DOT:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[CC]]), index=0
+    ; CHECK-DAG:   [[BC:%[^ ]+]] = f16[1,8,8]{2,1,0} bitcast([[DOT]])
+    ; CHECK-DAG:   [[DUS:%[^ ]+]] = f16[4,8,8]{2,1,0} dynamic-update-slice([[P2]], [[BC]], [[C1]], [[C0]], [[C0]])
+    ; CHECK-DAG:   [[WORKSPACE:%[^ ]+]] = s8[256]{0} get-tuple-element([[CC]]), index=1
+    ; CHECK:       ROOT [[TUPLE:%[^ ]+]] = (f16[4,8,8]{2,1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DUS]], [[WORKSPACE]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT [[DOT_MAIN:%[^ ]+]] = f16[4,8,8]{2,1,0} get-tuple-element([[FUSION]]), index=0
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
 }  // namespace xla::gpu

From da9ca7194a1d1d5886cab80f3da7e8fe7cb790bc Mon Sep 17 00:00:00 2001
From: Felix Chern <fchern@google.com>
Date: Wed, 27 Mar 2024 14:19:38 -0700
Subject: [PATCH 501/670] [BugFix] Fix approx_max_k first reduction smaller
 than the second reduction issue.

PiperOrigin-RevId: 619654498
---
 third_party/xla/xla/client/lib/approx_topk_shape.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/client/lib/approx_topk_shape.cc b/third_party/xla/xla/client/lib/approx_topk_shape.cc
index 64bd2d35e5b4b3..aec7b7105e9d3a 100644
--- a/third_party/xla/xla/client/lib/approx_topk_shape.cc
+++ b/third_party/xla/xla/client/lib/approx_topk_shape.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
+#include <cstdint>
 #include <utility>
 
 #include "xla/util.h"
@@ -97,7 +98,7 @@ absl::StatusOr<std::pair<int64_t, int64_t>> ApproxTopKReductionOutputSize(
           static_cast<uint64_t>((1.0 - top_k) /
                                 std::log(static_cast<double>(recall_target))),
           tpu_tiling),
-      input_size);
+      logical_input_size);
   uint32_t log2_reduction = log2_floor(logical_input_size / m);
   if (log2_reduction == 0) {
     return std::pair<int64_t, int64_t>(input_size, 0);

From d7b59acfc66677baca52d6ccc394f6b6232d6ea2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Mar 2024 14:24:22 -0700
Subject: [PATCH 502/670] Change to MSA to ignore constants while calculating
 operand distance.

This is required for MSA to identify and optimize compiler-unrolled loops.

PiperOrigin-RevId: 619656147
---
 .../memory_space_assignment/memory_space_assignment.cc    | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 6d24cca13ab831..014f5c074e0934 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -1339,11 +1339,13 @@ std::function<int(const HloInstruction*)> GetOperandDistanceFunction(
     const HloLiveRange& hlo_live_range, const HloInstruction* use_inst) {
   const int use_idx = hlo_live_range.instruction_schedule().at(use_inst);
   return [&, use_idx](const HloInstruction* operand) -> int {
-    // We just use -1 for parameter, tuple, and gte instructions. We could make
-    // this "see through" the gtes if we get too many false positives.
+    // We just use -1 for parameter, tuple, gte and constant instructions. We
+    // could make this "see through" the gtes if we get too many false
+    // positives.
     if (operand->opcode() == HloOpcode::kParameter ||
         operand->opcode() == HloOpcode::kTuple ||
-        operand->opcode() == HloOpcode::kGetTupleElement) {
+        operand->opcode() == HloOpcode::kGetTupleElement ||
+        operand->opcode() == HloOpcode::kConstant) {
       return -1;
     }
     return use_idx - hlo_live_range.instruction_schedule().at(operand);

From e545ed3ef3dc703146462c1e25897568e7d98877 Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <slebedev@google.com>
Date: Wed, 27 Mar 2024 14:25:19 -0700
Subject: [PATCH 503/670] Do not deadlock the GPU if a pure_callback dispatches
 a GPU kernel

PiperOrigin-RevId: 619656442
---
 third_party/xla/xla/pjrt/BUILD                          | 1 +
 third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc | 3 ++-
 third_party/xla/xla/python/BUILD                        | 1 +
 third_party/xla/xla/python/py_client_gpu.cc             | 3 +++
 4 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 178b7b4f82a87c..7d316928c72772 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -441,6 +441,7 @@ cc_library(
     visibility = internal_visibility(["//xla:friends"]),
     deps = [
         ":event_pool",
+        ":host_callback",
         ":local_device_state",
         ":metrics",
         ":mlir_to_hlo",
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 085d10800326ca..83261bcd7f98a6 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -103,6 +103,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/event_pool.h"
+#include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/metrics.h"
 #include "xla/pjrt/mlir_to_hlo.h"
@@ -2898,7 +2899,7 @@ PjRtStreamExecutorLoadedExecutable::Execute(
           << " num_partitions=" << num_partitions()
           << " num_addressable_devices=" << num_addressable_devices;
   std::vector<StatusOr<Result>> results(num_addressable_devices);
-  if (num_addressable_devices == 1) {
+  if (num_addressable_devices == 1 && !ThisThreadIsInsideHostCallback()) {
     // Fast-path if there is only one device — run the computation on the
     // current thread.
     const int replica = addressable_device_logical_ids_[0].replica;
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 4a53f04bb532cf..9938364cfd94bc 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -456,6 +456,7 @@ cc_library(
         "//third_party/nanobind",
         "//xla:comparison_util",
         "//xla/pjrt:exceptions",
+        "//xla/pjrt:host_callback",
         "//xla/service:custom_call_status",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/python/py_client_gpu.cc b/third_party/xla/xla/python/py_client_gpu.cc
index ca6232d7ccb9df..88496bec8b1206 100644
--- a/third_party/xla/xla/python/py_client_gpu.cc
+++ b/third_party/xla/xla/python/py_client_gpu.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #endif
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/pjrt/exceptions.h"
+#include "xla/pjrt/host_callback.h"
 #include "xla/primitive_util.h"
 #include "xla/python/callback.h"
 #include "xla/python/nb_numpy.h"
@@ -97,8 +98,10 @@ void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
     array.attr("flags").attr("writeable") = nb::bool_(false);
     PyTuple_SET_ITEM(host_input_arrays.ptr(), i, array.inc_ref().ptr());
   }
+  EnterHostCallback();
   std::optional<nb::tuple> maybe_result_tuple =
       callback->Call(host_input_arrays, status);
+  LeaveHostCallback();
   if (!maybe_result_tuple) {
     return;
   }

From 9984d944354aa14edaf5ae0634fe5829b2478fa0 Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Wed, 27 Mar 2024 15:01:41 -0700
Subject: [PATCH 504/670] Integrate StableHLO at openxla/stablehlo@f4459e76

PiperOrigin-RevId: 619669095
---
 third_party/stablehlo/temporary.patch         | 180 ------------------
 third_party/stablehlo/workspace.bzl           |   4 +-
 .../xla/third_party/stablehlo/temporary.patch | 180 ------------------
 .../xla/third_party/stablehlo/workspace.bzl   |   4 +-
 .../xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir  |   4 +-
 .../Dialect/mhlo/verifier_reduce_op.mlir      |   2 +-
 6 files changed, 7 insertions(+), 367 deletions(-)

diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index df3d010154251c..70d9744d6e8ae1 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -1,14 +1,3 @@
-diff --ruN a/stablehlo/BUILD.bazel b/stablehlo/BUILD.bazel
---- stablehlo/BUILD.bazel
-+++ stablehlo/BUILD.bazel
-@@ -422,7 +422,6 @@
-     hdrs = [
-         "stablehlo/reference/Api.h",
-     ],
--    strip_include_prefix = ".",
-     deps = [
-         ":interpreter_ops",
-         ":reference_configuration",
 diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 --- stablehlo/CMakeLists.txt
 +++ stablehlo/CMakeLists.txt
@@ -185,96 +174,6 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
  add_subdirectory(integrations)
  add_subdirectory(reference)
  add_subdirectory(tests)
-diff --ruN a/stablehlo/stablehlo/dialect/TypeInference.cpp b/stablehlo/stablehlo/dialect/TypeInference.cpp
---- stablehlo/stablehlo/dialect/TypeInference.cpp
-+++ stablehlo/stablehlo/dialect/TypeInference.cpp
-@@ -65,25 +65,6 @@
- 
- namespace mlir {
- namespace hlo {
--namespace {
--//===----------------------------------------------------------------------===//
--// Utils for quantization specific verifications
--//===----------------------------------------------------------------------===//
--template <typename T>
--bool allQuantized(ArrayRef<Type> typeRange) {
--  return llvm::all_of(typeRange, [&](Type val) {
--    return val.cast<ShapedType>().getElementType().isa<T>();
--  });
--}
--
--template <typename T>
--bool noneQuantized(ArrayRef<Type> typeRange) {
--  return llvm::all_of(typeRange, [&](Type val) {
--    return !val.cast<ShapedType>().getElementType().isa<T>();
--  });
--}
--
--}  // namespace
- 
- //===----------------------------------------------------------------------===//
- // Utils for shape functions.
-@@ -3472,60 +3453,6 @@
-                              "is incompatible with return type of operation ",
-                              shapedResultType, "");
- 
--  llvm::SmallVector<Type, 3> typeEntries{lhsType, rhsType, resultType};
--  if (noneQuantized<quant::QuantizedType>(typeEntries)) return success();
--  // convolution_c28
--  if (!allQuantized<quant::QuantizedType>(typeEntries)) {
--    return emitOptionalError(location,
--                             "not all of operands and result are quantized");
--  }
--
--  auto lhsQType =
--      getElementTypeOrSelf(lhsType).dyn_cast<quant::QuantizedType>();
--  auto rhsQType =
--      getElementTypeOrSelf(rhsType).dyn_cast<quant::QuantizedType>();
--  auto resultQType =
--      getElementTypeOrSelf(resultType).dyn_cast<quant::QuantizedType>();
--  // convolution_c29
--  if (lhsQType.getStorageType() != rhsQType.getStorageType())
--    return emitOptionalError(location, "mismatched operand storage types ",
--                             lhsQType.getStorageType(), " and ",
--                             rhsQType.getStorageType());
--  // convolution_c30
--  auto expressedType = lhsQType.getExpressedType();
--  if (expressedType != rhsQType.getExpressedType() ||
--      expressedType != resultQType.getExpressedType())
--    return emitOptionalError(location,
--                             "mismatched operands and result expressed types");
--
--  llvm::SmallVector<Type, 2> typeEntriesPerAxis{rhsType, resultType};
--  if (noneQuantized<quant::UniformQuantizedPerAxisType>(typeEntriesPerAxis))
--    return success();
--  // convolution_c31
--  auto rhsQPAType = rhsQType.dyn_cast<quant::UniformQuantizedPerAxisType>();
--  auto resultQPAType =
--      resultQType.dyn_cast<quant::UniformQuantizedPerAxisType>();
--  if (!rhsQPAType && resultQPAType) {
--    return emitOptionalError(
--        location, "per-tensor rhs expects per-tensor result but received ",
--        rhsType, " and ", resultType, " respectively");
--  }
--
--  // convolution_c32
--  if (rhsQPAType &&
--      rhsQPAType.getQuantizedDimension() != kernelOutputFeatureDimension)
--    return emitOptionalError(
--        location, "mismatched kernel_output_feature_dimension ",
--        kernelOutputFeatureDimension, " and rhs quantized dimension ",
--        rhsQPAType.getQuantizedDimension());
--  // convolution_c33
--  if (resultQPAType &&
--      resultQPAType.getQuantizedDimension() != outputFeatureDimension)
--    return emitOptionalError(location, "mismatched output_feature_dimension ",
--                             outputFeatureDimension,
--                             " and result quantized dimension ",
--                             resultQPAType.getQuantizedDimension());
--
-   return success();
- }
- 
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
@@ -2746,83 +2645,4 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
---- stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
-+++ stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
-@@ -821,75 +821,3 @@
-   %0 = "stablehlo.uniform_dequantize"(%arg0) : (tensor<4x!quant.uniform<si8:f32, 1.000000e+00>>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
- }
--
--// -----
--
--func.func @convolution_c28(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{not all of operands and result are quantized}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207xf32>, tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
--
--// -----
--
--func.func @convolution_c29(%arg0: tensor<1x8x8x207x!quant.uniform<i16:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{mismatched operand storage types 'i16' and 'i8'}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i16:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
--
--// -----
--
--func.func @convolution_c30(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f64, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{mismatched operands and result expressed types}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f64, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
--
--// -----
--
--func.func @convolution_c31(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 0.1:-30>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {10.0:50}>> {
--  // expected-error@+1 {{per-tensor rhs expects per-tensor result but received 'tensor<3x3x207x16x!quant.uniform<i8:f32, 1.000000e-01:-30>>' and 'tensor<1x8x8x16x!quant.uniform<i8:f32:0, {1.000000e+01:50}>>' respectively}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32, 0.1:-30>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {10.0:50}>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {10.0:50}>>
--}
--
--// -----
--
--func.func @convolution_c32(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {0.1:-30}>> {
--  // expected-error@+1 {{mismatched kernel_output_feature_dimension 3 and rhs quantized dimension 0}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {0.1:-30}>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {0.1:-30}>>
--}
--
--// -----
--
--func.func @convolution_c33(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32:3, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>> {
-- // expected-error@+1 {{mismatched output_feature_dimension 3 and result quantized dimension 0}}
-- %0 = stablehlo.convolution(%arg0, %arg1)
--     dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--     window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--     {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--    (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:3, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
-- func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
--}
 
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index de47cf2fc6db34..b79bde1851c6c6 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "b27ef13c377983d04c233adb8e1de093ec7f350a"
-    STABLEHLO_SHA256 = "fe48a38f20e73ddfeb6e364f4b43e13f490e60e236eaa23b109ce203dc49e62b"
+    STABLEHLO_COMMIT = "f4459e76553770ecc94f23de29984c7859ad9f05"
+    STABLEHLO_SHA256 = "00e2bcd62db577297a0a9b6f9203a9f2f58bd40bfa2574908ffa883ad7f60fd5"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index df3d010154251c..70d9744d6e8ae1 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1,14 +1,3 @@
-diff --ruN a/stablehlo/BUILD.bazel b/stablehlo/BUILD.bazel
---- stablehlo/BUILD.bazel
-+++ stablehlo/BUILD.bazel
-@@ -422,7 +422,6 @@
-     hdrs = [
-         "stablehlo/reference/Api.h",
-     ],
--    strip_include_prefix = ".",
-     deps = [
-         ":interpreter_ops",
-         ":reference_configuration",
 diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 --- stablehlo/CMakeLists.txt
 +++ stablehlo/CMakeLists.txt
@@ -185,96 +174,6 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
  add_subdirectory(integrations)
  add_subdirectory(reference)
  add_subdirectory(tests)
-diff --ruN a/stablehlo/stablehlo/dialect/TypeInference.cpp b/stablehlo/stablehlo/dialect/TypeInference.cpp
---- stablehlo/stablehlo/dialect/TypeInference.cpp
-+++ stablehlo/stablehlo/dialect/TypeInference.cpp
-@@ -65,25 +65,6 @@
- 
- namespace mlir {
- namespace hlo {
--namespace {
--//===----------------------------------------------------------------------===//
--// Utils for quantization specific verifications
--//===----------------------------------------------------------------------===//
--template <typename T>
--bool allQuantized(ArrayRef<Type> typeRange) {
--  return llvm::all_of(typeRange, [&](Type val) {
--    return val.cast<ShapedType>().getElementType().isa<T>();
--  });
--}
--
--template <typename T>
--bool noneQuantized(ArrayRef<Type> typeRange) {
--  return llvm::all_of(typeRange, [&](Type val) {
--    return !val.cast<ShapedType>().getElementType().isa<T>();
--  });
--}
--
--}  // namespace
- 
- //===----------------------------------------------------------------------===//
- // Utils for shape functions.
-@@ -3472,60 +3453,6 @@
-                              "is incompatible with return type of operation ",
-                              shapedResultType, "");
- 
--  llvm::SmallVector<Type, 3> typeEntries{lhsType, rhsType, resultType};
--  if (noneQuantized<quant::QuantizedType>(typeEntries)) return success();
--  // convolution_c28
--  if (!allQuantized<quant::QuantizedType>(typeEntries)) {
--    return emitOptionalError(location,
--                             "not all of operands and result are quantized");
--  }
--
--  auto lhsQType =
--      getElementTypeOrSelf(lhsType).dyn_cast<quant::QuantizedType>();
--  auto rhsQType =
--      getElementTypeOrSelf(rhsType).dyn_cast<quant::QuantizedType>();
--  auto resultQType =
--      getElementTypeOrSelf(resultType).dyn_cast<quant::QuantizedType>();
--  // convolution_c29
--  if (lhsQType.getStorageType() != rhsQType.getStorageType())
--    return emitOptionalError(location, "mismatched operand storage types ",
--                             lhsQType.getStorageType(), " and ",
--                             rhsQType.getStorageType());
--  // convolution_c30
--  auto expressedType = lhsQType.getExpressedType();
--  if (expressedType != rhsQType.getExpressedType() ||
--      expressedType != resultQType.getExpressedType())
--    return emitOptionalError(location,
--                             "mismatched operands and result expressed types");
--
--  llvm::SmallVector<Type, 2> typeEntriesPerAxis{rhsType, resultType};
--  if (noneQuantized<quant::UniformQuantizedPerAxisType>(typeEntriesPerAxis))
--    return success();
--  // convolution_c31
--  auto rhsQPAType = rhsQType.dyn_cast<quant::UniformQuantizedPerAxisType>();
--  auto resultQPAType =
--      resultQType.dyn_cast<quant::UniformQuantizedPerAxisType>();
--  if (!rhsQPAType && resultQPAType) {
--    return emitOptionalError(
--        location, "per-tensor rhs expects per-tensor result but received ",
--        rhsType, " and ", resultType, " respectively");
--  }
--
--  // convolution_c32
--  if (rhsQPAType &&
--      rhsQPAType.getQuantizedDimension() != kernelOutputFeatureDimension)
--    return emitOptionalError(
--        location, "mismatched kernel_output_feature_dimension ",
--        kernelOutputFeatureDimension, " and rhs quantized dimension ",
--        rhsQPAType.getQuantizedDimension());
--  // convolution_c33
--  if (resultQPAType &&
--      resultQPAType.getQuantizedDimension() != outputFeatureDimension)
--    return emitOptionalError(location, "mismatched output_feature_dimension ",
--                             outputFeatureDimension,
--                             " and result quantized dimension ",
--                             resultQPAType.getQuantizedDimension());
--
-   return success();
- }
- 
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
@@ -2746,83 +2645,4 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
---- stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
-+++ stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
-@@ -821,75 +821,3 @@
-   %0 = "stablehlo.uniform_dequantize"(%arg0) : (tensor<4x!quant.uniform<si8:f32, 1.000000e+00>>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
- }
--
--// -----
--
--func.func @convolution_c28(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{not all of operands and result are quantized}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207xf32>, tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
--
--// -----
--
--func.func @convolution_c29(%arg0: tensor<1x8x8x207x!quant.uniform<i16:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{mismatched operand storage types 'i16' and 'i8'}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i16:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
--
--// -----
--
--func.func @convolution_c30(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f64, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{mismatched operands and result expressed types}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f64, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
--
--// -----
--
--func.func @convolution_c31(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 0.1:-30>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {10.0:50}>> {
--  // expected-error@+1 {{per-tensor rhs expects per-tensor result but received 'tensor<3x3x207x16x!quant.uniform<i8:f32, 1.000000e-01:-30>>' and 'tensor<1x8x8x16x!quant.uniform<i8:f32:0, {1.000000e+01:50}>>' respectively}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32, 0.1:-30>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {10.0:50}>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {10.0:50}>>
--}
--
--// -----
--
--func.func @convolution_c32(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {0.1:-30}>> {
--  // expected-error@+1 {{mismatched kernel_output_feature_dimension 3 and rhs quantized dimension 0}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {0.1:-30}>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {0.1:-30}>>
--}
--
--// -----
--
--func.func @convolution_c33(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32:3, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>> {
-- // expected-error@+1 {{mismatched output_feature_dimension 3 and result quantized dimension 0}}
-- %0 = stablehlo.convolution(%arg0, %arg1)
--     dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--     window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--     {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--    (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:3, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
-- func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
--}
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index de47cf2fc6db34..b79bde1851c6c6 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "b27ef13c377983d04c233adb8e1de093ec7f350a"
-    STABLEHLO_SHA256 = "fe48a38f20e73ddfeb6e364f4b43e13f490e60e236eaa23b109ce203dc49e62b"
+    STABLEHLO_COMMIT = "f4459e76553770ecc94f23de29984c7859ad9f05"
+    STABLEHLO_SHA256 = "00e2bcd62db577297a0a9b6f9203a9f2f58bd40bfa2574908ffa883ad7f60fd5"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
index 4886bbe86385cd..9c9d2be9793e1a 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
@@ -1496,7 +1496,7 @@ func.func @concatenate_c4(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor
 
 func.func @concatenate_c6(%arg0: tensor<1x3xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3x3xi32> {
   // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
-  // expected-error@+1 {{shapes of operand (0) and (1) do not match at non-concat index: (1, 3) != (2, 2) at non-concat index 1}}
+  // expected-error@+1 {{shapes of operand (0) and (1) are not compatible at non-concat index 1: (1, 3) != (2, 2)}}
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1x3xi32>, tensor<2x2xi32>) -> tensor<3x3xi32>
   func.return %0 : tensor<3x3xi32>
 }
@@ -6581,4 +6581,4 @@ func.func @composite_c4(%arg0: !mhlo.token) {
     decomposition = @foo
   } : (!mhlo.token) -> tensor<f32>
   func.return
-}
\ No newline at end of file
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir
index edc680fbf78999..764a93e5fb77ed 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir
@@ -136,7 +136,7 @@ func.func @reduce_diferent_input_shapes(%arg0: tensor<2x3xf32>, %arg1: tensor<3x
 func.func @reduce_oob_dims(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
     -> (tensor<?xf32>) {
 
-  // expected-error@+1 {{Out-of-bounds dimension 2, expected to be less than the input-tensor rank 2}}
+  // expected-error@+1 {{Out-of-bounds dimension 2, expected to be in range [0, 2)}}
   %0 = "mhlo.reduce"(%arg0, %arg1) ({
 
   ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32> ):

From f9076c414d0c910d82ba573eef4110fb6fae6473 Mon Sep 17 00:00:00 2001
From: Lawrence Wolf-Sonkin <lawrencews@google.com>
Date: Wed, 27 Mar 2024 15:08:15 -0700
Subject: [PATCH 505/670] [tensorflow] Mark `gtl::ArraySlice` and
 `gtl::MutableArraySlice` with `ABSL_DEPRECATE_AND_INLINE`

PiperOrigin-RevId: 619671604
---
 tensorflow/core/lib/gtl/BUILD         |  3 ++-
 tensorflow/core/lib/gtl/array_slice.h | 10 ++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/lib/gtl/BUILD b/tensorflow/core/lib/gtl/BUILD
index d03c47b594f637..868d05f0912fc8 100644
--- a/tensorflow/core/lib/gtl/BUILD
+++ b/tensorflow/core/lib/gtl/BUILD
@@ -31,7 +31,8 @@ cc_library(
     name = "array_slice",
     hdrs = ["array_slice.h"],
     deps = [
-        "//tensorflow/core/lib/gtl:inlined_vector",
+        ":inlined_vector",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/core/lib/gtl/array_slice.h b/tensorflow/core/lib/gtl/array_slice.h
index 8f47faf89e4d01..ddacf4d2e3d5c3 100644
--- a/tensorflow/core/lib/gtl/array_slice.h
+++ b/tensorflow/core/lib/gtl/array_slice.h
@@ -16,19 +16,25 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_GTL_ARRAY_SLICE_H_
 #define TENSORFLOW_CORE_LIB_GTL_ARRAY_SLICE_H_
 
+#include "absl/base/macros.h"
 #include "absl/types/span.h"
 // TODO(timshen): This is kept only because lots of targets transitively depend
 // on it. Remove all targets' dependencies.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace gtl {
 
 template <typename T>
-using ArraySlice = absl::Span<const T>;
+using ArraySlice ABSL_DEPRECATE_AND_INLINE() = absl::Span<const T>;
 
 template <typename T>
-using MutableArraySlice = absl::Span<T>;
+using MutableArraySlice ABSL_DEPRECATE_AND_INLINE() = absl::Span<T>;
 
 }  // namespace gtl
 }  // namespace tensorflow

From 42d629bda0a984fe0d242dd811269f241f85a41d Mon Sep 17 00:00:00 2001
From: Olli Lupton <olupton@nvidia.com>
Date: Wed, 27 Mar 2024 15:21:05 -0700
Subject: [PATCH 506/670] PR #9896: Opaque types for NVTX domain and string
 handles

Imported from GitHub PR https://github.com/openxla/xla/pull/9896

This avoids having preprocessor checks on `GOOGLE_CUDA` inside some headers, which led to ODR violations.
Copybara import of the project:

--
22e65458993aea1a9b441e6c043f7fe315ff4cc3 by Olli Lupton <olupton@nvidia.com>:

Opaque types for NVTX domain and string handles

This avoids having preprocessor checks on GOOGLE_CUDA inside some
headers, which led to ODR violations.

Merging this change closes #9896

PiperOrigin-RevId: 619675819
---
 .../third_party/tsl/tsl/profiler/lib/BUILD    | 21 +++--
 .../tsl/tsl/profiler/lib/nvtx_utils.cc        | 84 +++++++++++++++++
 .../tsl/tsl/profiler/lib/nvtx_utils.h         | 91 ++++++++-----------
 .../tsl/tsl/profiler/lib/nvtx_utils_stub.cc   | 29 ++++++
 .../tsl/tsl/profiler/lib/scoped_annotation.h  | 28 +++---
 third_party/xla/xla/service/gpu/runtime/BUILD |  3 +-
 .../xla/xla/service/gpu/runtime/annotation.cc | 41 ++++-----
 .../xla/xla/service/gpu/runtime/annotation.h  | 20 ++--
 8 files changed, 201 insertions(+), 116 deletions(-)
 create mode 100644 third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.cc
 create mode 100644 third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils_stub.cc

diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
index 892db510abae51..e6f8b25a83809b 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
@@ -31,6 +31,8 @@ cc_library(
 filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
+        "nvtx_utils.h",
+        "nvtx_utils_stub.cc",  #Include the stub implementation here since CUDA isn't relevant to Android.
         "scoped_annotation.h",
         "scoped_memory_debug_annotation.cc",
         "scoped_memory_debug_annotation.h",
@@ -269,27 +271,28 @@ cc_library(
 
 cc_library(
     name = "nvtx_utils",
+    srcs = if_cuda_is_configured(
+        ["nvtx_utils.cc"],
+        ["nvtx_utils_stub.cc"],
+    ),
     hdrs = ["nvtx_utils.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     visibility = ["//visibility:public"],
-    deps = [
-        "//tsl/platform:logging",
-        "//tsl/platform:macros",
-        "//tsl/platform:types",
-        "@com_google_absl//absl/strings",
-    ] + if_cuda_is_configured(nvtx_headers()),
+    deps = if_cuda_is_configured(nvtx_headers()),
 )
 
 cc_library(
     name = "scoped_annotation",
-    hdrs = ["scoped_annotation.h"],
+    hdrs = [
+        "nvtx_utils.h",
+        "scoped_annotation.h",
+    ],
     visibility = ["//visibility:public"],
     deps = [
+        ":nvtx_utils",
         "//tsl/platform:macros",
         "//tsl/platform:types",
         "@com_google_absl//absl/strings",
     ] + if_not_android([
-        ":nvtx_utils",
         "//tsl/profiler/backends/cpu:annotation_stack",
     ]),
 )
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.cc
new file mode 100644
index 00000000000000..b122c6e12dfc19
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.cc
@@ -0,0 +1,84 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tsl/profiler/lib/nvtx_utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <type_traits>
+
+#include "nvtx3/nvToolsExt.h"
+#include "nvtx3/nvToolsExtPayload.h"
+
+namespace tsl::profiler {
+static_assert(std::is_pointer_v<nvtxDomainHandle_t>);
+static_assert(std::is_pointer_v<nvtxStringHandle_t>);
+
+ProfilerDomainHandle DefaultProfilerDomain() {
+  static ProfilerDomainHandle domain =
+      reinterpret_cast<ProfilerDomainHandle>(nvtxDomainCreateA("TSL"));
+  return domain;
+}
+
+void RangePop(ProfilerDomainHandle domain) {
+  nvtxDomainRangePop(reinterpret_cast<nvtxDomainHandle_t>(domain));
+}
+
+void RangePush(ProfilerDomainHandle domain, const char* ascii) {
+  nvtxEventAttributes_t attrs{};
+  attrs.version = NVTX_VERSION;
+  attrs.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  attrs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  attrs.message.ascii = ascii;
+  nvtxDomainRangePushEx(reinterpret_cast<nvtxDomainHandle_t>(domain), &attrs);
+}
+
+namespace detail {
+void RangePush(ProfilerDomainHandle domain, StringHandle title,
+               uint64_t schema_id, const void* payload, size_t payload_size) {
+  nvtxEventAttributes_t attrs{};
+  attrs.version = NVTX_VERSION;
+  attrs.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  attrs.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
+  attrs.message.registered = reinterpret_cast<nvtxStringHandle_t>(title);
+  NVTX_PAYLOAD_EVTATTR_SET(attrs, schema_id, payload, payload_size);
+  nvtxDomainRangePushEx(reinterpret_cast<nvtxDomainHandle_t>(domain), &attrs);
+}
+}  // namespace detail
+
+uint64_t RegisterSchema(ProfilerDomainHandle domain, const void* schemaAttr) {
+  return nvtxPayloadSchemaRegister(
+      reinterpret_cast<nvtxDomainHandle_t>(domain),
+      static_cast<const nvtxPayloadSchemaAttr_t*>(schemaAttr));
+}
+
+StringHandle RegisterString(ProfilerDomainHandle domain,
+                            const std::string& str) {
+  const auto impl = [domain](const char* c_str) {
+    return reinterpret_cast<StringHandle>(nvtxDomainRegisterStringA(
+        reinterpret_cast<nvtxDomainHandle_t>(domain), c_str));
+  };
+  constexpr auto max_length = 65330;
+  if (str.size() <= max_length) {
+    return impl(str.c_str());
+  }
+  // nvbugs 4340868
+  std::string_view suffix{"\n[truncated]\n"};
+  std::string buffer(str.data(), max_length - suffix.size());
+  buffer.append(suffix);
+  return impl(buffer.c_str());
+}
+}  // namespace tsl::profiler
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
index 8713550b3a20c8..0727072d06390d 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
@@ -16,73 +16,54 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
 #define TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
 
-#include <optional>
+#include <stddef.h>
+
+#include <cstdint>
 #include <string>
 
-#if GOOGLE_CUDA
-#include "nvtx3/nvToolsExt.h"
-#include "nvtx3/nvToolsExtPayload.h"
-#else
-// Some typedef to help build without NVTX.
-typedef void* nvtxDomainHandle_t;
-typedef void* nvtxStringHandle_t;
-#endif
+namespace tsl::profiler {
+struct String;
+// Opaque handle to a string that has been pre-registered with the profiler/NVTX
+// implementation
+using StringHandle = String*;
 
-namespace tsl {
-namespace profiler {
+struct ProfilerDomain;
+// Opaque handle to a domain in the profiler/NVTX implementation
+using ProfilerDomainHandle = ProfilerDomain*;
 
-// A helper function that return the domains to use if NVTX profiling
-// is enabled.
-inline std::optional<nvtxDomainHandle_t> GetNVTXDomain() {
-#if GOOGLE_CUDA
-  static nvtxDomainHandle_t domain = nvtxDomainCreateA("TSL");
-  if (domain != nullptr) return domain;
-#endif
-  return std::nullopt;
-}
+// Get the "TSL" domain if NVTX profiling is enabled, otherwise null
+ProfilerDomainHandle DefaultProfilerDomain();
 
-// A helper function to decide whether to enable CUDA NVTX profiling ranges.
-inline bool RangesEnabled() {
-#if GOOGLE_CUDA
-  return GetNVTXDomain().has_value();
-#else
-  return false;
-#endif
-}
+// Register a string with the profiler/NVTX implementation for faster use
+StringHandle RegisterString(ProfilerDomainHandle, const std::string&);
+
+// End a range that was created on this thread by RangePush
+void RangePop(ProfilerDomainHandle);
 
 // Older/simpler version; NVTX implementation copies a C-style string each time
-inline void RangePush(nvtxDomainHandle_t domain, const char* ascii) {
-#if GOOGLE_CUDA
-  nvtxEventAttributes_t attrs{};
-  attrs.version = NVTX_VERSION;
-  attrs.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-  attrs.messageType = NVTX_MESSAGE_TYPE_ASCII;
-  attrs.message.ascii = ascii;
-  ::nvtxDomainRangePushEx(domain, &attrs);
-#endif
-}
-inline void RangePush(nvtxDomainHandle_t domain, const std::string& str) {
+void RangePush(ProfilerDomainHandle domain, const char*);
+inline void RangePush(ProfilerDomainHandle domain, const std::string& str) {
   RangePush(domain, str.c_str());
 }
 
-// More powerful version: pass a registered string instead of a C-style string,
-// and attach a generic payload. The Annotation type must implement a method
-// called NvtxSchemaId() that allows the NVTX backend to interpret the payload.
+namespace detail {
+void RangePush(ProfilerDomainHandle domain, StringHandle title,
+               uint64_t schema_id, const void* payload, size_t payload_size);
+}
+
+// More powerful version: pass a registered string instead of a C-style
+// string, and attach a generic payload. The Annotation type must implement a
+// method called NvtxSchemaId() that allows the NVTX backend to interpret the
+// payload.
 template <typename Annotation>
-void RangePush(nvtxDomainHandle_t domain, nvtxStringHandle_t handle,
+void RangePush(ProfilerDomainHandle domain, StringHandle title,
                const Annotation& annotation) {
-#if GOOGLE_CUDA
-  nvtxEventAttributes_t attrs{};
-  attrs.version = NVTX_VERSION;
-  attrs.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-  attrs.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
-  attrs.message.registered = handle;
-  NVTX_PAYLOAD_EVTATTR_SET(attrs, annotation.NvtxSchemaId(), &annotation,
-                           sizeof(Annotation));
-  ::nvtxDomainRangePushEx(domain, &attrs);
-#endif
+  return detail::RangePush(domain, title, annotation.NvtxSchemaId(),
+                           &annotation, sizeof(Annotation));
 }
 
-}  // namespace profiler
-}  // namespace tsl
+// Register the schema of a custom payload type, for use with the more powerful
+// version of RangePush
+uint64_t RegisterSchema(ProfilerDomainHandle domain, const void* schemaAttr);
+}  // namespace tsl::profiler
 #endif  // TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils_stub.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils_stub.cc
new file mode 100644
index 00000000000000..c887af77ec8b11
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils_stub.cc
@@ -0,0 +1,29 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tsl/profiler/lib/nvtx_utils.h"
+
+namespace tsl::profiler {
+ProfilerDomainHandle DefaultProfilerDomain() { return {}; }
+void RangePop(ProfilerDomainHandle) {}
+void RangePush(ProfilerDomainHandle, const char*) {}
+namespace detail {
+void RangePush(ProfilerDomainHandle, StringHandle, uint64_t, const void*,
+               size_t) {}
+}  // namespace detail
+uint64_t RegisterSchema(ProfilerDomainHandle, const void*) { return 0; }
+StringHandle RegisterString(ProfilerDomainHandle, const std::string&) {
+  return {};
+}
+}  // namespace tsl::profiler
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
index b779d600959466..bfd222a81184e3 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
@@ -23,27 +23,23 @@ limitations under the License.
 #include <utility>
 
 #include "tsl/platform/macros.h"
+#include "tsl/profiler/lib/nvtx_utils.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tsl/profiler/backends/cpu/annotation_stack.h"
 #endif
 
-#if GOOGLE_CUDA
-#include "tsl/profiler/lib/nvtx_utils.h"
-#endif
-
-namespace tsl {
-namespace profiler {
+namespace tsl::profiler {
 
 // Adds an annotation to all activities through the currently registered
 // TraceCollector until PopAnnotation() is called.
 template <typename T>
-inline void PushAnnotation(const T& generator) {
-#if GOOGLE_CUDA
-  if (auto domain = GetNVTXDomain(); TF_PREDICT_FALSE(domain.has_value())) {
-    return RangePush(*domain, generator());
+void PushAnnotation(const T& generator) {
+  if (auto domain = DefaultProfilerDomain();
+      TF_PREDICT_FALSE(domain != nullptr)) {
+    RangePush(domain, generator());
+    return;
   }
-#endif
 
 #if !defined(IS_MOBILE_PLATFORM)
   if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
@@ -64,12 +60,11 @@ inline void PopAnnotation() {
   // fail probably due to compiler in that presubmit config.
   std::atomic_thread_fence(std::memory_order_acquire);
 
-#if GOOGLE_CUDA
-  if (auto domain = GetNVTXDomain(); TF_PREDICT_FALSE(domain.has_value())) {
-    ::nvtxDomainRangePop(*domain);
+  if (auto domain = DefaultProfilerDomain();
+      TF_PREDICT_FALSE(domain != nullptr)) {
+    RangePop(domain);
     return;
   }
-#endif
 
 #if !defined(IS_MOBILE_PLATFORM)
   if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
@@ -110,7 +105,6 @@ class ScopedAnnotation {
   ScopedAnnotation& operator=(const ScopedAnnotation&) = delete;
 };
 
-}  // namespace profiler
-}  // namespace tsl
+}  // namespace tsl::profiler
 
 #endif  // TENSORFLOW_TSL_PROFILER_LIB_SCOPED_ANNOTATION_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 852048d66fe307..3a7451e83774b5 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -1,4 +1,5 @@
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("@local_tsl//tsl:tsl.bzl", "nvtx_headers")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 load("//xla/service/gpu:build_defs.bzl", "get_cub_sort_kernel_types")
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
@@ -37,7 +38,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/profiler/lib:nvtx_utils",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
-    ],
+    ] + if_cuda_is_configured(nvtx_headers()),
 )
 
 #===-------------------------------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/service/gpu/runtime/annotation.cc b/third_party/xla/xla/service/gpu/runtime/annotation.cc
index 39af816c1b29fe..809f02d30d10b3 100644
--- a/third_party/xla/xla/service/gpu/runtime/annotation.cc
+++ b/third_party/xla/xla/service/gpu/runtime/annotation.cc
@@ -42,29 +42,22 @@ limitations under the License.
 #include "tsl/profiler/lib/nvtx_utils.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 
+#if GOOGLE_CUDA
+#include "nvtx3/nvToolsExt.h"
+#include "nvtx3/nvToolsExtPayload.h"
+#endif
+
 namespace xla::gpu {
 
 using ::tsl::profiler::ScopedAnnotation;
+using ::tsl::profiler::StringHandle;
 namespace {
 
-nvtxStringHandle_t RegisterString(const std::string& str) {
-#if GOOGLE_CUDA
-  auto domain = tsl::profiler::GetNVTXDomain();
-  if (!domain) {
-    return {};  // NVTX not enabled, so don't registering strings.
+StringHandle RegisterString(const std::string& str) {
+  if (auto domain = tsl::profiler::DefaultProfilerDomain(); domain) {
+    return tsl::profiler::RegisterString(domain, str);
   }
-  constexpr auto max_length = 65330;
-  if (str.size() <= max_length) {
-    return nvtxDomainRegisterStringA(*domain, str.c_str());
-  }
-  // nvbugs 4340868
-  std::string_view suffix{"\n[truncated]\n"};
-  std::string buffer(str.data(), max_length - suffix.size());
-  buffer.append(suffix);
-  return nvtxDomainRegisterStringA(*domain, buffer.c_str());
-#else
   return {};
-#endif
 }
 
 // Nsight Systems supports some basic HTML markup in annotation strings. This
@@ -202,7 +195,7 @@ class SourceLocationVisitor : public ConstDfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  std::pair<nvtxStringHandle_t, int32_t> LongestSourceLocationPrefix() const {
+  std::pair<StringHandle, int32_t> LongestSourceLocationPrefix() const {
     // Find the longest common prefix along the members of location_set_ and
     // return a formatted version of that prefix, along with its length. As
     // location_set_ is sorted, that just means looking for the longest common
@@ -371,7 +364,7 @@ std::string CalledInstructionsAsString(HloInstruction const& inst) {
 
 // Get a string representing the longest common prefix of source locations in
 // this module, and the number of frames that that represents.
-std::pair<nvtxStringHandle_t, int32_t> GetLongestSourceLocationPrefix(
+std::pair<StringHandle, int32_t> GetLongestSourceLocationPrefix(
     const HloModule& mod) {
   // In the presence of (at least) debug callbacks, calling Accept on the root
   // instruction of the module may not reach all instructions in the module.
@@ -417,8 +410,8 @@ auto schema_entry(uint64_t type, const char* name, uint64_t offset) {
 uint64_t ModuleAnnotation::NvtxSchemaId() {
   static std::uint64_t schema_id = []() -> std::uint64_t {
 #if GOOGLE_CUDA
-    auto domain_opt = tsl::profiler::GetNVTXDomain();
-    if (!domain_opt.has_value()) {
+    auto domain = tsl::profiler::DefaultProfilerDomain();
+    if (!domain) {
       return 0;
     }
     const nvtxPayloadSchemaEntry_t schema[] = {
@@ -440,7 +433,7 @@ uint64_t ModuleAnnotation::NvtxSchemaId() {
         /* .entries = */ schema,
         /* .numEntries = */ sizeof(schema) / sizeof(schema[0]),
         /* .payloadStaticSize = */ sizeof(ModuleAnnotation)};
-    return nvtxPayloadSchemaRegister(*domain_opt, &schemaAttr);
+    return RegisterSchema(domain, &schemaAttr);
 #else
     return 0;
 #endif
@@ -491,8 +484,8 @@ ModuleAnnotations::ModuleAnnotations(std::string_view module_name)
 uint64_t KernelAnnotation::NvtxSchemaId() {
   static std::uint64_t schema_id = []() -> std::uint64_t {
 #if GOOGLE_CUDA
-    auto domain_opt = tsl::profiler::GetNVTXDomain();
-    if (!domain_opt.has_value()) {
+    auto domain = tsl::profiler::DefaultProfilerDomain();
+    if (!domain) {
       return 0;
     }
     const nvtxPayloadSchemaEntry_t schema[] = {
@@ -515,7 +508,7 @@ uint64_t KernelAnnotation::NvtxSchemaId() {
         /* .entries = */ schema,
         /* .numEntries = */ sizeof(schema) / sizeof(schema[0]),
         /* .payloadStaticSize = */ sizeof(KernelAnnotation)};
-    return nvtxPayloadSchemaRegister(*domain_opt, &schemaAttr);
+    return RegisterSchema(domain, &schemaAttr);
 #else
     return 0;
 #endif
diff --git a/third_party/xla/xla/service/gpu/runtime/annotation.h b/third_party/xla/xla/service/gpu/runtime/annotation.h
index fc0dbfb5ed2a60..70a4c8df1ddeb1 100644
--- a/third_party/xla/xla/service/gpu/runtime/annotation.h
+++ b/third_party/xla/xla/service/gpu/runtime/annotation.h
@@ -38,21 +38,21 @@ class ModuleAnnotation {
 
   std::string_view longest_op_name_prefix() const { return longest_prefix_; }
   explicit operator std::string_view() const { return title_str_; }
-  nvtxStringHandle_t title() const { return title_; }
+  tsl::profiler::StringHandle title() const { return title_; }
   static uint64_t NvtxSchemaId();
   int32_t common_stack_frames() const { return common_stack_frames_; }
 
  private:
-  friend void RangePush(nvtxDomainHandle_t domain,
+  friend void RangePush(tsl::profiler::ProfilerDomainHandle domain,
                         const ModuleAnnotation& annotation) {
     tsl::profiler::RangePush(domain, annotation.title(), annotation);
   }
 
   std::string longest_prefix_;
   std::string title_str_;
-  nvtxStringHandle_t title_;
-  nvtxStringHandle_t module_name_;
-  nvtxStringHandle_t common_src_locations_{};
+  tsl::profiler::StringHandle title_;
+  tsl::profiler::StringHandle module_name_;
+  tsl::profiler::StringHandle common_src_locations_{};
   int32_t module_id_{-1};
   int32_t common_stack_frames_{};
 };
@@ -66,16 +66,16 @@ struct KernelAnnotation {
   static uint64_t NvtxSchemaId();
 
  private:
-  friend void RangePush(nvtxDomainHandle_t domain,
+  friend void RangePush(tsl::profiler::ProfilerDomainHandle domain,
                         const KernelAnnotation& annotation) {
     tsl::profiler::RangePush(domain, annotation.title, annotation);
   }
 
   std::string title_str;
-  nvtxStringHandle_t title;
-  nvtxStringHandle_t hlo_dump;
-  nvtxStringHandle_t src_locations;
-  nvtxStringHandle_t called_hlo_dump;
+  tsl::profiler::StringHandle title;
+  tsl::profiler::StringHandle hlo_dump;
+  tsl::profiler::StringHandle src_locations;
+  tsl::profiler::StringHandle called_hlo_dump;
 };
 
 // Parsed/prepared information for an HloModule that gets propagated to NVTX

From 9ceb4517a93c612c3b90c72a0cf54b3bad1bde97 Mon Sep 17 00:00:00 2001
From: Shanbin Ke <shanbinke@gmail.com>
Date: Wed, 27 Mar 2024 16:41:50 -0700
Subject: [PATCH 507/670] PR #10668: [XLA:GPU] Fix cuDNN FMHA rewriter sequence
 length checks and Bump up minimum flash attn cuDNN version to 8.9.4

Imported from GitHub PR https://github.com/openxla/xla/pull/10668

* If only seqlen_q or seqlen_kv is larger than 512, it will be lowered to fused attn instead of flash attn. This is incorrect as fused attn does not support this case. Add checks to lower to flash attn if one seqlen is larger than 512.
* While fixing the seqlen checks, also relax the seqlen/head dim constraints to support any seqlen % 2 == 0, head_dim <= 128 and head_dim % 8 = 0. (Since 8.9.4).
* Consolidate multiple get seqlen/head dim from fused attn/flash attn checks into one `getBHSD` function and rewrite both fused attn/flash attn checks to make it easier to understand.
* Bump up minimum cuDNN version require for flash attn from 8.9.3 to 8.9.4. (More seqlen/head dim support and cross attn support).
* Remove cross attn checks from rewriter and rewriter test since it is now default supported with 8.9.4.
* Add one testcase to cover the first bullet point.
Copybara import of the project:

--
bfdce45d669fe899dd845f497df783f014558384 by cjkkkk <ske@nvidia.com>:

rewrite seqlen checks

--
8a0e8b465cc1790953b1dc5f8740adfa7cf1124a by cjkkkk <ske@nvidia.com>:

return vector directly

--
72abf544d2f962c5554fe00ffea8d3167a9cde81 by cjkkkk <ske@nvidia.com>:

use struct for qkv_layout && add () for extra &&

--
0cac35422315180fcc9b04372916e061ebe17754 by cjkkkk <ske@nvidia.com>:

fix head dim check with cuDNN < 8.9.6

Merging this change closes #10668

PiperOrigin-RevId: 619703747
---
 .../service/gpu/cudnn_fused_mha_rewriter.cc   | 298 +++++++++---------
 .../gpu/cudnn_fused_mha_rewriter_test.cc      | 188 ++++++-----
 2 files changed, 265 insertions(+), 221 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
index c8fa817cf806eb..632eb42f2b5bbb 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
@@ -315,19 +315,6 @@ bool IsSupportedPrimitiveType(const HloInstruction* bmm) {
   return dtype == BF16 || dtype == F16;
 }
 
-bool IsContractingDimSupported(absl::Span<const int64_t> contracting_dims) {
-  return absl::c_all_of(contracting_dims,
-                        [](int64_t dim) { return dim == 64; });
-}
-
-bool IsNonContractingDimSupported(
-    const std::vector<int64_t>& non_contracting_dims, bool is_training) {
-  // For training, cuDNN require non_contracting_dim to be Divisible by 64
-  return absl::c_all_of(non_contracting_dims, [&](int64_t dim) {
-    return dim <= 512 && (!is_training || dim % 64 == 0);
-  });
-}
-
 std::vector<int64_t> GetDimensionVector(absl::Span<const int64_t> dimensions,
                                         absl::Span<const int64_t> dim_nums) {
   std::vector<int64_t> vec(dim_nums.size());
@@ -337,150 +324,163 @@ std::vector<int64_t> GetDimensionVector(absl::Span<const int64_t> dimensions,
   return vec;
 }
 
-absl::StatusOr<bool> IsSupportedBMM1(const HloInstruction* bmm_1,
-                                     bool is_training) {
-  const DotDimensionNumbers& dot_dims_bmm1 = bmm_1->dot_dimension_numbers();
+struct QKVLayout {
+  int64_t batch;
+  int64_t num_heads;
+  int64_t seqlen_q;
+  int64_t seqlen_kv;
+  int64_t hidden_dim;
+};
+
+absl::StatusOr<std::optional<QKVLayout>> GetQKVLayout(
+    HloInstruction* bmm_1, HloInstruction* bmm_2, bool need_canonicalization) {
+  // get layout from bmm1
+  const DotDimensionNumbers& bmm1_dnums = bmm_1->dot_dimension_numbers();
   TF_ASSIGN_OR_RETURN(
-      std::vector<int64_t> lhs_non_contracting_dim_nums_bmm1,
+      std::vector<int64_t> bmm1_s_q_dims,
       GetNonContractingDims(bmm_1->operand(0)->shape(),
-                            dot_dims_bmm1.lhs_batch_dimensions(),
-                            dot_dims_bmm1.lhs_contracting_dimensions()));
+                            bmm1_dnums.lhs_batch_dimensions(),
+                            bmm1_dnums.lhs_contracting_dimensions()));
+
   TF_ASSIGN_OR_RETURN(
-      std::vector<int64_t> rhs_non_contracting_dim_nums_bmm1,
+      std::vector<int64_t> bmm1_s_kv_dims,
       GetNonContractingDims(bmm_1->operand(1)->shape(),
-                            dot_dims_bmm1.rhs_batch_dimensions(),
-                            dot_dims_bmm1.rhs_contracting_dimensions()));
-  std::vector<int64_t> lhs_non_contracting_dims_bmm1 =
+                            bmm1_dnums.rhs_batch_dimensions(),
+                            bmm1_dnums.rhs_contracting_dimensions()));
+
+  std::vector<int64_t> bmm1_bh =
       GetDimensionVector(bmm_1->operand(0)->shape().dimensions(),
-                         lhs_non_contracting_dim_nums_bmm1);
-  std::vector<int64_t> rhs_non_contracting_dims_bmm1 =
-      GetDimensionVector(bmm_1->operand(1)->shape().dimensions(),
-                         rhs_non_contracting_dim_nums_bmm1);
-  // The non contracting dimensions for BMM1 need to be less than or equal to
-  // 512.
-  if (!IsNonContractingDimSupported(lhs_non_contracting_dims_bmm1,
-                                    is_training) ||
-      !IsNonContractingDimSupported(rhs_non_contracting_dims_bmm1,
-                                    is_training)) {
-    if (VLOG_IS_ON(2)) {
-      VLOG(2) << "BMM1 lhs_non_contracting_dims: "
-              << absl::StrJoin(lhs_non_contracting_dims_bmm1, ",")
-              << " BMM1 rhs_non_contracting_dims: "
-              << absl::StrJoin(rhs_non_contracting_dims_bmm1, ",")
-              << " are not supported. The non-contracting dims should be less "
-                 "than 512. This is a criteria for current cuDNN 8.8 support.";
-    }
-    return false;
-  }
+                         bmm1_dnums.lhs_batch_dimensions());
+
+  std::vector<int64_t> bmm1_s_q = GetDimensionVector(
+      bmm_1->operand(0)->shape().dimensions(), bmm1_s_q_dims);
+
+  std::vector<int64_t> bmm1_s_kv = GetDimensionVector(
+      bmm_1->operand(1)->shape().dimensions(), bmm1_s_kv_dims);
 
-  std::vector<int64_t> lhs_contracting_dims_bmm1 =
+  std::vector<int64_t> bmm1_d =
       GetDimensionVector(bmm_1->operand(0)->shape().dimensions(),
-                         dot_dims_bmm1.lhs_contracting_dimensions());
-  std::vector<int64_t> rhs_contracting_dims_bmm1 =
-      GetDimensionVector(bmm_1->operand(1)->shape().dimensions(),
-                         dot_dims_bmm1.rhs_contracting_dimensions());
-
-  // The contracting dimensions for BMM1 need to be 64.
-  if (!IsContractingDimSupported(lhs_contracting_dims_bmm1) ||
-      !IsContractingDimSupported(rhs_contracting_dims_bmm1)) {
-    if (VLOG_IS_ON(2)) {
-      VLOG(2) << "BMM1 lhs_contracting_dims: "
-              << absl::StrJoin(lhs_contracting_dims_bmm1, ",")
-              << " BMM1 rhs_contracting_dims: "
-              << absl::StrJoin(rhs_contracting_dims_bmm1, ",")
-              << " are not supported.";
-    }
-    return false;
-  }
-  return true;
-}
+                         bmm1_dnums.lhs_contracting_dimensions());
 
-absl::StatusOr<bool> IsSupportedBMM2(const HloInstruction* bmm_2,
-                                     bool need_canonicalization) {
-  const DotDimensionNumbers& dot_dims_bmm2 = bmm_2->dot_dimension_numbers();
-  // need swap lhs and rhs for bmm2 if canonicalization is needed
-  int operand_index = need_canonicalization ? 0 : 1;
-  auto batch_dim = need_canonicalization ? dot_dims_bmm2.lhs_batch_dimensions()
-                                         : dot_dims_bmm2.rhs_batch_dimensions();
-  auto contracting_dim = need_canonicalization
-                             ? dot_dims_bmm2.lhs_contracting_dimensions()
-                             : dot_dims_bmm2.rhs_contracting_dimensions();
+  TF_RET_CHECK(bmm1_bh.size() == 2);
+  TF_RET_CHECK(bmm1_s_q.size() == 1);
+  TF_RET_CHECK(bmm1_s_kv.size() == 1);
+  TF_RET_CHECK(bmm1_d.size() == 1);
 
+  // get layout from bmm2
+  const DotDimensionNumbers& bmm2_dnums = bmm_2->dot_dimension_numbers();
   TF_ASSIGN_OR_RETURN(
-      std::vector<int64_t> non_contracting_dim_nums_bmm2,
-      GetNonContractingDims(bmm_2->operand(operand_index)->shape(), batch_dim,
-                            contracting_dim));
-
-  std::vector<int64_t> non_contracting_dims_bmm2 =
-      GetDimensionVector(bmm_2->operand(operand_index)->shape().dimensions(),
-                         non_contracting_dim_nums_bmm2);
-  // The non contracting dimension for BMM2 needs to be 64 for the input matrix.
-  // The input matrix is the second argument to BMM2 i.e, rhs.
-  if (!absl::c_all_of(non_contracting_dims_bmm2,
-                      [](int64_t dim) { return dim == 64; })) {
-    if (VLOG_IS_ON(2)) {
-      VLOG(2) << " BMM2 rhs_non_contracting_dims: "
-              << absl::StrJoin(non_contracting_dims_bmm2, ",")
-              << " are not supported.";
-    }
-    return false;
+      std::vector<int64_t> bmm2_lhs_non_contracting_dims,
+      GetNonContractingDims(bmm_2->operand(0)->shape(),
+                            bmm2_dnums.lhs_batch_dimensions(),
+                            bmm2_dnums.lhs_contracting_dimensions()));
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<int64_t> bmm2_rhs_non_contracting_dims,
+      GetNonContractingDims(bmm_2->operand(1)->shape(),
+                            bmm2_dnums.rhs_batch_dimensions(),
+                            bmm2_dnums.rhs_contracting_dimensions()));
+
+  std::vector<int64_t> bmm2_bh =
+      GetDimensionVector(bmm_2->operand(0)->shape().dimensions(),
+                         bmm2_dnums.lhs_batch_dimensions());
+
+  std::vector<int64_t> bmm2_s_kv =
+      GetDimensionVector(bmm_2->operand(0)->shape().dimensions(),
+                         bmm2_dnums.lhs_contracting_dimensions());
+
+  std::vector<int64_t> bmm2_s_q =
+      need_canonicalization
+          ? GetDimensionVector(bmm_2->operand(1)->shape().dimensions(),
+                               bmm2_rhs_non_contracting_dims)
+          : GetDimensionVector(bmm_2->operand(0)->shape().dimensions(),
+                               bmm2_lhs_non_contracting_dims);
+
+  std::vector<int64_t> bmm2_d =
+      need_canonicalization
+          ? GetDimensionVector(bmm_2->operand(0)->shape().dimensions(),
+                               bmm2_lhs_non_contracting_dims)
+          : GetDimensionVector(bmm_2->operand(1)->shape().dimensions(),
+                               bmm2_rhs_non_contracting_dims);
+
+  TF_RET_CHECK(bmm2_bh.size() == 2);
+  TF_RET_CHECK(bmm2_s_q.size() == 1);
+  TF_RET_CHECK(bmm2_s_kv.size() == 1);
+  TF_RET_CHECK(bmm2_d.size() == 1);
+
+  // check if bhsd is correct between bmm1 and bmm2
+  if (bmm1_bh[0] != bmm2_bh[0] || bmm1_bh[1] != bmm2_bh[1] ||
+      bmm1_s_q[0] != bmm2_s_q[0] || bmm1_s_kv[0] != bmm2_s_kv[0] ||
+      bmm1_d[0] != bmm2_d[0]) {
+    return std::nullopt;
   }
-  return true;
+
+  QKVLayout qkv_layout;
+  qkv_layout.batch = bmm1_bh[0];
+  qkv_layout.num_heads = bmm1_bh[1];
+  qkv_layout.seqlen_q = bmm1_s_q[0];
+  qkv_layout.seqlen_kv = bmm1_s_kv[0];
+  qkv_layout.hidden_dim = bmm1_d[0];
+  return qkv_layout;
 }
 
-absl::StatusOr<bool> IsFlashAttention(
-    HloInstruction* bmm_1, bool is_causal_mask,
-    absl::string_view custom_call_name,
+absl::StatusOr<bool> IsFusedAttention(
+    QKVLayout qkv_layout, bool is_training,
     stream_executor::CudaComputeCapability cc,
     stream_executor::dnn::VersionInfo cudnn_version) {
-  const DotDimensionNumbers& dnums = bmm_1->dot_dimension_numbers();
-  TF_ASSIGN_OR_RETURN(
-      std::vector<int64_t> seq_q_dims,
-      GetNonContractingDims(bmm_1->operand(0)->shape(),
-                            dnums.lhs_batch_dimensions(),
-                            dnums.lhs_contracting_dimensions()));
-
-  TF_ASSIGN_OR_RETURN(
-      std::vector<int64_t> seq_k_dims,
-      GetNonContractingDims(bmm_1->operand(1)->shape(),
-                            dnums.rhs_batch_dimensions(),
-                            dnums.rhs_contracting_dimensions()));
-
-  std::vector<int64_t> seq_q =
-      GetDimensionVector(bmm_1->operand(0)->shape().dimensions(), seq_q_dims);
+  // otherwise check if it is supported by regular attention
+  int64_t s_q = qkv_layout.seqlen_q;
+  int64_t s_kv = qkv_layout.seqlen_kv;
+  int64_t hidden_dim = qkv_layout.hidden_dim;
+  bool is_seqlen_supported =
+      (s_q <= 512 && s_kv <= 512) &&
+      (!is_training || (s_q % 64 == 0 && s_kv % 64 == 0));
+  bool is_hidden_dim_supported = hidden_dim == 64;
+  bool is_fused_attention = is_seqlen_supported && is_hidden_dim_supported;
+  return is_fused_attention;
+}
 
-  std::vector<int64_t> seq_k =
-      GetDimensionVector(bmm_1->operand(1)->shape().dimensions(), seq_k_dims);
+absl::StatusOr<bool> IsFlashAttention(
+    QKVLayout qkv_layout, bool is_training,
+    stream_executor::CudaComputeCapability cc,
+    stream_executor::dnn::VersionInfo cudnn_version) {
+  int64_t s_q = qkv_layout.seqlen_q;
+  int64_t s_kv = qkv_layout.seqlen_kv;
+  int64_t hidden_dim = qkv_layout.hidden_dim;
+  // start with most relaxed constraint
+  bool is_seqlen_supported = (s_q > 512 || s_kv > 512) &&
+                             (!is_training || (s_q % 2 == 0 && s_kv % 2 == 0));
+  bool is_hidden_dim_supported = hidden_dim <= 128 && hidden_dim % 8 == 0;
+  bool is_flash_attention = is_seqlen_supported && is_hidden_dim_supported;
+  if (!is_flash_attention) return false;
+  // going backwards to check compatibility
+  if ((is_training && (s_q < 64 || s_kv < 64)) &&
+      !IsComputeCapabilityAndCudnnSupported(
+          cc, cudnn_version, stream_executor::dnn::VersionInfo(9, 0, 0))) {
+    VLOG(2) << "Flash attention training with seq < 64 not supported cuDNN < "
+               "9.0.0.";
+    return false;
+  }
 
-  std::vector<int64_t> hidden_dim =
-      GetDimensionVector(bmm_1->operand(0)->shape().dimensions(),
-                         dnums.lhs_contracting_dimensions());
-  // for now, seq_q and seq_k should be equal for flash attention to work
-  // flash attention only supports fixed topology so we check if custom call is
-  // such topology by checking custom_call_name
-  TF_RET_CHECK(seq_q.size() == 1);
-  TF_RET_CHECK(seq_k.size() == 1);
-  TF_RET_CHECK(hidden_dim.size() == 1);
-
-  auto is_seqlen_supported = seq_q[0] > 512 && seq_k[0] > 512 &&
-                             seq_q[0] % 64 == 0 && seq_k[0] % 64 == 0;
-  auto is_hidden_dim_supported = hidden_dim[0] == 64 || hidden_dim[0] == 128;
-  auto is_flash_attention = is_seqlen_supported && is_hidden_dim_supported;
-  auto is_cross_attention = seq_q[0] != seq_k[0];
-
-  // flash attention requires cuDNN 8.9.3 to run non-fused QKV
-  // once we have fused QKV support, we can relax this contraint
-  if (is_flash_attention &&
+  if ((hidden_dim != 64 && hidden_dim != 128) &&
       !IsComputeCapabilityAndCudnnSupported(
-          cc, cudnn_version, stream_executor::dnn::VersionInfo(8, 9, 3))) {
-    VLOG(2) << "Require cuDNN 8.9.3 to run flash attention.";
+          cc, cudnn_version, stream_executor::dnn::VersionInfo(8, 9, 6))) {
+    VLOG(2) << "Flash attention head dim != 64 or 128 not supported with cuDNN "
+               "< 8.9.6.";
     return false;
   }
-  // flash attention cross attention requires cuDNN 8.9.4 to run
-  if (is_cross_attention &&
+
+  if ((is_training && s_kv % 64 != 0) &&
       !IsComputeCapabilityAndCudnnSupported(
+          cc, cudnn_version, stream_executor::dnn::VersionInfo(8, 9, 5))) {
+    VLOG(2) << "Flash attention training with seq kv % 64 != 0 not supported "
+               "with cuDNN < 8.9.5.";
+    return false;
+  }
+
+  if (!IsComputeCapabilityAndCudnnSupported(
           cc, cudnn_version, stream_executor::dnn::VersionInfo(8, 9, 4))) {
-    VLOG(2) << "Require cuDNN 8.9.4 to run flash cross attention.";
+    VLOG(2) << "Require cuDNN 8.9.4 to run flash attention.";
     return false;
   }
   return is_flash_attention;
@@ -1239,10 +1239,19 @@ absl::StatusOr<bool> IsMHABlockSupported(
     return false;
   }
 
+  // get batch/num heads/sequence length/hidden dim from bmm1 and bmm2
+  // also make sure they are the same between bmm1 and bmm2
+  TF_ASSIGN_OR_RETURN(std::optional<QKVLayout> qkv_layout,
+                      GetQKVLayout(bmm_1, bmm_2, need_canonicalization));
+  if (!qkv_layout.has_value()) {
+    VLOG(2) << "bmm1 and bmm2 have different qkv layout.";
+    return false;
+  }
+
   // check if matched attention block is supported by cuDNN flash attention.
-  TF_ASSIGN_OR_RETURN(is_flash_attention,
-                      IsFlashAttention(bmm_1, is_causal_mask, custom_call_name,
-                                       cc, cudnn_version));
+  TF_ASSIGN_OR_RETURN(
+      is_flash_attention,
+      IsFlashAttention(qkv_layout.value(), is_training, cc, cudnn_version));
   if (is_flash_attention) {
     if (is_causal_mask) {
       // if bias is causal mask, needs to remove bias from name
@@ -1259,14 +1268,11 @@ absl::StatusOr<bool> IsMHABlockSupported(
     }
     return true;
   }
-  // otherwise check if it is supported by regular attention
-  TF_ASSIGN_OR_RETURN(bool is_bmm1_supported,
-                      IsSupportedBMM1(bmm_1, is_training));
-  if (!is_bmm1_supported) return false;
-  TF_ASSIGN_OR_RETURN(bool is_bmm2_supported,
-                      IsSupportedBMM2(bmm_2, need_canonicalization));
-  if (!is_bmm2_supported) return false;
-  return true;
+  // check if matched attention block is supported by cuDNN fused attention.
+  TF_ASSIGN_OR_RETURN(
+      bool is_fused_attention,
+      IsFusedAttention(qkv_layout.value(), is_training, cc, cudnn_version));
+  return is_fused_attention;
 }
 
 absl::StatusOr<HloInstruction*> CanonicalizeBatchedGemmForcuDNNFMHA(
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
index d9dea364d70482..b52fa5be2c598d 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
@@ -90,13 +90,6 @@ class CudnnFusedMhaRewriterTestHloTest : public HloTestBase {
   }
 
   se::dnn::VersionInfo GetCudnnVersionWithFlashAttentionSupport() {
-    // Fake a supported compute capability to run tests,
-    // we don't run any kernels in these tests so they should be safe
-    // to run anywhere.
-    return se::dnn::VersionInfo(8, 9, 3);
-  }
-
-  se::dnn::VersionInfo GetCudnnVersionWithFlashCrossAttentionSupport() {
     // Fake a supported compute capability to run tests,
     // we don't run any kernels in these tests so they should be safe
     // to run anywhere.
@@ -754,7 +747,7 @@ ENTRY main.41 {
   convert.40 = bf16[16,16,256,256]{3,2,1,0} convert(add.15)
   constant.4 = bf16[] constant(0)
   broadcast.5 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-  compare = pred[16,16,256,256]{3,2,1,0} compare(convert.40, broadcast.5), direction=GT 
+  compare = pred[16,16,256,256]{3,2,1,0} compare(convert.40, broadcast.5), direction=GT
   select.13 = bf16[16,16,256,256]{3,2,1,0} select(compare, convert.40, broadcast.5)
   convert.36 = f32[16,16,256,256]{3,2,1,0} convert(select.13)
   constant.9 = f32[] constant(-inf)
@@ -4361,73 +4354,6 @@ ENTRY main.82 {
   EXPECT_EQ(config.is_causal_mask(), false);
 }
 
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       FlashAttentionF16Bmm1BiasSoftmaxBmm2PatternCrossAttention) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,2048,64]{3,2,1,0},f16[2,6,64,1024]{3,2,1,0},f16[2,6,1024,64]{3,2,1,0},f16[2,6,2048,1024]{3,2,1,0})->f16[2,6,2048,64]{3,2,1,0}}
-
-region_0.7 {
-  Arg_0.8 = f16[] parameter(0)
-  Arg_1.9 = f16[] parameter(1)
-  ROOT maximum = f16[] maximum(Arg_0.8, Arg_1.9)
-}
-
-region_1.19 {
-  Arg_0.20 = f32[] parameter(0)
-  Arg_1.21 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0.20, Arg_1.21)
-}
-
-ENTRY main.31 {
-  Arg_0.1 = f16[2,6,2048,64]{3,2,1,0} parameter(0), sharding={replicated}
-  Arg_1.2 = f16[2,6,64,1024]{3,2,1,0} parameter(1), sharding={replicated}
-  dot = f16[2,6,2048,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={3}, rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
-  Arg_3.4 = f16[2,6,2048,1024]{3,2,1,0} parameter(3), sharding={replicated}
-  add.1 = f16[2,6,2048,1024]{3,2,1,0} add(dot, Arg_3.4)
-  constant = f16[] constant(-inf)
-  reduce.11 = f16[2,6,2048]{2,1,0} reduce(add.1, constant), dimensions={3}, to_apply=region_0.7
-  broadcast.3 = f16[2,6,2048,1024]{3,2,1,0} broadcast(reduce.11), dimensions={0,1,2}
-  subtract.1 = f16[2,6,2048,1024]{3,2,1,0} subtract(add.1, broadcast.3)
-  exponential.1 = f16[2,6,2048,1024]{3,2,1,0} exponential(subtract.1)
-  convert.1 = f32[2,6,2048,1024]{3,2,1,0} convert(exponential.1)
-  constant.1 = f32[] constant(0)
-  reduce.23 = f32[2,6,2048]{2,1,0} reduce(convert.1, constant.1), dimensions={3}, to_apply=region_1.19
-  convert.2 = f16[2,6,2048]{2,1,0} convert(reduce.23)
-  broadcast.4 = f16[2,6,2048,1024]{3,2,1,0} broadcast(convert.2), dimensions={0,1,2}
-  divide = f16[2,6,2048,1024]{3,2,1,0} divide(exponential.1, broadcast.4)
-  Arg_2.3 = f16[2,6,1024,64]{3,2,1,0} parameter(2), sharding={replicated}
-  ROOT dot.1 = f16[2,6,2048,64]{3,2,1,0} dot(divide, Arg_2.3), lhs_contracting_dims={3}, rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(m->entry_computation()->root_instruction(), GmockMatch(m::Dot()));
-
-  CudnnFusedMHARewriter fusedMhaRewriterWithCrossAttention{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithFlashCrossAttentionSupport()};
-  TF_ASSERT_OK(
-      RunHloPass(&fusedMhaRewriterWithCrossAttention, m.get()).status());
-  SCOPED_TRACE(m->ToString());
-  const HloInstruction* fmha;
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHAScaleBiasSoftmaxCallTarget}), 0)
-              .WithShape(F16, {2, 6, 2048, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(config.is_flash_attention(), true);
-  EXPECT_EQ(config.is_causal_mask(), false);
-}
-
 // GPT3 pattern
 TEST_F(CudnnFusedMhaRewriterTestHloTest, FlashAttentionBF16TrainingGPT3_5B) {
   if (skip_reason_) GTEST_SKIP() << *skip_reason_;
@@ -5194,6 +5120,118 @@ ENTRY main.164_spmd {
               }))))));
 }
 
+constexpr absl::string_view hlo_should_lower_to_flash_attention = R"(
+HloModule fmha_test, entry_computation_layout={(bf16[16,16,128,64]{3,2,1,0},bf16[16,16,1024,64]{3,2,1,0},bf16[16,16,1024,64]{3,2,1,0})->bf16[16,16,128,64]{3,2,1,0}}
+ENTRY main.6 {
+  Arg_0.1 = bf16[16,16,128,64]{3,2,1,0} parameter(0)
+  Arg_1.2 = bf16[16,16,1024,64]{3,2,1,0} parameter(1)
+  Arg_2.3 = bf16[16,16,1024,64]{3,2,1,0} parameter(2)
+  dot.0 = bf16[16,16,128,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
+  ROOT dot.1 = bf16[16,16,128,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
+})";
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest, ShouldLowerToFlashAttention) {
+  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto m, ParseAndReturnVerifiedModule(hlo_should_lower_to_flash_attention,
+                                           GetModuleConfig()));
+  CudnnFusedMHARewriter fusedMhaRewriter{
+      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+  const HloInstruction* fmha;
+
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(
+                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
+                     .WithShape(BF16, {16, 16, 128, 64})));
+  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
+                          fmha->backend_config<GpuBackendConfig>());
+  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
+  EXPECT_EQ(config.fmha_scale(), 1.0);
+  EXPECT_EQ(config.dropout_rate(), 0.0);
+  EXPECT_EQ(config.is_flash_attention(), true);
+}
+
+constexpr absl::string_view hlo_head_dim_not_multiple_of_64 = R"(
+HloModule jit__reference, entry_computation_layout={(f16[4,48,1024,16]{3,2,1,0}, f16[4,48,1024,16]{3,2,1,0}, f16[4,48,1024,16]{3,2,1,0})->f16[4,48,1024,16]{3,2,1,0}}
+
+region_0.26 {
+  Arg_0.27 = f32[] parameter(0)
+  Arg_1.28 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(Arg_0.27, Arg_1.28)
+}
+
+region_1.37 {
+  Arg_0.38 = f32[] parameter(0)
+  Arg_1.39 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.38, Arg_1.39)
+}
+
+ENTRY main.49 {
+  iota.2 = s32[1024,1024]{1,0} iota(), iota_dimension=0
+  iota.3 = s32[1024,1024]{1,0} iota(), iota_dimension=1
+  compare = pred[1024,1024]{1,0} compare(iota.2, iota.3), direction=GE
+  broadcast.4 = pred[4,48,1024,1024]{3,2,1,0} broadcast(compare), dimensions={2,3}
+  Arg_0.1 = f16[4,48,1024,16]{3,2,1,0} parameter(0)
+  Arg_1.2 = f16[4,48,1024,16]{3,2,1,0} parameter(1)
+  dot.9 = f16[4,48,1024,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  constant.4 = f16[] constant(0.5)
+  broadcast.6 = f16[4,48,1024,1024]{3,2,1,0} broadcast(constant.4), dimensions={}
+  multiply = f16[4,48,1024,1024]{3,2,1,0} multiply(dot.9, broadcast.6)
+  constant = f16[] constant(-inf)
+  broadcast.7 = f16[4,48,1024,1024]{3,2,1,0} broadcast(constant), dimensions={}
+  select.1 = f16[4,48,1024,1024]{3,2,1,0} select(broadcast.4, multiply, broadcast.7)
+  convert.1 = f32[4,48,1024,1024]{3,2,1,0} convert(select.1)
+  constant.7 = f32[] constant(-inf)
+  reduce.30 = f32[4,48,1024]{2,1,0} reduce(convert.1, constant.7), dimensions={3}, to_apply=region_0.26
+  broadcast.8 = f32[4,48,1024,1024]{3,2,1,0} broadcast(reduce.30), dimensions={0,1,2}
+  subtract = f32[4,48,1024,1024]{3,2,1,0} subtract(convert.1, broadcast.8)
+  exponential = f32[4,48,1024,1024]{3,2,1,0} exponential(subtract)
+  constant.6 = f32[] constant(0)
+  reduce.41 = f32[4,48,1024]{2,1,0} reduce(exponential, constant.6), dimensions={3}, to_apply=region_1.37
+  broadcast.9 = f32[4,48,1024,1024]{3,2,1,0} broadcast(reduce.41), dimensions={0,1,2}
+  divide = f32[4,48,1024,1024]{3,2,1,0} divide(exponential, broadcast.9)
+  convert.2 = f16[4,48,1024,1024]{3,2,1,0} convert(divide)
+  Arg_2.3 = f16[4,48,1024,16]{3,2,1,0} parameter(2)
+  ROOT dot.48 = f16[4,48,1024,16]{3,2,1,0} dot(convert.2, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+} // main.49
+)";
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest, HeadDimNotMultipleOf64) {
+  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto m, ParseAndReturnVerifiedModule(hlo_head_dim_not_multiple_of_64,
+                                           GetModuleConfig()));
+  CudnnFusedMHARewriter fusedMhaRewriter{
+      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+
+  // head dim not a multiple of 64 should not be lowered with cuDNN < 8.9.6
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(m->entry_computation()->root_instruction(), GmockMatch(m::Dot()));
+
+  // should be lowered with cuDNN >= 8.9.6
+  CudnnFusedMHARewriter fusedMhaRewriterWithcuDNN8907{
+      GetCudaComputeCapability(), se::dnn::VersionInfo(8, 9, 7)};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriterWithcuDNN8907, m.get()).status());
+  const HloInstruction* fmha;
+
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(
+          m::GetTupleElement(
+              m::CustomCall(&fmha, {kCudnnfMHAScaleMaskSoftmaxCallTarget}), 0)
+              .WithShape(F16, {4, 48, 1024, 16})));
+  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
+                          fmha->backend_config<GpuBackendConfig>());
+  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
+  EXPECT_EQ(config.fmha_scale(), 0.5);
+  EXPECT_EQ(config.dropout_rate(), 0.0);
+  EXPECT_EQ(config.is_flash_attention(), true);
+}
 }  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla

From 48cc5bba3e1c6b32488020840e27c46ea3131ed8 Mon Sep 17 00:00:00 2001
From: Chris Minge <chrisminge@google.com>
Date: Wed, 27 Mar 2024 16:42:53 -0700
Subject: [PATCH 508/670] Remove check that the first thread pool is the
 default, as it is no longer needed.

PiperOrigin-RevId: 619704113
---
 tensorflow/core/tfrt/tfrt_session/tfrt_session.cc | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
index 6d1843770a53c0..2836105ca727e1 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
@@ -604,15 +604,6 @@ class TfrtSessionFactory::ThreadPoolManager {
         auto pool_index = it.index();
         auto num_threads = pool_options.num_threads();
 
-        // For the current use cases the first thread pool is always the default
-        // thread pool. We add this check here to verify the assumption. We can
-        // remove this check once the code stablizes, since it is semantically
-        // meaningful to use non-default thread pool as the first thread pool.
-        if (pool_index == 0 && num_threads != 0) {
-          return errors::InvalidArgument(
-              "The first thread pool must have num_threads = 0");
-        }
-
         if (num_threads != 0) {
           TF_ASSIGN_OR_RETURN(
               auto* thread_pool,

From d6e5325809bac99b37bb8d8f13c66d694cecdf34 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rjascani@google.com>
Date: Thu, 28 Mar 2024 00:41:41 +0000
Subject: [PATCH 509/670] shlo_ref: Move template specializations

GCC gives compilation errors that these template specializations are in
a non-namespace scope. This PR moves the template specializations out of
the struct scope and into a namespace scope.
---
 tensorflow/lite/experimental/shlo/ops/cbrt.cc | 18 +++++++++---------
 .../lite/experimental/shlo/ops/cbrt_test.cc   | 18 +++++++++---------
 tensorflow/lite/experimental/shlo/ops/ceil.cc | 18 +++++++++---------
 .../lite/experimental/shlo/ops/ceil_test.cc   | 18 +++++++++---------
 .../lite/experimental/shlo/ops/cosine.cc      | 18 +++++++++---------
 .../lite/experimental/shlo/ops/cosine_test.cc | 18 +++++++++---------
 .../lite/experimental/shlo/ops/exponential.cc | 18 +++++++++---------
 .../shlo/ops/exponential_minus_one.cc         | 18 +++++++++---------
 .../shlo/ops/exponential_minus_one_test.cc    | 18 +++++++++---------
 .../experimental/shlo/ops/exponential_test.cc | 18 +++++++++---------
 .../lite/experimental/shlo/ops/floor.cc       | 18 +++++++++---------
 .../lite/experimental/shlo/ops/floor_test.cc  | 18 +++++++++---------
 tensorflow/lite/experimental/shlo/ops/log.cc  | 18 +++++++++---------
 .../experimental/shlo/ops/log_plus_one.cc     | 18 +++++++++---------
 .../shlo/ops/log_plus_one_test.cc             | 18 +++++++++---------
 .../lite/experimental/shlo/ops/log_test.cc    | 18 +++++++++---------
 .../lite/experimental/shlo/ops/logistic.cc    | 18 +++++++++---------
 .../experimental/shlo/ops/logistic_test.cc    | 18 +++++++++---------
 tensorflow/lite/experimental/shlo/ops/not.cc  |  9 +++++----
 .../lite/experimental/shlo/ops/not_test.cc    |  9 +++++----
 tensorflow/lite/experimental/shlo/ops/sign.cc | 18 +++++++++---------
 .../lite/experimental/shlo/ops/sign_test.cc   | 18 +++++++++---------
 tensorflow/lite/experimental/shlo/ops/sine.cc | 18 +++++++++---------
 .../lite/experimental/shlo/ops/sine_test.cc   | 18 +++++++++---------
 tensorflow/lite/experimental/shlo/ops/sqrt.cc | 18 +++++++++---------
 .../lite/experimental/shlo/ops/sqrt_test.cc   | 19 ++++++++++---------
 tensorflow/lite/experimental/shlo/ops/tanh.cc | 18 +++++++++---------
 .../lite/experimental/shlo/ops/tanh_test.cc   | 18 +++++++++---------
 28 files changed, 245 insertions(+), 242 deletions(-)

diff --git a/tensorflow/lite/experimental/shlo/ops/cbrt.cc b/tensorflow/lite/experimental/shlo/ops/cbrt.cc
index 2e50c92c2e5998..076e5175436a5e 100644
--- a/tensorflow/lite/experimental/shlo/ops/cbrt.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cbrt.cc
@@ -32,17 +32,17 @@ struct Cbrt {
   T operator()(T v) const {
     return std::cbrt(v);
   }
+};
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Cbrt::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-};
+template <>
+BF16 Cbrt::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 CbrtOp Create(CbrtOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc b/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc
index 687e3cb7debb15..b62edcfebbfb7c 100644
--- a/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc
@@ -48,17 +48,17 @@ struct Cbrt {
   T operator()(T v) const {
     return std::cbrt(v);
   }
+} cbrt_ref;
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Cbrt::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-} cbrt_ref;
+template <>
+BF16 Cbrt::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Cbrt, UnaryElementwiseOpShapePropagationTest,
                                CbrtOp, TestParamNames);
diff --git a/tensorflow/lite/experimental/shlo/ops/ceil.cc b/tensorflow/lite/experimental/shlo/ops/ceil.cc
index a6b501131db5f9..95f96a38bafc4a 100644
--- a/tensorflow/lite/experimental/shlo/ops/ceil.cc
+++ b/tensorflow/lite/experimental/shlo/ops/ceil.cc
@@ -33,17 +33,17 @@ struct Ceil {
   T operator()(T v) const {
     return std::ceil(v);
   }
+};
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Ceil::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-};
+template <>
+BF16 Ceil::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 CeilOp Create(CeilOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/ceil_test.cc b/tensorflow/lite/experimental/shlo/ops/ceil_test.cc
index 4059b19bcca63c..2e3e6288c14f4f 100644
--- a/tensorflow/lite/experimental/shlo/ops/ceil_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/ceil_test.cc
@@ -48,17 +48,17 @@ struct Ceil {
   T operator()(T v) const {
     return std::ceil(v);
   }
+} ceil_ref;
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Ceil::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-} ceil_ref;
+template <>
+BF16 Ceil::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Ceil, UnaryElementwiseOpShapePropagationTest,
                                CeilOp, TestParamNames);
diff --git a/tensorflow/lite/experimental/shlo/ops/cosine.cc b/tensorflow/lite/experimental/shlo/ops/cosine.cc
index 8b757f9709ef18..5bd347836af75e 100644
--- a/tensorflow/lite/experimental/shlo/ops/cosine.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cosine.cc
@@ -32,17 +32,17 @@ struct Cosine {
   T operator()(T v) const {
     return std::cos(v);
   }
+};
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Cosine::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-};
+template <>
+BF16 Cosine::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 CosineOp Create(CosineOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/cosine_test.cc b/tensorflow/lite/experimental/shlo/ops/cosine_test.cc
index 41fce8a264dd57..9f858c9d6cc666 100644
--- a/tensorflow/lite/experimental/shlo/ops/cosine_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cosine_test.cc
@@ -48,17 +48,17 @@ struct Cosine {
   T operator()(T v) const {
     return std::cos(v);
   }
+} cosine_ref;
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Cosine::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-} cosine_ref;
+template <>
+BF16 Cosine::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Cosine, UnaryElementwiseOpShapePropagationTest,
                                CosineOp, TestParamNames);
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential.cc b/tensorflow/lite/experimental/shlo/ops/exponential.cc
index 8c8994ccc8b296..979ddef45889ff 100644
--- a/tensorflow/lite/experimental/shlo/ops/exponential.cc
+++ b/tensorflow/lite/experimental/shlo/ops/exponential.cc
@@ -32,17 +32,17 @@ struct Exponential {
   T operator()(T v) const {
     return std::exp(v);
   }
+};
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Exponential::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-};
+template <>
+BF16 Exponential::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 ExponentialOp Create(ExponentialOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc
index 57de5eb188df0f..a5bcab04280ba1 100644
--- a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc
@@ -32,17 +32,17 @@ struct ExponentialMinusOne {
   T operator()(T v) const {
     return std::expm1(v);
   }
+};
 
-  template <>
-  F16 operator()(F16 v) const {
-    return F16(operator()(static_cast<float>(v)));
-  }
+template <>
+F16 ExponentialMinusOne::operator()(F16 v) const {
+  return F16(operator()(static_cast<float>(v)));
+}
 
-  template <>
-  BF16 operator()(BF16 v) const {
-    return BF16(operator()(static_cast<float>(v)));
-  }
-};
+template <>
+BF16 ExponentialMinusOne::operator()(BF16 v) const {
+  return BF16(operator()(static_cast<float>(v)));
+}
 
 ExponentialMinusOneOp Create(ExponentialMinusOneOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc
index b791350e49a304..0fbe259ecefafc 100644
--- a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc
@@ -48,17 +48,17 @@ struct ExponentialMinusOne {
   T operator()(T v) const {
     return std::expm1(v);
   }
+} exponential_minus_one_ref;
 
-  template <>
-  F16 operator()(F16 v) const {
-    return F16(operator()(static_cast<float>(v)));
-  }
+template <>
+F16 ExponentialMinusOne::operator()(F16 v) const {
+  return F16(operator()(static_cast<float>(v)));
+}
 
-  template <>
-  BF16 operator()(BF16 v) const {
-    return BF16(operator()(static_cast<float>(v)));
-  }
-} exponential_minus_one_ref;
+template <>
+BF16 ExponentialMinusOne::operator()(BF16 v) const {
+  return BF16(operator()(static_cast<float>(v)));
+}
 
 INSTANTIATE_TYPED_TEST_SUITE_P(ExponentialMinusOne,
                                UnaryElementwiseOpShapePropagationTest,
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_test.cc b/tensorflow/lite/experimental/shlo/ops/exponential_test.cc
index 12a180a5a60826..f8cab0a7afc137 100644
--- a/tensorflow/lite/experimental/shlo/ops/exponential_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_test.cc
@@ -48,17 +48,17 @@ struct Exponential {
   T operator()(T v) const {
     return std::exp(v);
   }
+} exponential_ref;
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Exponential::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-} exponential_ref;
+template <>
+BF16 Exponential::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Exponential,
                                UnaryElementwiseOpShapePropagationTest,
diff --git a/tensorflow/lite/experimental/shlo/ops/floor.cc b/tensorflow/lite/experimental/shlo/ops/floor.cc
index 1a3c9e8efbb07f..7ef86a3cb53e93 100644
--- a/tensorflow/lite/experimental/shlo/ops/floor.cc
+++ b/tensorflow/lite/experimental/shlo/ops/floor.cc
@@ -32,17 +32,17 @@ struct Floor {
   T operator()(T v) const {
     return std::floor(v);
   }
+};
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Floor::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-};
+template <>
+BF16 Floor::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 FloorOp Create(FloorOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/floor_test.cc b/tensorflow/lite/experimental/shlo/ops/floor_test.cc
index 08ad8a8de670c5..bf0e19f0c10aeb 100644
--- a/tensorflow/lite/experimental/shlo/ops/floor_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/floor_test.cc
@@ -48,17 +48,17 @@ struct Floor {
   T operator()(T v) const {
     return std::floor(v);
   }
+} floor_ref;
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Floor::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-} floor_ref;
+template <>
+BF16 Floor::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Floor, UnaryElementwiseOpShapePropagationTest,
                                FloorOp, TestParamNames);
diff --git a/tensorflow/lite/experimental/shlo/ops/log.cc b/tensorflow/lite/experimental/shlo/ops/log.cc
index 1beca617bc5880..9f3f68ae8e7fdf 100644
--- a/tensorflow/lite/experimental/shlo/ops/log.cc
+++ b/tensorflow/lite/experimental/shlo/ops/log.cc
@@ -32,17 +32,17 @@ struct Log {
   T operator()(T v) const {
     return std::log(v);
   }
+};
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Log::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-};
+template <>
+BF16 Log::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 LogOp Create(LogOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc b/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc
index 66d3a7a11cc5b8..f80ebcce54d9fa 100644
--- a/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc
+++ b/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc
@@ -32,17 +32,17 @@ struct LogPlusOne {
   T operator()(T v) const {
     return std::log1p(v);
   }
+};
 
-  template <>
-  F16 operator()(F16 v) const {
-    return F16(operator()(static_cast<float>(v)));
-  }
+template <>
+F16 LogPlusOne::operator()(F16 v) const {
+  return F16(operator()(static_cast<float>(v)));
+}
 
-  template <>
-  BF16 operator()(BF16 v) const {
-    return BF16(operator()(static_cast<float>(v)));
-  }
-};
+template <>
+BF16 LogPlusOne::operator()(BF16 v) const {
+  return BF16(operator()(static_cast<float>(v)));
+}
 
 LogPlusOneOp Create(LogPlusOneOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc b/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc
index f636b72cb23e3f..9e303cc812d310 100644
--- a/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc
@@ -48,17 +48,17 @@ struct LogPlusOne {
   T operator()(T v) const {
     return std::log1p(v);
   }
+} log_plus_one_ref;
 
-  template <>
-  F16 operator()(F16 v) const {
-    return F16(operator()(static_cast<float>(v)));
-  }
+template <>
+F16 LogPlusOne::operator()(F16 v) const {
+  return F16(operator()(static_cast<float>(v)));
+}
 
-  template <>
-  BF16 operator()(BF16 v) const {
-    return BF16(operator()(static_cast<float>(v)));
-  }
-} log_plus_one_ref;
+template <>
+BF16 LogPlusOne::operator()(BF16 v) const {
+  return BF16(operator()(static_cast<float>(v)));
+}
 
 INSTANTIATE_TYPED_TEST_SUITE_P(LogPlusOne,
                                UnaryElementwiseOpShapePropagationTest,
diff --git a/tensorflow/lite/experimental/shlo/ops/log_test.cc b/tensorflow/lite/experimental/shlo/ops/log_test.cc
index 2d5c45239bd0c5..5f2c59f147ee4e 100644
--- a/tensorflow/lite/experimental/shlo/ops/log_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/log_test.cc
@@ -48,17 +48,17 @@ struct Log {
   T operator()(T v) const {
     return std::log(v);
   }
+} log_ref;
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Log::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-} log_ref;
+template <>
+BF16 Log::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Log, UnaryElementwiseOpShapePropagationTest,
                                LogOp, TestParamNames);
diff --git a/tensorflow/lite/experimental/shlo/ops/logistic.cc b/tensorflow/lite/experimental/shlo/ops/logistic.cc
index 0174e4113e2c29..3953cfe3441810 100644
--- a/tensorflow/lite/experimental/shlo/ops/logistic.cc
+++ b/tensorflow/lite/experimental/shlo/ops/logistic.cc
@@ -33,17 +33,17 @@ struct Logistic {
     constexpr T one = static_cast<T>(1);
     return one / (one + std::exp(-v));
   }
+};
 
-  template <>
-  F16 operator()(F16 v) const {
-    return F16(operator()(static_cast<float>(v)));
-  }
+template <>
+F16 Logistic::operator()(F16 v) const {
+  return F16(operator()(static_cast<float>(v)));
+}
 
-  template <>
-  BF16 operator()(BF16 v) const {
-    return BF16(operator()(static_cast<float>(v)));
-  }
-};
+template <>
+BF16 Logistic::operator()(BF16 v) const {
+  return BF16(operator()(static_cast<float>(v)));
+}
 
 LogisticOp Create(LogisticOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/logistic_test.cc b/tensorflow/lite/experimental/shlo/ops/logistic_test.cc
index e11df7372a40c0..3d8014e33d133a 100644
--- a/tensorflow/lite/experimental/shlo/ops/logistic_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/logistic_test.cc
@@ -49,17 +49,17 @@ struct Logistic {
     constexpr T one = static_cast<T>(1);
     return one / (one + std::exp(-v));
   }
+} logistic_ref;
 
-  template <>
-  F16 operator()(F16 v) const {
-    return F16(operator()(static_cast<float>(v)));
-  }
+template <>
+F16 Logistic::operator()(F16 v) const {
+  return F16(operator()(static_cast<float>(v)));
+}
 
-  template <>
-  BF16 operator()(BF16 v) const {
-    return BF16(operator()(static_cast<float>(v)));
-  }
-} logistic_ref;
+template <>
+BF16 Logistic::operator()(BF16 v) const {
+  return BF16(operator()(static_cast<float>(v)));
+}
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Logistic, UnaryElementwiseOpShapePropagationTest,
                                LogisticOp, TestParamNames);
diff --git a/tensorflow/lite/experimental/shlo/ops/not.cc b/tensorflow/lite/experimental/shlo/ops/not.cc
index 5029afc7260f41..b8d613309b9010 100644
--- a/tensorflow/lite/experimental/shlo/ops/not.cc
+++ b/tensorflow/lite/experimental/shlo/ops/not.cc
@@ -28,12 +28,13 @@ struct Not {
   T operator()(T v) const {
     return ~v;
   }
-  template <>
-  bool operator()(bool v) const {
-    return !v;
-  }
 };
 
+template <>
+bool Not::operator()(bool v) const {
+  return !v;
+}
+
 NotOp Create(NotOp::Attributes) { return {}; }
 
 absl::Status Prepare(NotOp& op, const Tensor& input, Tensor& output) {
diff --git a/tensorflow/lite/experimental/shlo/ops/not_test.cc b/tensorflow/lite/experimental/shlo/ops/not_test.cc
index cc719a1badbf3f..6a036867895f84 100644
--- a/tensorflow/lite/experimental/shlo/ops/not_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/not_test.cc
@@ -48,12 +48,13 @@ struct Not {
   T operator()(T v) const {
     return ~v;
   }
-  template <>
-  bool operator()(bool v) const {
-    return !v;
-  }
 } not_ref;
 
+template <>
+bool Not::operator()(bool v) const {
+  return !v;
+}
+
 INSTANTIATE_TYPED_TEST_SUITE_P(Not, UnaryElementwiseOpShapePropagationTest,
                                NotOp, TestParamNames);
 
diff --git a/tensorflow/lite/experimental/shlo/ops/sign.cc b/tensorflow/lite/experimental/shlo/ops/sign.cc
index 6703650dc0c18c..197b87ba6bb3bc 100644
--- a/tensorflow/lite/experimental/shlo/ops/sign.cc
+++ b/tensorflow/lite/experimental/shlo/ops/sign.cc
@@ -32,17 +32,17 @@ struct Sign {
     constexpr T zero = static_cast<T>(0);
     return v < zero ? -one : (v > zero ? one : v);
   }
+};
 
-  template <>
-  F16 operator()(F16 v) const {
-    return static_cast<F16>(operator()(static_cast<float>(v)));
-  }
+template <>
+F16 Sign::operator()(F16 v) const {
+  return static_cast<F16>(operator()(static_cast<float>(v)));
+}
 
-  template <>
-  BF16 operator()(BF16 v) const {
-    return static_cast<BF16>(operator()(static_cast<float>(v)));
-  }
-};
+template <>
+BF16 Sign::operator()(BF16 v) const {
+  return static_cast<BF16>(operator()(static_cast<float>(v)));
+}
 
 SignOp Create(SignOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/sign_test.cc b/tensorflow/lite/experimental/shlo/ops/sign_test.cc
index e786c185b9840d..67ec823482036a 100644
--- a/tensorflow/lite/experimental/shlo/ops/sign_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/sign_test.cc
@@ -49,17 +49,17 @@ struct Sign {
     constexpr T zero = static_cast<T>(0);
     return v < zero ? -one : (v > zero ? one : v);
   }
+} sign_ref;
 
-  template <>
-  F16 operator()(F16 v) const {
-    return static_cast<F16>(operator()(static_cast<float>(v)));
-  }
+template <>
+F16 Sign::operator()(F16 v) const {
+  return static_cast<F16>(operator()(static_cast<float>(v)));
+}
 
-  template <>
-  BF16 operator()(BF16 v) const {
-    return static_cast<BF16>(operator()(static_cast<float>(v)));
-  }
-} sign_ref;
+template <>
+BF16 Sign::operator()(BF16 v) const {
+  return static_cast<BF16>(operator()(static_cast<float>(v)));
+}
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Sign, UnaryElementwiseOpShapePropagationTest,
                                SignOp, TestParamNames);
diff --git a/tensorflow/lite/experimental/shlo/ops/sine.cc b/tensorflow/lite/experimental/shlo/ops/sine.cc
index 1d1228f18f0804..e69d98e07c1517 100644
--- a/tensorflow/lite/experimental/shlo/ops/sine.cc
+++ b/tensorflow/lite/experimental/shlo/ops/sine.cc
@@ -33,17 +33,17 @@ struct Sine {
   T operator()(T v) const {
     return std::sin(v);
   }
+};
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Sine::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-};
+template <>
+BF16 Sine::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 SineOp Create(SineOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/sine_test.cc b/tensorflow/lite/experimental/shlo/ops/sine_test.cc
index c82f2c570bb858..fa16dee3b1d27f 100644
--- a/tensorflow/lite/experimental/shlo/ops/sine_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/sine_test.cc
@@ -48,17 +48,17 @@ struct Sine {
   T operator()(T v) const {
     return std::sin(v);
   }
+} sine_ref;
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Sine::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-} sine_ref;
+template <>
+BF16 Sine::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Sine, UnaryElementwiseOpShapePropagationTest,
                                SineOp, TestParamNames);
diff --git a/tensorflow/lite/experimental/shlo/ops/sqrt.cc b/tensorflow/lite/experimental/shlo/ops/sqrt.cc
index e13ff7a3025aa1..f34841fc9cb874 100644
--- a/tensorflow/lite/experimental/shlo/ops/sqrt.cc
+++ b/tensorflow/lite/experimental/shlo/ops/sqrt.cc
@@ -32,17 +32,17 @@ struct Sqrt {
   T operator()(T v) const {
     return std::sqrt(v);
   }
+};
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Sqrt::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-};
+template <>
+BF16 Sqrt::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 SqrtOp Create(SqrtOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc b/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc
index 161a6c6882d4a4..937c871b93745f 100644
--- a/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc
@@ -49,17 +49,18 @@ struct Sqrt {
     return std::sqrt(v);
   }
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
-
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
 } sqrt_ref;
 
+template <>
+F16 Sqrt::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Sqrt::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
 INSTANTIATE_TYPED_TEST_SUITE_P(Sqrt, UnaryElementwiseOpShapePropagationTest,
                                SqrtOp, TestParamNames);
 
diff --git a/tensorflow/lite/experimental/shlo/ops/tanh.cc b/tensorflow/lite/experimental/shlo/ops/tanh.cc
index 3ba4c17a88dba6..d2518f6ba81b5f 100644
--- a/tensorflow/lite/experimental/shlo/ops/tanh.cc
+++ b/tensorflow/lite/experimental/shlo/ops/tanh.cc
@@ -33,17 +33,17 @@ struct Tanh {
   T operator()(T v) const {
     return std::tanh(v);
   }
+};
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Tanh::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-};
+template <>
+BF16 Tanh::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 TanhOp Create(TanhOp::Attributes) { return {}; }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/tanh_test.cc b/tensorflow/lite/experimental/shlo/ops/tanh_test.cc
index d57e52d7318235..0343a087b65a62 100644
--- a/tensorflow/lite/experimental/shlo/ops/tanh_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/tanh_test.cc
@@ -48,17 +48,17 @@ struct Tanh {
   T operator()(T v) const {
     return std::tanh(v);
   }
+} tanh_ref;
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Tanh::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-} tanh_ref;
+template <>
+BF16 Tanh::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 INSTANTIATE_TYPED_TEST_SUITE_P(Tanh, UnaryElementwiseOpShapePropagationTest,
                                TanhOp, TestParamNames);

From 1a9c77acc0b8ebe3f31c5f3b44e44623e257efa3 Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Wed, 27 Mar 2024 17:25:23 -0700
Subject: [PATCH 510/670] Remove unnecessary const qualifiers from function
 definitions.

`absl::Span<const T>` is already a read-only interface to the elements of its container, so the const here is redundant.

PiperOrigin-RevId: 619716367
---
 third_party/xla/xla/client/xla_builder.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/client/xla_builder.cc b/third_party/xla/xla/client/xla_builder.cc
index 39c49502f7a5c7..f8d42040b8019b 100644
--- a/third_party/xla/xla/client/xla_builder.cc
+++ b/third_party/xla/xla/client/xla_builder.cc
@@ -1433,8 +1433,8 @@ XlaOp XlaBuilder::Broadcast(XlaOp operand,
 }
 
 XlaOp XlaBuilder::BroadcastInDim(
-    XlaOp operand, const absl::Span<const int64_t> out_dim_size,
-    const absl::Span<const int64_t> broadcast_dimensions) {
+    XlaOp operand, absl::Span<const int64_t> out_dim_size,
+    absl::Span<const int64_t> broadcast_dimensions) {
   return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     // Output shape, in the case of degenerate broadcast, the out_dim_size is
@@ -4690,8 +4690,8 @@ XlaOp Broadcast(const XlaOp operand,
 }
 
 XlaOp BroadcastInDim(const XlaOp operand,
-                     const absl::Span<const int64_t> out_dim_size,
-                     const absl::Span<const int64_t> broadcast_dimensions) {
+                     absl::Span<const int64_t> out_dim_size,
+                     absl::Span<const int64_t> broadcast_dimensions) {
   return operand.builder()->BroadcastInDim(operand, out_dim_size,
                                            broadcast_dimensions);
 }
@@ -5313,7 +5313,7 @@ XlaOp AllGather(const XlaOp operand, int64_t all_gather_dimension,
                                       layout, use_global_device_ids);
 }
 
-XlaOp AllGatherTuple(const absl::Span<const XlaOp> operands,
+XlaOp AllGatherTuple(absl::Span<const XlaOp> operands,
                      int64_t all_gather_dimension, int64_t shard_count,
                      absl::Span<const ReplicaGroup> replica_groups,
                      const std::optional<ChannelHandle>& channel_id,
@@ -5340,7 +5340,7 @@ XlaOp AllReduce(const XlaOp operand, const XlaComputation& computation,
                                       use_global_device_ids);
 }
 
-XlaOp AllReduceTuple(const absl::Span<const XlaOp> operands,
+XlaOp AllReduceTuple(absl::Span<const XlaOp> operands,
                      const XlaComputation& computation,
                      absl::Span<const ReplicaGroup> replica_groups,
                      const std::optional<ChannelHandle>& channel_id,

From e06195ace845fdf3cd67cae032e22f5e46147855 Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Wed, 27 Mar 2024 17:57:46 -0700
Subject: [PATCH 511/670] Use const qualified variables where possible.

Modified files:
* third_party/tensorflow/compiler/xla/service/shape_inference_test.cc
* third_party/tensorflow/compiler/xla/client/xla_builder_test.cc

PiperOrigin-RevId: 619725388
---
 .../xla/xla/client/xla_builder_test.cc        |  690 ++++---
 .../xla/xla/service/shape_inference_test.cc   | 1831 +++++++++--------
 2 files changed, 1344 insertions(+), 1177 deletions(-)

diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index 671dc4ab3362a8..665f932b78e537 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -107,7 +107,7 @@ std::string TestName() {
 TEST(XlaBuilderTest, OnePlusTwo) {
   XlaBuilder b(TestName());
   Add(ConstantR0<float>(&b, 1.0), ConstantR0<float>(&b, 2.0));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Add(m::Constant(), m::Constant())));
 }
@@ -117,7 +117,7 @@ TEST(XlaBuilderTest, UnaryOperatorsBuildExpectedHLO) {
                                  auto matches_pattern) {
     XlaBuilder b(TestName());
     op(ConstantR0<int32_t>(&b, 1));
-    TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+    TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
     auto root = module->entry_computation()->root_instruction();
     EXPECT_THAT(root, matches_pattern);
   };
@@ -132,7 +132,7 @@ TEST(XlaBuilderTest, BinaryOperatorsBuildExpectedHLO) {
                                   auto matches_pattern) {
     XlaBuilder b(TestName());
     op(ConstantR0<int32_t>(&b, 1), ConstantR0<int32_t>(&b, 2));
-    TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+    TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
     auto root = module->entry_computation()->root_instruction();
     EXPECT_THAT(root, matches_pattern);
   };
@@ -162,7 +162,7 @@ TEST(XlaBuilderTest, BinaryOperatorsBuildExpectedHLO) {
       [&](std::function<XlaOp(XlaOp, XlaOp)> op, auto matches_pattern) {
         XlaBuilder b(TestName());
         op(ConstantR0<uint32_t>(&b, 1), ConstantR0<uint32_t>(&b, 2));
-        TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+        TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
         auto root = module->entry_computation()->root_instruction();
         EXPECT_THAT(root, matches_pattern);
       };
@@ -173,10 +173,10 @@ TEST(XlaBuilderTest, BinaryOperatorsBuildExpectedHLO) {
 
 TEST(XlaBuilderTest, VariadicAnd) {
   XlaBuilder b(TestName());
-  Shape s = ShapeUtil::MakeShape(PRED, {});
+  const Shape s = ShapeUtil::MakeShape(PRED, {});
   And(Parameter(&b, 0, s, "p0"), Parameter(&b, 1, s, "p1"),
       Parameter(&b, 2, s, "p2"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   // Don't specify in the test whether And(x, y, z) is right- or
   // left-associative; accept either one.
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -189,10 +189,10 @@ TEST(XlaBuilderTest, VariadicAnd) {
 
 TEST(XlaBuilderTest, VariadicOr) {
   XlaBuilder b(TestName());
-  Shape s = ShapeUtil::MakeShape(PRED, {});
+  const Shape s = ShapeUtil::MakeShape(PRED, {});
   Or(Parameter(&b, 0, s, "p0"), Parameter(&b, 1, s, "p1"),
      Parameter(&b, 2, s, "p2"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   // Don't specify in the test whether Or(x, y, z) is right- or
   // left-associative; accept either one.
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -217,7 +217,7 @@ TEST(XlaBuilderTest, ParamPlusConstantHasScalarBroadcast) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {3, 5}), "x");
   Add(x, ConstantR0<float>(&b, 1.0));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               GmockMatch(m::Add(m::Parameter(), m::Broadcast(m::Constant()))));
@@ -225,9 +225,9 @@ TEST(XlaBuilderTest, ParamPlusConstantHasScalarBroadcast) {
 
 TEST(XlaBuilderTest, ParamPlusConstantHasScalarBroadcastReversed) {
   XlaBuilder b(TestName());
-  XlaOp x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {3, 5}), "x");
+  const XlaOp x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {3, 5}), "x");
   Add(ConstantR0<float>(&b, 1.0), x);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               GmockMatch(m::Add(m::Broadcast(m::Constant()), m::Parameter())));
@@ -241,10 +241,10 @@ TEST(XlaBuilderTest, ParamPlusParamHasBroadcast) {
   auto y = Parameter(&b, 1, y_shape, "y");
   auto add = Add(x, y, /*broadcast_dimensions=*/{0, 1});
 
-  TF_ASSERT_OK_AND_ASSIGN(auto add_shape, b.GetShape(add));
+  TF_ASSERT_OK_AND_ASSIGN(const auto add_shape, b.GetShape(add));
   EXPECT_TRUE(ShapeUtil::Equal(add_shape, x_shape));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
       root, GmockMatch(m::Add(m::Parameter(0), m::Broadcast(m::Parameter(1)))));
@@ -254,31 +254,31 @@ TEST(XlaBuilderTest, XPlusX) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(S32, {1, 3, 5, 7}), "x");
   Add(x, x);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0), m::Parameter(0))));
 }
 
 TEST(XlaBuilderTest, TestBinaryOpImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[1]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2, 2]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[2,2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[1]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[2, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2,2]"));
   Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
       /*broadcast_dimensions=*/{1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, TestBinaryOpImplicitBroadcastBounded) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[1]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[<=2, <=2]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=2, <=2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[1]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[<=2, <=2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, <=2]"));
   Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
       /*broadcast_dimensions=*/{1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
@@ -323,14 +323,14 @@ TEST(XlaBuilderTest, Call) {
   auto p0 = Parameter(&b_call, 0, ShapeUtil::MakeShape(F32, {}), "p0");
   auto p1 = Parameter(&b_call, 1, ShapeUtil::MakeShape(F32, {}), "p1");
   Add(p0, p1);
-  TF_ASSERT_OK_AND_ASSIGN(auto call, b_call.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto call, b_call.Build());
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "x");
   auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {}), "y");
   auto one = ConstantR0<float>(&b, 1);
   auto two = ConstantR0<float>(&b, 2);
   Add(Call(&b, call, {x, y}), Call(&b, call, {one, two}));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Add(m::Call(m::Parameter(), m::Parameter()),
                                       m::Call(m::Constant(), m::Constant()))));
@@ -341,7 +341,7 @@ TEST(XlaBuilderTest, BinopHasDegenerateBroadcast) {
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {1, 2, 3}), "x");
   auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {1, 2, 1}), "y");
   Add(x, y);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
 
   // Expected:
   //
@@ -363,7 +363,7 @@ TEST(XlaBuilderTest, BinopHasInDimAndDegenerateBroadcast) {
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3}), "x");
   auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {2, 1, 4}), "y");
   Add(x, y, /*broadcast_dimensions=*/{0, 1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
 
   // The binary operation has in-dim broadcast and degenerate broadcast, should
   // first do the in-dim broadcast then convert the degenerate broadcast into a
@@ -389,7 +389,7 @@ TEST(XlaBuilderTest, BroadcastInDim) {
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3}), "x");
   BroadcastInDim(x, {2, 4, 3},
                  /*broadcast_dimensions=*/{0, 2});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Broadcast()));
 }
@@ -399,7 +399,7 @@ TEST(XlaBuilderTest, BroadcastInDimWithDegeneratedDim) {
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 1, 4}), "x");
   BroadcastInDim(x, {2, 3, 4},
                  /*broadcast_dimensions=*/{0, 1, 2});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(m::Broadcast(m::Reshape(m::Broadcast()))));
 }
@@ -432,7 +432,7 @@ TEST(XlaBuilderTest, ReshapeDefaultOrder) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3, 5, 7}), "x");
   Reshape(x, /*new_sizes=*/{6, 35});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Reshape(m::Parameter())));
 }
@@ -441,7 +441,7 @@ TEST(XlaBuilderTest, ReshapeHasTranspose) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3, 5, 7}), "x");
   Reshape(x, /*dimensions=*/{3, 2, 1, 0}, /*new_sizes=*/{6, 35});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Reshape(m::Transpose(m::Parameter()))));
 }
@@ -450,7 +450,7 @@ TEST(XlaBuilderTest, Transpose) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
   Transpose(x, /*permutation=*/{1, 0});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Transpose(m::Parameter())));
 }
@@ -459,7 +459,7 @@ TEST(XlaBuilderTest, AllGatherR1) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4}), "x");
   AllGather(x, /*all_gather_dimension=*/0, /*shard_count=*/4);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   EXPECT_EQ(root->opcode(), HloOpcode::kAllGather);
@@ -470,7 +470,7 @@ TEST(XlaBuilderTest, AllGatherR2) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
   AllGather(x, /*all_gather_dimension=*/1, /*shard_count=*/4);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   EXPECT_EQ(root->opcode(), HloOpcode::kAllGather);
@@ -484,7 +484,7 @@ TEST(XlaBuilderTest, AllGatherWithTuple) {
   auto x2 = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {16, 4}), "x2");
   AllGather(Tuple(&b, {x, x2}), /*all_gather_dimension=*/0,
             /*shard_count=*/4);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   EXPECT_EQ(root->opcode(), HloOpcode::kAllGather);
@@ -499,7 +499,7 @@ TEST(XlaBuilderTest, AllGatherTuple) {
   auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {128, 4}), "p0");
   auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {128, 8}), "p1");
   AllGatherTuple({p0, p1}, /*all_gather_dimension=*/1, /*shard_count=*/4);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   auto tuple_shape =
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {128, 16}),
@@ -527,7 +527,7 @@ TEST(XlaBuilderTest, ReduceScatter) {
   group.add_replica_ids(1);
   ReduceScatter(x, to_apply, /*scatter_dimension=*/1, /*shard_count=*/2,
                 /*replica_groups=*/{group});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   EXPECT_EQ(root->opcode(), HloOpcode::kReduceScatter);
@@ -555,7 +555,7 @@ TEST(XlaBuilderTest, ReduceScatterWithTuple) {
   ReduceScatter(Tuple(&b, {x, x2}), to_apply, /*scatter_dimension=*/1,
                 /*shard_count=*/2,
                 /*replica_groups=*/{group});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   EXPECT_EQ(root->opcode(), HloOpcode::kReduceScatter);
@@ -570,7 +570,7 @@ TEST(XlaBuilderTest, AllToAll) {
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
   AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0,
            /*split_count=*/2);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   // AllToAll is decomposed into slices -> all-to-all -> gte -> concat.
@@ -587,7 +587,7 @@ TEST(XlaBuilderTest, AllToAllSpecial) {
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16, 8}), "x");
   AllToAll(x, /*split_dimension=*/0, /*concat_dimension=*/0,
            /*split_count=*/2);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   // AllToAll is converted into a single all-to-all HloInstruction.
@@ -605,7 +605,7 @@ TEST(XlaBuilderTest, AllToAllTuple) {
   replica_group.add_replica_ids(1);
 
   AllToAllTuple({p0, p1}, {replica_group}, LayoutUtil::MakeAscendingLayout(2));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   // Check shape and replica groups.
@@ -637,10 +637,10 @@ TEST(XlaBuilderTest, AllReduceTuple) {
   XlaBuilder bsum(TestName());
   auto f32Scalar = ShapeUtil::MakeShape(F32, {});
   Add(Parameter(&bsum, 0, f32Scalar, "x"), Parameter(&bsum, 1, f32Scalar, "y"));
-  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto sum, bsum.Build());
 
   AllReduceTuple({p0, p1}, sum);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   // Check shape and replica groups.
@@ -659,7 +659,7 @@ TEST(XlaBuilderTest, CollectiveBroadcast) {
   replica_group.add_replica_ids(0);
   replica_group.add_replica_ids(1);
   CollectiveBroadcast(x, {replica_group});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kCollectiveBroadcast);
 }
@@ -668,7 +668,7 @@ TEST(XlaBuilderTest, CollectivePermute) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
   CollectivePermute(x, {{0, 1}, {1, 2}, {2, 3}});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kCollectivePermute);
 }
@@ -678,7 +678,7 @@ TEST(XlaBuilderTest, GetDimensionSize) {
   auto x =
       Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}, {false, true}), "x");
   GetDimensionSize(x, 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kGetDimensionSize);
 }
@@ -689,7 +689,7 @@ TEST(XlaBuilderTest, GetDimensionSizeConstant) {
       Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}, {false, true}), "x");
   // Get dimension size from a constant dimension gives us a constant.
   GetDimensionSize(x, 0);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kConstant);
 }
@@ -707,7 +707,7 @@ TEST(XlaBuilderTest, ReportErrorOrReturnHandlesNonErrors) {
   XlaBuilder b(TestName());
   absl::StatusOr<XlaOp> op(ConstantR0<float>(&b, 1.0));
   Add(b.ReportErrorOrReturn(op), ConstantR0<float>(&b, 2.0));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Add(m::Constant(), m::Constant())));
 }
@@ -723,9 +723,10 @@ TEST(XlaBuilderTest, ReportErrorOrReturnHandlesErrors) {
 
 TEST(XlaBuilderTest, BuildWithSpecificRoot) {
   XlaBuilder b(TestName());
-  XlaOp constant = ConstantR0<float>(&b, 1.0);
+  const XlaOp constant = ConstantR0<float>(&b, 1.0);
   Add(constant, ConstantR0<float>(&b, 2.0));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b, /*root=*/constant));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module,
+                          BuildHloModule(b, /*root=*/constant));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Constant()));
 }
@@ -735,11 +736,11 @@ TEST(XlaBuilderTest, BuildWithSpecificRootAndMultipleParameters) {
   // parameters.
   XlaBuilder b(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {42, 123});
-  XlaOp x = Parameter(&b, 0, shape, "x");
-  XlaOp y = Parameter(&b, 1, shape, "y");
-  XlaOp z = Parameter(&b, 2, shape, "z");
+  const XlaOp x = Parameter(&b, 0, shape, "x");
+  const XlaOp y = Parameter(&b, 1, shape, "y");
+  const XlaOp z = Parameter(&b, 2, shape, "z");
   Add(x, Sub(y, z));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b, /*root=*/x));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b, /*root=*/x));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Parameter()));
   EXPECT_EQ(module->entry_computation()->num_parameters(), 3);
@@ -752,7 +753,7 @@ TEST(XlaBuilderTest, BuildWithSpecificRootWithWrongBuilder) {
   const Shape shape = ShapeUtil::MakeShape(F32, {42, 123});
 
   Parameter(&b, 0, shape, "param");
-  XlaOp other_param = Parameter(&other_b, 0, shape, "other_param");
+  const XlaOp other_param = Parameter(&other_b, 0, shape, "other_param");
 
   Status status = b.Build(other_param).status();
   ASSERT_IS_NOT_OK(status);
@@ -770,7 +771,7 @@ TEST(XlaBuilderTest, ProtoMatches) {
     auto p0 = Parameter(&b_call, 0, ShapeUtil::MakeShape(F32, {}), "p0");
     auto p1 = Parameter(&b_call, 1, ShapeUtil::MakeShape(F32, {}), "p1");
     Add(p0, Add(p1, p0));
-    TF_ASSERT_OK_AND_ASSIGN(auto call, b_call.Build());
+    TF_ASSERT_OK_AND_ASSIGN(const auto call, b_call.Build());
     XlaBuilder b(TestName());
     auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {}), "y");
@@ -786,11 +787,11 @@ TEST(XlaBuilderTest, ProtoMatches) {
 
 TEST(XlaBuilderTest, DynamicParameter) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {6}, {true})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   Parameter(&b, 1, ShapeUtil::MakeShape(U32, {}), "p1");
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b, /*root=*/p0));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b, /*root=*/p0));
   const Shape& param_shape = module->entry_computation()
                                  ->parameter_instruction(0)
                                  ->shape()
@@ -803,7 +804,7 @@ TEST(XlaBuilderTest, SetDimensionSize) {
   auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10}), "p0");
   auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "p1");
   auto set_dim_size = SetDimensionSize(p0, p1, 0);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(const auto module,
                           BuildHloModule(b, /*root=*/set_dim_size));
   const Shape& root_shape =
       module->entry_computation()->root_instruction()->shape();
@@ -816,7 +817,7 @@ TEST(XlaBuilderTest, RemoveDynamicDimension) {
   auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "p1");
   auto set_dim_size = SetDimensionSize(p0, p1, 0);
   auto remove_dim_size = RemoveDynamicDimension(set_dim_size, 0);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(const auto module,
                           BuildHloModule(b, /*root=*/remove_dim_size));
   const Shape& root_shape =
       module->entry_computation()->root_instruction()->shape();
@@ -832,7 +833,7 @@ TEST(XlaBuilderTest, RemoveDynamicDimensionMultiDims) {
   set_dim_size = SetDimensionSize(set_dim_size, p1, 1);
   auto remove_dim_size = RemoveDynamicDimension(set_dim_size, 0);
   remove_dim_size = RemoveDynamicDimension(remove_dim_size, 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(const auto module,
                           BuildHloModule(b, /*root=*/remove_dim_size));
   const Shape& root_shape =
       module->entry_computation()->root_instruction()->shape();
@@ -843,12 +844,12 @@ TEST(XlaBuilderTest, RemoveDynamicDimensionMultiDims) {
 
 TEST(XlaBuilderTest, DynamicUnary) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5}, {true}), ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte = GetTupleElement(p0, 0);
   Neg(gte);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(result_shape.is_dynamic_dimension(0));
@@ -856,14 +857,14 @@ TEST(XlaBuilderTest, DynamicUnary) {
 
 TEST(XlaBuilderTest, DynamicBinary) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5}, {true}),
        ShapeUtil::MakeShape(F32, {5}, {true}), ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte0 = GetTupleElement(p0, 0);
   auto gte1 = GetTupleElement(p0, 1);
   Add(gte0, gte1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(result_shape.is_dynamic_dimension(0));
@@ -871,14 +872,14 @@ TEST(XlaBuilderTest, DynamicBinary) {
 
 TEST(XlaBuilderTest, DynamicBinaryHasBroadcast) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5, 4}, {true, false}),
        ShapeUtil::MakeShape(F32, {5}, {true}), ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte0 = GetTupleElement(p0, 0);
   auto gte1 = GetTupleElement(p0, 1);
   Add(gte0, gte1, {0});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
@@ -887,14 +888,14 @@ TEST(XlaBuilderTest, DynamicBinaryHasBroadcast) {
 
 TEST(XlaBuilderTest, DynamicBroadcast) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5, 4}, {true, false}),
        ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte = GetTupleElement(p0, 0);
   BroadcastInDim(gte, /*out_dim_size=*/{3, 5, 4},
                  /*broadcast_dimensions=*/{1, 2});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(
@@ -904,14 +905,14 @@ TEST(XlaBuilderTest, DynamicBroadcast) {
 
 TEST(XlaBuilderTest, DynamicBinaryHasDegenerateBroadcast) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {10}, {true}),
        ShapeUtil::MakeShape(F32, {1, 15}), ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte0 = GetTupleElement(p0, 0);
   auto gte1 = GetTupleElement(p0, 1);
   Add(gte0, gte1, /*broadcast_dimensions=*/{0});  // f32[<=10, 15]
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
@@ -920,7 +921,7 @@ TEST(XlaBuilderTest, DynamicBinaryHasDegenerateBroadcast) {
 
 TEST(XlaBuilderTest, DynamicSelectOnlyPredDynamic) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(PRED, {10}, {true}),
        ShapeUtil::MakeShape(F32, {10}), ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
@@ -929,7 +930,7 @@ TEST(XlaBuilderTest, DynamicSelectOnlyPredDynamic) {
 
   Select(gte0, gte1, gte1);
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true}))
@@ -938,16 +939,17 @@ TEST(XlaBuilderTest, DynamicSelectOnlyPredDynamic) {
 
 TEST(XlaBuilderTest, SelectIntoConditional) {
   XlaBuilder b(TestName());
-  Shape selector_shape = ShapeUtil::MakeShape(PRED, {});
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape selector_shape = ShapeUtil::MakeShape(PRED, {});
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {})});
-  XlaOp p0 = Parameter(&b, 0, selector_shape, "p0");
-  XlaOp p1 = Parameter(&b, 1, tuple_param_shape, "p1");
-  XlaOp p2 = Parameter(&b, 2, tuple_param_shape, "p2");
+  const XlaOp p0 = Parameter(&b, 0, selector_shape, "p0");
+  const XlaOp p1 = Parameter(&b, 1, tuple_param_shape, "p1");
+  const XlaOp p2 = Parameter(&b, 2, tuple_param_shape, "p2");
 
   Select(p0, p1, p2);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(m::Conditional(m::Parameter(0), m::Parameter(1),
                                         m::Parameter(2))));
@@ -965,7 +967,7 @@ TEST(XlaBuilderTest, SelectIntoConditional) {
 
 TEST(XlaBuilderTest, DynamicPad) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5, 4}, {true, false}),
        ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
@@ -979,7 +981,7 @@ TEST(XlaBuilderTest, DynamicPad) {
     dimension->set_interior_padding(0);
   }
   Pad(gte, pad_val, padding_config);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
@@ -988,7 +990,7 @@ TEST(XlaBuilderTest, DynamicPad) {
 
 TEST(XlaBuilderTest, DynamicConvolution) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {1, 2, 2, 128}, {true, false, false, false}),
        ShapeUtil::MakeShape(F32, {2, 2, 128, 8}, {false, false, true, false}),
        ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
@@ -1010,7 +1012,7 @@ TEST(XlaBuilderTest, DynamicConvolution) {
   dnums.set_kernel_output_feature_dimension(3);
   ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
                             /*feature_group_count=*/1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(),
@@ -1020,7 +1022,7 @@ TEST(XlaBuilderTest, DynamicConvolution) {
 
 TEST(XlaBuilderTest, DynamicDot) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {2, 3, 4}, {true, true, false}),
        ShapeUtil::MakeShape(F32, {2, 4, 5}, {true, false, false}),
        ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
@@ -1034,7 +1036,7 @@ TEST(XlaBuilderTest, DynamicDot) {
   dnums.add_lhs_batch_dimensions(0);
   dnums.add_rhs_batch_dimensions(0);
   DotGeneral(lhs, rhs, dnums);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(
@@ -1044,7 +1046,7 @@ TEST(XlaBuilderTest, DynamicDot) {
 
 TEST(XlaBuilderTest, DynamicReduce) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5, 4, 3}, {false, true, false}),
        ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
@@ -1053,9 +1055,9 @@ TEST(XlaBuilderTest, DynamicReduce) {
   XlaBuilder bsum(TestName());
   Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
       Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
-  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto sum, bsum.Build());
   Reduce(gte, init, sum, {0});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
@@ -1064,7 +1066,7 @@ TEST(XlaBuilderTest, DynamicReduce) {
 
 TEST(XlaBuilderTest, DynamicReduceWindow) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {2, 4, 8}, {true, false, false}),
        ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
@@ -1073,10 +1075,10 @@ TEST(XlaBuilderTest, DynamicReduceWindow) {
   XlaBuilder bsum(TestName());
   Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
       Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
-  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto sum, bsum.Build());
   ReduceWindow(gte, init, sum, /*window_dimensions=*/{1, 2, 4},
                /*window_strides=*/{1, 1, 1}, Padding::kValid);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   VLOG(2) << module->entry_computation()->root_instruction()->ToString()
           << "\n";
   const Shape& result_shape =
@@ -1088,7 +1090,7 @@ TEST(XlaBuilderTest, DynamicReduceWindow) {
 
 TEST(XlaBuilderTest, VariadicDynamicReduceWindow) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {2, 4, 8}, {true, false, false}),
        ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
@@ -1103,12 +1105,12 @@ TEST(XlaBuilderTest, VariadicDynamicReduceWindow) {
   auto p5 = Parameter(&bsum, 3, ShapeUtil::MakeShape(F32, {}), "y1");
   std::vector<XlaOp> output_operands = {Add(p2, p4), Add(p3, p5)};
   Tuple(&bsum, absl::MakeSpan(output_operands));
-  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto sum, bsum.Build());
   auto init = ConstantR0<float>(&b, 0.f);
   ReduceWindow(input_operands, {init, init}, sum,
                /*window_dimensions=*/{1, 2, 4},
                /*window_strides=*/{1, 1, 1}, Padding::kValid);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   VLOG(2) << module->entry_computation()->root_instruction()->ToString()
           << "\n";
   const Shape& result_shape =
@@ -1123,7 +1125,7 @@ TEST(XlaBuilderTest, VariadicDynamicReduceWindow) {
 
 TEST(XlaBuilderTest, DynamicSelectAndScatter) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {2, 4, 8}, {true, false, false}),
        ShapeUtil::MakeShape(F32, {2, 2, 2}, {true, false, false}),
        ShapeUtil::MakeShape(U32, {})});
@@ -1132,17 +1134,17 @@ TEST(XlaBuilderTest, DynamicSelectAndScatter) {
   XlaBuilder bsum(TestName());
   Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
       Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
-  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto sum, bsum.Build());
   XlaBuilder bge(TestName());
   Ge(Parameter(&bge, 0, ShapeUtil::MakeShape(F32, {}), "x"),
      Parameter(&bge, 1, ShapeUtil::MakeShape(F32, {}), "y"));
-  TF_ASSERT_OK_AND_ASSIGN(auto ge, bge.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto ge, bge.Build());
 
   auto gte0 = GetTupleElement(p0, 0);
   auto source = GetTupleElement(p0, 1);
   SelectAndScatter(gte0, ge, {1, 2, 4}, {1, 2, 4}, Padding::kValid, source,
                    init, sum);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(
@@ -1152,14 +1154,14 @@ TEST(XlaBuilderTest, DynamicSelectAndScatter) {
 
 TEST(XlaBuilderTest, DynamicReshape) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {2, 3, 4, 5, 6},
                             {false, false, true, true, false}),
        ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte = GetTupleElement(p0, 0);  // f32[2, 3, <=4, <=5, 6]
   Reshape(gte, /*new_sizes=*/{6, 4, 5, 2, 3});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(result_shape.is_dynamic_dimension(1));
@@ -1171,7 +1173,7 @@ TEST(XlaBuilderTest, DynamicReshape) {
 
 TEST(XlaBuilderTest, DynamicSelect) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {4, 5, 6}, {false, true, false}),
        ShapeUtil::MakeShape(F32, {4, 5, 6}, {false, true, false}),
        ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
@@ -1180,7 +1182,7 @@ TEST(XlaBuilderTest, DynamicSelect) {
   auto gte0 = GetTupleElement(p0, 0);
   auto gte1 = GetTupleElement(p0, 1);
   Select(pred, gte0, gte1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(result_shape.is_dynamic_dimension(1));
@@ -1192,7 +1194,7 @@ TEST(XlaBuilderTest, DynamicSelect) {
 
 TEST(XlaBuilderTest, DynamicSelectNotCompatible) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {4, 5, 6}, {false, true, false}),
        ShapeUtil::MakeShape(F32, {4, 5, 6}, {false, false, true}),
        ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
@@ -1207,13 +1209,13 @@ TEST(XlaBuilderTest, DynamicSelectNotCompatible) {
 
 TEST(XlaBuilderTest, DynamicTranspose) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {3, 5}, {true, false}),
        ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte = GetTupleElement(p0, 0);
   Transpose(gte, /*permutation=*/{1, 0});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {false, true}))
@@ -1222,8 +1224,8 @@ TEST(XlaBuilderTest, DynamicTranspose) {
 
 TEST(XlaBuilderTest, DotWithPreferredElementType) {
   XlaBuilder b(TestName());
-  Shape p0_shape = ShapeUtil::MakeShape(U8, {2, 3});
-  Shape p1_shape = ShapeUtil::MakeShape(U16, {3, 2});
+  const Shape p0_shape = ShapeUtil::MakeShape(U8, {2, 3});
+  const Shape p1_shape = ShapeUtil::MakeShape(U16, {3, 2});
   auto p0 = Parameter(&b, 0, p0_shape, "p0");
   auto p1 = Parameter(&b, 1, p1_shape, "p1");
 
@@ -1232,7 +1234,7 @@ TEST(XlaBuilderTest, DotWithPreferredElementType) {
   dnums.add_rhs_contracting_dimensions(0);
   DotGeneral(p0, p1, dnums, /*precision_config=*/nullptr,
              /*preferred_element_type=*/U32);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   ASSERT_TRUE(
@@ -1258,16 +1260,16 @@ TEST(XlaBuilderTest, SparseDot) {
   std::vector<XlaOp> sparse_meta = {meta};
 
   SparseDot(lhs, rhs, sparse_meta, sparsity, dnums);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[10, 20]"));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[10, 20]"));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, ConvolutionWithPreferredElementType) {
   XlaBuilder b(TestName());
-  Shape p0_shape = ShapeUtil::MakeShape(S16, {1, 2, 2, 128});
-  Shape p1_shape = ShapeUtil::MakeShape(S8, {2, 2, 128, 8});
+  const Shape p0_shape = ShapeUtil::MakeShape(S16, {1, 2, 2, 128});
+  const Shape p1_shape = ShapeUtil::MakeShape(S8, {2, 2, 128, 8});
   auto p0 = Parameter(&b, 0, p0_shape, "p0");
   auto p1 = Parameter(&b, 1, p1_shape, "p1");
 
@@ -1288,7 +1290,7 @@ TEST(XlaBuilderTest, ConvolutionWithPreferredElementType) {
                             /*feature_group_count=*/1, /*batch_group_count=*/1,
                             /*precision_config=*/nullptr,
                             /*preferred_element_type=*/S32);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   ASSERT_TRUE(
@@ -1315,7 +1317,7 @@ TEST(XlaBuilderTest, CheckInputOutputAlias) {
   b.SetUpAlias({1}, 0, {});
   b.SetUpAlias({0}, 1, {});
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b, root));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b, root));
 
   const HloInputOutputAliasConfig& config = module->input_output_alias_config();
   EXPECT_TRUE(config.ParameterHasAlias(0, {}));
@@ -1340,7 +1342,7 @@ TEST(XlaBuilderTest, CheckBufferDonor) {
 
   b.AddBufferDonor(0, {});
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b, root));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b, root));
 
   const HloBufferDonorConfig& config = module->buffer_donor_config();
   EXPECT_TRUE(config.ParameterIsBufferDonor(0, {}));
@@ -1377,7 +1379,7 @@ TEST(XlaBuilderTest, ValidInputOutputAliasBufferDonor) {
 
   b.SetUpAlias({1}, 0, {});
   b.AddBufferDonor(1, {});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b, root));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b, root));
 
   const HloInputOutputAliasConfig& io_alias_config =
       module->input_output_alias_config();
@@ -1429,7 +1431,7 @@ TEST(XlaBuilderTest, SimpleSetFrontendAttributes) {
   b.ClearFrontendAttributes();
   ConstantR0(&b, 0);  // No attribute set
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
 
   std::vector<FrontendAttributes> expected{FrontendAttributes(), attributes,
                                            FrontendAttributes()};
@@ -1471,7 +1473,7 @@ TEST(XlaBuilderTest, ComplexSetFrontendAttributes) {
   ConstantR0(&b, 0);  // No attribute set
   expected.push_back(FrontendAttributes());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   ExpectInstructionsAttributesMatch(*module, expected);
 }
 
@@ -1537,14 +1539,14 @@ TEST(XlaBuilderTest, AddFrontendAttribute) {
   ConstantR0(&b, 0);  // No attribute set
   expected.push_back(FrontendAttributes());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   ExpectInstructionsAttributesMatch(*module, expected);
 }
 
 TEST(XlaBuilderTest, ComparisonType) {
   XlaBuilder b(TestName());
   (void)Le(ConstantR0<int32_t>(&b, 1), ConstantR0<int32_t>(&b, 2));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   ASSERT_THAT(root, GmockMatch(m::Compare(m::Constant(), m::Constant())));
   EXPECT_EQ(Comparison::Type::kSigned,
@@ -1554,7 +1556,7 @@ TEST(XlaBuilderTest, ComparisonType) {
 TEST(XlaBuilderTest, StableLookUpInstructionByHandle) {
   XlaBuilder b(TestName());
   internal::XlaBuilderFriend builder_friend;
-  XlaOp le = Le(ConstantR0<int32_t>(&b, 1), ConstantR0<int32_t>(&b, 2));
+  const XlaOp le = Le(ConstantR0<int32_t>(&b, 1), ConstantR0<int32_t>(&b, 2));
   HloInstructionProto* first_op = builder_friend.GetInstruction(le);
   // Create some more instructions.
   for (int i = 0; i < 100; ++i) {
@@ -1567,7 +1569,7 @@ TEST(XlaBuilderTest, StableLookUpInstructionByHandle) {
 
 TEST(XlaBuilderTest, ComplexAbsConstant) {
   XlaBuilder b(TestName());
-  XlaOp out =
+  const XlaOp out =
       Abs(ConstantR0<std::complex<float>>(&b, std::complex<float>{-1, -1}));
   ValueInference value_inference(&b);
   absl::StatusOr<OptionalLiteral> analyzed =
@@ -1579,22 +1581,24 @@ TEST(XlaBuilderTest, ComplexAbsConstant) {
 
 TEST(XlaBuilderTest, OutfeedDummyTupleSharding) {
   XlaBuilder b(TestName());
-  XlaOp value = ConstantR1<int32_t>(&b, {0});
-  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, /* dimensions= */ {1},
-                                                    /* minor_to_major= */ {0});
+  const XlaOp value = ConstantR1<int32_t>(&b, {0});
+  const Shape shape =
+      ShapeUtil::MakeShapeWithDenseLayout(S32, /* dimensions= */ {1},
+                                          /* minor_to_major= */ {0});
   Outfeed(value, shape, "");
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_FALSE(module->entry_computation()->root_instruction()->has_sharding());
 }
 
 TEST(XlaBuilderTest, OutfeedTokenSharding) {
   XlaBuilder b(TestName());
-  XlaOp value = ConstantR1<int32_t>(&b, {0});
-  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, /* dimensions= */ {1},
-                                                    /* minor_to_major= */ {0});
+  const XlaOp value = ConstantR1<int32_t>(&b, {0});
+  const Shape shape =
+      ShapeUtil::MakeShapeWithDenseLayout(S32, /* dimensions= */ {1},
+                                          /* minor_to_major= */ {0});
   b.SetSharding(sharding_builder::Replicate());
   Outfeed(value, shape, "");
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto it = std::find_if(module->entry_computation()->instructions().begin(),
                          module->entry_computation()->instructions().end(),
                          HloPredicateIsOp<HloOpcode::kOutfeed>);
@@ -1612,11 +1616,11 @@ TEST(XlaBuilderTest, OutfeedTokenSharding) {
 
 TEST(XlaBuilderTest, NormalizeTupleSharding) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {6})});
   b.SetSharding(sharding_builder::Replicate());
   Parameter(&b, 0, tuple_param_shape, "p0");
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_TRUE(root->has_sharding());
   EXPECT_TRUE(root->sharding().IsTuple());
@@ -1625,8 +1629,8 @@ TEST(XlaBuilderTest, NormalizeTupleSharding) {
 
 TEST(XlaBuilderTest, InvalidSharding) {
   XlaBuilder b(TestName());
-  Shape shape2d = ShapeUtil::MakeShape(F32, {6, 8});
-  Shape shape1d = ShapeUtil::MakeShape(F32, {5});
+  const Shape shape2d = ShapeUtil::MakeShape(F32, {6, 8});
+  const Shape shape1d = ShapeUtil::MakeShape(F32, {5});
   b.SetSharding(sharding_builder::Tile1D(shape1d, 4));
   Parameter(&b, 0, shape2d, "p0");
   auto statusor = b.Build();
@@ -1642,7 +1646,7 @@ TEST(XlaBuilderTest, TopKDimensions) {
   int64_t largest = true;
   TopK(Parameter(&b, 0, ShapeUtil::MakeShape(F32, {6, 8}), "p0"), k, largest);
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_TRUE(root->opcode() == HloOpcode::kTopK);
   EXPECT_TRUE(root->shape().IsTuple());
@@ -1661,15 +1665,15 @@ TEST(XlaBuilderTest, TopKDimensions) {
 
 TEST(XlaBuilderTest, DynamicBroadcastInDimExportSuccess) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[1, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_dimensions, ParseShape("s32[3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_shape, ParseShape("f32[1, 2, 3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[1, 2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("s32[3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[1, 2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, 2, 3]"));
   DynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
       /*broadcast_dimensions=*/{1, 2}, output_shape);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(module->ToString(), HasSubstr("mhlo.dynamic_broadcast_in_dim"));
   EXPECT_THAT(module->ToString(), HasSubstr("broadcast_dimensions=[1,2]"));
   EXPECT_THAT(GetRoot(*module),
@@ -1678,15 +1682,15 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimExportSuccess) {
 
 TEST(XlaBuilderTest, DynamicBroadcastInDimNonBroadcastDimSizeGreaterThanOne) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_dimensions, ParseShape("s32[3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_shape, ParseShape("f32[2, 2, 3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[2, 2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("s32[3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[2, 2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2, 2, 3]"));
   DynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
       /*broadcast_dimensions=*/{1, 2}, output_shape);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(module->ToString(), HasSubstr("mhlo.dynamic_broadcast_in_dim"));
   EXPECT_THAT(module->ToString(), HasSubstr("broadcast_dimensions=[1,2]"));
   EXPECT_THAT(GetRoot(*module),
@@ -1695,15 +1699,15 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimNonBroadcastDimSizeGreaterThanOne) {
 
 TEST(XlaBuilderTest, DynamicBroadcastInDimDynamicResultSize) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[1, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_dimensions, ParseShape("s32[3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_shape, ParseShape("f32[1, 2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[1, 2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("s32[3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[1, 2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, 2, ?]"));
   DynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
       /*broadcast_dimensions=*/{1, 2}, output_shape);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(module->ToString(), HasSubstr("mhlo.dynamic_broadcast_in_dim"));
   EXPECT_THAT(module->ToString(), HasSubstr("broadcast_dimensions=[1,2]"));
   EXPECT_THAT(GetRoot(*module),
@@ -1712,9 +1716,9 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimDynamicResultSize) {
 
 TEST(XlaBuilderTest, DynamicBroadcastInDimInvalidOutputDimensionsElementType) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_dimensions, ParseShape("f32[3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_shape, ParseShape("f32[2, 3, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("f32[3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[2, 3, 3]"));
   DynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
@@ -1727,9 +1731,10 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimInvalidOutputDimensionsElementType) {
 
 TEST(XlaBuilderTest, DynamicBroadcastInDimInvalidOutputDimensionsRank) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_dimensions, ParseShape("s32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_shape, ParseShape("f32[2, 3, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions,
+                          ParseShape("s32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[2, 3, 3]"));
   DynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
@@ -1742,9 +1747,9 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimInvalidOutputDimensionsRank) {
 
 TEST(XlaBuilderTest, DynamicBroadcastInDimIncompatibleBroadcastSize) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_dimensions, ParseShape("s32[3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_shape, ParseShape("f32[2, 3, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("s32[3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[2, 3, 3]"));
   DynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
@@ -1788,24 +1793,26 @@ class XlaBuilderUnboundedBinaryOpTest
 
 TEST_P(XlaBuilderUnboundedUnaryOpTest, UnboundedUnaryOpTest) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape(GetParam().operand));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape(GetParam().operand));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                          ParseShape(GetParam().expected));
   GetParam().unary_op(Parameter(&b, 0, operand, "operand"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST_P(XlaBuilderUnboundedBinaryOpTest, UnboundedBinaryOpTest) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                          ParseShape(GetParam().expected));
   GetParam().binary_op(Parameter(&b, 0, lhs, "lhs"),
                        Parameter(&b, 1, rhs, "rhs"),
                        GetParam().broadcast_dimensions);
   if (auto result = BuildHloModule(b); result.ok()) {
-    std::unique_ptr<HloModule> module = std::move(*result);
+    const std::unique_ptr<HloModule> module = std::move(*result);
     EXPECT_THAT(GetRoot(*module),
                 GmockMatch(m::Op().WithShapeEqualTo(&expected)));
   } else {
@@ -1816,33 +1823,33 @@ TEST_P(XlaBuilderUnboundedBinaryOpTest, UnboundedBinaryOpTest) {
 
 TEST(XlaBuilderTest, UnboundedAddScalarBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
       /*broadcast_dimensions=*/empty_array);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedAddDegenerateBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[1, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[1, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
       /*broadcast_dimensions=*/{0, 1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedAddUnsupportedImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
       /*broadcast_dimensions=*/zero_array);
   EXPECT_THAT(BuildHloModule(b),
@@ -1851,85 +1858,88 @@ TEST(XlaBuilderTest, UnboundedAddUnsupportedImplicitBroadcast) {
 
 TEST(XlaBuilderTest, UnboundedAnd) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("s32[1, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("s32[?, 1, ?, 2, ?, <=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
+                          ParseShape("s32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs,
+                          ParseShape("s32[?, 1, ?, 2, ?, <=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("s32[?, ?, 2, 2, <=2, <=2, ?]"));
   And(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
       /*broadcast_dimensions=*/absl::Span<const int64_t>{});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedBatchNormGrad) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scale, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape mean, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape variance, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_scale, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_offset, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_output, ParseShape("f32[5, ?, 7]"));
-  Shape expected =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scale, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape mean, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape variance, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_scale, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_offset, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_output, ParseShape("f32[5, ?, 7]"));
+  const Shape expected =
       ShapeUtil::MakeTupleShape({grad_operand, grad_scale, grad_offset});
   BatchNormGrad(
       Parameter(&b, 0, operand, "operand"), Parameter(&b, 1, scale, "scale"),
       Parameter(&b, 2, mean, "mean"), Parameter(&b, 3, variance, "variance"),
       Parameter(&b, 4, grad_output, "grad_output"), 1.0, 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedBatchNormInference) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scale, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape offset, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape mean, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape variance, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scale, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape offset, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape mean, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape variance, ParseShape("f32[5]"));
   BatchNormInference(
       Parameter(&b, 0, operand, "operand"), Parameter(&b, 1, scale, "scale"),
       Parameter(&b, 2, offset, "offset"), Parameter(&b, 3, mean, "mean"),
       Parameter(&b, 4, variance, "variance"), 1.0, 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedBatchNormTraining) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scale, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape offset, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape batch_mean, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape batch_var, ParseShape("f32[?]"));
-  Shape expected = ShapeUtil::MakeTupleShape({output, batch_mean, batch_var});
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scale, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape offset, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape batch_mean, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape batch_var, ParseShape("f32[?]"));
+  const Shape expected =
+      ShapeUtil::MakeTupleShape({output, batch_mean, batch_var});
   BatchNormTraining(Parameter(&b, 0, operand, "operand"),
                     Parameter(&b, 1, scale, "scale"),
                     Parameter(&b, 2, offset, "offset"), 1.0, 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedBitcastConvert) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f16[?, 10, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f16[?, 10, 2]"));
   BitcastConvertType(Parameter(&b, 0, operand, "operand"), PrimitiveType::F16);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedBroadcastUnsupportedOperand) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=3, ?]"));
   Broadcast(Parameter(&b, 0, operand, "operand"), /*broadcast_sizes=*/{1});
   EXPECT_THAT(BuildHloModule(b),
               StatusIs(_, HasSubstr("is_unbounded_dynamic")));
@@ -1937,7 +1947,7 @@ TEST(XlaBuilderTest, UnboundedBroadcastUnsupportedOperand) {
 
 TEST(XlaBuilderTest, UnboundedBroadcastUnsupportedBroadcastSize) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[1]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1]"));
   Broadcast(Parameter(&b, 0, operand, "operand"),
             /*broadcast_sizes=*/{Shape::kUnboundedSize});
   EXPECT_THAT(
@@ -1947,19 +1957,19 @@ TEST(XlaBuilderTest, UnboundedBroadcastUnsupportedBroadcastSize) {
 
 TEST(XlaBuilderTest, UnboundedBroadcastInDim) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=2, 3, 4]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, 3, 4]"));
   BroadcastInDim(Parameter(&b, 0, operand, "operand"),
                  /*out_dim_size=*/{2, 3, 4},
                  /*broadcast_dimensions=*/{0, 2});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedBroadcastInDimUnsupported) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=3, ?]"));
   BroadcastInDim(Parameter(&b, 0, operand, "operand"),
                  /*out_dim_size=*/{2, 3, Shape::kUnboundedSize},
                  /*broadcast_dimensions=*/{0, 2});
@@ -1970,66 +1980,69 @@ TEST(XlaBuilderTest, UnboundedBroadcastInDimUnsupported) {
 
 TEST(XlaBuilderTest, UnboundedClamp) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
+                          ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs,
+                          ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs,
+                          ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedClampScalarMinImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedClampScalarMinMaxImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedClampScalarOperandMaxImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedClampScalarMinOperandImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
@@ -2037,9 +2050,9 @@ TEST(XlaBuilderTest, UnboundedClampScalarMinOperandImplicitBroadcast) {
 TEST(XlaBuilderTest,
      UnboundedClampUnsupportedDegenerateOperandImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[1]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[1]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[?, 10]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
   EXPECT_THAT(BuildHloModule(b),
@@ -2048,52 +2061,54 @@ TEST(XlaBuilderTest,
 
 TEST(XlaBuilderTest, UnboundedCompare) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
+                          ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs,
+                          ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("pred[?, ?, 2, 2, <=2, <=2, ?]"));
   Compare(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
           /*direction=*/{});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedConcatenate) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand1,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand1,
                           ParseShape("f32[3, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand2,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand2,
                           ParseShape("f32[?, 4, ?, 2, ?, <=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand3,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand3,
                           ParseShape("f32[?, ?, 2, 2, <=2, <=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("f32[3, 4, ?, 2, <=2, <=2, ?]"));
   ConcatInDim(&b,
               {Parameter(&b, 0, operand1, "operand1"),
                Parameter(&b, 1, operand2, "operand2"),
                Parameter(&b, 2, operand3, "operand3")},
               2);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedConvert) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("s32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("s32[?]"));
   ConvertElementType(Parameter(&b, 0, operand, "operand"), PrimitiveType::S32);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedConvolution) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 2, ?, 128]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2, 2, <=128, 8]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 1, ?, 8]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 2, ?, 128]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[2, 2, <=128, 8]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 1, ?, 8]"));
 
   ConvolutionDimensionNumbers dnums;
   dnums.set_input_batch_dimension(0);
@@ -2111,27 +2126,27 @@ TEST(XlaBuilderTest, UnboundedConvolution) {
   ConvWithGeneralDimensions(Parameter(&b, 0, lhs, "lhs"),
                             Parameter(&b, 1, rhs, "rhs"),
                             /*window_strides=*/{1, 1}, Padding::kValid, dnums);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedDot) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Dot(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedDotGeneral) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, <=3, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2, 4, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, <=3, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, <=3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[2, 4, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, <=3, 5]"));
 
   DotDimensionNumbers dnums;
   dnums.add_lhs_contracting_dimensions(2);
@@ -2140,16 +2155,17 @@ TEST(XlaBuilderTest, UnboundedDotGeneral) {
   dnums.add_rhs_batch_dimensions(0);
 
   DotGeneral(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"), dnums);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedGather) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[3, 4, 2]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape start_indices, ParseShape("s32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, 2, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[3, 4, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape start_indices,
+                          ParseShape("s32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, 2, 2]"));
 
   GatherDimensionNumbers dimension_numbers;
   dimension_numbers.add_offset_dims(2);
@@ -2162,7 +2178,7 @@ TEST(XlaBuilderTest, UnboundedGather) {
   Gather(Parameter(&b, 0, operand, "operand"),
          Parameter(&b, 1, start_indices, "start_indices"), dimension_numbers,
          /*slice_sizes=*/{1, 2, 2});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
@@ -2184,8 +2200,8 @@ TEST(XlaBuilderTest, UnboundedOr) {
 
 TEST(XlaBuilderTest, UnboundedPad) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 21]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 21]"));
   PaddingConfig padding_config;
   for (int i = 0; i < 2; i++) {
     auto dimension = padding_config.add_dimensions();
@@ -2195,21 +2211,21 @@ TEST(XlaBuilderTest, UnboundedPad) {
   }
   Pad(Parameter(&b, 0, operand, "operand"),
       /*padding_value=*/ConstantR0<float>(&b, 0), padding_config);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedReduce) {
   XlaBuilder b(TestName());
-  Shape shape = ShapeUtil::MakeShape(F32, {7}, {false});
-  Shape expected = ShapeUtil::MakeTupleShape({shape, shape, shape});
+  const Shape shape = ShapeUtil::MakeShape(F32, {7}, {false});
+  const Shape expected = ShapeUtil::MakeTupleShape({shape, shape, shape});
 
-  TF_ASSERT_OK_AND_ASSIGN(Shape input0, ParseShape("f32[7, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape input1, ParseShape("f32[?, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape input2, ParseShape("f32[7, ?]"));
-  Shape scalar_f32 = ShapeUtil::MakeShape(F32, {});
-  XlaOp init = Parameter(&b, 3, scalar_f32, "init");
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input0, ParseShape("f32[7, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input1, ParseShape("f32[?, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input2, ParseShape("f32[7, ?]"));
+  const Shape scalar_f32 = ShapeUtil::MakeShape(F32, {});
+  const XlaOp init = Parameter(&b, 3, scalar_f32, "init");
 
   XlaBuilder bsum(TestName());
   std::vector<XlaOp> output_operands = {
@@ -2220,50 +2236,50 @@ TEST(XlaBuilderTest, UnboundedReduce) {
       Add(Parameter(&bsum, 4, scalar_f32, "arg4"),
           Parameter(&bsum, 5, scalar_f32, "arg5"))};
   Tuple(&bsum, absl::MakeSpan(output_operands));
-  TF_ASSERT_OK_AND_ASSIGN(XlaComputation sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const XlaComputation sum, bsum.Build());
   Reduce(
       &b,
       {Parameter(&b, 0, input0, "input0"), Parameter(&b, 1, input1, "input1"),
        Parameter(&b, 2, input2, "input2")},
       {init, init, init}, sum, {1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedReduceWindow) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape input, ParseShape("f32[?, 4, 8]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 3, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input, ParseShape("f32[?, 4, 8]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 3, 5]"));
 
   XlaBuilder bsum(TestName());
   Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
       Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
-  TF_ASSERT_OK_AND_ASSIGN(XlaComputation sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const XlaComputation sum, bsum.Build());
 
   ReduceWindow(Parameter(&b, 0, input, "input"), ConstantR0<float>(&b, 0.f),
                sum,
                /*window_dimensions=*/{1, 2, 4},
                /*window_strides=*/{1, 1, 1}, Padding::kValid);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedReshape) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[2,3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2,3]"));
   Reshape(Parameter(&b, 0, operand, "operand"), /*dimensions=*/{0},
           /*new_sizes=*/{2, 3});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedReshapeUnsupportedOutputShape) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[6]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[6]"));
   Reshape(Parameter(&b, 0, operand, "operand"), /*dimensions=*/{0},
           /*new_sizes=*/{Shape::kUnboundedSize, Shape::kUnboundedSize});
   EXPECT_THAT(
@@ -2275,7 +2291,7 @@ TEST(XlaBuilderTest, UnboundedReshapeUnsupportedOutputShape) {
 
 TEST(XlaBuilderTest, UnboundedReshapeUnsupportedInferredShape) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
   Reshape(operand, Parameter(&b, 0, operand, "operand"));
   EXPECT_THAT(
       BuildHloModule(b),
@@ -2286,18 +2302,19 @@ TEST(XlaBuilderTest, UnboundedReshapeUnsupportedInferredShape) {
 
 TEST(XlaBuilderTest, UnboundedScatter) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape input, ParseShape("f32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_indices, ParseShape("s32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape updates, ParseShape("f32[?, ?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input, ParseShape("f32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_indices,
+                          ParseShape("s32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape updates, ParseShape("f32[?, ?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, ?]"));
 
   XlaComputation update_computation;
   {
-    std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
-    XlaOp arg0 = Parameter(sub_builder.get(), 0,
-                           ShapeUtil::MakeScalarShape(F32), "arg0");
-    XlaOp arg1 = Parameter(sub_builder.get(), 1,
-                           ShapeUtil::MakeScalarShape(F32), "arg1");
+    const std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
+    const XlaOp arg0 = Parameter(sub_builder.get(), 0,
+                                 ShapeUtil::MakeScalarShape(F32), "arg0");
+    const XlaOp arg1 = Parameter(sub_builder.get(), 1,
+                                 ShapeUtil::MakeScalarShape(F32), "arg1");
     Add(arg0, arg1);
     TF_ASSERT_OK_AND_ASSIGN(update_computation, sub_builder->Build());
   }
@@ -2316,73 +2333,76 @@ TEST(XlaBuilderTest, UnboundedScatter) {
           dimension_numbers, /*indices_are_sorted=*/false,
           /*unique_indices=*/false);
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedSelect) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[1, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
+                          ParseShape("pred[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs,
+                          ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs,
+                          ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("f32[1, 1, 2, 2, <=2, <=2, ?]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedSelectScalarPred) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("pred[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedSelectScalarOnTrueOnFalseImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("pred[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedSelectScalarPredOnFalseImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("pred[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedSelectScalarPredOnTrueImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("pred[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
@@ -2390,9 +2410,9 @@ TEST(XlaBuilderTest, UnboundedSelectScalarPredOnTrueImplicitBroadcast) {
 TEST(XlaBuilderTest,
      UnboundedSelectUnsupportedDegenerateOperandImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[1]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("pred[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[1]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[?, 10]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
   EXPECT_THAT(BuildHloModule(b),
@@ -2401,26 +2421,26 @@ TEST(XlaBuilderTest,
 
 TEST(XlaBuilderTest, UnboundedSlice) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[1, <=3, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[1, <=2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, <=3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, <=2, 3]"));
   Slice(Parameter(&b, 0, operand, "operand"),
         /*start_indices=*/{0, 1, 2},
         /*limit_indices=*/{1, 3, 5},
         /*strides=*/{1, 1, 1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedTranspose) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand,
                           ParseShape("f32[1, ?, 2, ?, <=2]{4,3,2,1,0}"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("f32[<=2, 1, ?, 2, ?]{0,2,3,4,1}"));
   Transpose(Parameter(&b, 0, operand, "operand"),
             /*permutation=*/{4, 0, 3, 2, 1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
diff --git a/third_party/xla/xla/service/shape_inference_test.cc b/third_party/xla/xla/service/shape_inference_test.cc
index ef34bb1aa2fb80..174e10f007037d 100644
--- a/third_party/xla/xla/service/shape_inference_test.cc
+++ b/third_party/xla/xla/service/shape_inference_test.cc
@@ -82,8 +82,9 @@ class ReduceShapeInferenceTest : public ShapeInferenceTest {
       const Shape& expected_inferred_shape, const Shape& arg,
       absl::Span<const int64_t> dimensions_to_reduce) {
     ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
-    auto inferred_status = ShapeInference::InferReduceShape(
-        {&arg, &f32_}, dimensions_to_reduce, to_apply);
+    const absl::StatusOr<Shape> inferred_status =
+        ShapeInference::InferReduceShape({&arg, &f32_}, dimensions_to_reduce,
+                                         to_apply);
     EXPECT_IS_OK(inferred_status.status());
     EXPECT_TRUE(ShapeUtil::Equal(expected_inferred_shape, *inferred_status));
   }
@@ -162,123 +163,137 @@ class UnboundedSelectOpShapeInferenceTest
     : public ::testing::TestWithParam<std::vector<std::string>> {};
 
 TEST_F(ShapeInferenceTest, UnaryNegateMatrix) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferUnaryOpShape(HloOpcode::kNegate, matrix_shape);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_shape, *inferred_status));
 }
 
 TEST_F(ShapeInferenceTest, SelectScalarPredBetweenTuples) {
-  Shape tuple = ShapeUtil::MakeTupleShape({s32_, f32_});
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, pred_, tuple, tuple);
+  const Shape tuple = ShapeUtil::MakeTupleShape({s32_, f32_});
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, pred_, tuple,
+                                          tuple);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Expected array argument for select"));
 }
 
 TEST_F(ShapeInferenceTest, SelectScalarPredBetweenArrays) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, pred_, matrix_64_48_, matrix_64_48_);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, pred_,
+                                          matrix_64_48_, matrix_64_48_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
 }
 
 TEST_F(ShapeInferenceTest, SelectArrayPredBetweenArrays) {
-  auto predarray = ShapeUtil::MakeShape(PRED, {64, 48});
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, predarray, matrix_64_48_, matrix_64_48_);
+  const Shape predarray = ShapeUtil::MakeShape(PRED, {64, 48});
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, predarray,
+                                          matrix_64_48_, matrix_64_48_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
 }
 
 TEST_F(ShapeInferenceTest, SelectBadShapes) {
-  auto inferred_status_error1 = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, pred_, matrix_64_48_, matrix_32_64_);
+  const absl::StatusOr<Shape> inferred_status_error1 =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, pred_,
+                                          matrix_64_48_, matrix_32_64_);
   ASSERT_FALSE(inferred_status_error1.ok());
   ASSERT_THAT(inferred_status_error1.status().message(),
               HasSubstr("Operands to select must be the same shape"));
 
-  auto inferred_status_error2 = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, s32_, matrix_64_48_, matrix_64_48_);
+  const absl::StatusOr<Shape> inferred_status_error2 =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, s32_,
+                                          matrix_64_48_, matrix_64_48_);
   ASSERT_FALSE(inferred_status_error2.ok());
   ASSERT_THAT(inferred_status_error2.status().message(),
               HasSubstr("pred operand must have PRED"));
 
-  auto inferred_status_error3 = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, ShapeUtil::MakeShape(PRED, {64}), matrix_64_48_,
-      matrix_64_48_);
+  const absl::StatusOr<Shape> inferred_status_error3 =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kSelect,
+                                          ShapeUtil::MakeShape(PRED, {64}),
+                                          matrix_64_48_, matrix_64_48_);
   ASSERT_FALSE(inferred_status_error3.ok());
   ASSERT_THAT(
       inferred_status_error3.status().message(),
       HasSubstr("Operands to select and predicate must be the same shape"));
 
   // Tuples have a TUPLE element type and cannot be the pred of a select.
-  auto inferred_status_error4 = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, ShapeUtil::MakeTupleShape({pred_, pred_}),
-      ShapeUtil::MakeTupleShape({f32_, f32_}),
-      ShapeUtil::MakeTupleShape({f32_, f32_}));
+  const absl::StatusOr<Shape> inferred_status_error4 =
+      ShapeInference::InferTernaryOpShape(
+          HloOpcode::kSelect, ShapeUtil::MakeTupleShape({pred_, pred_}),
+          ShapeUtil::MakeTupleShape({f32_, f32_}),
+          ShapeUtil::MakeTupleShape({f32_, f32_}));
   ASSERT_FALSE(inferred_status_error4.ok());
   ASSERT_THAT(inferred_status_error4.status().message(),
               HasSubstr("Expected array argument for select pred"));
 }
 
 TEST_F(ShapeInferenceTest, ClampAllMatrix) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, matrix_64_48_, matrix_64_48_, matrix_64_48_);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, matrix_64_48_,
+                                          matrix_64_48_, matrix_64_48_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
 }
 
 TEST_F(ShapeInferenceTest, ClampAllScalar) {
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, f32_, f32_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(f32_, *inferred_status));
 }
 
 TEST_F(ShapeInferenceTest, ClampMinScalar) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, f32_, matrix_64_48_, matrix_64_48_);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_,
+                                          matrix_64_48_, matrix_64_48_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
 }
 
 TEST_F(ShapeInferenceTest, ClampMaxScalar) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, matrix_64_48_, matrix_64_48_, f32_);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, matrix_64_48_,
+                                          matrix_64_48_, f32_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
 }
 
 TEST_F(ShapeInferenceTest, ClampOperandScalar) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, matrix_64_48_, f32_, matrix_64_48_);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, matrix_64_48_,
+                                          f32_, matrix_64_48_);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Clamp with incompatible shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampMinMatrix) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, matrix_64_48_, f32_, f32_);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, matrix_64_48_,
+                                          f32_, f32_);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Clamp with incompatible shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampMaxMatrix) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, f32_, f32_, matrix_64_48_);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, f32_,
+                                          matrix_64_48_);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Clamp with incompatible shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampOperandMatrix) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, f32_, matrix_64_48_, f32_);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_,
+                                          matrix_64_48_, f32_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
 }
@@ -317,8 +332,8 @@ TEST_F(ShapeInferenceTest, ClampBadShapes) {
 }
 
 TEST_F(ShapeInferenceTest, Complex) {
-  auto complex_shape = [&](const Shape& lhs, const Shape& rhs,
-                           absl::Span<const int64_t> bcast) {
+  const auto complex_shape = [&](const Shape& lhs, const Shape& rhs,
+                                 absl::Span<const int64_t> bcast) {
     return ShapeInference::InferBinaryOpShape(HloOpcode::kComplex, lhs, rhs,
                                               bcast);
   };
@@ -330,7 +345,7 @@ TEST_F(ShapeInferenceTest, Complex) {
   // Only F32->C64 and F64->C128 supported.
   ASSERT_FALSE(complex_shape(f16_, f16_, {}).ok());
   // Validate correct uses.
-  Shape c64_32 = ShapeUtil::MakeShape(C64, {32});
+  const Shape c64_32 = ShapeUtil::MakeShape(C64, {32});
   TF_ASSERT_OK_AND_ASSIGN(Shape result, complex_shape(f32_, f32_, {}));
   ASSERT_TRUE(ShapeUtil::Equal(result, ShapeUtil::MakeShape(C64, {})));
   TF_ASSERT_OK_AND_ASSIGN(result, complex_shape(vector_32_, f32_, {}));
@@ -340,7 +355,7 @@ TEST_F(ShapeInferenceTest, Complex) {
   TF_ASSERT_OK_AND_ASSIGN(result, complex_shape(vector_32_, f32_, {}));
   ASSERT_TRUE(ShapeUtil::Equal(result, c64_32));
 
-  Shape c64_32_64 = ShapeUtil::MakeShape(C64, {32, 64});
+  const Shape c64_32_64 = ShapeUtil::MakeShape(C64, {32, 64});
   TF_ASSERT_OK_AND_ASSIGN(result,
                           complex_shape(vector_64_, matrix_32_64_, {1}));
   ASSERT_TRUE(ShapeUtil::Equal(result, c64_32_64));
@@ -358,7 +373,7 @@ TEST_F(ShapeInferenceTest, Complex) {
 }
 
 TEST_F(ShapeInferenceTest, VariadicOpTuplify) {
-  absl::StatusOr<Shape> result =
+  const absl::StatusOr<Shape> result =
       ShapeInference::InferVariadicOpShape(HloOpcode::kTuple, {&s32_, &f32_});
   ASSERT_IS_OK(result.status());
   ASSERT_TRUE(
@@ -366,7 +381,7 @@ TEST_F(ShapeInferenceTest, VariadicOpTuplify) {
 }
 
 TEST_F(ShapeInferenceTest, ReduceWindowInHalf) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {8, 8});
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {8, 8});
   Window window;
   WindowDimension dim;
   dim.set_size(2);
@@ -377,13 +392,14 @@ TEST_F(ShapeInferenceTest, ReduceWindowInHalf) {
   dim.set_base_dilation(1);
   *window.add_dimensions() = dim;
   *window.add_dimensions() = dim;
-  Shape window_shape = ShapeUtil::MakeShape(F32, {2, 2});
-  Shape init_value_shape = ShapeUtil::MakeShape(F32, {});
-  Shape float_scalar = ShapeUtil::MakeShape(F32, {});
+  const Shape window_shape = ShapeUtil::MakeShape(F32, {2, 2});
+  const Shape init_value_shape = ShapeUtil::MakeShape(F32, {});
+  const Shape float_scalar = ShapeUtil::MakeShape(F32, {});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})}, f32_);
-  auto inferred_status = ShapeInference::InferReduceWindowShape(
-      matrix_shape, init_value_shape, window, to_apply);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceWindowShape(matrix_shape, init_value_shape,
+                                             window, to_apply);
 
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(
@@ -391,18 +407,20 @@ TEST_F(ShapeInferenceTest, ReduceWindowInHalf) {
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterProperShapes) {
-  auto inferred_status_ok = ShapeInference::InferSelectAndScatterShape(
-      operand_shape_, select_program_shape_, window_, source_shape_,
-      init_value_shape_, scatter_program_shape_);
+  const absl::StatusOr<Shape> inferred_status_ok =
+      ShapeInference::InferSelectAndScatterShape(
+          operand_shape_, select_program_shape_, window_, source_shape_,
+          init_value_shape_, scatter_program_shape_);
   ASSERT_IS_OK(inferred_status_ok.status());
   ASSERT_TRUE(ShapeUtil::Equal(operand_shape_, *inferred_status_ok));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSourceShape) {
-  Shape source_shape_fail = ShapeUtil::MakeShape(F32, {4, 6});
-  auto inferred_status_fail = ShapeInference::InferSelectAndScatterShape(
-      operand_shape_, select_program_shape_, window_, source_shape_fail,
-      init_value_shape_, scatter_program_shape_);
+  const Shape source_shape_fail = ShapeUtil::MakeShape(F32, {4, 6});
+  const absl::StatusOr<Shape> inferred_status_fail =
+      ShapeInference::InferSelectAndScatterShape(
+          operand_shape_, select_program_shape_, window_, source_shape_fail,
+          init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
   ASSERT_THAT(inferred_status_fail.status().message(),
               HasSubstr("Source shape does not match"));
@@ -411,9 +429,10 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSourceShape) {
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape1) {
   ProgramShape select_program_shape_fail =
       ShapeUtil::MakeProgramShape({ShapeUtil::MakeShape(F32, {})}, pred_);
-  auto inferred_status_fail = ShapeInference::InferSelectAndScatterShape(
-      operand_shape_, select_program_shape_fail, window_, source_shape_,
-      init_value_shape_, scatter_program_shape_);
+  const absl::StatusOr<Shape> inferred_status_fail =
+      ShapeInference::InferSelectAndScatterShape(
+          operand_shape_, select_program_shape_fail, window_, source_shape_,
+          init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
   ASSERT_THAT(inferred_status_fail.status().message(),
               HasSubstr("Select function must take 2 parameters"));
@@ -422,9 +441,10 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape1) {
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape2) {
   ProgramShape select_program_shape_fail = ShapeUtil::MakeProgramShape(
       {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})}, f32_);
-  auto inferred_status_fail = ShapeInference::InferSelectAndScatterShape(
-      operand_shape_, select_program_shape_fail, window_, source_shape_,
-      init_value_shape_, scatter_program_shape_);
+  const absl::StatusOr<Shape> inferred_status_fail =
+      ShapeInference::InferSelectAndScatterShape(
+          operand_shape_, select_program_shape_fail, window_, source_shape_,
+          init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
   ASSERT_THAT(inferred_status_fail.status().message(),
               HasSubstr("Select function must have rank-0 PRED"));
@@ -433,9 +453,10 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape2) {
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape3) {
   ProgramShape select_program_shape_fail = ShapeUtil::MakeProgramShape(
       {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {})}, pred_);
-  auto inferred_status_fail = ShapeInference::InferSelectAndScatterShape(
-      operand_shape_, select_program_shape_fail, window_, source_shape_,
-      init_value_shape_, scatter_program_shape_);
+  const absl::StatusOr<Shape> inferred_status_fail =
+      ShapeInference::InferSelectAndScatterShape(
+          operand_shape_, select_program_shape_fail, window_, source_shape_,
+          init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
   ASSERT_THAT(inferred_status_fail.status().message(),
               HasSubstr("Select function's first parameter"));
@@ -444,9 +465,10 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape3) {
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape4) {
   ProgramShape select_program_shape_fail = ShapeUtil::MakeProgramShape(
       {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(U32, {})}, pred_);
-  auto inferred_status_fail = ShapeInference::InferSelectAndScatterShape(
-      operand_shape_, select_program_shape_fail, window_, source_shape_,
-      init_value_shape_, scatter_program_shape_);
+  const absl::StatusOr<Shape> inferred_status_fail =
+      ShapeInference::InferSelectAndScatterShape(
+          operand_shape_, select_program_shape_fail, window_, source_shape_,
+          init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
   ASSERT_THAT(inferred_status_fail.status().message(),
               HasSubstr("Select function's second parameter"));
@@ -457,8 +479,9 @@ TEST_F(ShapeInferenceTest, AllGatherStart) {
   const Shape expected_shape = ShapeUtil::MakeTupleShape(
       {operand, ShapeUtil::MakeShape(F32, {8, 8, 4})});
 
-  auto inferred_ag_shape = ShapeInference::InferAllGatherStartShape(
-      {&operand}, /*all_gather_dimension=*/0, /*shard_count=*/8);
+  const absl::StatusOr<Shape> inferred_ag_shape =
+      ShapeInference::InferAllGatherStartShape(
+          {&operand}, /*all_gather_dimension=*/0, /*shard_count=*/8);
   EXPECT_TRUE(inferred_ag_shape.ok());
   EXPECT_TRUE(ShapeUtil::Equal(*inferred_ag_shape, expected_shape));
 }
@@ -475,8 +498,10 @@ TEST_F(ShapeInferenceTest, AllGatherStartMultiOperand) {
        ShapeUtil::MakeTupleShape(
            {expected_output0_shape, expected_output1_shape})});
 
-  auto inferred_ag_shape = ShapeInference::InferAllGatherStartShape(
-      {&operand0, &operand1}, /*all_gather_dimension=*/0, /*shard_count=*/8);
+  const absl::StatusOr<Shape> inferred_ag_shape =
+      ShapeInference::InferAllGatherStartShape({&operand0, &operand1},
+                                               /*all_gather_dimension=*/0,
+                                               /*shard_count=*/8);
   EXPECT_TRUE(inferred_ag_shape.ok());
   EXPECT_TRUE(ShapeUtil::Equal(*inferred_ag_shape, expected_shape));
 }
@@ -487,7 +512,7 @@ TEST_F(ShapeInferenceTest, AllGatherDone) {
                                  ShapeUtil::MakeShape(F32, {8, 8, 4})});
   const Shape expected_shape = ShapeUtil::MakeShape(F32, {8, 8, 4});
 
-  auto inferred_ag_done_shape =
+  const absl::StatusOr<Shape> inferred_ag_done_shape =
       ShapeInference::InferAllGatherDoneShape(input_shape);
   EXPECT_TRUE(inferred_ag_done_shape.ok());
   EXPECT_TRUE(ShapeUtil::Equal(*inferred_ag_done_shape, expected_shape));
@@ -508,7 +533,7 @@ TEST_F(ShapeInferenceTest, AllGatherDoneMultiOperand) {
   const Shape expected_shape = ShapeUtil::MakeTupleShape(
       {expected_output0_shape, expected_output1_shape});
 
-  auto inferred_ag_done_shape =
+  const absl::StatusOr<Shape> inferred_ag_done_shape =
       ShapeInference::InferAllGatherDoneShape(input_shape);
   EXPECT_TRUE(inferred_ag_done_shape.ok());
   EXPECT_TRUE(ShapeUtil::Equal(*inferred_ag_done_shape, expected_shape));
@@ -518,7 +543,7 @@ TEST_F(ShapeInferenceTest, Convolve) {
   ConvolutionDimensionNumbers dnums;
 
   // Dimension order: batch, feature, x0, x1
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 3, 4});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 3, 4});
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
@@ -529,15 +554,15 @@ TEST_F(ShapeInferenceTest, Convolve) {
   dnums.add_output_spatial_dimensions(3);
 
   // Dimension order: x1, batch, feature, x0
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 3});
   dnums.set_kernel_input_feature_dimension(2);
   dnums.set_kernel_output_feature_dimension(1);
   dnums.add_kernel_spatial_dimensions(3);
   dnums.add_kernel_spatial_dimensions(0);
 
   Window window;
-  auto dim0 = window.add_dimensions();
-  auto dim1 = window.add_dimensions();
+  const auto dim0 = window.add_dimensions();
+  const auto dim1 = window.add_dimensions();
   dim0->set_size(3);
   dim0->set_stride(2);
   dim0->set_padding_low(1);
@@ -550,9 +575,11 @@ TEST_F(ShapeInferenceTest, Convolve) {
   dim1->set_padding_high(0);
   dim1->set_window_dilation(1);
   dim1->set_base_dilation(1);
-  auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
-      window, dnums, /*preferred_element_type=*/std::nullopt);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferConvolveShape(
+          lhs_shape, rhs_shape, /*feature_group_count=*/1,
+          /*batch_group_count=*/1, window, dnums,
+          /*preferred_element_type=*/std::nullopt);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {10, 12, 2, 3}),
                                *inferred_status));
@@ -562,7 +589,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithWindowDilation) {
   ConvolutionDimensionNumbers dnums;
 
   // Dimension order: batch, feature, x0, x1
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 103, 4});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 103, 4});
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
@@ -573,14 +600,14 @@ TEST_F(ShapeInferenceTest, ConvolveWithWindowDilation) {
   dnums.add_output_spatial_dimensions(3);
 
   // Dimension order: x1, batch, feature, x0
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 3});
   dnums.set_kernel_input_feature_dimension(2);
   dnums.set_kernel_output_feature_dimension(1);
   dnums.add_kernel_spatial_dimensions(3);
   dnums.add_kernel_spatial_dimensions(0);
 
   Window window;
-  auto dim0 = window.add_dimensions();
+  const auto dim0 = window.add_dimensions();
   dim0->set_size(3);
   dim0->set_stride(3);
   dim0->set_padding_low(0);
@@ -588,16 +615,18 @@ TEST_F(ShapeInferenceTest, ConvolveWithWindowDilation) {
   dim0->set_window_dilation(6);
   dim0->set_base_dilation(1);
 
-  auto dim1 = window.add_dimensions();
+  const auto dim1 = window.add_dimensions();
   dim1->set_size(2);
   dim1->set_stride(1);
   dim1->set_padding_low(2);
   dim1->set_padding_high(1);
   dim1->set_window_dilation(2);
   dim1->set_base_dilation(1);
-  auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
-      window, dnums, /*preferred_element_type=*/std::nullopt);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferConvolveShape(
+          lhs_shape, rhs_shape, /*feature_group_count=*/1,
+          /*batch_group_count=*/1, window, dnums,
+          /*preferred_element_type=*/std::nullopt);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {10, 12, 31, 5}),
                                *inferred_status));
@@ -607,7 +636,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithBaseDilation) {
   ConvolutionDimensionNumbers dnums;
 
   // Dimension order: batch, feature, x0, x1
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 3, 4});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 3, 4});
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
@@ -618,14 +647,14 @@ TEST_F(ShapeInferenceTest, ConvolveWithBaseDilation) {
   dnums.add_output_spatial_dimensions(3);
 
   // Dimension order: x1, batch, feature, x0
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 4});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 4});
   dnums.set_kernel_input_feature_dimension(2);
   dnums.set_kernel_output_feature_dimension(1);
   dnums.add_kernel_spatial_dimensions(3);
   dnums.add_kernel_spatial_dimensions(0);
 
   Window window;
-  auto dim0 = window.add_dimensions();
+  const auto dim0 = window.add_dimensions();
   dim0->set_size(4);
   dim0->set_stride(3);
   dim0->set_padding_low(0);
@@ -633,16 +662,18 @@ TEST_F(ShapeInferenceTest, ConvolveWithBaseDilation) {
   dim0->set_window_dilation(1);
   dim0->set_base_dilation(6);
 
-  auto dim1 = window.add_dimensions();
+  const auto dim1 = window.add_dimensions();
   dim1->set_size(2);
   dim1->set_stride(1);
   dim1->set_padding_low(2);
   dim1->set_padding_high(1);
   dim1->set_window_dilation(1);
   dim1->set_base_dilation(2);
-  auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
-      window, dnums, /*preferred_element_type=*/std::nullopt);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferConvolveShape(
+          lhs_shape, rhs_shape, /*feature_group_count=*/1,
+          /*batch_group_count=*/1, window, dnums,
+          /*preferred_element_type=*/std::nullopt);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {10, 12, 4, 9}),
                                *inferred_status));
@@ -650,8 +681,8 @@ TEST_F(ShapeInferenceTest, ConvolveWithBaseDilation) {
 
 TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
   // Dimension order for this test: batch, feature, x0, x1
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 3, 4});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {12, 11, 3, 2});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 3, 4});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {12, 11, 3, 2});
 
   ConvolutionDimensionNumbers dnums;
   dnums.set_input_batch_dimension(3);
@@ -668,8 +699,8 @@ TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
   dnums.add_kernel_spatial_dimensions(1);
 
   Window window;
-  auto dim0 = window.add_dimensions();
-  auto dim1 = window.add_dimensions();
+  const auto dim0 = window.add_dimensions();
+  const auto dim1 = window.add_dimensions();
   dim0->set_size(2);
   dim0->set_stride(1);
   dim0->set_padding_low(0);
@@ -678,9 +709,11 @@ TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
   dim1->set_stride(2);
   dim1->set_padding_low(1);
   dim1->set_padding_high(1);
-  auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
-      window, dnums, /*preferred_element_type=*/std::nullopt);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferConvolveShape(
+          lhs_shape, rhs_shape, /*feature_group_count=*/1,
+          /*batch_group_count=*/1, window, dnums,
+          /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("each dimension exactly once"));
@@ -700,11 +733,11 @@ TEST_F(ShapeInferenceTest, ConvolveBatchGroupCountUnequalOutputFeature) {
   dnums.set_output_feature_dimension(1);
   dnums.add_output_spatial_dimensions(2);
   dnums.add_output_spatial_dimensions(3);
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {60, 38, 17, 13});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {38, 10, 4, 4});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {60, 38, 17, 13});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {38, 10, 4, 4});
   Window window;
-  auto dim0 = window.add_dimensions();
-  auto dim1 = window.add_dimensions();
+  const auto dim0 = window.add_dimensions();
+  const auto dim1 = window.add_dimensions();
   dim0->set_size(4);
   dim1->set_size(4);
   dim0->set_padding_low(0);
@@ -715,9 +748,11 @@ TEST_F(ShapeInferenceTest, ConvolveBatchGroupCountUnequalOutputFeature) {
   dim1->set_stride(1);
   dim0->set_window_dilation(3);
   dim1->set_window_dilation(2);
-  auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/6,
-      window, dnums, /*preferred_element_type=*/std::nullopt);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferConvolveShape(
+          lhs_shape, rhs_shape, /*feature_group_count=*/1,
+          /*batch_group_count=*/6, window, dnums,
+          /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("to be a multiple of batch group count"));
@@ -772,7 +807,7 @@ ConvolveArgs MakeConvolveArgs(PrimitiveType lhs_type, PrimitiveType rhs_type) {
 TEST_F(ShapeInferenceTest, ConvolveWithBF16_F16) {
   ConvolveArgs args = MakeConvolveArgs(BF16, F16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -784,7 +819,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithBF16_F16) {
 TEST_F(ShapeInferenceTest, ConvolveWithF16_BF16) {
   ConvolveArgs args = MakeConvolveArgs(F16, BF16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -796,7 +831,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithF16_BF16) {
 TEST_F(ShapeInferenceTest, ConvolveWithS32_U32) {
   ConvolveArgs args = MakeConvolveArgs(S32, U32);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -808,7 +843,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithS32_U32) {
 TEST_F(ShapeInferenceTest, ConvolveWithU32_S32) {
   ConvolveArgs args = MakeConvolveArgs(U32, S32);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -820,7 +855,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithU32_S32) {
 TEST_F(ShapeInferenceTest, ConvolveWithPreferredElementType) {
   ConvolveArgs args = MakeConvolveArgs(S8, S16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -832,7 +867,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithPreferredElementType) {
 TEST_F(ShapeInferenceTest, ConvolveWithPreferredElementTypeSameAsInferredType) {
   ConvolveArgs args = MakeConvolveArgs(S8, S16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -845,7 +880,7 @@ TEST_F(ShapeInferenceTest,
        FloatingPointConvolveWithNarrowerPreferredElementType) {
   ConvolveArgs args = MakeConvolveArgs(F32, F32);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -858,7 +893,7 @@ TEST_F(ShapeInferenceTest,
        FloatingPointConvolveWithIntegralPreferredElementType) {
   ConvolveArgs args = MakeConvolveArgs(BF16, BF16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -871,7 +906,7 @@ TEST_F(ShapeInferenceTest,
        IntegralConvolveWithFloatingPointPreferredElementType) {
   ConvolveArgs args = MakeConvolveArgs(S8, S16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -884,7 +919,7 @@ TEST_F(ShapeInferenceTest,
        ConvolveWithPreferredElementTypeWithDifferentSignedness) {
   ConvolveArgs args = MakeConvolveArgs(S8, S16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -896,7 +931,7 @@ TEST_F(ShapeInferenceTest,
 TEST_F(ShapeInferenceTest, ConvolveWithNarrowerPreferredElementType) {
   ConvolveArgs args = MakeConvolveArgs(S8, S16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -918,14 +953,16 @@ static const char* innermost_dimension_matches =
 static void Pass(const Shape& shape, FftType type,
                  absl::Span<const int64_t> length,
                  const Shape& expected_shape) {
-  auto inferred_status = ShapeInference::InferFftShape(shape, type, length);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferFftShape(shape, type, length);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(expected_shape, *inferred_status));
 }
 
 static void Fail(const Shape& shape, FftType type,
                  absl::Span<const int64_t> length, absl::string_view message) {
-  auto inferred_status = ShapeInference::InferFftShape(shape, type, length);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferFftShape(shape, type, length);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().message(),
               HasSubstr(std::string(message)));
@@ -935,7 +972,7 @@ static void Fail(const Shape& shape, FftType type,
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestFftRanks) {
   FftType type = FftType::FFT;
-  Shape shape = ShapeUtil::MakeShape(C64, {16, 8});
+  const Shape shape = ShapeUtil::MakeShape(C64, {16, 8});
   fft::Fail(shape, type, {}, fft::unsupported_rank);
   fft::Pass(shape, type, {8}, shape);
   fft::Pass(shape, type, {16, 8}, shape);
@@ -945,15 +982,15 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestFftRanks) {
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestFftTypes) {
   FftType type = FftType::FFT;
-  Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
+  const Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
   fft::Fail(shape_f32, type, {16, 8}, fft::requires_complex_input);
   fft::Pass(shape_c128, type, {16, 8}, shape_c128);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIfftRanks) {
   FftType type = FftType::IFFT;
-  Shape shape = ShapeUtil::MakeShape(C64, {16, 8});
+  const Shape shape = ShapeUtil::MakeShape(C64, {16, 8});
   fft::Fail(shape, type, {}, fft::unsupported_rank);
   fft::Pass(shape, type, {8}, shape);
   fft::Pass(shape, type, {16, 8}, shape);
@@ -963,16 +1000,16 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestIfftRanks) {
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIfftTypes) {
   FftType type = FftType::IFFT;
-  Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
+  const Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
   fft::Fail(shape_f32, type, {16, 8}, fft::requires_complex_input);
   fft::Pass(shape_c128, type, {16, 8}, shape_c128);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestRfftRanks) {
   FftType type = FftType::RFFT;
-  Shape shape_in = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape shape_out = ShapeUtil::MakeShape(C64, {16, 5});
+  const Shape shape_in = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape shape_out = ShapeUtil::MakeShape(C64, {16, 5});
   fft::Fail(shape_in, type, {}, fft::unsupported_rank);
   fft::Pass(shape_in, type, {8}, shape_out);
   fft::Pass(shape_in, type, {16, 8}, shape_out);
@@ -982,36 +1019,36 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestRfftRanks) {
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestRfftDimensions) {
   FftType type = FftType::RFFT;
-  Shape shape = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape shape = ShapeUtil::MakeShape(F32, {16, 8});
   fft::Fail(shape, type, {4}, fft::dimensions_match);
   fft::Fail(shape, type, {16, 4}, fft::dimensions_match);
   fft::Fail(shape, type, {8, 8}, fft::dimensions_match);
   fft::Fail(shape, type, {8, 16}, fft::dimensions_match);
 
-  Shape zero_shape_in = ShapeUtil::MakeShape(F32, {16, 0});
-  Shape zero_shape_out = ShapeUtil::MakeShape(C64, {16, 0});
+  const Shape zero_shape_in = ShapeUtil::MakeShape(F32, {16, 0});
+  const Shape zero_shape_out = ShapeUtil::MakeShape(C64, {16, 0});
   fft::Pass(zero_shape_in, type, {0}, zero_shape_out);
   fft::Pass(zero_shape_in, type, {16, 0}, zero_shape_out);
 
-  Shape even_shape_in = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape odd_shape_in = ShapeUtil::MakeShape(F32, {16, 9});
-  Shape shape_out = ShapeUtil::MakeShape(C64, {16, 5});
+  const Shape even_shape_in = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape odd_shape_in = ShapeUtil::MakeShape(F32, {16, 9});
+  const Shape shape_out = ShapeUtil::MakeShape(C64, {16, 5});
   fft::Pass(even_shape_in, type, {16, 8}, shape_out);
   fft::Pass(odd_shape_in, type, {16, 9}, shape_out);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestRfftTypes) {
   FftType type = FftType::RFFT;
-  Shape shape_c64 = ShapeUtil::MakeShape(C64, {16, 8});
-  Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
+  const Shape shape_c64 = ShapeUtil::MakeShape(C64, {16, 8});
+  const Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
   fft::Fail(shape_c64, type, {16, 8}, fft::requires_f32_input);
   fft::Fail(shape_c128, type, {16, 8}, fft::requires_f32_input);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftRanks) {
   FftType type = FftType::IRFFT;
-  Shape shape_in = ShapeUtil::MakeShape(C64, {16, 5});
-  Shape shape_out = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape shape_in = ShapeUtil::MakeShape(C64, {16, 5});
+  const Shape shape_out = ShapeUtil::MakeShape(F32, {16, 8});
   fft::Fail(shape_in, type, {}, fft::unsupported_rank);
   fft::Pass(shape_in, type, {8}, shape_out);
   fft::Pass(shape_in, type, {16, 8}, shape_out);
@@ -1021,141 +1058,150 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftRanks) {
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftDimensions) {
   FftType type = FftType::IRFFT;
-  Shape shape = ShapeUtil::MakeShape(C64, {16, 5});
+  const Shape shape = ShapeUtil::MakeShape(C64, {16, 5});
   fft::Fail(shape, type, {5}, fft::innermost_dimension_matches);
   fft::Fail(shape, type, {16, 5}, fft::innermost_dimension_matches);
   fft::Fail(shape, type, {8, 8}, fft::dimensions_match);
   fft::Fail(shape, type, {8, 9}, fft::dimensions_match);
 
-  Shape zero_shape_in = ShapeUtil::MakeShape(C64, {16, 0});
-  Shape zero_shape_out = ShapeUtil::MakeShape(F32, {16, 0});
+  const Shape zero_shape_in = ShapeUtil::MakeShape(C64, {16, 0});
+  const Shape zero_shape_out = ShapeUtil::MakeShape(F32, {16, 0});
   fft::Pass(zero_shape_in, type, {0}, zero_shape_out);
   fft::Pass(zero_shape_in, type, {16, 0}, zero_shape_out);
 
-  Shape even_shape_out = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape odd_shape_out = ShapeUtil::MakeShape(F32, {16, 9});
+  const Shape even_shape_out = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape odd_shape_out = ShapeUtil::MakeShape(F32, {16, 9});
   fft::Pass(shape, type, {16, 8}, even_shape_out);
   fft::Pass(shape, type, {16, 9}, odd_shape_out);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftTypes) {
   FftType type = FftType::IRFFT;
-  Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 5});
-  Shape shape_f64_out = ShapeUtil::MakeShape(F64, {16, 8});
+  const Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 5});
+  const Shape shape_f64_out = ShapeUtil::MakeShape(F64, {16, 8});
   fft::Fail(shape_f32, type, {16, 8}, fft::requires_complex_input);
   fft::Pass(shape_c128, type, {16, 8}, shape_f64_out);
 }
 
 TEST_F(ShapeInferenceTest, MapThatChangesElementType) {
-  Shape arg = ShapeUtil::MakeShape(F32, {20});
+  const Shape arg = ShapeUtil::MakeShape(F32, {20});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_}, s32_);
-  auto inferred_status = ShapeInference::InferMapShape({&arg}, to_apply, {0});
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferMapShape({&arg}, to_apply, {0});
   EXPECT_IS_OK(inferred_status.status());
-  Shape expected = ShapeUtil::MakeShape(S32, {20});
+  const Shape expected = ShapeUtil::MakeShape(S32, {20});
   EXPECT_TRUE(ShapeUtil::Equal(expected, *inferred_status));
 }
 
 TEST_F(ShapeInferenceTest, Map) {
-  auto inferred_status_r1f32 = ShapeInference::InferMapShape(
-      {&vector_32_, &vector_32_},
-      ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
+  const absl::StatusOr<Shape> inferred_status_r1f32 =
+      ShapeInference::InferMapShape(
+          {&vector_32_, &vector_32_},
+          ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
   EXPECT_IS_OK(inferred_status_r1f32.status());
   EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_status_r1f32));
 
   // It's OK to provide a single argument, as long as the applied arity matches
   // (this degenerates to a Map).
-  auto inferred_status_r1f32_one = ShapeInference::InferMapShape(
-      {&vector_32_}, ShapeUtil::MakeProgramShape({f32_}, f32_), {0});
+  const absl::StatusOr<Shape> inferred_status_r1f32_one =
+      ShapeInference::InferMapShape(
+          {&vector_32_}, ShapeUtil::MakeProgramShape({f32_}, f32_), {0});
   EXPECT_IS_OK(inferred_status_r1f32_one.status());
   EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_status_r1f32_one));
 
-  auto inferred_status_r2s32 = ShapeInference::InferMapShape(
-      {&s32matrix_64_64_, &s32matrix_64_64_, &s32matrix_64_64_},
-      ShapeUtil::MakeProgramShape({s32_, s32_, s32_}, s32_), {0, 1});
+  const absl::StatusOr<Shape> inferred_status_r2s32 =
+      ShapeInference::InferMapShape(
+          {&s32matrix_64_64_, &s32matrix_64_64_, &s32matrix_64_64_},
+          ShapeUtil::MakeProgramShape({s32_, s32_, s32_}, s32_), {0, 1});
   EXPECT_IS_OK(inferred_status_r2s32.status());
   EXPECT_TRUE(ShapeUtil::Equal(s32matrix_64_64_, *inferred_status_r2s32));
 
-  auto no_args_error = ShapeInference::InferMapShape(
+  const auto no_args_error = ShapeInference::InferMapShape(
       {}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {});
   ASSERT_FALSE(no_args_error.ok());
   ASSERT_THAT(no_args_error.status().message(),
               HasSubstr("expects at least one argument"));
 
-  auto args_diff_shapes_error = ShapeInference::InferMapShape(
+  const auto args_diff_shapes_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_64_},
       ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
   ASSERT_FALSE(args_diff_shapes_error.ok());
   ASSERT_THAT(args_diff_shapes_error.status().message(),
               HasSubstr("requires all operands to have the same shape"));
 
-  auto arity_error = ShapeInference::InferMapShape(
+  const auto arity_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_}, ShapeUtil::MakeProgramShape({f32_}, f32_),
       {0});
   ASSERT_FALSE(arity_error.ok());
   ASSERT_THAT(arity_error.status().message(),
               HasSubstr("function arity must match"));
 
-  auto output_shape_error = ShapeInference::InferMapShape(
+  const auto output_shape_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({f32_, f32_}, vector_32_), {0});
   ASSERT_FALSE(output_shape_error.ok());
   ASSERT_THAT(output_shape_error.status().message(),
               HasSubstr("result has to be a scalar"));
 
-  auto param_shape_error = ShapeInference::InferMapShape(
+  const auto param_shape_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({vector_32_, f32_}, f32_), {0});
   ASSERT_FALSE(param_shape_error.ok());
   ASSERT_THAT(param_shape_error.status().message(),
               HasSubstr("parameter has to be a scalar"));
 
-  auto param_element_type_error = ShapeInference::InferMapShape(
+  const auto param_element_type_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({f32_, s32_}, f32_), {0});
   ASSERT_FALSE(param_element_type_error.ok());
   ASSERT_THAT(param_element_type_error.status().message(),
               HasSubstr("parameter type has to match argument"));
 
-  Shape arg = ShapeUtil::MakeShape(F32, {20});
+  const Shape arg = ShapeUtil::MakeShape(F32, {20});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_}, f32_);
-  auto inferred_status = ShapeInference::InferMapShape({&arg}, to_apply, {0});
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferMapShape({&arg}, to_apply, {0});
   EXPECT_IS_OK(inferred_status.status());
   EXPECT_TRUE(ShapeUtil::Equal(arg, *inferred_status));
 
-  auto inferred_status_error1 = ShapeInference::InferMapShape(
-      {&arg}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
+  const absl::StatusOr<Shape> inferred_status_error1 =
+      ShapeInference::InferMapShape(
+          {&arg}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
   ASSERT_FALSE(inferred_status_error1.ok());
   ASSERT_THAT(inferred_status_error1.status().message(),
               HasSubstr("arity must match number of arguments"));
 
-  auto inferred_status_error2 = ShapeInference::InferMapShape(
-      {&arg}, ShapeUtil::MakeProgramShape({vector_32_}, f32_), {0});
+  const absl::StatusOr<Shape> inferred_status_error2 =
+      ShapeInference::InferMapShape(
+          {&arg}, ShapeUtil::MakeProgramShape({vector_32_}, f32_), {0});
   ASSERT_FALSE(inferred_status_error2.ok());
   ASSERT_THAT(inferred_status_error2.status().message(),
               HasSubstr("has to be a scalar"));
 
-  auto inferred_status_error3 = ShapeInference::InferMapShape(
-      {&arg}, ShapeUtil::MakeProgramShape({f32_}, vector_32_), {0});
+  const absl::StatusOr<Shape> inferred_status_error3 =
+      ShapeInference::InferMapShape(
+          {&arg}, ShapeUtil::MakeProgramShape({f32_}, vector_32_), {0});
   ASSERT_FALSE(inferred_status_error3.ok());
   ASSERT_THAT(inferred_status_error3.status().message(),
               HasSubstr("has to be a scalar"));
 
-  auto inferred_status_error5 = ShapeInference::InferMapShape(
-      {&arg}, ShapeUtil::MakeProgramShape({s32_}, s32_), {0});
+  const absl::StatusOr<Shape> inferred_status_error5 =
+      ShapeInference::InferMapShape(
+          {&arg}, ShapeUtil::MakeProgramShape({s32_}, s32_), {0});
   ASSERT_FALSE(inferred_status_error5.ok());
   ASSERT_THAT(inferred_status_error5.status().message(),
               HasSubstr("parameter type has to match argument"));
 }
 
 TEST_F(ShapeInferenceTest, MapWithDifferentInputTypes) {
-  Shape arg0 = ShapeUtil::MakeShape(F32, {20});
-  Shape arg1 = ShapeUtil::MakeShape(S32, {20});
+  const Shape arg0 = ShapeUtil::MakeShape(F32, {20});
+  const Shape arg1 = ShapeUtil::MakeShape(S32, {20});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, s32_}, s32_);
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferMapShape({&arg0, &arg1}, to_apply, {0});
   EXPECT_IS_OK(inferred_status.status());
-  Shape expected = ShapeUtil::MakeShape(S32, {20});
+  const Shape expected = ShapeUtil::MakeShape(S32, {20});
   EXPECT_TRUE(ShapeUtil::Equal(expected, *inferred_status));
 }
 
@@ -1205,20 +1251,21 @@ TEST_F(ReduceShapeInferenceTest, ReduceCubeAmongAllDimensions) {
 }
 
 TEST_F(ReduceShapeInferenceTest, ReduceMultiOutput) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {f32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_}));
-  auto inferred_status = ShapeInference::InferReduceShape(
-      {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceShape(
+          {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
   EXPECT_IS_OK(inferred_status.status());
   EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeTupleShape({f32_, s32_}),
                                *inferred_status));
 }
 
 TEST_F(ReduceShapeInferenceTest, ReduceWindowMultiOutput) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3, 1});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3, 1});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3, 1});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3, 1});
   std::vector<const Shape*> args = {&f32_arg_shape, &s32_arg_shape};
   std::vector<const Shape*> inits = {&f32_, &s32_};
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
@@ -1229,11 +1276,12 @@ TEST_F(ReduceShapeInferenceTest, ReduceWindowMultiOutput) {
       MakePadding(f32_arg_shape.dimensions(), window_dimensions, window_strides,
                   Padding::kValid);
   TF_ASSERT_OK_AND_ASSIGN(
-      Window window,
+      const Window window,
       ShapeInference::InferWindowFromDimensions(
           window_dimensions, window_strides, padding_values, {}, {}));
-  auto inferred_status = ShapeInference::InferReduceWindowShape(
-      absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceWindowShape(
+          absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
   VLOG(2) << inferred_status->ToString() << "\n";
   EXPECT_IS_OK(inferred_status.status());
   EXPECT_TRUE(ShapeUtil::Equal(
@@ -1243,25 +1291,27 @@ TEST_F(ReduceShapeInferenceTest, ReduceWindowMultiOutput) {
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput1) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply =
       ShapeUtil::MakeProgramShape({f32_, s32_, f32_, s32_, f32_, s32_},
                                   ShapeUtil::MakeTupleShape({f32_, s32_}));
-  auto inferred_status = ShapeInference::InferReduceShape(
-      {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceShape(
+          {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("must take 4 parameters, but takes 6 parameter(s)"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput2) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {s32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_}));
-  auto inferred_status = ShapeInference::InferReduceShape(
-      {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceShape(
+          {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(
       inferred_status.status().message(),
@@ -1272,15 +1322,16 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput2) {
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput3) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {s32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_}));
-  auto inferred_status = ShapeInference::InferReduceShape({}, {0, 1}, to_apply);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceShape({}, {0, 1}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("must have at least 2 arguments, has 0"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorBadReduceWindowInput) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3, 1});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3, 1});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3, 1});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3, 1});
   std::vector<const Shape*> args = {&f32_arg_shape, &s32_arg_shape};
   std::vector<const Shape*> inits = {&f32_, &s32_};
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
@@ -1291,22 +1342,24 @@ TEST_F(ReduceShapeInferenceTest, ErrorBadReduceWindowInput) {
       MakePadding(f32_arg_shape.dimensions(), window_dimensions, window_strides,
                   Padding::kValid);
   TF_ASSERT_OK_AND_ASSIGN(
-      Window window,
+      const Window window,
       ShapeInference::InferWindowFromDimensions(
           window_dimensions, window_strides, padding_values, {}, {}));
-  auto inferred_status = ShapeInference::InferReduceWindowShape(
-      absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceWindowShape(
+          absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
   EXPECT_FALSE(inferred_status.status().ok());
   EXPECT_THAT(inferred_status.status().message(), HasSubstr("f32[] vs s32[]"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput1) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply =
       ShapeUtil::MakeProgramShape({f32_, s32_, f32_, s32_}, f32_);
-  auto inferred_status = ShapeInference::InferReduceShape(
-      {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceShape(
+          {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(
       inferred_status.status().message(),
@@ -1314,12 +1367,13 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput1) {
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput2) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {f32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_, s32_}));
-  auto inferred_status = ShapeInference::InferReduceShape(
-      {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceShape(
+          {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(
       inferred_status.status().message(),
@@ -1327,12 +1381,13 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput2) {
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerBoth) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {s32_, s32_, s32_, s32_}, ShapeUtil::MakeTupleShape({s32_, s32_}));
-  auto inferred_status = ShapeInference::InferReduceShape(
-      {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceShape(
+          {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("accumulator shape at index 0 differs from the "
@@ -1341,10 +1396,11 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerBoth) {
 
 TEST_F(ReduceShapeInferenceTest, ErrorOutOfBoundsDimension) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
-  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  auto inferred_status = ShapeInference::InferReduceShape(
-      {&arg_shape, &f32_},
-      /*dimensions_to_reduce=*/{3, 4}, to_apply);
+  const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceShape({&arg_shape, &f32_},
+                                       /*dimensions_to_reduce=*/{3, 4},
+                                       to_apply);
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("out-of-bounds dimension"));
@@ -1352,8 +1408,8 @@ TEST_F(ReduceShapeInferenceTest, ErrorOutOfBoundsDimension) {
 
 TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_, f32_}, f32_);
-  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  auto inferred_status =
+  const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferReduceShape({&arg_shape, &f32_},
                                        /*dimensions_to_reduce=*/{0}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
@@ -1363,8 +1419,8 @@ TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) {
 
 TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, s32_);
-  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  auto inferred_status =
+  const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferReduceShape({&arg_shape, &f32_},
                                        /*dimensions_to_reduce=*/{0}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
@@ -1374,18 +1430,19 @@ TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) {
 
 TEST_F(ReduceShapeInferenceTest, ReduceWithRepeatedReduceDimension) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
-  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  auto inferred_status = ShapeInference::InferReduceShape(
-      {&arg_shape, &f32_},
-      /*dimensions_to_reduce=*/{0, 0}, to_apply);
+  const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceShape({&arg_shape, &f32_},
+                                       /*dimensions_to_reduce=*/{0, 0},
+                                       to_apply);
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("Duplicate reduction dimension: 0"));
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {1, 1});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(
@@ -1393,8 +1450,8 @@ TEST_F(ShapeInferenceTest, InferSliceShapeRank2) {
 }
 
 TEST_F(ShapeInferenceTest, InferSliceWithDynamicDimensions) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64}, {true, true});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64}, {true, true});
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {33, 64}, {1, 1});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(
@@ -1402,8 +1459,8 @@ TEST_F(ShapeInferenceTest, InferSliceWithDynamicDimensions) {
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStrides) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {2, 4});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(
@@ -1411,8 +1468,8 @@ TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStrides) {
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStridesNotIntegral) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferSliceShape(matrix_shape, {15, 0}, {20, 13}, {2, 4});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(
@@ -1420,24 +1477,24 @@ TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStridesNotIntegral) {
 }
 
 TEST_F(ShapeInferenceTest, InferInvalidStride) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {0, 1});
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_EQ(tsl::error::INVALID_ARGUMENT, inferred_status.status().code());
 }
 
 TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {1, 1});
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_EQ(tsl::error::INVALID_ARGUMENT, inferred_status.status().code());
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank1) {
-  Shape vector_shape = ShapeUtil::MakeShape(F32, {17});
-  auto inferred_status =
+  const Shape vector_shape = ShapeUtil::MakeShape(F32, {17});
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferSliceShape(vector_shape, {2}, {4}, {1});
   ASSERT_TRUE(inferred_status.ok());
   ASSERT_TRUE(
@@ -1445,10 +1502,10 @@ TEST_F(ShapeInferenceTest, InferSliceShapeRank1) {
 }
 
 TEST_F(ShapeInferenceTest, InferConstIndexShape) {
-  Shape tuple_shape = ShapeUtil::MakeTupleShape({f32_, s32_});
-  auto inferred0_status =
+  const Shape tuple_shape = ShapeUtil::MakeTupleShape({f32_, s32_});
+  const absl::StatusOr<Shape> inferred0_status =
       ShapeInference::InferGetTupleElementShape(tuple_shape, 0);
-  auto inferred1_status =
+  const absl::StatusOr<Shape> inferred1_status =
       ShapeInference::InferGetTupleElementShape(tuple_shape, 1);
   ASSERT_IS_OK(inferred0_status.status());
   ASSERT_IS_OK(inferred1_status.status());
@@ -1457,10 +1514,10 @@ TEST_F(ShapeInferenceTest, InferConstIndexShape) {
 }
 
 TEST_F(ShapeInferenceTest, InferTupleElementShapeOutOfBound) {
-  Shape tuple_shape = ShapeUtil::MakeTupleShape({f32_, s32_});
-  auto inferredNegative_status =
+  const Shape tuple_shape = ShapeUtil::MakeTupleShape({f32_, s32_});
+  const absl::StatusOr<Shape> inferredNegative_status =
       ShapeInference::InferGetTupleElementShape(tuple_shape, -1);
-  auto inferred2_status =
+  const absl::StatusOr<Shape> inferred2_status =
       ShapeInference::InferGetTupleElementShape(tuple_shape, 2);
   ASSERT_FALSE(inferredNegative_status.ok());
   ASSERT_FALSE(inferred2_status.ok());
@@ -1471,17 +1528,19 @@ TEST_F(ShapeInferenceTest, InferTupleElementShapeOutOfBound) {
 }
 
 TEST_F(ShapeInferenceTest, InferPowShape) {
-  auto ten_floats = ShapeUtil::MakeShape(F32, {10});
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kPower, ten_floats, f32_, {});
+  const Shape ten_floats = ShapeUtil::MakeShape(F32, {10});
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kPower, ten_floats, f32_,
+                                         {});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(ten_floats, *inferred_status));
 }
 
 TEST_F(ShapeInferenceTest, InferCompareShape) {
-  auto ten_floats = ShapeUtil::MakeShape(F32, {10});
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kCompare, ten_floats, f32_, {});
+  const Shape ten_floats = ShapeUtil::MakeShape(F32, {10});
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kCompare, ten_floats, f32_,
+                                         {});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(
       ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}), *inferred_status));
@@ -1493,9 +1552,10 @@ TEST_F(ShapeInferenceTest, InferReshapeDegenerateCombine) {
   // [<=1]
   //
   // Both output dimension can be dynamic, use inferred_dimension to tie-break.
-  auto operand = ShapeUtil::MakeShape(F32, {1, 1}, {false, true});
-  auto status = ShapeInference::InferReshapeShape(operand, {1, 0}, {1},
-                                                  /*inferred_dimension=*/-1);
+  const Shape operand = ShapeUtil::MakeShape(F32, {1, 1}, {false, true});
+  const auto status =
+      ShapeInference::InferReshapeShape(operand, {1, 0}, {1},
+                                        /*inferred_dimension=*/-1);
   ASSERT_EQ(ShapeUtil::MakeShape(F32, {1}, {true}), *status);
 }
 
@@ -1505,9 +1565,10 @@ TEST_F(ShapeInferenceTest, InferReshapeSplit) {
   // [1, 10]
   //
   // Both output dimension can be dynamic, use inferred_dimension to tie-break.
-  auto operand = ShapeUtil::MakeShape(F32, {10}, {true});
-  auto status = ShapeInference::InferReshapeShape(operand, {0}, {1, 10},
-                                                  /*inferred_dimension=*/0);
+  const Shape operand = ShapeUtil::MakeShape(F32, {10}, {true});
+  const auto status =
+      ShapeInference::InferReshapeShape(operand, {0}, {1, 10},
+                                        /*inferred_dimension=*/0);
   ASSERT_EQ(ShapeUtil::MakeShape(F32, {1, 10}, {true, false}), *status);
 }
 
@@ -1515,9 +1576,10 @@ TEST_F(ShapeInferenceTest, InferReshapeCombine) {
   // [6, <=10]
   //   | reshape
   // [<=60]
-  auto operand = ShapeUtil::MakeShape(F32, {6, 10}, {false, true});
-  auto status = ShapeInference::InferReshapeShape(operand, {1, 0}, {60},
-                                                  /*inferred_dimension=*/-11);
+  const Shape operand = ShapeUtil::MakeShape(F32, {6, 10}, {false, true});
+  const auto status =
+      ShapeInference::InferReshapeShape(operand, {1, 0}, {60},
+                                        /*inferred_dimension=*/-11);
   ASSERT_EQ(ShapeUtil::MakeShape(F32, {60}, {true}), *status);
 }
 
@@ -1525,9 +1587,10 @@ TEST_F(ShapeInferenceTest, UnchangedDimension) {
   // [6, <=10]
   //   | reshape
   // [2, 3, <=10]
-  auto operand = ShapeUtil::MakeShape(F32, {6, 10}, {false, true});
-  auto status = ShapeInference::InferReshapeShape(operand, {1, 0}, {2, 3, 10},
-                                                  /*inferred_dimension=*/-11);
+  const Shape operand = ShapeUtil::MakeShape(F32, {6, 10}, {false, true});
+  const auto status =
+      ShapeInference::InferReshapeShape(operand, {1, 0}, {2, 3, 10},
+                                        /*inferred_dimension=*/-11);
   ASSERT_EQ(ShapeUtil::MakeShape(F32, {2, 3, 10}, {false, false, true}),
             *status);
 }
@@ -1536,8 +1599,8 @@ TEST_F(ShapeInferenceTest, InferDynamicBroadcast) {
   // CHECK:
   // %broadcast = s32[15,<=15]{1,0} broadcast(s32[<=15]{0}), dimensions={1}
 
-  auto operand_shape = ShapeUtil::MakeShape(F32, {15}, {true});
-  auto inferred_status =
+  const Shape operand_shape = ShapeUtil::MakeShape(F32, {15}, {true});
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferBroadcastShape(operand_shape, {15});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_EQ(ShapeUtil::MakeShape(F32, {15, 15}, {false, true}),
@@ -1548,29 +1611,31 @@ TEST_F(ShapeInferenceTest, BroadcastScalar) {
   for (auto element_type : {F32, U32, S8}) {
     const Shape scalar_shape = ShapeUtil::MakeShape(element_type, {});
     {  // no-op scalar broadcast
-      auto status = ShapeInference::InferBroadcastShape(scalar_shape, {});
+      const auto status = ShapeInference::InferBroadcastShape(scalar_shape, {});
       ASSERT_IS_OK(status.status());
       ASSERT_TRUE(ShapeUtil::Equal(scalar_shape, *status));
     }
     const Shape oned_shape = ShapeUtil::MakeShape(element_type, {3});
     {  // scalar -> 1d broadcast
-      auto status = ShapeInference::InferBroadcastShape(scalar_shape, {3});
+      const auto status =
+          ShapeInference::InferBroadcastShape(scalar_shape, {3});
       ASSERT_IS_OK(status.status());
       ASSERT_TRUE(ShapeUtil::Equal(oned_shape, *status));
     }
     {  // no-op 1d broadcast
-      auto status = ShapeInference::InferBroadcastShape(oned_shape, {});
+      const auto status = ShapeInference::InferBroadcastShape(oned_shape, {});
       ASSERT_IS_OK(status.status());
       ASSERT_TRUE(ShapeUtil::Equal(oned_shape, *status));
     }
     const Shape twod_shape = ShapeUtil::MakeShape(element_type, {2, 3});
     {  // scalar -> 2d broadcast
-      auto status = ShapeInference::InferBroadcastShape(scalar_shape, {2, 3});
+      const auto status =
+          ShapeInference::InferBroadcastShape(scalar_shape, {2, 3});
       ASSERT_IS_OK(status.status());
       ASSERT_TRUE(ShapeUtil::Equal(twod_shape, *status));
     }
     {  // 1d -> 2d broadcast
-      auto status = ShapeInference::InferBroadcastShape(oned_shape, {2});
+      const auto status = ShapeInference::InferBroadcastShape(oned_shape, {2});
       ASSERT_IS_OK(status.status());
       ASSERT_TRUE(ShapeUtil::Equal(twod_shape, *status));
     }
@@ -1580,7 +1645,7 @@ TEST_F(ShapeInferenceTest, BroadcastScalar) {
 // scalar <dot> vector: ok
 TEST_F(ShapeInferenceTest, ScalarDotVector) {
   DotDimensionNumbers dot_dnums;
-  auto inferred_status = ShapeInference::InferDotOpShape(
+  const absl::StatusOr<Shape> inferred_status = ShapeInference::InferDotOpShape(
       f32_, vector_32_, dot_dnums, /*preferred_element_type=*/std::nullopt);
   EXPECT_TRUE(inferred_status.ok());
   EXPECT_EQ(*inferred_status, vector_32_);
@@ -1591,7 +1656,7 @@ TEST_F(ShapeInferenceTest, DotWithRankHigherThanTwo) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  auto inferred_status = ShapeInference::InferDotOpShape(
+  const absl::StatusOr<Shape> inferred_status = ShapeInference::InferDotOpShape(
       ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, dot_dnums,
       /*preferred_element_type=*/std::nullopt);
   EXPECT_TRUE(inferred_status.ok());
@@ -1604,12 +1669,12 @@ TEST_F(ShapeInferenceTest, VectorDotVector) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferDotOpShape(vector_64_, vector_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(f32_, *inferred_status));
-  auto inferred_status_mismatch =
+  const absl::StatusOr<Shape> inferred_status_mismatch =
       ShapeInference::InferDotOpShape(vector_64_, vector_32_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status_mismatch.ok());
@@ -1620,12 +1685,12 @@ TEST_F(ShapeInferenceTest, MatrixDotVector) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferDotOpShape(matrix_32_64_, vector_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(*inferred_status, vector_32_));
-  auto inferred_status_mismatch =
+  const absl::StatusOr<Shape> inferred_status_mismatch =
       ShapeInference::InferDotOpShape(matrix_32_64_, vector_32_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status_mismatch.ok());
@@ -1636,12 +1701,12 @@ TEST_F(ShapeInferenceTest, VectorDotMatrix) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferDotOpShape(vector_32_, matrix_32_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(*inferred_status, vector_64_));
-  auto inferred_status_mismatch =
+  const absl::StatusOr<Shape> inferred_status_mismatch =
       ShapeInference::InferDotOpShape(vector_64_, matrix_32_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status_mismatch.ok());
@@ -1652,14 +1717,14 @@ TEST_F(ShapeInferenceTest, MatrixDotMatrix) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  auto inferred_status_match =
+  const absl::StatusOr<Shape> inferred_status_match =
       ShapeInference::InferDotOpShape(matrix_32_64_, matrix_64_48_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_IS_OK(inferred_status_match.status());
   ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, matrix_32_48_))
       << "inferred: " << ShapeUtil::HumanString(*inferred_status_match)
       << " expected: " << ShapeUtil::HumanString(matrix_64_48_);
-  auto inferred_status_mismatch =
+  const absl::StatusOr<Shape> inferred_status_mismatch =
       ShapeInference::InferDotOpShape(matrix_32_64_, matrix_32_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status_mismatch.ok());
@@ -1667,9 +1732,9 @@ TEST_F(ShapeInferenceTest, MatrixDotMatrix) {
 
 // BatchMatMul with two batch dimensions and one contracting dimension.
 TEST_F(ShapeInferenceTest, DotGeneral) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 3});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 3, 14});
-  Shape output_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 3, 14});
+  const Shape output_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(3);
@@ -1680,7 +1745,7 @@ TEST_F(ShapeInferenceTest, DotGeneral) {
   dot_dnums.add_rhs_batch_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
 
-  auto inferred_status_match =
+  const absl::StatusOr<Shape> inferred_status_match =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_IS_OK(inferred_status_match.status());
@@ -1691,8 +1756,8 @@ TEST_F(ShapeInferenceTest, DotGeneral) {
 
 // BatchMatMul with two contracting dimensions fails.
 TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(2);
@@ -1702,7 +1767,7 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
   dot_dnums.add_rhs_contracting_dimensions(1);
   dot_dnums.add_rhs_batch_dimensions(0);
 
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
@@ -1712,9 +1777,9 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
 }
 
 TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsPasses) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 2, 14});
-  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 11, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 2, 14});
+  const Shape output_shape = ShapeUtil::MakeShape(F32, {2, 11, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(2);
@@ -1725,7 +1790,7 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsPasses) {
   dot_dnums.add_rhs_contracting_dimensions(2);
   dot_dnums.add_rhs_batch_dimensions(0);
 
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   EXPECT_TRUE(inferred_status.ok());
@@ -1733,10 +1798,11 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsPasses) {
 }
 
 TEST_F(ShapeInferenceTest, ErrorSetDimensionSize) {
-  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape val_shape = ShapeUtil::MakeShape(S32, {1});
-  auto inferred_status = ShapeInference::InferSetDimensionSizeShape(
-      arg_shape, val_shape, /*dimension=*/0);
+  const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape val_shape = ShapeUtil::MakeShape(S32, {1});
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferSetDimensionSizeShape(arg_shape, val_shape,
+                                                 /*dimension=*/0);
 
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(inferred_status.status().message(),
@@ -1744,10 +1810,11 @@ TEST_F(ShapeInferenceTest, ErrorSetDimensionSize) {
 }
 
 TEST_F(ShapeInferenceTest, ErrorSetDimensionSizeWrongType) {
-  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape val_shape = ShapeUtil::MakeShape(U32, {});
-  auto inferred_status = ShapeInference::InferSetDimensionSizeShape(
-      arg_shape, val_shape, /*dimension=*/0);
+  const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape val_shape = ShapeUtil::MakeShape(U32, {});
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferSetDimensionSizeShape(arg_shape, val_shape,
+                                                 /*dimension=*/0);
 
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(inferred_status.status().message(),
@@ -1756,8 +1823,8 @@ TEST_F(ShapeInferenceTest, ErrorSetDimensionSizeWrongType) {
 
 // BatchMatMul with different batch dimension sizes fails.
 TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimSizesFails) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 3, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 3, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(2);
@@ -1766,7 +1833,7 @@ TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimSizesFails) {
   dot_dnums.add_rhs_contracting_dimensions(1);
   dot_dnums.add_rhs_batch_dimensions(0);
 
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
@@ -1776,8 +1843,8 @@ TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimSizesFails) {
 
 // BatchMatMul with different batch dimension numbers passes
 TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimNumbersPasses) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 2, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 2, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(2);
@@ -1786,7 +1853,7 @@ TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimNumbersPasses) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
 
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_TRUE(inferred_status.ok());
@@ -1796,8 +1863,8 @@ TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimNumbersPasses) {
 
 // BatchMatMul with out-of-range dimension numbers fails.
 TEST_F(ShapeInferenceTest, DotWithContractingDimNumberOutOfRange) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(3);
@@ -1806,7 +1873,7 @@ TEST_F(ShapeInferenceTest, DotWithContractingDimNumberOutOfRange) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
 
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
@@ -1816,8 +1883,8 @@ TEST_F(ShapeInferenceTest, DotWithContractingDimNumberOutOfRange) {
 
 // BatchMatMul with non-unique dimension numbers fails.
 TEST_F(ShapeInferenceTest, DotWithContractingNonUniqueDimNumber) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
@@ -1826,7 +1893,7 @@ TEST_F(ShapeInferenceTest, DotWithContractingNonUniqueDimNumber) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
 
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
@@ -1838,7 +1905,7 @@ TEST_F(ShapeInferenceTest, DotWithIntegralPreferredElementType) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(S8, {32, 32}),
                               ShapeUtil::MakeShape(S16, {32, 32}), dot_dnums,
@@ -1851,7 +1918,7 @@ TEST_F(ShapeInferenceTest, DotWithPreferredElementTypeSameAsInferredType) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(BF16, {32, 32}),
                               ShapeUtil::MakeShape(F32, {32, 32}), dot_dnums,
@@ -1864,7 +1931,7 @@ TEST_F(ShapeInferenceTest, FloatingPointDotWithNarrowerPreferredElementType) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(BF16, {32, 32}),
                               ShapeUtil::MakeShape(F32, {32, 32}), dot_dnums,
@@ -1877,7 +1944,7 @@ TEST_F(ShapeInferenceTest, FloatingPointDotWithIntegralPreferredElementType) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(BF16, {32, 32}),
                               ShapeUtil::MakeShape(BF16, {32, 32}), dot_dnums,
@@ -1890,7 +1957,7 @@ TEST_F(ShapeInferenceTest, IntegralDotWithFloatingPointPreferredElementType) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(S8, {32, 32}),
                               ShapeUtil::MakeShape(S16, {32, 32}), dot_dnums,
@@ -1903,7 +1970,7 @@ TEST_F(ShapeInferenceTest, DotWithPreferredElementTypeWithDifferentSignedness) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(S8, {32, 32}),
                               ShapeUtil::MakeShape(S16, {32, 32}), dot_dnums,
@@ -1916,7 +1983,7 @@ TEST_F(ShapeInferenceTest, DotWithNarrowerPreferredElementType) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(S8, {32, 32}),
                               ShapeUtil::MakeShape(S16, {32, 32}), dot_dnums,
@@ -1938,7 +2005,7 @@ TEST_F(ShapeInferenceTest, DotWithSparseLhs) {
 
   std::vector<SparsityDescriptor> sparsity = {sparsity_descriptor};
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferDotOpShape(
           ShapeUtil::MakeShape(F32, {10, 16}),
           ShapeUtil::MakeShape(F32, {32, 20}), dot_dnums,
@@ -1960,7 +2027,7 @@ TEST_F(ShapeInferenceTest, DotWithSparseRhs) {
 
   std::vector<SparsityDescriptor> sparsity = {sparsity_descriptor};
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferDotOpShape(
           ShapeUtil::MakeShape(F32, {10, 32}),
           ShapeUtil::MakeShape(F32, {16, 20}), dot_dnums,
@@ -1985,7 +2052,7 @@ TEST_F(ShapeInferenceTest, DotWithSparseBothOperands) {
 
   std::vector<SparsityDescriptor> sparsity = {sparsity_lhs, sparsity_rhs};
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferDotOpShape(
           ShapeUtil::MakeShape(F32, {10, 16}),
           ShapeUtil::MakeShape(F32, {16, 20}), dot_dnums,
@@ -2006,7 +2073,7 @@ TEST_F(ShapeInferenceTest, DotWithIncorrectSparseDimensionSizeRatio) {
   sparsity_descriptor.set_dimension(1);
 
   std::vector<SparsityDescriptor> sparsity = {sparsity_descriptor};
-  auto inferred_status = ShapeInference::InferDotOpShape(
+  const absl::StatusOr<Shape> inferred_status = ShapeInference::InferDotOpShape(
       ShapeUtil::MakeShape(F32, {10, 32}), ShapeUtil::MakeShape(F32, {32, 20}),
       dot_dnums, /*preferred_element_type=*/std::nullopt,
       absl::MakeSpan(sparsity));
@@ -2027,7 +2094,7 @@ TEST_F(ShapeInferenceTest, SparseDotMetadata) {
   sparsity_descriptor.set_index(0);
   sparsity_descriptor.set_dimension(2);
 
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferSparseDotMetadataShape(
                               ShapeUtil::MakeShape(F32, {5, 10, 16}), dot_dnums,
                               sparsity_descriptor));
@@ -2042,12 +2109,12 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastMatrixVector) {
   const Shape vec8 = ShapeUtil::MakeShape(F32, {8});
   const Shape vec16 = ShapeUtil::MakeShape(F32, {16});
 
-  auto inferred_status_match =
+  absl::StatusOr<Shape> inferred_status_match =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec8, {1});
   ASSERT_IS_OK(inferred_status_match.status());
   ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, mat));
 
-  auto inferred_status_mismatch =
+  absl::StatusOr<Shape> inferred_status_mismatch =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec8, {0});
   ASSERT_FALSE(inferred_status_mismatch.ok());
 
@@ -2068,8 +2135,9 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastCubeMatrix) {
   const Shape matrix16_4 = ShapeUtil::MakeShape(F32, {16, 4});
   const Shape matrix16_8 = ShapeUtil::MakeShape(F32, {16, 8});
 
-  auto inferred_status_match = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, cube, matrix8_4, {1, 2});
+  absl::StatusOr<Shape> inferred_status_match =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, cube, matrix8_4,
+                                         {1, 2});
   ASSERT_IS_OK(inferred_status_match.status());
   ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, cube));
 
@@ -2093,43 +2161,46 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
   const Shape matrix8_8 = ShapeUtil::MakeShape(F32, {8, 8});
 
   // "magical" broadcast rejected
-  auto inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_status_error1 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {});
   ASSERT_FALSE(inferred_status_error1.ok());
   ASSERT_THAT(inferred_status_error1.status().message(),
               HasSubstr("Shapes must be equal rank"));
 
   // broadcast_dimension out of bounds for tensor's rank
-  auto inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_status_error2 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {3});
   ASSERT_FALSE(inferred_status_error2.ok());
   ASSERT_THAT(inferred_status_error2.status().message(),
               ContainsRegex("Broadcast dimension number .* too large"));
 
   // broadcast_dimension doesn't match corresponding dimension
-  auto inferred_status_error3 =
+  const absl::StatusOr<Shape> inferred_status_error3 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {0});
   ASSERT_FALSE(inferred_status_error3.ok());
   ASSERT_THAT(inferred_status_error3.status().message(),
               HasSubstr("Broadcast dimension 0 mismatch"));
 
   // broadcast_dimensions list too long
-  auto inferred_status_error4 = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, tensor, matrix8_4, {0, 1, 2});
+  const absl::StatusOr<Shape> inferred_status_error4 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, matrix8_4,
+                                         {0, 1, 2});
   ASSERT_FALSE(inferred_status_error4.ok());
   ASSERT_THAT(inferred_status_error4.status().message(),
               HasSubstr("broadcast_dimensions has to match"));
 
   // there's a dimension above the rank of the tensor
-  auto inferred_status_error5 = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, tensor, matrix8_4, {3, 0});
+  const absl::StatusOr<Shape> inferred_status_error5 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, matrix8_4,
+                                         {3, 0});
   ASSERT_FALSE(inferred_status_error5.ok());
   ASSERT_THAT(inferred_status_error5.status().message(),
               ContainsRegex("dimension number .* too large"));
 
   // broadcasting dimensions don't match in this order
-  auto inferred_status_error6 = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, tensor, matrix8_4, {2, 1});
+  const absl::StatusOr<Shape> inferred_status_error6 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, matrix8_4,
+                                         {2, 1});
   ASSERT_FALSE(inferred_status_error6.ok());
   ASSERT_THAT(inferred_status_error6.status().message(),
               HasSubstr("dimension 0 mismatch"));
@@ -2137,14 +2208,16 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
   // The following two tests make sure that broadcasting dimensions are listed
   // in a proper (strictly increasing) order, even if the lower-rank array
   // matches the higher-rank array in many different ways.
-  auto inferred_status_error7 = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, tensor8_8_8, matrix8_8, {0, 0});
+  const absl::StatusOr<Shape> inferred_status_error7 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor8_8_8,
+                                         matrix8_8, {0, 0});
   ASSERT_FALSE(inferred_status_error7.ok());
   ASSERT_THAT(inferred_status_error7.status().message(),
               HasSubstr("dimensions order is wrong"));
 
-  auto inferred_status_error8 = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, tensor8_8_8, matrix8_8, {1, 0});
+  const absl::StatusOr<Shape> inferred_status_error8 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor8_8_8,
+                                         matrix8_8, {1, 0});
   ASSERT_FALSE(inferred_status_error8.ok());
   ASSERT_THAT(inferred_status_error8.status().message(),
               HasSubstr("dimensions order is wrong"));
@@ -2152,10 +2225,10 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
 
 // Tests for the while instruction with proper shapes.
 TEST_F(ShapeInferenceTest, WhileWithCorrectShapes) {
-  Shape result_shape = ShapeUtil::MakeTupleShape({s32_, vector_32_});
+  const Shape result_shape = ShapeUtil::MakeTupleShape({s32_, vector_32_});
   ProgramShape cond = ShapeUtil::MakeProgramShape({result_shape}, pred_);
   ProgramShape body = ShapeUtil::MakeProgramShape({result_shape}, result_shape);
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferWhileShape(cond, body, result_shape);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(result_shape, *inferred_status));
@@ -2163,34 +2236,36 @@ TEST_F(ShapeInferenceTest, WhileWithCorrectShapes) {
 
 // Tests for the while instruction with wrong shapes.
 TEST_F(ShapeInferenceTest, WhileWithBadShapes) {
-  Shape result_shape = ShapeUtil::MakeTupleShape({s32_, vector_32_});
+  const Shape result_shape = ShapeUtil::MakeTupleShape({s32_, vector_32_});
   ProgramShape cond = ShapeUtil::MakeProgramShape({result_shape}, pred_);
   ProgramShape body = ShapeUtil::MakeProgramShape({result_shape}, result_shape);
 
-  auto bad_shape_1 = ShapeUtil::MakeProgramShape({s32_, result_shape}, pred_);
-  auto inferred_status_error1 =
+  const auto bad_shape_1 =
+      ShapeUtil::MakeProgramShape({s32_, result_shape}, pred_);
+  const absl::StatusOr<Shape> inferred_status_error1 =
       ShapeInference::InferWhileShape(bad_shape_1, body, result_shape);
   ASSERT_FALSE(inferred_status_error1.ok());
   ASSERT_THAT(inferred_status_error1.status().message(),
               HasSubstr("Condition must take 1 arguments"));
 
-  auto bad_shape_2 =
+  const auto bad_shape_2 =
       ShapeUtil::MakeProgramShape({s32_, result_shape}, result_shape);
-  auto inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_status_error2 =
       ShapeInference::InferWhileShape(cond, bad_shape_2, result_shape);
   ASSERT_FALSE(inferred_status_error2.ok());
   ASSERT_THAT(inferred_status_error2.status().message(),
               HasSubstr("Body must take 1 arguments"));
 
-  auto bad_shape_3 = ShapeUtil::MakeProgramShape({result_shape}, s32_);
-  auto inferred_status_error3 =
+  const auto bad_shape_3 = ShapeUtil::MakeProgramShape({result_shape}, s32_);
+  const absl::StatusOr<Shape> inferred_status_error3 =
       ShapeInference::InferWhileShape(bad_shape_3, body, result_shape);
   ASSERT_FALSE(inferred_status_error3.ok());
   ASSERT_THAT(inferred_status_error3.status().message(),
               HasSubstr("Condition must return a boolean"));
 
-  auto bad_shape_4 = ShapeUtil::MakeProgramShape({result_shape}, vector_32_);
-  auto inferred_status_error4 =
+  const auto bad_shape_4 =
+      ShapeUtil::MakeProgramShape({result_shape}, vector_32_);
+  const absl::StatusOr<Shape> inferred_status_error4 =
       ShapeInference::InferWhileShape(cond, bad_shape_4, result_shape);
   ASSERT_FALSE(inferred_status_error4.ok());
   ASSERT_THAT(inferred_status_error4.status().message(),
@@ -2199,12 +2274,13 @@ TEST_F(ShapeInferenceTest, WhileWithBadShapes) {
 
 // Tests for the concatenate instruction with dynamic shapes.
 TEST_F(ShapeInferenceTest, ConcatenateWithDynamicShapes) {
-  auto dynamic_shape_1 =
+  const auto dynamic_shape_1 =
       ShapeUtil::MakeShape(F32, {32, 160, 10}, {true, false, false});
-  auto dynamic_shape_2 =
+  const auto dynamic_shape_2 =
       ShapeUtil::MakeShape(F32, {32, 160, 10}, {false, true, false});
-  auto inferred_status = ShapeInference::InferConcatOpShape(
-      {&dynamic_shape_1, &dynamic_shape_2}, /*dimension=*/0);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferConcatOpShape({&dynamic_shape_1, &dynamic_shape_2},
+                                         /*dimension=*/0);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(
       ShapeUtil::MakeShape(F32, {64, 160, 10}, {true, true, false}),
@@ -2213,20 +2289,23 @@ TEST_F(ShapeInferenceTest, ConcatenateWithDynamicShapes) {
 
 // Tests for the concatenate instruction with proper shapes.
 TEST_F(ShapeInferenceTest, ConcatenateWithCorrectShapes) {
-  auto inferred_status_1 = ShapeInference::InferConcatOpShape(
-      {&vector_32_, &vector_64_}, /*dimension=*/0);
+  const absl::StatusOr<Shape> inferred_status_1 =
+      ShapeInference::InferConcatOpShape({&vector_32_, &vector_64_},
+                                         /*dimension=*/0);
   ASSERT_IS_OK(inferred_status_1.status());
   ASSERT_TRUE(
       ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {96}), *inferred_status_1));
 
-  auto inferred_status_2 = ShapeInference::InferConcatOpShape(
-      {&vector_32_, &vector_64_, &vector_32_}, /*dimension=*/0);
+  const absl::StatusOr<Shape> inferred_status_2 =
+      ShapeInference::InferConcatOpShape(
+          {&vector_32_, &vector_64_, &vector_32_}, /*dimension=*/0);
   ASSERT_IS_OK(inferred_status_2.status());
   ASSERT_TRUE(
       ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {128}), *inferred_status_2));
 
-  auto inferred_status_3 = ShapeInference::InferConcatOpShape(
-      {&matrix_32_48_, &matrix_32_64_, &matrix_32_48_}, /*dimension=*/1);
+  const absl::StatusOr<Shape> inferred_status_3 =
+      ShapeInference::InferConcatOpShape(
+          {&matrix_32_48_, &matrix_32_64_, &matrix_32_48_}, /*dimension=*/1);
   ASSERT_IS_OK(inferred_status_3.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {32, 160}),
                                *inferred_status_3));
@@ -2234,41 +2313,44 @@ TEST_F(ShapeInferenceTest, ConcatenateWithCorrectShapes) {
 
 // Tests for the concatenate instruction with wrong shapes.
 TEST_F(ShapeInferenceTest, ConcatenateWithBadShapes) {
-  auto inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_status_error1 =
       ShapeInference::InferConcatOpShape({}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error1.ok());
   ASSERT_THAT(inferred_status_error1.status().message(),
               HasSubstr("Concatenate expects at least one argument"));
 
-  auto inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_status_error2 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/-1);
   ASSERT_FALSE(inferred_status_error2.ok());
   ASSERT_THAT(inferred_status_error2.status().message(),
               HasSubstr("dimension out of bounds: -1"));
 
-  auto inferred_status_error3 =
+  const absl::StatusOr<Shape> inferred_status_error3 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/1);
   ASSERT_FALSE(inferred_status_error3.ok());
   ASSERT_THAT(inferred_status_error3.status().message(),
               HasSubstr("dimension out of bounds: 1"));
 
-  Shape tuple = ShapeUtil::MakeTupleShape({vector_32_});
-  auto inferred_status_error4 = ShapeInference::InferConcatOpShape(
-      {&vector_32_, &tuple}, /*dimension=*/0);
+  const Shape tuple = ShapeUtil::MakeTupleShape({vector_32_});
+  const absl::StatusOr<Shape> inferred_status_error4 =
+      ShapeInference::InferConcatOpShape({&vector_32_, &tuple},
+                                         /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error4.ok());
   ASSERT_THAT(
       inferred_status_error4.status().message(),
       HasSubstr("Expected array argument for operand of concatenation"));
 
   const Shape vector_s32 = ShapeUtil::MakeShape(S32, {32});
-  auto inferred_status_error5 = ShapeInference::InferConcatOpShape(
-      {&vector_32_, &vector_s32}, /*dimension=*/0);
+  const absl::StatusOr<Shape> inferred_status_error5 =
+      ShapeInference::InferConcatOpShape({&vector_32_, &vector_s32},
+                                         /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error5.ok());
   ASSERT_THAT(inferred_status_error5.status().message(),
               HasSubstr("concatenate arrays with different element types"));
 
-  auto inferred_status_error6 = ShapeInference::InferConcatOpShape(
-      {&matrix_32_48_, &matrix_32_64_}, /*dimension=*/0);
+  const absl::StatusOr<Shape> inferred_status_error6 =
+      ShapeInference::InferConcatOpShape({&matrix_32_48_, &matrix_32_64_},
+                                         /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error6.ok());
   ASSERT_THAT(inferred_status_error6.status().message(),
               HasSubstr("concatenate arrays that differ in "
@@ -2277,21 +2359,21 @@ TEST_F(ShapeInferenceTest, ConcatenateWithBadShapes) {
 }
 
 TEST_F(ShapeInferenceTest, Pad) {
-  Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
-  Shape padding_value_shape = ShapeUtil::MakeShape(F32, {});
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
+  const Shape padding_value_shape = ShapeUtil::MakeShape(F32, {});
   // Padding for dimension 0: {low: 0, high: 2, interior: 3}
   // Padding for dimension 1: {low: 1, high: 5, interior: 0}
   PaddingConfig padding_config;
-  auto dimension0 = padding_config.add_dimensions();
+  const auto dimension0 = padding_config.add_dimensions();
   dimension0->set_edge_padding_low(0);
   dimension0->set_edge_padding_high(2);
   dimension0->set_interior_padding(3);
-  auto dimension1 = padding_config.add_dimensions();
+  const auto dimension1 = padding_config.add_dimensions();
   dimension1->set_edge_padding_low(1);
   dimension1->set_edge_padding_high(5);
   dimension1->set_interior_padding(0);
 
-  auto inferred_status = ShapeInference::InferPadShape(
+  const absl::StatusOr<Shape> inferred_status = ShapeInference::InferPadShape(
       input_shape, padding_value_shape, padding_config);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(
@@ -2299,7 +2381,7 @@ TEST_F(ShapeInferenceTest, Pad) {
 
   dimension1->set_edge_padding_low(-20);
   dimension1->set_edge_padding_high(-10);
-  auto negative_dimension_size = ShapeInference::InferPadShape(
+  const auto negative_dimension_size = ShapeInference::InferPadShape(
       input_shape, padding_value_shape, padding_config);
   ASSERT_FALSE(negative_dimension_size.ok());
   ASSERT_THAT(negative_dimension_size.status().message(),
@@ -2307,36 +2389,38 @@ TEST_F(ShapeInferenceTest, Pad) {
 }
 
 TEST_F(ShapeInferenceTest, Reverse) {
-  Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
 
-  auto inferred_status = ShapeInference::InferReverseShape(input_shape, {0, 1});
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReverseShape(input_shape, {0, 1});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(input_shape, *inferred_status));
 }
 
 TEST_F(ShapeInferenceTest, ReverseInvalidDimension) {
-  Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
 
-  auto inferred_status_error0 =
+  const absl::StatusOr<Shape> inferred_status_error0 =
       ShapeInference::InferReverseShape(input_shape, {0, 2});
   ASSERT_FALSE(inferred_status_error0.ok());
   ASSERT_THAT(inferred_status_error0.status().message(),
               HasSubstr("out-of-bounds"));
 
-  auto inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_status_error1 =
       ShapeInference::InferReverseShape(input_shape, {0, -1});
   ASSERT_FALSE(inferred_status_error1.ok());
   ASSERT_THAT(inferred_status_error1.status().message(),
               HasSubstr("out-of-bounds"));
 
-  auto inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_status_error2 =
       ShapeInference::InferReverseShape(input_shape, {0, 0});
   ASSERT_FALSE(inferred_status_error2.ok());
   ASSERT_THAT(inferred_status_error2.status().message(),
               HasSubstr("duplicated"));
 
-  Shape tuple_shape = ShapeUtil::MakeTupleShape({input_shape, input_shape});
-  auto inferred_status_error3 =
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({input_shape, input_shape});
+  const absl::StatusOr<Shape> inferred_status_error3 =
       ShapeInference::InferReverseShape(tuple_shape, {0});
   ASSERT_FALSE(inferred_status_error3.ok());
   ASSERT_THAT(inferred_status_error3.status().message(),
@@ -2344,40 +2428,43 @@ TEST_F(ShapeInferenceTest, ReverseInvalidDimension) {
 }
 
 TEST_F(ShapeInferenceTest, Call) {
-  auto inferred_status0 =
+  const absl::StatusOr<Shape> inferred_status0 =
       ShapeInference::InferCallShape({}, ShapeUtil::MakeProgramShape({}, f32_));
   EXPECT_IS_OK(inferred_status0.status());
   EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_status0));
 
-  auto inferred_status1 = ShapeInference::InferCallShape(
+  const absl::StatusOr<Shape> inferred_status1 = ShapeInference::InferCallShape(
       {&f32_, &s32_, &pred_, &vector_32_, &matrix_32_48_},
       ShapeUtil::MakeProgramShape(
           {f32_, s32_, pred_, vector_32_, matrix_32_48_}, s32matrix_64_64_));
   EXPECT_IS_OK(inferred_status1.status());
   EXPECT_TRUE(ShapeUtil::Equal(s32matrix_64_64_, *inferred_status1));
 
-  auto inferred_status_error0 = ShapeInference::InferCallShape(
-      {}, ShapeUtil::MakeProgramShape({f32_}, f32_));
+  const absl::StatusOr<Shape> inferred_status_error0 =
+      ShapeInference::InferCallShape({},
+                                     ShapeUtil::MakeProgramShape({f32_}, f32_));
   EXPECT_FALSE(inferred_status_error0.ok());
   EXPECT_THAT(inferred_status_error0.status().message(),
               HasSubstr("arity must match"));
 
-  auto inferred_status_error1 = ShapeInference::InferCallShape(
-      {&f32_}, ShapeUtil::MakeProgramShape({}, f32_));
+  const absl::StatusOr<Shape> inferred_status_error1 =
+      ShapeInference::InferCallShape({&f32_},
+                                     ShapeUtil::MakeProgramShape({}, f32_));
   EXPECT_FALSE(inferred_status_error1.ok());
   EXPECT_THAT(inferred_status_error1.status().message(),
               HasSubstr("arity must match"));
 
-  auto inferred_status_error2 = ShapeInference::InferCallShape(
-      {&f32_}, ShapeUtil::MakeProgramShape({s32_}, f32_));
+  const absl::StatusOr<Shape> inferred_status_error2 =
+      ShapeInference::InferCallShape({&f32_},
+                                     ShapeUtil::MakeProgramShape({s32_}, f32_));
   EXPECT_FALSE(inferred_status_error2.ok());
   EXPECT_THAT(inferred_status_error2.status().message(),
               HasSubstr("parameter must match argument"));
 }
 
 TEST_F(ShapeInferenceTest, Transpose) {
-  Shape a_shape = ShapeUtil::MakeShape(F32, {2, 3, 4, 5});
-  auto inferred_shape_and_status =
+  const Shape a_shape = ShapeUtil::MakeShape(F32, {2, 3, 4, 5});
+  const absl::StatusOr<Shape> inferred_shape_and_status =
       ShapeInference::InferTransposeShape(a_shape, {1, 2, 3, 0});
   EXPECT_IS_OK(inferred_shape_and_status);
   EXPECT_TRUE(ShapeUtil::Compatible(ShapeUtil::MakeShape(F32, {3, 4, 5, 2}),
@@ -2385,8 +2472,8 @@ TEST_F(ShapeInferenceTest, Transpose) {
 }
 
 TEST_F(ShapeInferenceTest, Rank1Transpose) {
-  Shape a_shape = ShapeUtil::MakeShape(F32, {5});
-  auto inferred_shape_and_status =
+  const Shape a_shape = ShapeUtil::MakeShape(F32, {5});
+  const absl::StatusOr<Shape> inferred_shape_and_status =
       ShapeInference::InferTransposeShape(a_shape, {0});
   EXPECT_IS_OK(inferred_shape_and_status);
   EXPECT_TRUE(ShapeUtil::Compatible(ShapeUtil::MakeShape(F32, {5}),
@@ -2394,83 +2481,92 @@ TEST_F(ShapeInferenceTest, Rank1Transpose) {
 }
 
 TEST_F(ShapeInferenceTest, ConditionalPred) {
-  auto inferred_status0 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
-      {vector_32_, vector_64_});
+  const absl::StatusOr<Shape> inferred_status0 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+          {vector_32_, vector_64_});
   EXPECT_IS_OK(inferred_status0.status());
   EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_status0));
 
-  auto inferred_status1 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
-       ShapeUtil::MakeProgramShape({vector_32_}, vector_64_)},
-      {matrix_32_48_, vector_32_});
+  const absl::StatusOr<Shape> inferred_status1 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
+           ShapeUtil::MakeProgramShape({vector_32_}, vector_64_)},
+          {matrix_32_48_, vector_32_});
   EXPECT_IS_OK(inferred_status1.status());
   EXPECT_TRUE(ShapeUtil::Equal(vector_64_, *inferred_status1));
 
-  auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
-  auto inferred_status2 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
-       ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_)},
-      {matrix_32_48_, tuple_f32_v32});
+  const auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
+  const absl::StatusOr<Shape> inferred_status2 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+           ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_)},
+          {matrix_32_48_, tuple_f32_v32});
   EXPECT_IS_OK(inferred_status2.status());
   EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_status2));
 
-  auto inferred_status_error0 = ShapeInference::InferConditionalShape(
-      f32_,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
-      {vector_32_, vector_64_});
+  const absl::StatusOr<Shape> inferred_status_error0 =
+      ShapeInference::InferConditionalShape(
+          f32_,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+          {vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error0.ok());
   EXPECT_THAT(inferred_status_error0.status().message(),
               HasSubstr("must be bool or int32_t"));
 
-  auto inferred_status_error1 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
-       ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
-      {ShapeUtil::MakeTupleShape({f32_, vector_32_}), matrix_32_48_});
+  const absl::StatusOr<Shape> inferred_status_error1 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
+           ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
+          {ShapeUtil::MakeTupleShape({f32_, vector_32_}), matrix_32_48_});
   EXPECT_FALSE(inferred_status_error1.ok());
   EXPECT_THAT(inferred_status_error1.status().message(),
               HasSubstr("branch computation 0 must take 1 argument"));
 
-  auto inferred_status_error2 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({vector_64_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
-      {vector_32_, vector_64_});
+  const absl::StatusOr<Shape> inferred_status_error2 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({vector_64_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+          {vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error2.ok());
   EXPECT_THAT(inferred_status_error2.status().message(),
               HasSubstr("branch operand 0 must match the shape of the only "
                         "parameter of branch computation 0"));
 
-  auto inferred_status_error3 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
-       ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_)},
-      {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_})});
+  const absl::StatusOr<Shape> inferred_status_error3 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+           ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_)},
+          {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_})});
   EXPECT_FALSE(inferred_status_error3.ok());
   EXPECT_THAT(inferred_status_error3.status().message(),
               HasSubstr("branch computation 1 must take 1 argument"));
 
-  auto inferred_status_error4 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
-      {vector_32_, vector_64_});
+  const absl::StatusOr<Shape> inferred_status_error4 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
+          {vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error4.ok());
   EXPECT_THAT(inferred_status_error4.status().message(),
               HasSubstr("branch operand 1 must match the shape of the only "
                         "parameter of branch computation 1"));
 
-  auto inferred_status_error5 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
-      {vector_32_, vector_64_});
+  const absl::StatusOr<Shape> inferred_status_error5 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
+          {vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error5.ok());
   EXPECT_THAT(inferred_status_error5.status().message(),
               HasSubstr("the result of branch 0 computation and branch 1 "
@@ -2478,77 +2574,84 @@ TEST_F(ShapeInferenceTest, ConditionalPred) {
 }
 
 TEST_F(ShapeInferenceTest, ConditionalIndexed) {
-  auto r0s32 = ShapeUtil::MakeShape(S32, {});
-  auto inferred_status0 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
-      {vector_32_, vector_64_, vector_64_});
+  const Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+  const absl::StatusOr<Shape> inferred_status0 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+          {vector_32_, vector_64_, vector_64_});
   EXPECT_IS_OK(inferred_status0.status());
   EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_status0));
 
-  auto inferred_status1 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
-       ShapeUtil::MakeProgramShape({vector_32_}, vector_64_),
-       ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_)},
-      {matrix_32_48_, vector_32_, matrix_32_48_});
+  const absl::StatusOr<Shape> inferred_status1 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
+           ShapeUtil::MakeProgramShape({vector_32_}, vector_64_),
+           ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_)},
+          {matrix_32_48_, vector_32_, matrix_32_48_});
   EXPECT_IS_OK(inferred_status1.status());
   EXPECT_TRUE(ShapeUtil::Equal(vector_64_, *inferred_status1));
 
-  auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
-  auto inferred_status2 = ShapeInference::InferConditionalShape(
-      r0s32, {ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_)},
-      {tuple_f32_v32});
+  const auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
+  const absl::StatusOr<Shape> inferred_status2 =
+      ShapeInference::InferConditionalShape(
+          r0s32, {ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_)},
+          {tuple_f32_v32});
   EXPECT_IS_OK(inferred_status2.status());
   EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_status2));
 
-  auto inferred_status_error0 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
-      {vector_32_, vector_32_, vector_64_});
+  const absl::StatusOr<Shape> inferred_status_error0 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+          {vector_32_, vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error0.ok());
   EXPECT_THAT(inferred_status_error0.status().message(),
               HasSubstr("2 == branch_computations.size()"));
 
-  auto inferred_status_error1 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
-       ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
-       ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
-      {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_}),
-       matrix_32_48_});
+  const absl::StatusOr<Shape> inferred_status_error1 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+           ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
+           ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
+          {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_}),
+           matrix_32_48_});
   EXPECT_FALSE(inferred_status_error1.ok());
   EXPECT_THAT(inferred_status_error1.status().message(),
               HasSubstr("branch computation 1 must take 1 argument"));
 
-  auto inferred_status_error2 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({r0s32}, f32_),
-       ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
-      {r0s32, vector_32_, vector_64_});
+  const absl::StatusOr<Shape> inferred_status_error2 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({r0s32}, f32_),
+           ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
+          {r0s32, vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error2.ok());
   EXPECT_THAT(inferred_status_error2.status().message(),
               HasSubstr("branch operand 2 must match the shape of the only "
                         "parameter of branch computation 2"));
 
-  auto inferred_status_error3 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
-      {vector_32_, vector_32_, vector_32_, vector_64_});
+  const absl::StatusOr<Shape> inferred_status_error3 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
+          {vector_32_, vector_32_, vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error3.ok());
   EXPECT_THAT(inferred_status_error3.status().message(),
               HasSubstr("the result of branch 0 computation and branch 3 "
                         "computation must have the same shape"));
 
-  auto inferred_status_error4 =
+  const absl::StatusOr<Shape> inferred_status_error4 =
       ShapeInference::InferConditionalShape(r0s32, {}, {});
   EXPECT_FALSE(inferred_status_error4.ok());
   EXPECT_THAT(inferred_status_error4.status().message(),
@@ -2556,31 +2659,33 @@ TEST_F(ShapeInferenceTest, ConditionalIndexed) {
 }
 
 TEST_F(ShapeInferenceTest, ConditionalDynamic) {
-  auto r0s32 = ShapeUtil::MakeShape(S32, {});
-  auto static_shape = ShapeUtil::MakeShape(S32, {4}, {false});
-  auto dynamic_shape = ShapeUtil::MakeShape(S32, {4}, {true});
-  auto inferred_status0 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({vector_32_}, static_shape),
-       ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape),
-       ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape)},
-      {vector_32_, vector_64_, vector_64_});
+  const Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+  const Shape static_shape = ShapeUtil::MakeShape(S32, {4}, {false});
+  const Shape dynamic_shape = ShapeUtil::MakeShape(S32, {4}, {true});
+  const absl::StatusOr<Shape> inferred_status0 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({vector_32_}, static_shape),
+           ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape),
+           ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape)},
+          {vector_32_, vector_64_, vector_64_});
   EXPECT_IS_OK(inferred_status0.status());
   EXPECT_TRUE(ShapeUtil::Equal(dynamic_shape, *inferred_status0));
 
-  auto inferred_status1 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({vector_32_}, dynamic_shape),
-       ShapeUtil::MakeProgramShape({vector_64_}, static_shape),
-       ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape)},
-      {vector_32_, vector_64_, vector_64_});
+  const absl::StatusOr<Shape> inferred_status1 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({vector_32_}, dynamic_shape),
+           ShapeUtil::MakeProgramShape({vector_64_}, static_shape),
+           ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape)},
+          {vector_32_, vector_64_, vector_64_});
   EXPECT_IS_OK(inferred_status1.status());
   EXPECT_TRUE(ShapeUtil::Equal(dynamic_shape, *inferred_status1));
 }
 
 TEST_F(ShapeInferenceTest, BadSlice) {
-  auto arg = ShapeUtil::MakeShape(F32, {4});
-  absl::StatusOr<Shape> statusor =
+  const Shape arg = ShapeUtil::MakeShape(F32, {4});
+  const absl::StatusOr<Shape> statusor =
       ShapeInference::InferSliceShape(arg, {0}, {5}, {1});
   ASSERT_FALSE(statusor.ok());
 
@@ -2594,9 +2699,9 @@ TEST_F(ShapeInferenceTest, BadSlice) {
 }
 
 TEST_F(ShapeInferenceTest, BadSort) {
-  auto keys = ShapeUtil::MakeShape(F32, {4});
-  auto values = ShapeUtil::MakeShape(F32, {5});
-  absl::StatusOr<Shape> statusor =
+  const Shape keys = ShapeUtil::MakeShape(F32, {4});
+  const Shape values = ShapeUtil::MakeShape(F32, {5});
+  const absl::StatusOr<Shape> statusor =
       ShapeInference::InferVariadicOpShape(HloOpcode::kSort, {&keys, &values});
   EXPECT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().message(), HasSubstr("dimensions must match"))
@@ -2604,10 +2709,10 @@ TEST_F(ShapeInferenceTest, BadSort) {
 }
 
 TEST_F(ShapeInferenceTest, BadSortValuesMismatch) {
-  auto keys = ShapeUtil::MakeShape(F32, {4});
-  auto values_good = ShapeUtil::MakeShape(F32, {4});
-  auto values_bad = ShapeUtil::MakeShape(F32, {5});
-  absl::StatusOr<Shape> statusor = ShapeInference::InferVariadicOpShape(
+  const Shape keys = ShapeUtil::MakeShape(F32, {4});
+  const Shape values_good = ShapeUtil::MakeShape(F32, {4});
+  const Shape values_bad = ShapeUtil::MakeShape(F32, {5});
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferVariadicOpShape(
       HloOpcode::kSort, {&keys, &values_good, &values_bad});
   EXPECT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().message(), HasSubstr("dimensions must match"))
@@ -2615,21 +2720,22 @@ TEST_F(ShapeInferenceTest, BadSortValuesMismatch) {
 }
 
 TEST_F(ShapeInferenceTest, SortManyValues) {
-  auto keys = ShapeUtil::MakeShape(F32, {4});
-  auto values_s32 = ShapeUtil::MakeShape(S32, {4});
-  auto values_u32 = ShapeUtil::MakeShape(U32, {4});
-  absl::StatusOr<Shape> statusor = ShapeInference::InferVariadicOpShape(
+  const Shape keys = ShapeUtil::MakeShape(F32, {4});
+  const Shape values_s32 = ShapeUtil::MakeShape(S32, {4});
+  const Shape values_u32 = ShapeUtil::MakeShape(U32, {4});
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferVariadicOpShape(
       HloOpcode::kSort, {&keys, &values_s32, &values_u32});
   EXPECT_IS_OK(statusor);
-  Shape inferred_shape = *statusor;
+  const Shape inferred_shape = *statusor;
   EXPECT_TRUE(ShapeUtil::Compatible(
       inferred_shape,
       ShapeUtil::MakeTupleShape({keys, values_s32, values_u32})));
 }
 
 TEST_F(ShapeInferenceTest, GoodTopK) {
-  auto input = ShapeUtil::MakeShape(F32, {3, 4, 5});
-  absl::StatusOr<Shape> s = ShapeInference::InferTopKShape(input, /*k=*/2);
+  const Shape input = ShapeUtil::MakeShape(F32, {3, 4, 5});
+  const absl::StatusOr<Shape> s =
+      ShapeInference::InferTopKShape(input, /*k=*/2);
   ASSERT_IS_OK(s.status());
   ASSERT_TRUE(ShapeUtil::Equal(
       *s, ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 4, 2}),
@@ -2637,8 +2743,8 @@ TEST_F(ShapeInferenceTest, GoodTopK) {
 }
 
 TEST_F(ShapeInferenceTest, FailTopKLargeK) {
-  auto input = ShapeUtil::MakeShape(F32, {3, 4, 5});
-  absl::StatusOr<Shape> statusor =
+  const Shape input = ShapeUtil::MakeShape(F32, {3, 4, 5});
+  const absl::StatusOr<Shape> statusor =
       ShapeInference::InferTopKShape(input, /*k=*/10);
   EXPECT_FALSE(statusor.ok());
 }
@@ -2648,7 +2754,7 @@ TEST_F(ShapeInferenceTest, InferStochasticConvertShape) {
   const Shape random = ShapeUtil::MakeShape(U32, {4, 3});
   const Shape expected_shape = ShapeUtil::MakeShape(S8, {4, 3});
 
-  auto inferred_sr_shape =
+  const absl::StatusOr<Shape> inferred_sr_shape =
       ShapeInference::InferStochasticConvertShape(operand, random, S8);
   EXPECT_TRUE(inferred_sr_shape.ok());
   EXPECT_TRUE(ShapeUtil::Equal(*inferred_sr_shape, expected_shape));
@@ -2659,7 +2765,7 @@ TEST_F(ShapeInferenceTest, InvalidStochasticConvert_MismatchRandomElementType) {
   const Shape random = ShapeUtil::MakeShape(U16, {4, 3});
   const Shape expected_shape = ShapeUtil::MakeShape(S8, {4, 3});
 
-  auto status_or =
+  const auto status_or =
       ShapeInference::InferStochasticConvertShape(operand, random, S8);
   ASSERT_FALSE(status_or.ok());
   EXPECT_THAT(
@@ -2674,7 +2780,7 @@ TEST_F(ShapeInferenceTest,
   const Shape random = ShapeUtil::MakeShape(S32, {4, 3});
   const Shape expected_shape = ShapeUtil::MakeShape(S8, {4, 3});
 
-  auto status_or =
+  const auto status_or =
       ShapeInference::InferStochasticConvertShape(operand, random, S8);
   ASSERT_FALSE(status_or.ok());
   EXPECT_THAT(
@@ -2703,7 +2809,7 @@ class GatherShapeInferenceTest : public ShapeInferenceTest {
 };
 
 TEST_F(GatherShapeInferenceTest, TensorFlowGather) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape gather_shape,
                           ShapeInference::InferGatherShape(
                               matrix_64_48_, s64_vector_32_,
                               HloGatherInstruction::MakeGatherDimNumbers(
@@ -2718,7 +2824,7 @@ TEST_F(GatherShapeInferenceTest, TensorFlowGather) {
 }
 
 TEST_F(GatherShapeInferenceTest, TensorFlowGatherV2) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape gather_shape,
                           ShapeInference::InferGatherShape(
                               matrix_64_48_, s64_vector_32_,
                               HloGatherInstruction::MakeGatherDimNumbers(
@@ -2733,7 +2839,7 @@ TEST_F(GatherShapeInferenceTest, TensorFlowGatherV2) {
 }
 
 TEST_F(GatherShapeInferenceTest, TensorFlowGatherNd) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape gather_shape,
                           ShapeInference::InferGatherShape(
                               matrix_64_48_, s64_4d_tensor_10_9_8_7_1_,
                               HloGatherInstruction::MakeGatherDimNumbers(
@@ -2749,7 +2855,7 @@ TEST_F(GatherShapeInferenceTest, TensorFlowGatherNd) {
 
 TEST_F(GatherShapeInferenceTest, TensorFlowBatchDynamicSlice) {
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
+      const Shape gather_shape,
       ShapeInference::InferGatherShape(
           f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
           HloGatherInstruction::MakeGatherDimNumbers(
@@ -2766,7 +2872,7 @@ TEST_F(GatherShapeInferenceTest, TensorFlowBatchDynamicSlice) {
 
 TEST_F(GatherShapeInferenceTest, DynamicGatherEntireDimension) {
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
+      const Shape gather_shape,
       ShapeInference::InferGatherShape(
           ShapeUtil::MakeShape(F32, {3, 2, 1}, {false, true, false}),
           ShapeUtil::MakeShape(S64, {}),
@@ -2783,7 +2889,7 @@ TEST_F(GatherShapeInferenceTest, DynamicGatherEntireDimension) {
 
 TEST_F(GatherShapeInferenceTest, DynamicGatherCollapsedDimension) {
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
+      const Shape gather_shape,
       ShapeInference::InferGatherShape(
           ShapeUtil::MakeShape(F32, {3, 2, 1}, {true, false, false}),
           ShapeUtil::MakeShape(S64, {}),
@@ -2800,7 +2906,7 @@ TEST_F(GatherShapeInferenceTest, DynamicGatherCollapsedDimension) {
 
 TEST_F(GatherShapeInferenceTest, DynamicIndices) {
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
+      const Shape gather_shape,
       ShapeInference::InferGatherShape(
           ShapeUtil::MakeShape(F32, {3, 2, 2}),
           ShapeUtil::MakeShape(S64, {3, 4, 2}, {false, true, false}),
@@ -2818,7 +2924,7 @@ TEST_F(GatherShapeInferenceTest, DynamicIndices) {
 
 TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_A) {
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
+      const Shape gather_shape,
       ShapeInference::InferGatherShape(
           f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_,
           HloGatherInstruction::MakeGatherDimNumbers(
@@ -2836,7 +2942,7 @@ TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_A) {
 
 TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) {
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
+      const Shape gather_shape,
       ShapeInference::InferGatherShape(
           f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_5_10_9_7_6_,
           HloGatherInstruction::MakeGatherDimNumbers(
@@ -2854,7 +2960,7 @@ TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) {
 
 TEST_F(GatherShapeInferenceTest, NoOutputGatherDims) {
   // This is equivalent to a dynamic slice.
-  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape gather_shape,
                           ShapeInference::InferGatherShape(
                               f32_5d_tensor_50_49_48_47_46_, s64_vector_5_,
                               HloGatherInstruction::MakeGatherDimNumbers(
@@ -2872,7 +2978,7 @@ TEST_F(GatherShapeInferenceTest, NoOutputGatherDims) {
 TEST_F(GatherShapeInferenceTest, ScalarGatherIndices) {
   // The gather indices "tensor" is a scalar S here that's used to slice out
   // [S,0,0,0,0]..[S,30,29,28,27] into a [30,29,28,27] shaped result.
-  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape gather_shape,
                           ShapeInference::InferGatherShape(
                               f32_5d_tensor_50_49_48_47_46_, s64_scalar_,
                               HloGatherInstruction::MakeGatherDimNumbers(
@@ -2888,7 +2994,7 @@ TEST_F(GatherShapeInferenceTest, ScalarGatherIndices) {
 }
 
 TEST_F(GatherShapeInferenceTest, TupleShapedTensorInput) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       tuple_shape_, s64_vector_32_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{0},
@@ -2903,7 +3009,7 @@ TEST_F(GatherShapeInferenceTest, TupleShapedTensorInput) {
 }
 
 TEST_F(GatherShapeInferenceTest, TupleShapedGatherIndicesInput) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       s64_vector_32_, tuple_shape_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{0},
@@ -2918,7 +3024,7 @@ TEST_F(GatherShapeInferenceTest, TupleShapedGatherIndicesInput) {
 }
 
 TEST_F(GatherShapeInferenceTest, FloatingPointGatherIndicesInput) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       s64_vector_32_, vector_32_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{0},
@@ -2934,7 +3040,7 @@ TEST_F(GatherShapeInferenceTest, FloatingPointGatherIndicesInput) {
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_NonAscendingWindowIndices) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 8, 7},
@@ -2951,7 +3057,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_RepeatedWindowIndices) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 7},
@@ -2968,7 +3074,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_WindowIndexOutOfBounds) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 99, 100, 101},
@@ -2984,7 +3090,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_WindowIndexBarelyOutOfBounds) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 9},
@@ -3000,7 +3106,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_MismatchingElidedWindowDims) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3018,7 +3124,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_OutOfBoundsWindowToInputMapping) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3035,7 +3141,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_RepeatedWindowToInputMapping) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3052,7 +3158,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_MismatchingGatherToInputMapping) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3070,7 +3176,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_OutOfBoundsGatherToInputMapping) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3086,7 +3192,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_RepeatedGatherToInputMapping) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3103,7 +3209,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_NonAscendingElidedWindowDims) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3118,7 +3224,7 @@ TEST_F(GatherShapeInferenceTest,
 }
 
 TEST_F(GatherShapeInferenceTest, InvalidGatherDimNumbers_WindowBoundsTooLarge) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7},
@@ -3135,7 +3241,7 @@ TEST_F(GatherShapeInferenceTest, InvalidGatherDimNumbers_WindowBoundsTooLarge) {
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_MismatchingNumberOfWindowBounds) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3152,7 +3258,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_WindowBoundsNot1ForElidedDim) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7},
@@ -3169,7 +3275,7 @@ TEST_F(GatherShapeInferenceTest,
 }
 
 TEST_F(GatherShapeInferenceTest, OutOfBoundsGatherIndicesLeafDim) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3254,8 +3360,8 @@ class ScatterShapeInferenceTest
 };
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithFullUpdates) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {64, 32}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {64, 32}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3268,8 +3374,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithFullUpdates) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithFullUpdatesV2) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {32, 48}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {32, 48}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3282,8 +3388,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithFullUpdatesV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithPartialUpdates) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {10, 32}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {10, 32}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3296,8 +3402,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithPartialUpdates) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithPartialUpdatesV2) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {32, 8}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {32, 8}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3310,8 +3416,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithPartialUpdatesV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesBiggerThanInput) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {65, 32}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {65, 32}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0},
@@ -3327,8 +3433,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesBiggerThanInput) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesBiggerThanInputV2) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {32, 49}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {32, 49}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{1},
@@ -3344,8 +3450,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesBiggerThanInputV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesNotMatchingIndices) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {64, 31}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {64, 31}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0},
@@ -3362,8 +3468,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesNotMatchingIndices) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesNotMatchingIndicesV2) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {31, 48}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {31, 48}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{1},
@@ -3380,9 +3486,9 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesNotMatchingIndicesV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterNdWithFullUpdates) {
-  auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
-                             {10, 9, 8, 7, 48}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
+                                   {10, 9, 8, 7, 48}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3395,9 +3501,9 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithFullUpdates) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterNdWithFullUpdatesV2) {
-  auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
-                             {10, 9, 8, 7, 64}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
+                                   {10, 9, 8, 7, 64}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3410,9 +3516,9 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithFullUpdatesV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterNdWithPartialUpdates) {
-  auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
-                             {10, 9, 8, 7, 10}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
+                                   {10, 9, 8, 7, 10}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3425,9 +3531,9 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithPartialUpdates) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterNdWithPartialUpdatesV2) {
-  auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
-                             {10, 9, 8, 7, 12}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
+                                   {10, 9, 8, 7, 12}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3440,9 +3546,9 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithPartialUpdatesV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterNdWithUpdatesBiggerThanInput) {
-  auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
-                             {10, 9, 8, 7, 65}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
+                                   {10, 9, 8, 7, 65}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4},
@@ -3458,9 +3564,9 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithUpdatesBiggerThanInput) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterNdWithUpdatesNotMatchingIndices) {
-  auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
-                             {9, 9, 8, 7, 64}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
+                                   {9, 9, 8, 7, 64}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4},
@@ -3477,10 +3583,11 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithUpdatesNotMatchingIndices) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfBatchDynamicUpdateSlice) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape scatter_shape,
+      const Shape scatter_shape,
       ShapeInference::InferScatterShape(
           shapes.ptrs, to_apply(types()),
           HloScatterInstruction::MakeScatterDimNumbers(
@@ -3494,10 +3601,11 @@ TEST_P(ScatterShapeInferenceTest, TfBatchDynamicUpdateSlice) {
 }
 
 TEST_P(ScatterShapeInferenceTest, NonDefaultScatterIndicesLeafDim) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 5, 7, 6}),
-                             {10, 9, 7, 6, 30, 29, 28, 27, 26}, types());
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 5, 7, 6}),
+                   {10, 9, 7, 6, 30, 29, 28, 27, 26}, types());
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape scatter_shape,
+      const Shape scatter_shape,
       ShapeInference::InferScatterShape(
           shapes.ptrs, to_apply(types()),
           HloScatterInstruction::MakeScatterDimNumbers(
@@ -3512,10 +3620,11 @@ TEST_P(ScatterShapeInferenceTest, NonDefaultScatterIndicesLeafDim) {
 }
 
 TEST_P(ScatterShapeInferenceTest, NonDefaultScatterIndicesLeafDimV2) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({5, 10, 9, 7, 6}),
-                             {10, 9, 7, 6, 30, 29, 28, 27, 26}, types());
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({5, 10, 9, 7, 6}),
+                   {10, 9, 7, 6, 30, 29, 28, 27, 26}, types());
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape scatter_shape,
+      const Shape scatter_shape,
       ShapeInference::InferScatterShape(
           shapes.ptrs, to_apply(types()),
           HloScatterInstruction::MakeScatterDimNumbers(
@@ -3530,11 +3639,11 @@ TEST_P(ScatterShapeInferenceTest, NonDefaultScatterIndicesLeafDimV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, NoUpdateScatterDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_vector(5),
-                             {30, 29, 28, 27, 26}, types());
+  const auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_vector(5),
+                                   {30, 29, 28, 27, 26}, types());
   // This is equivalent to a dynamic update slice.
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape scatter_shape,
+      const Shape scatter_shape,
       ShapeInference::InferScatterShape(
           shapes.ptrs, to_apply(types()),
           HloScatterInstruction::MakeScatterDimNumbers(
@@ -3549,11 +3658,11 @@ TEST_P(ScatterShapeInferenceTest, NoUpdateScatterDims) {
 }
 
 TEST_P(ScatterShapeInferenceTest, ScalarScatterIndices) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, scalar(S64),
-                             {30, 29, 28, 27}, types());
+  const auto shapes = CreateShapes({50, 49, 48, 47, 46}, scalar(S64),
+                                   {30, 29, 28, 27}, types());
   // The scalar indices "tensor" is a scalar S here that's used to update a
   // [30,29,28,27] shaped tensor within the operand at position S.
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3568,11 +3677,11 @@ TEST_P(ScatterShapeInferenceTest, ScalarScatterIndices) {
 }
 
 TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedTensorInput) {
-  Shape tuple_shape =
+  const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1}),
                                  ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1})});
-  Shape s64_vector_32 = s64_vector(32);
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const Shape s64_vector_32 = s64_vector(32);
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       {&tuple_shape, &s64_vector_32, &s64_vector_32}, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0},
@@ -3586,11 +3695,11 @@ TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedTensorInput) {
 }
 
 TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedScatterIndicesInput) {
-  Shape tuple_shape =
+  const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1}),
                                  ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1})});
-  Shape s64_vector_32 = s64_vector(32);
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const Shape s64_vector_32 = s64_vector(32);
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       {&s64_vector_32, &tuple_shape, &s64_vector_32}, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0},
@@ -3604,11 +3713,11 @@ TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedScatterIndicesInput) {
 }
 
 TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedUpdatesInput) {
-  Shape tuple_shape =
+  const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1}),
                                  ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1})});
-  Shape s64_vector_32 = s64_vector(32);
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const Shape s64_vector_32 = s64_vector(32);
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       {&s64_vector_32, &s64_vector_32, &tuple_shape}, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0},
@@ -3622,8 +3731,8 @@ TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedUpdatesInput) {
 }
 
 TEST_P(ScatterShapeInferenceTest, FloatingPointScatterIndicesInput) {
-  Shape s64_vector_32 = s64_vector(32);
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const Shape s64_vector_32 = s64_vector(32);
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       {&s64_vector_32, &vector_32_, &s64_vector_32}, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0},
@@ -3637,9 +3746,10 @@ TEST_P(ScatterShapeInferenceTest, FloatingPointScatterIndicesInput) {
 }
 
 TEST_P(ScatterShapeInferenceTest, OutOfBoundsScatterIndicesLeafDim) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3654,9 +3764,10 @@ TEST_P(ScatterShapeInferenceTest, OutOfBoundsScatterIndicesLeafDim) {
 }
 
 TEST_P(ScatterShapeInferenceTest, InvalidUpdates) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28, 50}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28, 50}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3672,9 +3783,10 @@ TEST_P(ScatterShapeInferenceTest, InvalidUpdates) {
 TEST_P(ScatterShapeInferenceTest, InvalidUpdateComputation) {
   const ProgramShape invalid_update_computation =
       ShapeUtil::MakeProgramShape({f32_}, f32_);
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, invalid_update_computation,
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3691,9 +3803,10 @@ TEST_P(ScatterShapeInferenceTest, InvalidUpdateComputation) {
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_NonAscendingUpdateWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6, 8, 7},
@@ -3708,9 +3821,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_RepeatedUpdateWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6, 7, 7},
@@ -3725,9 +3839,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_OutOfBoundsUpdateWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6, 7, 9},
@@ -3743,9 +3858,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_NonAscendingInsertedWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3760,9 +3876,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_RepeatedInsertedWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3777,9 +3894,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_OutOfBoundsInsertedWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3795,9 +3913,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_MismatchingScatterDimsToOperandDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3815,9 +3934,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_OutOfBoundsScatterDimsToOperandDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3833,9 +3953,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_RepeatedValuesInScatterDimsToOperandDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3852,9 +3973,9 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_InsufficientWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, scalar(S64),
-                             {30, 29, 28, 27}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({50, 49, 48, 47, 46}, scalar(S64),
+                                   {30, 29, 28, 27}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0, 1, 2, 3},
@@ -3883,8 +4004,9 @@ INSTANTIATE_TEST_SUITE_P(All, ScatterShapeInferenceTest,
                          ScatterTestName());
 
 TEST_P(UnboundedUnaryOpShapeInferenceTest, UnboundedUnaryOps) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape(GetParam().operand));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape(GetParam().operand));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                          ParseShape(GetParam().expected));
   TF_ASSERT_OK_AND_ASSIGN(
       const Shape inferred,
       ShapeInference::InferUnaryOpShape(GetParam().opcode, operand));
@@ -3894,12 +4016,14 @@ TEST_P(UnboundedUnaryOpShapeInferenceTest, UnboundedUnaryOps) {
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAdd) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, lhs, rhs, GetParam().broadcast_dimensions);
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
   if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
     EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
         << "inferred: " << ShapeUtil::HumanString(*inferred_status)
         << " expected: " << ShapeUtil::HumanString(expected);
@@ -3911,12 +4035,14 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAdd) {
 }
 
 TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedAnd) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAnd, lhs, rhs, GetParam().broadcast_dimensions);
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAnd, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
   if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
     EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
         << "inferred: " << ShapeUtil::HumanString(*inferred_status)
         << " expected: " << ShapeUtil::HumanString(expected);
@@ -3930,8 +4056,9 @@ TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedAnd) {
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAtan2) {
   TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAtan2, lhs, rhs, GetParam().broadcast_dimensions);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAtan2, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
   if (inferred_status.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
     EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
@@ -3956,18 +4083,18 @@ TEST_F(ShapeInferenceTest, UnboundedBitcastConvert) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBatchNormGrad) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scale, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape mean, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape variance, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_scale, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_offset, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_output, ParseShape("f32[5, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scale, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape mean, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape variance, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_scale, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_offset, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_output, ParseShape("f32[5, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape result_shape,
                           ShapeInference::InferBatchNormGradShape(
                               operand, scale, mean, variance, grad_output, 1));
-  Shape expected_tuple_shape =
+  const Shape expected_tuple_shape =
       ShapeUtil::MakeTupleShape({grad_operand, grad_scale, grad_offset});
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected_tuple_shape))
       << "inferred: " << ShapeUtil::HumanString(result_shape)
@@ -3975,31 +4102,31 @@ TEST_F(ShapeInferenceTest, UnboundedBatchNormGrad) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBatchNormInference) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scale, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape offset, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape mean, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape variance, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scale, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape offset, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape mean, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape variance, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape result_shape,
                           ShapeInference::InferBatchNormInferenceShape(
                               operand, scale, offset, mean, variance, 1));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, 7]"));
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
       << "inferred: " << ShapeUtil::HumanString(result_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBatchNormTraining) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scale, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape offset, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape batch_mean, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape batch_var, ParseShape("f32[?]"));
-  Shape expected_tuple_shape =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scale, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape offset, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape batch_mean, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape batch_var, ParseShape("f32[?]"));
+  const Shape expected_tuple_shape =
       ShapeUtil::MakeTupleShape({output, batch_mean, batch_var});
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result_shape,
+      const Shape result_shape,
       ShapeInference::InferBatchNormTrainingShape(operand, scale, offset, 1));
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected_tuple_shape))
       << "inferred: " << ShapeUtil::HumanString(result_shape)
@@ -4007,27 +4134,28 @@ TEST_F(ShapeInferenceTest, UnboundedBatchNormTraining) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastUnsupportedOperand) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[1, <=2, ?]"));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, <=2, ?]"));
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferBroadcastShape(operand, /*broadcast_sizes=*/{1});
   EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("is_unbounded_dynamic"));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastUnsupportedBroadcastSize) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, 4]"));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBroadcastShape(
-      operand, /*broadcast_sizes=*/{Shape::kUnboundedSize});
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, 4]"));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBroadcastShape(
+          operand, /*broadcast_sizes=*/{Shape::kUnboundedSize});
   EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("Non-broadcast dimensions must not be dynamic."));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastInDim) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=2, 3, 4]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, 3, 4]"));
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_status,
+      const Shape inferred_status,
       ShapeInference::InferBroadcastShape(operand, expected,
                                           /*broadcast_dimensions=*/{0, 2}));
   EXPECT_TRUE(ShapeUtil::Equal(inferred_status, expected))
@@ -4036,10 +4164,10 @@ TEST_F(ShapeInferenceTest, UnboundedBroadcastInDim) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastInDimToBounded) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=2, 3, <=4]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, 3, <=4]"));
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_status,
+      const Shape inferred_status,
       ShapeInference::InferBroadcastShape(operand, expected,
                                           /*broadcast_dimensions=*/{0, 2}));
   EXPECT_TRUE(ShapeUtil::Equal(inferred_status, expected))
@@ -4048,9 +4176,9 @@ TEST_F(ShapeInferenceTest, UnboundedBroadcastInDimToBounded) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastInDimUnsupportedOutput) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=2, 3, ?]"));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, 3, ?]"));
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferBroadcastShape(operand, expected,
                                           /*broadcast_dimensions=*/{0, 2});
   EXPECT_THAT(inferred_status.status().message(),
@@ -4058,21 +4186,22 @@ TEST_F(ShapeInferenceTest, UnboundedBroadcastInDimUnsupportedOutput) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastInDimUnsupported) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, 4]"));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBroadcastShape(
-      operand, /*broadcast_sizes=*/{2, Shape::kUnboundedSize, 4});
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, 4]"));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBroadcastShape(
+          operand, /*broadcast_sizes=*/{2, Shape::kUnboundedSize, 4});
   EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("Non-broadcast dimensions must not be dynamic."));
 }
 
 TEST_P(UnboundedClampOpShapeInferenceTest, UnboundedClamp) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam()[0]));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam()[1]));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape(GetParam()[2]));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam()[1]));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape(GetParam()[2]));
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, lhs, rhs, ehs);
   if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam()[3]));
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape(GetParam()[3]));
     EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
         << "inferred: " << ShapeUtil::HumanString(*inferred_status)
         << " expected: " << ShapeUtil::HumanString(expected);
@@ -4082,11 +4211,11 @@ TEST_P(UnboundedClampOpShapeInferenceTest, UnboundedClamp) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedClampWithTuple) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("(f32[2], f32[?])"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("(f32[?], f32[2])"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("(f32[2], f32[?])"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("(f32[?], f32[2])"));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("(f32[2], f32[?])"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("(f32[?], f32[2])"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("(f32[2], f32[?])"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("(f32[?], f32[2])"));
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, lhs, rhs, ehs);
   EXPECT_THAT(
       inferred_status.status().message(),
@@ -4095,12 +4224,14 @@ TEST_F(ShapeInferenceTest, UnboundedClampWithTuple) {
 }
 
 TEST_P(UnboundedCompareOpShapeInferenceTest, UnboundedCompare) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kCompare, lhs, rhs, GetParam().broadcast_dimensions);
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kCompare, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
   if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
     EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
         << "inferred: " << ShapeUtil::HumanString(*inferred_status)
         << " expected: " << ShapeUtil::HumanString(expected);
@@ -4112,13 +4243,13 @@ TEST_P(UnboundedCompareOpShapeInferenceTest, UnboundedCompare) {
 }
 
 TEST_P(UnboundedConcatenateOpShapeInferenceTest, UnboundedConcatenate) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand1, ParseShape(GetParam()[0]));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand2, ParseShape(GetParam()[1]));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape(GetParam()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand2, ParseShape(GetParam()[1]));
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferConcatOpShape({&operand1, &operand2},
                                          /*dimension=*/0);
   if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam()[2]));
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape(GetParam()[2]));
     EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
         << "inferred: " << ShapeUtil::HumanString(*inferred_status)
         << " expected: " << ShapeUtil::HumanString(expected);
@@ -4129,10 +4260,10 @@ TEST_P(UnboundedConcatenateOpShapeInferenceTest, UnboundedConcatenate) {
 
 TEST_F(UnboundedConcatenateOpShapeInferenceTest,
        UnboundedConcatenateMismatchedDimensions) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand1, ParseShape("f32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand2, ParseShape("f32[2, 3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand3, ParseShape("f32[2, 4]"));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape("f32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand2, ParseShape("f32[2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand3, ParseShape("f32[2, 4]"));
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferConcatOpShape({&operand1, &operand2, &operand3},
                                          /*dimension=*/0);
   EXPECT_THAT(inferred_status.status().message(),
@@ -4141,10 +4272,10 @@ TEST_F(UnboundedConcatenateOpShapeInferenceTest,
 
 TEST_F(UnboundedConcatenateOpShapeInferenceTest,
        UnboundedConcatenateMismatchedBoundSizes) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand1, ParseShape("f32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand2, ParseShape("f32[2, <=3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand3, ParseShape("f32[2, <=4]"));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape("f32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand2, ParseShape("f32[2, <=3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand3, ParseShape("f32[2, <=4]"));
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferConcatOpShape({&operand1, &operand2, &operand3},
                                          /*dimension=*/0);
   EXPECT_THAT(inferred_status.status().message(),
@@ -4152,19 +4283,19 @@ TEST_F(UnboundedConcatenateOpShapeInferenceTest,
 }
 
 TEST_F(ShapeInferenceTest, UnboundedConvert) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f64[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape result, ShapeInference::InferConvertShape(
-                                            operand, PrimitiveType::F64));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f64[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape result, ShapeInference::InferConvertShape(
+                                                  operand, PrimitiveType::F64));
   EXPECT_TRUE(ShapeUtil::Equal(result, expected))
       << "inferred: " << ShapeUtil::HumanString(result)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedConvolution) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 2, ?, 128]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2, 2, <=128, 8]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 1, ?, 8]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 2, ?, 128]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[2, 2, <=128, 8]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 1, ?, 8]"));
 
   ConvolutionDimensionNumbers dnums;
   dnums.set_input_batch_dimension(0);
@@ -4181,14 +4312,14 @@ TEST_F(ShapeInferenceTest, UnboundedConvolution) {
   dnums.set_kernel_output_feature_dimension(3);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      Window window,
+      const Window window,
       ShapeInference::InferWindowFromDimensions(
           /*window_dimensions=*/{2, 2}, /*window_strides=*/{1, 1},
           MakePadding(/*input_dimensions=*/{2, Shape::kUnboundedSize},
                       /*window_dimensions=*/{2, 2},
                       /*window_strides=*/{1, 1}, Padding::kValid),
           /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
-  TF_ASSERT_OK_AND_ASSIGN(Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape result_shape,
                           ShapeInference::InferConvolveShape(
                               lhs, rhs, /*feature_group_count=*/1,
                               /*batch_group_count=*/1, window, dnums,
@@ -4199,12 +4330,14 @@ TEST_F(ShapeInferenceTest, UnboundedConvolution) {
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedDiv) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kDivide, lhs, rhs, GetParam().broadcast_dimensions);
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kDivide, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
   if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
     EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
         << "inferred: " << ShapeUtil::HumanString(*inferred_status)
         << " expected: " << ShapeUtil::HumanString(expected);
@@ -4216,16 +4349,16 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedDiv) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedDot) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
 
   DotDimensionNumbers dnums;
   dnums.add_lhs_contracting_dimensions(1);
   dnums.add_rhs_contracting_dimensions(0);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result_shape,
+      const Shape result_shape,
       ShapeInference::InferDotOpShape(lhs, rhs, dnums,
                                       /*preferred_element_type=*/std::nullopt));
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
@@ -4234,9 +4367,9 @@ TEST_F(ShapeInferenceTest, UnboundedDot) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedDotGeneral) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, <=3, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2, 4, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, <=3, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, <=3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[2, 4, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, <=3, 5]"));
 
   DotDimensionNumbers dnums;
   dnums.add_lhs_batch_dimensions(0);
@@ -4245,7 +4378,7 @@ TEST_F(ShapeInferenceTest, UnboundedDotGeneral) {
   dnums.add_rhs_contracting_dimensions(1);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result_shape,
+      const Shape result_shape,
       ShapeInference::InferDotOpShape(lhs, rhs, dnums,
                                       /*preferred_element_type=*/std::nullopt));
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
@@ -4254,9 +4387,10 @@ TEST_F(ShapeInferenceTest, UnboundedDotGeneral) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedGather) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[3, 4, 2]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape start_indices, ParseShape("s32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, 2, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[3, 4, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape start_indices,
+                          ParseShape("s32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, 2, 2]"));
 
   GatherDimensionNumbers dimension_numbers;
   dimension_numbers.add_offset_dims(2);
@@ -4266,7 +4400,7 @@ TEST_F(ShapeInferenceTest, UnboundedGather) {
   dimension_numbers.add_start_index_map(0);
   dimension_numbers.set_index_vector_dim(2);
 
-  TF_ASSERT_OK_AND_ASSIGN(Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape result_shape,
                           ShapeInference::InferGatherShape(
                               operand, start_indices, dimension_numbers,
                               /*slice_sizes=*/{1, 2, 2}));
@@ -4276,12 +4410,14 @@ TEST_F(ShapeInferenceTest, UnboundedGather) {
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMax) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kMaximum, lhs, rhs, GetParam().broadcast_dimensions);
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kMaximum, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
   if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
     EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
         << "inferred: " << ShapeUtil::HumanString(*inferred_status)
         << " expected: " << ShapeUtil::HumanString(expected);
@@ -4293,12 +4429,14 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMax) {
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMul) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kMultiply, lhs, rhs, GetParam().broadcast_dimensions);
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kMultiply, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
   if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
     EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
         << "inferred: " << ShapeUtil::HumanString(*inferred_status)
         << " expected: " << ShapeUtil::HumanString(expected);
@@ -4329,20 +4467,20 @@ TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedOr) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedPad) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape padding_value, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 21]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape padding_value, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 21]"));
 
   PaddingConfig padding_config;
   for (int i = 0; i < 2; i++) {
-    auto dimension = padding_config.add_dimensions();
+    const auto dimension = padding_config.add_dimensions();
     dimension->set_edge_padding_low(1);
     dimension->set_edge_padding_high(1);
     dimension->set_interior_padding(1);
   }
 
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result_shape,
+      const Shape result_shape,
       ShapeInference::InferPadShape(operand, padding_value, padding_config));
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
       << "inferred: " << ShapeUtil::HumanString(result_shape)
@@ -4350,12 +4488,14 @@ TEST_F(ShapeInferenceTest, UnboundedPad) {
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedPow) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kPower, lhs, rhs, GetParam().broadcast_dimensions);
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kPower, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
   if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
     EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
         << "inferred: " << ShapeUtil::HumanString(*inferred_status)
         << " expected: " << ShapeUtil::HumanString(expected);
@@ -4367,41 +4507,42 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedPow) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReduce) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape input0, ParseShape("f32[7, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape input1, ParseShape("f32[?, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape input2, ParseShape("f32[7, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input0, ParseShape("f32[7, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input1, ParseShape("f32[?, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input2, ParseShape("f32[7, ?]"));
 
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {f32_, f32_, f32_, f32_, f32_, f32_},
       ShapeUtil::MakeTupleShape({f32_, f32_, f32_}));
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result_shape,
+      const Shape result_shape,
       ShapeInference::InferReduceShape(
           {&input0, &input1, &input2, &f32_, &f32_, &f32_}, {1}, to_apply));
-  Shape shape = ShapeUtil::MakeShape(F32, {7});
-  Shape expected = ShapeUtil::MakeTupleShape({shape, shape, shape});
+  const Shape shape = ShapeUtil::MakeShape(F32, {7});
+  const Shape expected = ShapeUtil::MakeTupleShape({shape, shape, shape});
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
       << "inferred: " << ShapeUtil::HumanString(result_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReduceInvalidReduceDimension) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape input0, ParseShape("f32[7, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape input1, ParseShape("f32[?, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape input2, ParseShape("f32[5, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input0, ParseShape("f32[7, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input1, ParseShape("f32[?, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input2, ParseShape("f32[5, ?]"));
 
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {f32_, f32_, f32_, f32_, f32_, f32_},
       ShapeUtil::MakeTupleShape({f32_, f32_, f32_}));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferReduceShape(
-      {&input0, &input1, &input2, &f32_, &f32_, &f32_}, {1}, to_apply);
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReduceShape(
+          {&input0, &input1, &input2, &f32_, &f32_, &f32_}, {1}, to_apply);
   EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("All reduced tensors must have compatible dimension"));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReduceWindow) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape input, ParseShape("f32[?, 4, 8]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 3, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input, ParseShape("f32[?, 4, 8]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 3, 5]"));
 
   Window window;
   WindowDimension dim0, dim1, dim2;
@@ -4419,39 +4560,41 @@ TEST_F(ShapeInferenceTest, UnboundedReduceWindow) {
   *window.add_dimensions() = dim2;
 
   ProgramShape body = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
-  TF_ASSERT_OK_AND_ASSIGN(Shape infered_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferReduceWindowShape(
                               input, /*init_value=*/f32_, window, body));
-  EXPECT_TRUE(ShapeUtil::Equal(infered_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(infered_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReshape) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[2,3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred, ShapeInference::InferReshapeShape(
-                                              operand, /*dimensions=*/{0},
-                                              /*new_sizes=*/{2, 3}, -1));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2,3]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred,
+      ShapeInference::InferReshapeShape(operand, /*dimensions=*/{0},
+                                        /*new_sizes=*/{2, 3}, -1));
   ASSERT_TRUE(ShapeUtil::Equal(inferred, expected))
       << "inferred: " << ShapeUtil::HumanString(inferred)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReshapeUnsupportedOutputShape) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[6]"));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferReshapeShape(
-      operand, /*dimensions=*/{0},
-      /*new_sizes=*/{Shape::kUnboundedSize, Shape::kUnboundedSize}, -1);
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[6]"));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferReshapeShape(
+          operand, /*dimensions=*/{0},
+          /*new_sizes=*/{Shape::kUnboundedSize, Shape::kUnboundedSize}, -1);
   EXPECT_THAT(
       inferred_status.status().message(),
       HasSubstr("Reshaping with unbounded result shape is not supported."));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReshapeUnsupportedMixOfDynamism) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, <=3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=3]"));
-  auto inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, <=3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=3]"));
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferReshapeShape(operand, /*dimensions=*/{0},
                                         /*new_sizes=*/{3}, -1);
   ASSERT_THAT(inferred_status.status().message(),
@@ -4460,13 +4603,13 @@ TEST_F(ShapeInferenceTest, UnboundedReshapeUnsupportedMixOfDynamism) {
 }
 
 TEST_P(UnboundedSelectOpShapeInferenceTest, UnboundedSelect) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam()[0]));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam()[1]));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape(GetParam()[2]));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam()[1]));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape(GetParam()[2]));
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, lhs, rhs, ehs);
   if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam()[3]));
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape(GetParam()[3]));
     EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
         << "inferred: " << ShapeUtil::HumanString(*inferred_status)
         << " expected: " << ShapeUtil::HumanString(expected);
@@ -4476,11 +4619,11 @@ TEST_P(UnboundedSelectOpShapeInferenceTest, UnboundedSelect) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedSelectWithTupleUnsupported) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("(pred[2], pred[?])"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("(f32[?], f32[2])"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("(f32[2], f32[?])"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("(f32[?], f32[2])"));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("(pred[2], pred[?])"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("(f32[?], f32[2])"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("(f32[2], f32[?])"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("(f32[?], f32[2])"));
+  const absl::StatusOr<Shape> inferred_status =
       ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, lhs, rhs, ehs);
   EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("Expected array argument for select pred, but got "
@@ -4488,24 +4631,27 @@ TEST_F(ShapeInferenceTest, UnboundedSelectWithTupleUnsupported) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedSlice) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[1, <=3, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[1, <=2, 3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape result_shape, ShapeInference::InferSliceShape(
-                                                  operand, /*starts=*/{0, 1, 2},
-                                                  /*limits=*/{1, 3, 5},
-                                                  /*strides=*/{1, 1, 1}));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, <=3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, <=2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape result_shape,
+      ShapeInference::InferSliceShape(operand, /*starts=*/{0, 1, 2},
+                                      /*limits=*/{1, 3, 5},
+                                      /*strides=*/{1, 1, 1}));
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
       << "inferred: " << ShapeUtil::HumanString(result_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedSub) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kSubtract, lhs, rhs, GetParam().broadcast_dimensions);
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kSubtract, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
   if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
     EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
         << "inferred: " << ShapeUtil::HumanString(*inferred_status)
         << " expected: " << ShapeUtil::HumanString(expected);
@@ -4517,10 +4663,11 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedSub) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedScatter) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape input, ParseShape("f32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_indices, ParseShape("s32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape updates, ParseShape("f32[?, ?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input, ParseShape("f32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_indices,
+                          ParseShape("s32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape updates, ParseShape("f32[?, ?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, ?]"));
 
   const ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
 
@@ -4533,7 +4680,7 @@ TEST_F(ShapeInferenceTest, UnboundedScatter) {
   dimension_numbers.set_index_vector_dim(2);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result,
+      const Shape result,
       ShapeInference::InferScatterShape({&input, &scatter_indices, &updates},
                                         to_apply, dimension_numbers));
   EXPECT_TRUE(ShapeUtil::Equal(result, expected))
@@ -4542,11 +4689,11 @@ TEST_F(ShapeInferenceTest, UnboundedScatter) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedTranspose) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand,
                           ParseShape("f32[1, ?, 2, ?, <=2]{4,3,2,1,0}"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("f32[<=2, 1, ?, 2, ?]{0,2,3,4,1}"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape result_shape,
                           ShapeInference::InferTransposeShape(
                               operand, /*dimensions=*/{4, 0, 3, 2, 1}));
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
@@ -4555,10 +4702,10 @@ TEST_F(ShapeInferenceTest, UnboundedTranspose) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedTransposeRank1) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?]"));
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result_shape,
+      const Shape result_shape,
       ShapeInference::InferTransposeShape(operand, /*dimensions=*/{0}));
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
       << "inferred: " << ShapeUtil::HumanString(result_shape)

From 8683bd13bf9f8c7d3e76c00b22c8d2b6e7fe26d8 Mon Sep 17 00:00:00 2001
From: Kanvi Khanna <kanvi.khanna@intel.com>
Date: Wed, 27 Mar 2024 18:18:51 -0700
Subject: [PATCH 512/670] PR #10930: [XLA:CPU] Bug fix for enabling F16 support

Imported from GitHub PR https://github.com/openxla/xla/pull/10930

This PR fixes a bug for f16 matmul enabling.
Since dot is supported in f16, there is no need for adding an extra convert instruction after fusion similar to the bf16 case.
Also add missed changes in build file.
Copybara import of the project:

--
45f7fdbac38cf011707c704cf4e75b49261d0339 by Kanvi Khanna <kanvi.khanna@intel.com>:

Bug-Fix:Since dot has f16 support,remove extra convert after fusion; add missing build file change

--
af9d2c4f7a92e3d647f15c9d25be047461df8dea by Kanvi Khanna <kanvi.khanna@intel.com>:

fix buildifier

--
3587ef104f9b47c2fd05dda1f0008cd3b53da421 by Kanvi Khanna <kanvi.khanna@intel.com>:

address comment

Merging this change closes #10930

PiperOrigin-RevId: 619730863
---
 third_party/xla/xla/service/BUILD             |  3 ++-
 .../xla/xla/service/change_op_data_type.cc    |  4 +--
 .../xla/service/cpu/onednn_matmul_rewriter.cc | 25 +++++++++----------
 .../xla/xla/tests/onednn_matmul_test.cc       |  4 +--
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 99f2a683026f2f..6a72a4431f47c0 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -8,7 +8,7 @@ load(
     "if_rocm",
     "if_rocm_is_configured",
 )
-load("@local_tsl//tsl:tsl.bzl", "if_google", "if_libtpu", "internal_visibility")
+load("@local_tsl//tsl:tsl.bzl", "if_google", "if_libtpu", "internal_visibility", "tsl_copts")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable", "internal_hlo_deps")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
@@ -7308,6 +7308,7 @@ cc_library(
     name = "change_op_data_type",
     srcs = ["change_op_data_type.cc"],
     hdrs = ["change_op_data_type.h"],
+    copts = tsl_copts(),
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
diff --git a/third_party/xla/xla/service/change_op_data_type.cc b/third_party/xla/xla/service/change_op_data_type.cc
index e77f233b9634f2..7f06bc76acc4da 100644
--- a/third_party/xla/xla/service/change_op_data_type.cc
+++ b/third_party/xla/xla/service/change_op_data_type.cc
@@ -63,8 +63,8 @@ absl::StatusOr<bool> ChangeOpDataType::Run(
         continue;
       }
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
-      if (instr->opcode == HloOpcode::kDot &&
-          OneDnnMatMulRewriter::ShouldRewrite(instr)) {
+      if (instr->opcode() == HloOpcode::kDot &&
+          cpu::OneDnnMatMulRewriter::ShouldRewrite(instr)) {
         continue;
       }
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
index 68f39b3399207c..0bbd677325150b 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -248,11 +248,6 @@ inline bool CompatibleElementType(const HloInstruction* instr) {
   return element_type == BF16 || element_type == F32 || element_type == F16;
 }
 
-inline bool LowPrecisionType(const HloInstruction* instr) {
-  PrimitiveType element_type = instr->shape().element_type();
-  return element_type == BF16 || element_type == F16;
-}
-
 // Type conversion from and to any of BF16, F16 and FP32.
 // TODO(intel-tf): Support more types when enabled.
 template <typename Pattern>
@@ -516,11 +511,12 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
       HloInstruction* new_instr;
       // If matched pattern has custom-call -> bitcast -> add, then we need to
       // insert bitcast after the new fusion to maintain the correct shape
-      // (new-custom-call -> bitcast). Also, this will be followed by -> convert
-      // for bf16 case to avoid datatype mismatch.
+      // (new-custom-call -> bitcast). Also, this will optionally be followed
+      // by -> convert for bf16 case to avoid datatype mismatch.
       if (optional_dot_bitcast != nullptr &&
           optional_dot_bitcast->opcode() == HloOpcode::kBitcast) {
-        if (LowPrecisionType(matmul_call)) {
+        if (optional_dot_convert != nullptr &&
+            optional_dot_convert->opcode() == HloOpcode::kConvert) {
           auto bitcast_call =
               matmul_call->AddInstruction(HloInstruction::CreateBitcast(
                   ShapeUtil::ChangeElementType(
@@ -528,18 +524,21 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
                   matmul_call));
           new_instr =
               bitcast_call->AddInstruction(HloInstruction::CreateConvert(
-                  ShapeUtil::ChangeElementType(bitcast_call->shape(),
-                                               PrimitiveType::F32),
+                  ShapeUtil::ChangeElementType(
+                      bitcast_call->shape(),
+                      optional_dot_convert->shape().element_type()),
                   bitcast_call));
         } else {
           new_instr = matmul_call->AddInstruction(
               HloInstruction::CreateBitcast(instr->shape(), matmul_call));
         }
       } else {
-        if (LowPrecisionType(matmul_call)) {
+        if (optional_dot_convert != nullptr &&
+            optional_dot_convert->opcode() == HloOpcode::kConvert) {
           new_instr = matmul_call->AddInstruction(HloInstruction::CreateConvert(
-              ShapeUtil::ChangeElementType(matmul_call->shape(),
-                                           PrimitiveType::F32),
+              ShapeUtil::ChangeElementType(
+                  matmul_call->shape(),
+                  optional_dot_convert->shape().element_type()),
               matmul_call));
         } else {
           new_instr = matmul_call;
diff --git a/third_party/xla/xla/tests/onednn_matmul_test.cc b/third_party/xla/xla/tests/onednn_matmul_test.cc
index 36c936487773d8..d8488bc2ca4fb6 100644
--- a/third_party/xla/xla/tests/onednn_matmul_test.cc
+++ b/third_party/xla/xla/tests/onednn_matmul_test.cc
@@ -511,7 +511,7 @@ TEST_F(MatmulTest, DivisionByConstantWithEltwiseLinearF32) {
   )");
 }
 
-TEST_F(MatmulTest, SimpleBiasTestFP16_PARAM_F32) {
+TEST_F(MatmulTest, SimpleBiasTestF16_PARAM_F32) {
   if (!IsSupportedType(PrimitiveType::F16)) {
     GTEST_SKIP() << "CPU does not support F16.";
   }
@@ -537,7 +537,7 @@ TEST_F(MatmulTest, SimpleBiasTestFP16_PARAM_F32) {
   MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_);
 }
 
-TEST_F(MatmulTest, SimpleBiasTestFP16_PARAM_FP16) {
+TEST_F(MatmulTest, SimpleBiasTestF16_PARAM_F16) {
   if (!IsSupportedType(PrimitiveType::F16)) {
     GTEST_SKIP() << "CPU does not support F16.";
   }

From 8831b76b1b75b317a8401afa0b5ca86ebfe629ab Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Wed, 27 Mar 2024 18:43:46 -0700
Subject: [PATCH 513/670] [xla:gpu][NFC] Add test for nested tuple custom call

PiperOrigin-RevId: 619737694
---
 ...ddress_computation_fusion_rewriter_test.cc | 153 ++++++++++++++++++
 1 file changed, 153 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
index 434333f0440a62..cf63e3978758c2 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
@@ -1071,6 +1071,159 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleCustomCallLegacy) {
                             });
 }
 
+TEST_F(AddressComputationFusionRewriterTest, TupleSliceCustomCallLegacy) {
+  XlaBuilder b(TestName());
+  CustomCall(
+      &b, "Callback_Void",
+      /*operands=*/
+      {
+          Tuple(&b,
+                {
+                    Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}),
+                          {0, 0}, {4, 8}, {1, 1}),
+                    Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                }),
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                    Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                }),
+      },
+      ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"");
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
+                                        computation.proto(), hlo_config));
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloSchedule schedule,
+      ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+        return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+      }));
+  TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f32[8,8]{1,0} parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f32[4,8]{1,0} slice([[P2]]), slice={[0:4], [0:8]}
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f32[256]{0} parameter(1)
+    ; CHECK-DAG:   [[T0:%[^ ]+]] = (f32[4,8]{1,0}, f32[256]{0}) tuple([[S0]], [[P1]])
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = (f32[1024]{0}, f32[8]{0}) parameter(0)
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f32[128]{0} custom-call([[T0]], [[P0]]),
+    ; CHECK:              custom_call_target="Callback_Void"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %{{.*}} {
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = f32[128]{0} fusion(
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo->ToString(),
+                            AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest, TupledOutputCustomCallLegacy) {
+  XlaBuilder b(TestName());
+  auto custom_call = CustomCall(
+      &b, "Callback_Void",
+      /*operands=*/
+      {
+          Tuple(&b,
+                {
+                    Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}),
+                          {0, 0}, {4, 8}, {1, 1}),
+                    Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                }),
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                    Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                }),
+      },
+      ShapeUtil::MakeTupleShape({
+          ShapeUtil::MakeShape(F32, {8}),
+          ShapeUtil::MakeTupleShape({
+              ShapeUtil::MakeShape(F32, {128}),
+              ShapeUtil::MakeShape(F32, {256}),
+          }),
+          ShapeUtil::MakeShape(F32, {1024}),
+          ShapeUtil::MakeShape(F32, {4, 8}),
+      }),
+      /*opaque=*/"");
+  Tuple(&b, {GetTupleElement(GetTupleElement(custom_call, 1), 0),
+             GetTupleElement(custom_call, 2)});
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
+                                        computation.proto(), hlo_config));
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloSchedule schedule,
+      ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+        return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+      }));
+  TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = (f32[1024]{0}, f32[8]{0}) parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f32[256]{0} parameter(1)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f32[8,8]{1,0} parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f32[4,8]{1,0} slice([[P2]]), slice={[0:4], [0:8]}
+    ; CHECK-DAG:   [[T0:%[^ ]+]] = (f32[4,8]{1,0}, f32[256]{0}) tuple([[S0]], [[P1]])
+    ; CHECK:       [[CC:%[^ ]+]] = (f32[8]{0}, (f32[128]{0}, f32[256]{0}), f32[1024]{0}, f32[4,8]{1,0}) custom-call([[T0]], [[P0]]),
+    ; CHECK:              custom_call_target="Callback_Void"
+    ; CHECK-DAG:   [[GTE0:%[^ ]+]] = f32[8]{0} get-tuple-element([[CC]]), index=0
+    ; CHECK-DAG:   [[GTE1:%[^ ]+]] = (f32[128]{0}, f32[256]{0}) get-tuple-element([[CC]]), index=1
+    ; CHECK-DAG:   [[GTE2:%[^ ]+]] = f32[128]{0} get-tuple-element([[GTE1]]), index=0
+    ; CHECK-DAG:   [[GTE3:%[^ ]+]] = f32[256]{0} get-tuple-element([[GTE1]]), index=1
+    ; CHECK-DAG:   [[T1:%[^ ]+]] = (f32[128]{0}, f32[256]{0}) tuple([[GTE2]], [[GTE3]])
+    ; CHECK-DAG:   [[GTE4:%[^ ]+]] = f32[1024]{0} get-tuple-element([[CC]]), index=2
+    ; CHECK-DAG:   [[GTE5:%[^ ]+]] = f32[4,8]{1,0} get-tuple-element([[CC]]), index=3
+    ; CHECK:       ROOT {{.*}} = (f32[8]{0}, (f32[128]{0}, f32[256]{0}), f32[1024]{0}, f32[4,8]{1,0}) tuple([[GTE0]], [[T1]], [[GTE4]], [[GTE5]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = (f32[8]{0}, (f32[128]{0}, f32[256]{0}), f32[1024]{0}, f32[4,8]{1,0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK-DAG:   [[GTE6:%[^ ]+]] = f32[1024]{0} get-tuple-element([[FUSION]]), index=2
+    ; CHECK-DAG:   [[GTE7:%[^ ]+]] = (f32[128]{0}, f32[256]{0}) get-tuple-element([[FUSION]]), index=1
+    ; CHECK-DAG:   [[GTE8:%[^ ]+]] = f32[128]{0} get-tuple-element([[GTE7]]), index=0
+    ; CHECK:       ROOT {{.*}} = (f32[128]{0}, f32[1024]{0}) tuple([[GTE8]], [[GTE6]])
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo->ToString(),
+                            AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
 TEST_F(AddressComputationFusionRewriterTest, UnalignedSlice) {
   XlaBuilder b(TestName());
   CustomCall(

From 5f92c143c115062d4f7806e8bdacc2d2e80e4931 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Wed, 27 Mar 2024 18:48:15 -0700
Subject: [PATCH 514/670] [xla:gpu][NFC] Refactor
 address_computation_fusion_rewriter

PiperOrigin-RevId: 619738914
---
 .../address_computation_fusion_rewriter.cc    | 196 +++++++++---------
 1 file changed, 97 insertions(+), 99 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index e59df332e9a731..44181f9206d366 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -57,6 +57,17 @@ namespace gpu {
 
 namespace {
 
+// A dataflow path flowing from a definition to a user.
+using DefUseDataflowPath = absl::InlinedVector<HloInstruction*, 2>;
+// All dataflow paths flowing from a definition to all users. Each user will
+// have a separate entry in the vector.
+using DefUseDataflowPaths = absl::InlinedVector<DefUseDataflowPath, 4>;
+// A dataflow path flowing from a user to a definition.
+using UseDefDataflowPath = absl::InlinedVector<HloInstruction*, 4>;
+// All dataflow paths flowing from a user to all definitions of its operands.
+using UseDefDataflowPaths = absl::InlinedVector<HloInstruction*, 8>;
+using InstructionSet = absl::flat_hash_set<HloInstruction*>;
+
 bool IsNoOp(const HloInstruction* hlo) {
   return HloPredicateIsOp<HloOpcode::kBitcast, HloOpcode::kTuple,
                           HloOpcode::kGetTupleElement>(hlo);
@@ -120,15 +131,15 @@ bool IsAlignedSlice(const Shape& src_shape, const Shape& dst_shape,
   return true;
 }
 
-absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
-    const HloInstruction* instr, bool dynamic) {
-  absl::InlinedVector<HloInstruction*, 8> sliced_operand_chains = {
+UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr,
+                                          bool dynamic) {
+  UseDefDataflowPaths sliced_operand_paths = {
       const_cast<HloInstruction*>(instr)};
 
   auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
   // This set is used to avoid duplicates in the matched results. It contains
   // the matched instructions that we have seen so far.
-  absl::flat_hash_set<HloInstruction*> processed_sliced_chain_set;
+  InstructionSet processed_instrs;
 
   const auto& aliasing_pairs =
       Cast<HloCustomCallInstruction>(instr)->output_to_operand_aliasing();
@@ -142,15 +153,14 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
     // is against the whole idea of address computation fusion. Skip this
     // operand.
     if (aliased_operands.contains(instr->operand_index(operand))) continue;
-    absl::InlinedVector<HloInstruction*, 4> maybe_sliced_operand_chain;
+    UseDefDataflowPath maybe_sliced_operand_path;
     bool slice_found = false;
     auto maybe_slice_adaptor =
         HloFindIf({HloInstructionAdaptor(*operand)}, *fusion, [&](auto node) {
           const HloInstruction* cur = &node.instruction();
           // If the node is a match that has been processed, stop the traversal.
-          if (processed_sliced_chain_set.contains(cur)) return true;
-          maybe_sliced_operand_chain.push_back(
-              const_cast<HloInstruction*>(cur));
+          if (processed_instrs.contains(cur)) return true;
+          maybe_sliced_operand_path.push_back(const_cast<HloInstruction*>(cur));
           if (dynamic) {
             if (const auto slice_instr =
                     DynCast<HloDynamicSliceInstruction>(cur)) {
@@ -176,36 +186,33 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
         });
     if (maybe_slice_adaptor == std::nullopt) continue;
     const auto& maybe_slice_instr = maybe_slice_adaptor->instruction();
-    if (slice_found ||
-        processed_sliced_chain_set.contains(&maybe_slice_instr)) {
+    if (slice_found || processed_instrs.contains(&maybe_slice_instr)) {
       // Even in the case of stopping at a match that has been processed, we
-      // still need to add instructions encountered in the sliced operand chain
+      // still need to add instructions encountered in the sliced operand path
       // during the latest traversal.
-      sliced_operand_chains.insert(sliced_operand_chains.end(),
-                                   maybe_sliced_operand_chain.begin(),
-                                   maybe_sliced_operand_chain.end());
-      processed_sliced_chain_set.insert(maybe_sliced_operand_chain.begin(),
-                                        maybe_sliced_operand_chain.end());
+      sliced_operand_paths.insert(sliced_operand_paths.end(),
+                                  maybe_sliced_operand_path.begin(),
+                                  maybe_sliced_operand_path.end());
+      processed_instrs.insert(maybe_sliced_operand_path.begin(),
+                              maybe_sliced_operand_path.end());
     }
   }
-  return sliced_operand_chains;
+  return sliced_operand_paths;
 }
 
 // Each user of `instr` that goes into a DUS will have an entry in the returned
 // vector.
-// Each entry contains the sliced chains for that user, i.e. the dataflow
-// sequence from the user itself to the DUS (included).
-absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 4>
-GetSlicedUserChains(const HloInstruction* instr) {
-  absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 4>
-      sliced_user_chains;
+// Each entry contains the sliced paths for that user, i.e. the sequence of ops
+// following the dataflow from the user itself to the DUS (included).
+DefUseDataflowPaths GetSlicedUserPaths(const HloInstruction* instr) {
+  DefUseDataflowPaths sliced_user_paths;
   auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
   // This set is used to avoid duplicates in the matched results. It contains
   // the matched instructions that we have seen so far.
-  absl::flat_hash_set<HloInstruction*> processed_sliced_chain_set;
+  InstructionSet processed_instrs;
 
   auto traverse_hlo_and_collect = [&](HloInstruction* start) {
-    absl::InlinedVector<HloInstruction*, 2> maybe_sliced_user_chain;
+    DefUseDataflowPath maybe_sliced_user_path;
     bool dus_found = false;
     auto maybe_dus_adaptor = HloFindIf(
         {HloInstructionAdaptor(*start)}, *fusion,
@@ -213,8 +220,8 @@ GetSlicedUserChains(const HloInstruction* instr) {
           const HloInstruction* cur = &node.instruction();
           // If the node is a match that has been processed, stop the
           // traversal.
-          if (processed_sliced_chain_set.contains(cur)) return true;
-          maybe_sliced_user_chain.push_back(const_cast<HloInstruction*>(cur));
+          if (processed_instrs.contains(cur)) return true;
+          maybe_sliced_user_path.push_back(const_cast<HloInstruction*>(cur));
           if (const auto slice_instr =
                   DynCast<HloDynamicUpdateSliceInstruction>(cur)) {
             if (IsAlignedSlice(slice_instr->shape(),
@@ -228,13 +235,13 @@ GetSlicedUserChains(const HloInstruction* instr) {
         /*visit_operands=*/false);
     if (maybe_dus_adaptor == std::nullopt) return;
     const auto& maybe_dus_instr = maybe_dus_adaptor->instruction();
-    if (dus_found || processed_sliced_chain_set.contains(&maybe_dus_instr)) {
+    if (dus_found || processed_instrs.contains(&maybe_dus_instr)) {
       // Even in the case of stopping at a match that has been processed, we
-      // still need to add instructions encountered in the sliced user chain
+      // still need to add instructions encountered in the sliced user path
       // during the latest traversal.
-      processed_sliced_chain_set.insert(maybe_sliced_user_chain.begin(),
-                                        maybe_sliced_user_chain.end());
-      sliced_user_chains.push_back(std::move(maybe_sliced_user_chain));
+      processed_instrs.insert(maybe_sliced_user_path.begin(),
+                              maybe_sliced_user_path.end());
+      sliced_user_paths.push_back(std::move(maybe_sliced_user_path));
     }
   };
 
@@ -250,19 +257,18 @@ GetSlicedUserChains(const HloInstruction* instr) {
     }
   }
 
-  return sliced_user_chains;
+  return sliced_user_paths;
 }
 
 absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
-    absl::Span<HloInstruction* const> matched) {
+    absl::Span<HloInstruction* const> matches) {
   absl::InlinedVector<HloInstruction*, 4> captures;
 
-  absl::flat_hash_set<HloInstruction*> instructions_set(matched.begin(),
-                                                        matched.end());
+  InstructionSet matched_instrs(matches.begin(), matches.end());
 
-  for (HloInstruction* instr : matched) {
+  for (HloInstruction* instr : matches) {
     for (HloInstruction* operand : instr->operands()) {
-      if (!instructions_set.contains(operand) &&
+      if (!matched_instrs.contains(operand) &&
           absl::c_find(captures, operand) == captures.end()) {
         captures.emplace_back(operand);
       }
@@ -272,43 +278,40 @@ absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
   return captures;
 }
 
-absl::InlinedVector<HloInstruction*, 8> GetSortedMatched(
-    absl::Span<HloInstruction* const> matched) {
-  absl::InlinedVector<HloInstruction*, 8> sorted_matched;
-  absl::flat_hash_set<HloInstruction*> instructions_set(matched.begin(),
-                                                        matched.end());
-  absl::flat_hash_set<HloInstruction*> processed_set;
-  // Topologically sort `matched`
-  for (auto it = matched.rbegin(); it != matched.rend(); ++it) {
-    if (processed_set.contains(*it)) continue;
+UseDefDataflowPaths GetSortedMatches(
+    absl::Span<HloInstruction* const> matches) {
+  UseDefDataflowPaths sorted_matches;
+  InstructionSet matched_instrs(matches.begin(), matches.end());
+  InstructionSet processed_instrs;
+  // Topologically sort `matches`
+  for (auto it = matches.rbegin(); it != matches.rend(); ++it) {
+    if (processed_instrs.contains(*it)) continue;
     for (auto* operand : (*it)->operands()) {
-      if (!instructions_set.contains(operand)) {
+      if (!matched_instrs.contains(operand)) {
         continue;
       }
-      if (!processed_set.contains(operand)) {
-        sorted_matched.emplace_back(operand);
-        processed_set.insert(operand);
+      if (!processed_instrs.contains(operand)) {
+        sorted_matches.emplace_back(operand);
+        processed_instrs.insert(operand);
       }
     }
-    sorted_matched.emplace_back(*it);
-    processed_set.insert(*it);
+    sorted_matches.emplace_back(*it);
+    processed_instrs.insert(*it);
   }
 
-  return sorted_matched;
+  return sorted_matches;
 }
 
-Status CreateRootTuple(
-    HloInstruction* hero, HloComputation::Builder& builder,
-    absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 4>
-        sliced_user_chains,
-    absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
-        instr_mapping) {
+Status CreateRootTuple(HloInstruction* hero, HloComputation::Builder& builder,
+                       DefUseDataflowPaths sliced_user_paths,
+                       absl::flat_hash_map<const HloInstruction*,
+                                           HloInstruction*>& instr_mapping) {
   unsigned tuple_size = hero->shape().tuple_shapes_size();
 
   std::vector<HloInstruction*> sliced_elems(tuple_size, nullptr);
-  for (auto& sliced_user_chain : sliced_user_chains) {
-    auto gte = Cast<HloGetTupleElementInstruction>(sliced_user_chain.front());
-    sliced_elems[gte->tuple_index()] = sliced_user_chain.back();
+  for (auto& sliced_user_path : sliced_user_paths) {
+    auto gte = Cast<HloGetTupleElementInstruction>(sliced_user_path.front());
+    sliced_elems[gte->tuple_index()] = sliced_user_path.back();
   }
 
   std::vector<HloInstruction*> elements;
@@ -335,8 +338,7 @@ Status CreateRootTuple(
 
 absl::StatusOr<HloComputation*> CreateFusionBody(
     HloModule* module, absl::Span<HloInstruction* const> operand_matches,
-    absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 4>
-        sliced_user_chains,
+    DefUseDataflowPaths sliced_user_paths,
     absl::Span<HloInstruction* const> captures) {
   HloComputation::Builder builder("address-computation");
 
@@ -361,7 +363,7 @@ absl::StatusOr<HloComputation*> CreateFusionBody(
   }
 
   // Instructions in the pattern are already topologically sorted, as we visited
-  // them following use-def chain, then reverse the list.
+  // them following use-def path, then reverse the list.
   HloInstruction* hero;
   for (HloInstruction* instr : operand_matches) {
     instr_mapping[instr] = builder.AddInstruction(
@@ -369,8 +371,8 @@ absl::StatusOr<HloComputation*> CreateFusionBody(
     hero = instr;
   }
 
-  for (auto& sliced_user_chain : sliced_user_chains) {
-    for (HloInstruction* instr : sliced_user_chain) {
+  for (auto& sliced_user_path : sliced_user_paths) {
+    for (HloInstruction* instr : sliced_user_path) {
       instr_mapping[instr] = builder.AddInstruction(
           instr->CloneWithNewOperands(instr->shape(), mapped_operands(instr)));
     }
@@ -380,7 +382,7 @@ absl::StatusOr<HloComputation*> CreateFusionBody(
   // assigned for each of the elements. Make sure the tuple is not nil first.
   if (hero->shape().IsTuple() && hero->shape().tuple_shapes_size() > 0) {
     TF_RETURN_IF_ERROR(
-        CreateRootTuple(hero, builder, sliced_user_chains, instr_mapping));
+        CreateRootTuple(hero, builder, sliced_user_paths, instr_mapping));
   }
 
   return module->AddComputationAndUnifyNamesAndIds(builder.Build(), false);
@@ -423,11 +425,8 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
   if (!module->has_schedule()) return Internal("module is not scheduled");
 
   auto process_slices = [&](bool dynamic) -> absl::StatusOr<bool> {
-    absl::flat_hash_map<
-        HloInstruction*,
-        std::pair<
-            absl::InlinedVector<HloInstruction*, 8>,
-            absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 4>>>
+    absl::flat_hash_map<HloInstruction*,
+                        std::pair<UseDefDataflowPaths, DefUseDataflowPaths>>
         matches;
 
     // Collect all potential custom call matches in the non-fusion computations.
@@ -436,28 +435,27 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
       for (HloInstruction* instr : computation->instructions()) {
         if (IsLegacyCublasMatmul(*instr) ||
             (!dynamic && IsCustomCall(instr, platform_name_))) {
-          auto sliced_operand_chains = GetSlicedOperandChains(instr, dynamic);
-          bool has_sliced_operand_chains = sliced_operand_chains.size() > 1;
-          absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 4>
-              sliced_user_chains{};
-          if (dynamic) sliced_user_chains = GetSlicedUserChains(instr);
-
-          bool has_sliced_user_chains =
-              absl::c_any_of(sliced_user_chains, [&](auto& sliced_user_chain) {
-                return !sliced_user_chain.empty();
+          auto sliced_operand_paths = GetSlicedOperandPaths(instr, dynamic);
+          bool has_sliced_operand_paths = sliced_operand_paths.size() > 1;
+          DefUseDataflowPaths sliced_user_paths{};
+          if (dynamic) sliced_user_paths = GetSlicedUserPaths(instr);
+
+          bool has_sliced_user_paths =
+              absl::c_any_of(sliced_user_paths, [&](auto& sliced_user_path) {
+                return !sliced_user_path.empty();
               });
 
-          if (absl::c_any_of(sliced_user_chains, [&](auto& sliced_user_chain) {
+          if (absl::c_any_of(sliced_user_paths, [&](auto& sliced_user_path) {
                 return DynCast<HloDynamicUpdateSliceInstruction>(
-                           sliced_user_chain.back()) == nullptr;
+                           sliced_user_path.back()) == nullptr;
               })) {
             return absl::InternalError(
-                "Expect sliced user chain to end with a DUS.");
+                "Expect sliced user path to end with a DUS.");
           }
 
-          if (has_sliced_operand_chains || has_sliced_user_chains) {
-            matches[instr] = std::make_pair(std::move(sliced_operand_chains),
-                                            std::move(sliced_user_chains));
+          if (has_sliced_operand_paths || has_sliced_user_paths) {
+            matches[instr] = std::make_pair(std::move(sliced_operand_paths),
+                                            std::move(sliced_user_paths));
           }
         }
       }
@@ -467,19 +465,19 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
 
     HloSchedule& schedule = module->schedule();
     for (auto& kv : matches) {
-      auto& [operand_matches, sliced_user_chains] = kv.second;
+      auto& [operand_matches, sliced_user_paths] = kv.second;
       std::vector<HloInstruction*> matches;
       absl::c_copy(operand_matches, std::back_inserter(matches));
 
-      for (auto& sliced_user_chain : sliced_user_chains)
-        absl::c_copy(sliced_user_chain, std::back_inserter(matches));
+      for (auto& sliced_user_path : sliced_user_paths)
+        absl::c_copy(sliced_user_path, std::back_inserter(matches));
 
       auto captures = GetPatternCaptures(matches);
-      auto sorted_operand_matches = GetSortedMatched(operand_matches);
+      auto sorted_operand_matches = GetSortedMatches(operand_matches);
 
       TF_ASSIGN_OR_RETURN(HloComputation * fusion_body,
                           CreateFusionBody(module, sorted_operand_matches,
-                                           sliced_user_chains, captures));
+                                           sliced_user_paths, captures));
 
       TF_ASSIGN_OR_RETURN(HloInstruction * fusion,
                           CreateFusionInstruction(module, kv.first, captures,
@@ -497,18 +495,18 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
       if (fusion->shape().IsTuple()) {
         TF_RETURN_IF_ERROR(parent->ReplaceInstructionWithDifferentShape(
             const_cast<HloInstruction*>(kv.first), fusion));
-        for (auto& sliced_user_chain : sliced_user_chains) {
+        for (auto& sliced_user_path : sliced_user_paths) {
           auto old_gte =
-              Cast<HloGetTupleElementInstruction>(sliced_user_chain.front());
+              Cast<HloGetTupleElementInstruction>(sliced_user_path.front());
           HloInstruction* gte =
               parent->AddInstruction(HloInstruction::CreateGetTupleElement(
                   fusion, old_gte->tuple_index()));
           TF_RETURN_IF_ERROR(
-              parent->ReplaceInstruction(sliced_user_chain.back(), gte));
+              parent->ReplaceInstruction(sliced_user_path.back(), gte));
         }
       } else {
         auto* old_instr = const_cast<HloInstruction*>(kv.first);
-        if (sliced_user_chains.empty()) {
+        if (sliced_user_paths.empty()) {
           // The only case where a tuple-shaped original hero op is fused into a
           // non-tuple-shaped fusion is there's only one element of the original
           // tuple being used. In that case, we need to replace that single
@@ -526,7 +524,7 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
             old_instr = kv.first->users().front();
           }
         } else {
-          old_instr = sliced_user_chains.front().back();
+          old_instr = sliced_user_paths.front().back();
         }
         TF_RETURN_IF_ERROR(parent->ReplaceInstruction(old_instr, fusion));
       }

From a4e0c1111c3640366a5d7593444a31227a6b2811 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 27 Mar 2024 19:16:10 -0700
Subject: [PATCH 515/670] [xla:ffi] Fix __has_builtin compilation compilation

PiperOrigin-RevId: 619745896
---
 third_party/xla/xla/ffi/api/api.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index 72139873306e41..f3595cc20195c3 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -57,6 +57,12 @@ limitations under the License.
 
 #include "xla/ffi/api/c_api.h"
 
+#ifdef __has_builtin
+#define XLA_FFI_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define XLA_FFI_HAS_BUILTIN(x) 0
+#endif
+
 #if __has_attribute(always_inline)
 #define XLA_FFI_ATTRIBUTE_ALWAYS_INLINE inline __attribute__((always_inline))
 #elif defined(_MSC_VER)
@@ -73,7 +79,7 @@ limitations under the License.
 #define XLA_FFI_ATTRIBUTE_NEVER_INLINE
 #endif
 
-#if __has_builtin(__builtin_expect)
+#if XLA_FFI_HAS_BUILTIN(__builtin_expect)
 #define XLA_FFI_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
 #define XLA_FFI_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
 #else

From d6475ffcc84bf3f94d671e90f933cc3f9b89b19c Mon Sep 17 00:00:00 2001
From: Vlad Sytchenko <vsytch@google.com>
Date: Wed, 27 Mar 2024 22:16:12 -0700
Subject: [PATCH 516/670] If two calls to
 "ProductOfElementaryHouseholderReflectors" are in the same module with the
 same shape of the first operand, but different shapes of tau, one will end up
 expanded to the wrong computation.

Append tau shape to the computation name when its there to avoid conflicts.

PiperOrigin-RevId: 619788209
---
 third_party/xla/xla/client/lib/qr_test.cc  | 35 ++++++++++++++++++++++
 third_party/xla/xla/service/qr_expander.cc |  5 +++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/client/lib/qr_test.cc b/third_party/xla/xla/client/lib/qr_test.cc
index 1da9cab60a02d0..a21932b3e797e3 100644
--- a/third_party/xla/xla/client/lib/qr_test.cc
+++ b/third_party/xla/xla/client/lib/qr_test.cc
@@ -145,4 +145,39 @@ XLA_TEST_F(QrTest, SubnormalComplex) {
                                     xla::ErrorSpec(1e-4, 1e-4));
 }
 
+XLA_TEST_F(QrTest, DuplicateHouseholderExpansion) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array2D<float> a0_vals({
+      {0, 1, 1},
+      {1, 0, 1},
+      {1, 1, 0},
+  });
+  xla::Array2D<float> a1_vals({
+      {1, 0},
+      {0, 1},
+      {1, 0},
+  });
+
+  // Verifies that different computations are created to generate HouseHolder
+  // transformations with identical QR shapes, but different tau shapes.
+  // The first QR decomposition should generate a ([3,3], [3]) computation,
+  // the second should generate a ([3,3], [2]) computation. Mismatch will result
+  // in compilation failure.
+
+  xla::XlaOp a0, q0, r0;
+  auto a0_data = CreateR2Parameter<float>(a0_vals, 0, "a0", &builder, &a0);
+  xla::QrExplicit(a0, /*full_matrices=*/true, q0, r0);
+
+  xla::XlaOp a1, q1, r1;
+  auto a1_data = CreateR2Parameter<float>(a1_vals, 1, "a1", &builder, &a1);
+  xla::QrExplicit(a1, /*full_matrices=*/true, q1, r1);
+
+  // Verifies that the decomposition composes back to the original matrix.
+  xla::BatchDot(q1, r1, xla::PrecisionConfig::HIGHEST);
+
+  ComputeAndCompareR2<float>(&builder, a1_vals, {a0_data.get(), a1_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
 }  // namespace
diff --git a/third_party/xla/xla/service/qr_expander.cc b/third_party/xla/xla/service/qr_expander.cc
index cc9b10b0905fec..e817b66b61d2c8 100644
--- a/third_party/xla/xla/service/qr_expander.cc
+++ b/third_party/xla/xla/service/qr_expander.cc
@@ -509,9 +509,12 @@ bool QrExpander::InstructionMatchesPattern(HloInstruction* instruction) {
 
 absl::StatusOr<HloInstruction*> QrExpander::ExpandInstruction(
     HloInstruction* instruction) {
-  const std::string name =
+  std::string name =
       absl::StrFormat("xla.%s_%s", instruction->custom_call_target(),
                       instruction->operand(0)->shape().ToString());
+  if (instruction->custom_call_target() == kHouseholderProductCustomCallName) {
+    name += "_" + instruction->operand(1)->shape().ToString();
+  }
 
   HloModule* module = instruction->GetModule();
 

From 573fc841781f593f40ac75e8f8de204c1162f8f9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Mar 2024 22:42:07 -0700
Subject: [PATCH 517/670] Automated Code Change

PiperOrigin-RevId: 619794438
---
 tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD     | 4 ----
 .../c/experimental/ops/gen/cpp/renderers/cpp_config.cc    | 2 ++
 .../ops/gen/cpp/renderers/cpp_file_renderer.cc            | 3 ++-
 .../experimental/ops/gen/cpp/renderers/guard_renderer.cc  | 5 ++++-
 .../c/experimental/ops/gen/cpp/renderers/guard_renderer.h | 1 +
 .../ops/gen/cpp/renderers/include_renderer.cc             | 5 ++++-
 .../experimental/ops/gen/cpp/renderers/include_renderer.h | 1 +
 .../ops/gen/cpp/renderers/namespace_renderer.cc           | 4 +++-
 .../ops/gen/cpp/renderers/namespace_renderer.h            | 1 +
 .../ops/gen/cpp/renderers/op_comment_renderer.cc          | 2 ++
 .../ops/gen/cpp/renderers/op_comment_renderer.h           | 1 +
 .../ops/gen/cpp/renderers/op_implementation_renderer.cc   | 1 +
 .../ops/gen/cpp/renderers/op_implementation_renderer.h    | 1 +
 .../c/experimental/ops/gen/cpp/renderers/op_renderer.cc   | 8 ++++++++
 .../c/experimental/ops/gen/cpp/renderers/op_renderer.h    | 2 ++
 .../c/experimental/ops/gen/cpp/renderers/renderer.cc      | 3 +++
 .../c/experimental/ops/gen/cpp/renderers/renderer.h       | 1 +
 .../c/experimental/ops/gen/cpp/renderers/renderer_test.cc | 4 ++++
 18 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD b/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
index 7589ea2d2f24a2..c13bc899f2d016 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
@@ -20,13 +20,9 @@ cc_library(
     deps = [
         "//tensorflow/c/experimental/ops/gen/common",
         "//tensorflow/c/experimental/ops/gen/cpp/views",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:str_util",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
index 36c25c92760872..1fc16e093c011d 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h"
 
+#include "absl/strings/str_split.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.cc
index 44f23ae0fb6aed..71132cfc3bf8b2 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.cc
@@ -14,8 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.h"
 
-#include "tensorflow/c/experimental/ops/gen/common/view_util.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
index 8bfd5a334c565d..7a4275b532eda7 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
@@ -15,7 +15,10 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h"
 
 #include "tensorflow/c/experimental/ops/gen/common/case_format.h"
-#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
index cfe2a99acfddce..a45fe89a7a011c 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_GUARD_RENDERER_H_
 
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
index 5242d6f1baf255..38f31209f6da24 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
@@ -14,7 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h"
 
-#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
index b98547079f3ac7..e43715a62e45b0 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_INCLUDE_RENDERER_H_
 
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
index 5547ca22df7ab0..db28ab303ae5c6 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h"
 
-#include "absl/strings/str_split.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h
index a54fc5878a0ad4..fd8ccf9531ef51 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.cc
index e5afb7b6d63393..5d11bcada6e8c0 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h"
 
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h
index 1d85c4c9fd7940..9131cc945349af 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_COMMENT_RENDERER_H_
 
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.cc
index e2184fcc7f834f..804e0585f88cca 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/ops/gen/common/view_util.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/arg_view.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/attr_view.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h
index 9237eb9410bad7..98c3b0d75524aa 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_IMPLEMENTATION_RENDERER_H_
 
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
index 41db2ced426b47..c58e67782dfc34 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
@@ -16,7 +16,15 @@ limitations under the License.
 
 #include <iterator>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/views/op_argument_view.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
index c29fb35b5b6b7c..3360e14e672e3a 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
index 0e6ee460512d2d..41d1dea64b3689 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stringpiece.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
index b0a95baefa7676..b6168b196b35b2 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_RENDERER_H_
 #define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_RENDERER_H_
 
+#include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
index 2674e5f156d9d5..eff654c5938160 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
@@ -14,8 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
 
+#include "tensorflow/c/experimental/ops/gen/common/path_config.h"
+#include "tensorflow/c/experimental/ops/gen/common/source_code.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {

From f29b6b1517cab1f1e9da22413840bb28ba0a8136 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 Mar 2024 22:51:58 -0700
Subject: [PATCH 518/670] Automated Code Change

PiperOrigin-RevId: 619796552
---
 third_party/xla/xla/backends/profiler/cpu/BUILD               | 1 -
 third_party/xla/xla/backends/profiler/cpu/host_tracer_test.cc | 1 -
 2 files changed, 2 deletions(-)

diff --git a/third_party/xla/xla/backends/profiler/cpu/BUILD b/third_party/xla/xla/backends/profiler/cpu/BUILD
index 5c55e71f8406b3..6cfd1f1a51815b 100644
--- a/third_party/xla/xla/backends/profiler/cpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/cpu/BUILD
@@ -126,7 +126,6 @@ xla_cc_test(
     srcs = ["host_tracer_test.cc"],
     deps = [
         ":host_tracer_impl",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/backends/profiler/cpu/host_tracer_test.cc b/third_party/xla/xla/backends/profiler/cpu/host_tracer_test.cc
index 2d71ca5286ad42..881f46e50837ff 100644
--- a/third_party/xla/xla/backends/profiler/cpu/host_tracer_test.cc
+++ b/third_party/xla/xla/backends/profiler/cpu/host_tracer_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
-#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"

From 3b7619292d3cbe9cf124870f9dc5810d8d96aa5e Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Wed, 27 Mar 2024 23:18:47 -0700
Subject: [PATCH 519/670] Add basic support for folding constant RTVars

This allows turning dynamic HLO operands into a index expressions when the HLO yields some constant value.

As a first step this is adding support for the HLO ops `constant` and `iota`.

PiperOrigin-RevId: 619803271
---
 third_party/xla/xla/service/gpu/fusions/BUILD |   4 +
 .../xla/service/gpu/fusions/fusion_emitter.cc |   3 +-
 .../in_place_dynamic_update_slice_mlir.cc     |   3 +-
 .../xla/xla/service/gpu/fusions/loop.cc       |   2 +-
 .../xla/xla/service/gpu/fusions/loop_mlir.cc  |   3 +-
 .../xla/xla/service/gpu/fusions/mlir/BUILD    |   1 +
 .../gpu/fusions/mlir/simplify_affine.cc       |   3 +-
 .../xla/xla/service/gpu/fusions/scatter.cc    |   3 +-
 .../xla/service/gpu/fusions/scatter_mlir.cc   |   5 +-
 .../xla/xla/service/gpu/fusions/transpose.cc  |   4 +-
 .../xla/service/gpu/fusions/transpose_mlir.cc |   6 +-
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |   2 +-
 third_party/xla/xla/service/gpu/model/BUILD   |   5 +-
 .../service/gpu/model/coalescing_analysis.cc  |   8 +-
 .../service/gpu/model/indexing_analysis.cc    |  24 ++-
 .../xla/service/gpu/model/indexing_analysis.h |   5 +
 .../gpu/model/indexing_analysis_test.cc       |   2 +-
 .../xla/xla/service/gpu/model/indexing_map.cc |  76 ++++++++-
 .../xla/xla/service/gpu/model/indexing_map.h  |  12 +-
 .../service/gpu/model/indexing_map_test.cc    | 148 ++++++++++++++++--
 20 files changed, 276 insertions(+), 43 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index c54387cf6ff642..87995e26bf2ec4 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -66,6 +66,7 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:computation_partitioner",
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -348,6 +349,7 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
+        "//xla/service/gpu/model:indexing_analysis",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -393,6 +395,7 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -518,6 +521,7 @@ cc_library(
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:parallel_loop_emitter",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
index 5cf2481dcb5322..e652532fd0464c 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
 #include "xla/service/gpu/target_util.h"
@@ -196,7 +197,7 @@ IndexingMap KernelFusionInterface::GetDefaultThreadIdIndexingMap(
   } else {
     indexing_map.AddConstraint(linear_index, Interval{0, num_elements - 1});
   }
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   return indexing_map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
index 2334a5b74c892a..eccdcfceee8a8e 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/xla_data.pb.h"
 
@@ -111,7 +112,7 @@ absl::Status MlirInPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
   auto indexing = *ComputeThreadIdToInputIndexing(
       /*root_index=*/0,
       /*hero_operand_index=*/kDUSUpdateIndex, mlir_context);
-  indexing.Simplify();
+  indexing.Simplify(GetIndexingMapForInstruction);
   indexing.RemoveUnusedSymbols();
 
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.cc b/third_party/xla/xla/service/gpu/fusions/loop.cc
index b2ef86d5a916b6..e417f96923f4e3 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop.cc
@@ -239,7 +239,7 @@ std::optional<IndexingMap> LoopFusion::ComputeThreadIdToInputIndexing(
   CHECK_EQ(output_to_input_indexing_set.size(), 1);
   IndexingMap thread_id_to_input_indexing_map = ComposeIndexingMaps(
       *thread_id_to_output_indexing, *output_to_input_indexing_set.begin());
-  thread_id_to_input_indexing_map.Simplify();
+  thread_id_to_input_indexing_map.Simplify(GetIndexingMapForInstruction);
   return thread_id_to_input_indexing_map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
index 63b54cadb2bdc5..bf41d50930ea95 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
 #include "xla/xla_data.pb.h"
@@ -87,7 +88,7 @@ std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToInputIndexing(
   CHECK_EQ(output_to_input_indexing_set.size(), 1);
   IndexingMap thread_id_to_input_indexing_map = ComposeIndexingMaps(
       *thread_id_to_output_indexing, *output_to_input_indexing_set.begin());
-  thread_id_to_input_indexing_map.Simplify();
+  thread_id_to_input_indexing_map.Simplify(GetIndexingMapForInstruction);
   return thread_id_to_input_indexing_map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
index 4b9298fda6e6b5..3e475b1a9cdbeb 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
@@ -285,6 +285,7 @@ cc_library(
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
index 585bc4b5cf6420..e4020c97a57e7d 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "xla/service/gpu/fusions/mlir/passes.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
@@ -116,7 +117,7 @@ struct RewriteAffineApply
 
     IndexingMap map(op.getAffineMap(), dim_ranges, symbol_ranges,
                     /*rt_vars=*/{});
-    map.Simplify();
+    map.Simplify(GetIndexingMapForInstruction);
     auto expr = map.GetAffineMap().getResult(0);
 
     RangeEvaluator range_evaluator(map.GetDimensionBounds(),
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.cc b/third_party/xla/xla/service/gpu/fusions/scatter.cc
index 55488cc0df5e0c..0625f9efd4653b 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_nested.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -275,7 +276,7 @@ std::optional<IndexingMap> ScatterFusion::ComputeThreadIdToInputIndexing(
         RangeVarsFromTensorSizes({scatter_indices_shape.dimensions(1)}),
         /*rt_vars=*/{}};
     auto scatter_indices_map = scatter_update_map * updates_to_indices_map;
-    scatter_indices_map.Simplify();
+    scatter_indices_map.Simplify(GetIndexingMapForInstruction);
     return scatter_indices_map;
   }
   return scatter_update_map;
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
index 932ec9cfccf1a1..6d8969a5251f31 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
@@ -123,7 +124,7 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
         RangeVarsFromTensorSizes({scatter_indices_shape.dimensions(1)}),
         /*rt_vars=*/{}};
     auto scatter_indices_map = scatter_update_map * updates_to_indices_map;
-    scatter_indices_map.Simplify();
+    scatter_indices_map.Simplify(GetIndexingMapForInstruction);
     return scatter_indices_map;
   }
   return scatter_update_map;
@@ -190,7 +191,7 @@ absl::Status MlirScatterFusion::EmitEntryFunction(
           /*root_index=*/0, /*hero_operand_index=*/kScatterUpdateIndex,
           mlir_context)
           .value();
-  thread_id_to_update_map.Simplify();
+  thread_id_to_update_map.Simplify(GetIndexingMapForInstruction);
   thread_id_to_update_map.RemoveUnusedSymbols();
 
   const auto& root_computation = computations.FindPartitionedComputation(
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.cc b/third_party/xla/xla/service/gpu/fusions/transpose.cc
index 99f113cbafbea7..ca7b3f7ff79228 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.cc
@@ -306,7 +306,7 @@ std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToOutputIndexing(
           tiling_.GetNumBlocks(), tiling_.GetThreadTileSize(),
           permuted_tiled_shape.dimensions()),
       GetBitcastMap(permuted_tiled_shape, hero.shape(), ctx));
-  map.Simplify();
+  map.Simplify(GetIndexingMapForInstruction);
   return map;
 }
 
@@ -318,7 +318,7 @@ std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToInputIndexing(
   auto map = ComposeIndexingMaps(
       GetIndexingMapForTiling(tiling_, ctx),
       GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(), ctx));
-  map.Simplify();
+  map.Simplify(GetIndexingMapForInstruction);
   return map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
index c654c2e4ec8b99..9e2e5be2ea564b 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
@@ -155,7 +155,7 @@ std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
           tiling_.GetNumBlocks(), tiling_.GetThreadTileSize(),
           permuted_tiled_shape.dimensions()),
       GetBitcastMap(permuted_tiled_shape, hero.shape(), mlir_context));
-  map.Simplify();
+  map.Simplify(GetIndexingMapForInstruction);
   return map;
 }
 
@@ -165,7 +165,7 @@ IndexingMap MlirTransposeFusion::ComputeThreadIdToInputIndexing(
       GetIndexingMapForTiling(tiling_, mlir_context),
       GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(),
                     mlir_context));
-  map.Simplify();
+  map.Simplify(GetIndexingMapForInstruction);
   return map;
 }
 
@@ -207,7 +207,7 @@ IndexingMap GetSharedMemoryWriteIndexingMap(
       thread_id_indexing.GetRangeVars(),
       thread_id_indexing.GetRTVars(),
       thread_id_indexing.GetConstraints()};
-  shmem_write_indexing.Simplify();
+  shmem_write_indexing.Simplify(GetIndexingMapForInstruction);
   return shmem_write_indexing;
 }
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 1fcbb8ce3c4e2c..98bf1ac6519286 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -2442,7 +2442,7 @@ absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
 
     IndexingMap program_id_to_input_tile_indexing = ComposeIndexingMaps(
         program_id_to_output_tile_indexing, tiled_hlo_instruction.indexing_map);
-    program_id_to_input_tile_indexing.Simplify();
+    program_id_to_input_tile_indexing.Simplify(GetIndexingMapForInstruction);
 
     // Manually compute pointer offset to avoid materialized fully parallel
     // dimensions in the tile. Current codegen tried to avoid size-1 dims.
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 560ebe466a3777..45084afa659674 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -424,7 +424,6 @@ cc_library(
     deps = [
         ":affine_map_printer",
         "//xla/hlo/ir:hlo",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -438,8 +437,12 @@ xla_cc_test(
     srcs = ["indexing_map_test.cc"],
     deps = [
         ":affine_map_printer",
+        ":indexing_analysis",
         ":indexing_map",
         ":indexing_test_utils",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index 283569d551a6b7..e7ad4bd6c01a93 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -234,7 +234,7 @@ bool IsCoalesced(const IndexingMap& thread_id_to_input_indexing_map,
       /*rt_vars=*/{}};
   IndexingMap thread_x_to_linearized_input =
       thread_x_first_32_elements * thread_id_to_input_indexing_map;
-  thread_x_to_linearized_input.Simplify();
+  thread_x_to_linearized_input.Simplify(GetIndexingMapForInstruction);
   thread_x_to_linearized_input.RemoveUnusedSymbols();
   return EstimateCoalescingViaMemoryTransactionsCount(
       FindContiguousIntervals(thread_x_to_linearized_input), element_type);
@@ -300,7 +300,8 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
         IndexingMap operand_logical_to_linearized_physical_shape =
             operand_logical_to_physical_map *
             operand_physical_to_linearized_shape;
-        operand_logical_to_linearized_physical_shape.Simplify();
+        operand_logical_to_linearized_physical_shape.Simplify(
+            GetIndexingMapForInstruction);
 
         for (const IndexingMap& operand_indexing_map :
              operand_indexing_maps_it->second) {
@@ -316,7 +317,8 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
           IndexingMap thread_id_to_linearized_physical_input_map =
               *thread_id_to_hero_operand_map *
               logical_output_to_linearized_physical_input_map;
-          thread_id_to_linearized_physical_input_map.Simplify();
+          thread_id_to_linearized_physical_input_map.Simplify(
+              GetIndexingMapForInstruction);
           result[operand].insert(thread_id_to_linearized_physical_input_map);
         }
       }
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index 79c6a04db159dd..b0deed5f9f8952 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -625,7 +625,7 @@ IndexingMap ComposeIndexingMapsForWindow(
   // Composed indexing.
   IndexingMap result =
       ComposeIndexingMaps(input_indexing_no_padding, padded_input_indexing);
-  result.Simplify();
+  result.Simplify(GetIndexingMapForInstruction);
   result.RemoveUnusedSymbols();
   return result;
 }
@@ -939,7 +939,7 @@ HloInstructionIndexing ComputeOutputToInputReshapeOpIndexing(
   IndexingMap reshape_indexing_map = IndexingMap::FromTensorSizes(
       ComputeReshapeIndexingMap(input, output, mlir_context),
       output.dimensions(), {});
-  reshape_indexing_map.Simplify();
+  reshape_indexing_map.Simplify(GetIndexingMapForInstruction);
   return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
 }
 HloInstructionIndexing ComputeInputToOutputReshapeOpIndexing(
@@ -950,7 +950,7 @@ HloInstructionIndexing ComputeInputToOutputReshapeOpIndexing(
   IndexingMap reshape_indexing_map = IndexingMap::FromTensorSizes(
       ComputeReshapeIndexingMap(output, input, mlir_context),
       input.dimensions(), {});
-  reshape_indexing_map.Simplify();
+  reshape_indexing_map.Simplify(GetIndexingMapForInstruction);
   return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
 }
 
@@ -1065,7 +1065,7 @@ HloInstructionIndexing ComputeOutputToInputBitcastOpIndexing(
     const HloInstruction* bitcast, MLIRContext* mlir_context) {
   auto bitcast_map = GetBitcastMap(bitcast->shape(),
                                    bitcast->operand(0)->shape(), mlir_context);
-  bitcast_map.Simplify();
+  bitcast_map.Simplify(GetIndexingMapForInstruction);
   return HloInstructionIndexing::FromIndexingMaps({bitcast_map});
 }
 
@@ -1073,7 +1073,7 @@ HloInstructionIndexing ComputeInputToOutputBitcastOpIndexing(
     const HloInstruction* bitcast, MLIRContext* mlir_context) {
   auto bitcast_map = GetBitcastMap(bitcast->operand(0)->shape(),
                                    bitcast->shape(), mlir_context);
-  bitcast_map.Simplify();
+  bitcast_map.Simplify(GetIndexingMapForInstruction);
   return HloInstructionIndexing::FromIndexingMaps({bitcast_map});
 }
 
@@ -1233,7 +1233,7 @@ bool HloInstructionIndexing::Simplify() {
       to_remove.push_back(map);
       if (map.IsUndefined()) {
         to_add.push_back(map);
-      } else if (map.Simplify()) {
+      } else if (map.Simplify(GetIndexingMapForInstruction)) {
         map.RemoveUnusedSymbols();
       } else {
         to_remove.pop_back();
@@ -1348,7 +1348,7 @@ GroupedByOpIndexingMap ComputeGroupedOutputToInputIndexing(
       for (const IndexingMap& producer_map : producer_operand_indexing) {
         for (const IndexingMap& consumer_map : consumer_indexing_maps_copy) {
           auto composed_map = ComposeIndexingMaps(consumer_map, producer_map);
-          composed_map.Simplify();
+          composed_map.Simplify(GetIndexingMapForInstruction);
           composed_map.RemoveUnusedSymbols();
           grouped_indexing_maps[&producer_operand_adaptor.instruction()].insert(
               composed_map);
@@ -1497,11 +1497,19 @@ IndexingMap ComputeEpilogueInputToOutputIndexing(
     auto user_indexing = ComputeInputToOutputIndexing(
         user, user->operand_index(instr), mlir_context);
     root_indexing = root_indexing * *user_indexing.indexing_maps[0].begin();
-    root_indexing.Simplify();
+    root_indexing.Simplify(GetIndexingMapForInstruction);
     instr = user;
   }
   return root_indexing;
 }
 
+IndexingMap GetIndexingMapForInstruction(const HloInstruction* instr,
+                                         int64_t operand_idx,
+                                         mlir::MLIRContext* mlir_context) {
+  HloInstructionIndexing indexing =
+      ComputeOutputToInputIndexing(instr, operand_idx, mlir_context);
+  return *indexing.indexing_maps[0].begin();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.h b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
index ed8c495c425899..22012ea472f887 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
@@ -180,6 +180,11 @@ llvm::SmallVector<mlir::AffineExpr, 4> DelinearizeInBoundsIndex(
     mlir::AffineExpr linear, absl::Span<const int64_t> sizes,
     absl::Span<const int64_t> strides);
 
+// Returns the output-to-input indexing map of the first output of `instr`
+IndexingMap GetIndexingMapForInstruction(const HloInstruction* instr,
+                                         int64_t operand_idx,
+                                         mlir::MLIRContext* mlir_context);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index d6424b4aaaf9ce..1ae80e76fa0907 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -2546,7 +2546,7 @@ TEST_F(IndexingAnalysisTest, TilingIndexing) {
                 /*tile_sizes=*/{8, 1, 4},
                 /*num_threads=*/{1, 4, 4}};
   auto indexing_map = GetIndexingMapForTiling(tiling, &mlir_context_);
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           (d3 floordiv 64) * 8 + s0,
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index 5ad902f690bea1..129a5e67cf7c4e 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/gpu/model/indexing_map.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <numeric>
@@ -34,6 +35,9 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
 
@@ -945,9 +949,11 @@ IndexingMap operator*(const IndexingMap& lhs, const IndexingMap& rhs) {
 // RangeEvaluator for every constraint. Note that we start with "expr"
 // simplification, because the ranges of constraints were already optimized once
 // when IndexingMap was constructed.
-bool IndexingMap::Simplify() {
+bool IndexingMap::Simplify(IndexingMapProvider indexing_map_provider) {
   if (IsUndefined()) return false;
 
+  bool rtvars_were_eliminated = ReplaceConstantRTVars(indexing_map_provider);
+
   // Simplify constraints to shrink the lower/upper bounds of dims and symbols.
   bool constraints_were_simplified = false;
   while (true) {
@@ -967,7 +973,8 @@ bool IndexingMap::Simplify() {
   if (affine_map_was_simplified) {
     affine_map_ = simplified_affine_map;
   }
-  return affine_map_was_simplified || constraints_were_simplified;
+  return affine_map_was_simplified || constraints_were_simplified ||
+         rtvars_were_eliminated;
 }
 
 bool IndexingMap::SimplifyConstraintExprs() {
@@ -1330,5 +1337,70 @@ bool IndexingMap::RescaleSymbols() {
   return !to_delete.empty();
 }
 
+static std::optional<AffineExpr> FoldsIntoConstantIndexingExpression(
+    const HloInstruction* instr, const mlir::AffineMap& affine_map,
+    MLIRContext* mlir_context,
+    IndexingMap::IndexingMapProvider indexing_map_provider) {
+  if (auto constant_expr = DynCast<HloConstantInstruction>(instr)) {
+    if (affine_map.isConstant()) {
+      const auto idx = affine_map.getConstantResults();
+      return getAffineConstantExpr(
+          constant_expr->literal().GetIntegralAsS64(idx).value(), mlir_context);
+    }
+    return std::nullopt;
+  }
+
+  if (auto iota_expr = DynCast<HloIotaInstruction>(instr)) {
+    auto iota_dimension = iota_expr->iota_dimension();
+    CHECK(iota_dimension < affine_map.getNumResults());
+    return affine_map.getResults()[iota_dimension];
+  }
+
+  return std::nullopt;
+}
+
+bool IndexingMap::ReplaceConstantRTVars(
+    IndexingMap::IndexingMapProvider indexing_map_provider) {
+  if (rt_vars_.empty()) return false;
+
+  std::vector<size_t> to_delete;
+
+  for (const auto& [index, rt_var] : llvm::enumerate(rt_vars_)) {
+    auto folded_expr = FoldsIntoConstantIndexingExpression(
+        rt_var.hlo, rt_var.map, GetMLIRContext(), indexing_map_provider);
+    if (!folded_expr.has_value()) continue;
+
+    auto symbol_index = range_vars_.size() + index;
+    affine_map_ = affine_map_.replace(
+        {{mlir::getAffineSymbolExpr(symbol_index, GetMLIRContext()),
+          folded_expr.value()}});
+
+    llvm::DenseMap<AffineExpr, AffineExpr> replacements;
+
+    for (const auto& [constraint, interval] : constraints_) {
+      auto modified_constraint = constraint.replace(
+          mlir::getAffineSymbolExpr(symbol_index, GetMLIRContext()),
+          folded_expr.value());
+
+      if (constraint == modified_constraint) continue;
+      replacements[constraint] = modified_constraint;
+    }
+
+    for (const auto& [old_expr, new_expr] : replacements) {
+      auto interval = constraints_.at(old_expr);
+      constraints_.erase(old_expr);
+      constraints_[new_expr] = interval;
+    }
+
+    to_delete.emplace_back(index);
+  }
+
+  for (auto index : llvm::reverse(to_delete)) {
+    rt_vars_.erase(rt_vars_.begin() + index);
+  }
+
+  return !to_delete.empty();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index 0419c05557364c..bfc8abf30bdd39 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
 
@@ -244,8 +245,13 @@ class IndexingMap {
 
   void Print(std::ostream& out, const AffineMapPrinter& printer) const;
 
+  // TODO(hebecker): Rearrange code structure so that we can call
+  // `ComputeInputToOutputIndexing` from `:indexing_analysis` directly.
+  using IndexingMapProvider = llvm::function_ref<IndexingMap(
+      const HloInstruction*, int64_t /*operand id*/, mlir::MLIRContext*)>;
+
   // Returns true if the map was simplified.
-  bool Simplify();
+  bool Simplify(IndexingMapProvider indexing_map_provider);
 
   // Return MLIRContext.
   mlir::MLIRContext* GetMLIRContext() const;
@@ -333,6 +339,10 @@ class IndexingMap {
   // Merges "mod" constraints for the same AffineExpr.
   void MergeModConstraints();
 
+  // Replace RTVars that yield constants by indexing expressions.
+  // Returns true if a replacement was performed, otherwise false.
+  bool ReplaceConstantRTVars(IndexingMapProvider indexing_map_provider);
+
   mlir::AffineMap affine_map_;
   std::vector<DimVar> dim_vars_;
   std::vector<RangeVar> range_vars_;
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index 6118265cea60ce..084b535708d332 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/model/indexing_map.h"
 
+#include <cstdint>
 #include <optional>
 #include <utility>
 #include <vector>
@@ -24,8 +25,12 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal_util.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
+#include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
 
@@ -163,7 +168,7 @@ TEST_F(IndexingMapTest, Composition_ProducerAndConsumerHaveConstraints) {
                           s0 mod 3 in [1, 1]
                           s2 mod 4 in [0, 0]
                         )"));
-  composed.Simplify();
+  composed.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(composed, MatchIndexingMap(R"(
                           (d0)[s0, s1, s2] -> (s2, d0, s1, s0)
                           domain:
@@ -381,7 +386,7 @@ TEST_F(IndexingMapTest, ConstraintMerge_Mod) {
                              Interval{0, 0});
   indexing_map.AddConstraint(ParseAffineExpr("s1 mod 5", &mlir_context_),
                              Interval{1, 1});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
 
   EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
                           (d0)[s0, s1] -> (d0, s1, s0)
@@ -399,7 +404,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_ConstantDims) {
   IndexingMap indexing_map =
       IndexingMap(ParseAffineMap("(d0) -> (d0)", &mlir_context_),
                   {DimVar{{5, 5}}}, /*range_vars=*/{}, /*rt_vars=*/{});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0) -> (5)
                                                   domain:
@@ -412,7 +417,7 @@ TEST_F(IndexingMapTest,
   auto serialized_map = "(d0, d1) -> (d0 + d1 floordiv 16, d1 mod 16)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {8, 16}, {});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0, d1) -> (d0, d1)
                                                   domain:
@@ -429,7 +434,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsAndModsWithMultipliers) {
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {9, 9, 9}, {});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0, d1, d2) -> (d0, d1, d2)
@@ -448,7 +453,7 @@ TEST_F(IndexingMapTest,
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {10, 10, 10}, {});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
     (d0, d1, d2) -> (d0 * 2 + (d1 + d2 floordiv 4) floordiv 2,
                      (d1 * 4 + d2) mod 8)
@@ -465,7 +470,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsAndModsWithReverse) {
       "d0 * 11 + d1 + ((d0 * -11 - d1 + 109) floordiv 11) * 11 - 99)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {8, 9}, {});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                  (d0, d1) -> (d0, d1)
                                                  domain:
@@ -479,7 +484,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_SimplifyReshape) {
       "()[s0] -> ((s0 * 128) mod 715 + ((s0 * 128) floordiv 715) * 715)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {128});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0] -> (s0 * 128)
       domain: s0 in [0, 127]
@@ -492,7 +497,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_SimplifyReshape_Regression) {
       "()[s0] -> ((s0 * 128) mod 715 + ((s0 * 64) floordiv 715) * 715)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {128});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0] -> ((s0 * 128) mod 715 + ((s0 * 64) floordiv 715) * 715)
       domain: s0 in [0, 127]
@@ -505,7 +510,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsInSequence) {
       "14)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {1234});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                  ()[s0] -> (s0)
                                                  domain:
@@ -519,7 +524,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivGcdGreater1) {
       "floordiv 3) * 768 + ((s0 * 128 + s1) floordiv 192) * 768)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {1234, 128, 4});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1, s2] -> (s0 * 512 + s1 * 4 + s2)
       domain:
@@ -535,7 +540,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_ExtractFromMod) {
       "20000)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {872, 4, 128, 896});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1, s2, s3] -> (
         s1 + (s0 * 458752 + s2 * 4 + s3 * 512) mod 20000
@@ -555,7 +560,7 @@ TEST_F(IndexingMapTest,
       "* 2) floordiv 4)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {2, 128});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1] -> (
         s0 * 4 + s1 floordiv 32
@@ -704,6 +709,123 @@ TEST(IntervalComparisionTest, Comparisons) {
   EXPECT_EQ(point != 16, true);
 }
 
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_ScalarConstant) {
+  // auto zero_dim_map = AffineMap::get(&mlir_context_);
+  auto constant =
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32_t>(42));
+
+  IndexingMap indexing_map(ParseAffineMap("()[s0] -> (s0)", &mlir_context_),
+                           /*dimensions=*/{},
+                           /*range_vars=*/{},
+                           {RTVar{Interval{42, 42}, constant.get(),
+                                  AffineMap::get(0, 0, {}, &mlir_context_)}});
+
+  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              () -> (42)
+              domain:
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_StaticIndexIntoTensorConstant) {
+  // auto zero_dim_map = AffineMap::get(&mlir_context_);
+  auto constant = HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<uint32_t>({{1, 2, 3, 4}, {11, 12, 13, 14}}));
+
+  IndexingMap indexing_map(
+      ParseAffineMap("()[s0] -> (s0)", &mlir_context_),
+      /*dimensions=*/{},
+      /*range_vars=*/{},
+      {RTVar{Interval{1, 14}, constant.get(),
+             ParseAffineMap("() -> (1,2)", &mlir_context_)}});
+
+  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              () -> (13)
+              domain:
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_NonFoldableTensor) {
+  // auto zero_dim_map = AffineMap::get(&mlir_context_);
+  auto constant = HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<uint32_t>({{1, 2, 3, 4}, {11, 12, 13, 14}}));
+
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (s0)", &mlir_context_),
+      /*dimensions=*/{},
+      /*range_vars=*/{},
+      {RTVar{Interval{1, 14}, constant.get(),
+             ParseAffineMap("(d0) -> (1, d0)", &mlir_context_)}});
+
+  EXPECT_FALSE(indexing_map.Simplify(GetIndexingMapForInstruction));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_Iota) {
+  auto iota = HloInstruction::CreateIota(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {10, 10}), 0);
+
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 255}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 9}, iota.get(),
+             ParseAffineMap("(d0) -> (d0, 7)", &mlir_context_)}});
+
+  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0) -> (d0, d0)
+              domain:
+              d0 in [0, 255]
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_IotaAsConstant) {
+  auto iota = HloInstruction::CreateIota(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {10, 10}), 1);
+
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 255}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 9}, iota.get(),
+             ParseAffineMap("(d0) -> (d0, 7)", &mlir_context_)}});
+
+  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0) -> (d0, 7)
+              domain:
+              d0 in [0, 255]
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_ConstraintsGetUpdated) {
+  auto iota = HloInstruction::CreateIota(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {10, 10}), 0);
+
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 255}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 9}, iota.get(),
+             ParseAffineMap("(d0) -> (d0, 7)", &mlir_context_)}});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 2", &mlir_context_),
+                             Interval{0, 0});
+
+  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0) -> (d0, d0)
+              domain:
+              d0 in [0, 254]
+              d0 mod 2 in [0, 0]
+              )"));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From e4e2605ca259f89c6eec453a1fd22167da351b3c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 00:29:28 -0700
Subject: [PATCH 520/670] Automated Code Change

PiperOrigin-RevId: 619820412
---
 tensorflow/lite/tools/BUILD               | 2 ++
 tensorflow/lite/tools/tool_params_test.cc | 1 -
 tensorflow/lite/tools/utils.cc            | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index d7608ec188363e..8c60f8ad012bd8 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -409,6 +409,8 @@ cc_library(
     hdrs = ["utils.h"],
     deps = [
         ":logging",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
     ],
diff --git a/tensorflow/lite/tools/tool_params_test.cc b/tensorflow/lite/tools/tool_params_test.cc
index 248db53b0a4d42..e34c40b3cba143 100644
--- a/tensorflow/lite/tools/tool_params_test.cc
+++ b/tensorflow/lite/tools/tool_params_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/tools/tool_params.h"
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/utils.cc b/tensorflow/lite/tools/utils.cc
index 846f76471f2ce1..12396ed7c3ce05 100644
--- a/tensorflow/lite/tools/utils.cc
+++ b/tensorflow/lite/tools/utils.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <complex>
 #include <random>
 
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/tools/logging.h"
 

From c5621b856d913d5692b59e2861f37244dcad00d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 02:02:15 -0700
Subject: [PATCH 521/670] Update GraphDef version to 1815.

PiperOrigin-RevId: 619842796
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 8d59a74287919d..df98002eacc475 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1814  // Updated: 2024/3/27
+#define TF_GRAPH_DEF_VERSION 1815  // Updated: 2024/3/28
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From e34abc2477bd1862f38073689f2d12c145dfc66e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 02:03:41 -0700
Subject: [PATCH 522/670] compat: Update forward compatibility horizon to
 2024-03-28

PiperOrigin-RevId: 619843215
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index a96d4514f68cda..d3a357f833f2ef 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 27)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 28)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 9e45913593a9470976671f1710eff7eeed36f44a Mon Sep 17 00:00:00 2001
From: Alan Kelly <alankelly@google.com>
Date: Thu, 28 Mar 2024 02:48:19 -0700
Subject: [PATCH 523/670] Disable Batch Mat Mul delegate test until flaky
 behaviour is fixed.

PiperOrigin-RevId: 619853923
---
 .../xnnpack/batch_matrix_multiply_test.cc     | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc b/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc
index 4d7af0f07e73ce..30642a3df2a517 100644
--- a/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 namespace tflite {
 namespace xnnpack {
 
-class BatchMatrixMultiplyTest : public testing::Test {
+class DISABLED_BatchMatrixMultiplyTest : public testing::Test {
  public:
   // std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
   auto get_delegate(int num_threads = 1) {
@@ -52,7 +52,7 @@ class BatchMatrixMultiplyTest : public testing::Test {
   std::mt19937 rng_ = std::mt19937(random_device_());
 };
 
-TEST_F(BatchMatrixMultiplyTest, 3D) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, 3D) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
   const auto input1_channels = channels_rng();
@@ -65,7 +65,7 @@ TEST_F(BatchMatrixMultiplyTest, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, BroadcastOne3D) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, BroadcastOne3D) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
   const auto input1_channels = channels_rng();
@@ -83,7 +83,7 @@ TEST_F(BatchMatrixMultiplyTest, BroadcastOne3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, BroadcastImplicit3D) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, BroadcastImplicit3D) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
   const auto input1_channels = channels_rng();
@@ -101,7 +101,7 @@ TEST_F(BatchMatrixMultiplyTest, BroadcastImplicit3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, 4D) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, 4D) {
   const auto outer_batch = shape_rng();
   const auto inner_batch = shape_rng();
   const auto height = shape_rng();
@@ -115,7 +115,7 @@ TEST_F(BatchMatrixMultiplyTest, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, BroadcastOne4D) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, BroadcastOne4D) {
   const auto outer_batch = shape_rng();
   const auto inner_batch = shape_rng();
   const auto height = shape_rng();
@@ -149,7 +149,7 @@ TEST_F(BatchMatrixMultiplyTest, BroadcastOne4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, BroadcastImplicit4D) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, BroadcastImplicit4D) {
   const auto outer_batch = shape_rng();
   const auto inner_batch = shape_rng();
   const auto height = shape_rng();
@@ -175,7 +175,7 @@ TEST_F(BatchMatrixMultiplyTest, BroadcastImplicit4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, 4D_AdjY) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, 4D_AdjY) {
   const auto outer_batch = shape_rng();
   const auto inner_batch = shape_rng();
   const auto height = shape_rng();
@@ -190,7 +190,7 @@ TEST_F(BatchMatrixMultiplyTest, 4D_AdjY) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, MultiThreading) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, MultiThreading) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
   const auto input1_channels = channels_rng();
@@ -203,7 +203,7 @@ TEST_F(BatchMatrixMultiplyTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, WeightsCache) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,

From 55cae5bb5a6f4346527db7f1b38c83a81427f9ee Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Thu, 28 Mar 2024 03:44:05 -0700
Subject: [PATCH 524/670] [xla:gpu] Sort sliced operand paths on the fly

PiperOrigin-RevId: 619867884
---
 .../address_computation_fusion_rewriter.cc    | 36 +++-----------
 ...ddress_computation_fusion_rewriter_test.cc | 48 +++++++++----------
 2 files changed, 30 insertions(+), 54 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index 44181f9206d366..6be6b4cc0be087 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -133,8 +133,7 @@ bool IsAlignedSlice(const Shape& src_shape, const Shape& dst_shape,
 
 UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr,
                                           bool dynamic) {
-  UseDefDataflowPaths sliced_operand_paths = {
-      const_cast<HloInstruction*>(instr)};
+  UseDefDataflowPaths sliced_operand_paths;
 
   auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
   // This set is used to avoid duplicates in the matched results. It contains
@@ -191,12 +190,14 @@ UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr,
       // still need to add instructions encountered in the sliced operand path
       // during the latest traversal.
       sliced_operand_paths.insert(sliced_operand_paths.end(),
-                                  maybe_sliced_operand_path.begin(),
-                                  maybe_sliced_operand_path.end());
+                                  maybe_sliced_operand_path.rbegin(),
+                                  maybe_sliced_operand_path.rend());
       processed_instrs.insert(maybe_sliced_operand_path.begin(),
                               maybe_sliced_operand_path.end());
     }
   }
+
+  sliced_operand_paths.push_back(const_cast<HloInstruction*>(instr));
   return sliced_operand_paths;
 }
 
@@ -278,30 +279,6 @@ absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
   return captures;
 }
 
-UseDefDataflowPaths GetSortedMatches(
-    absl::Span<HloInstruction* const> matches) {
-  UseDefDataflowPaths sorted_matches;
-  InstructionSet matched_instrs(matches.begin(), matches.end());
-  InstructionSet processed_instrs;
-  // Topologically sort `matches`
-  for (auto it = matches.rbegin(); it != matches.rend(); ++it) {
-    if (processed_instrs.contains(*it)) continue;
-    for (auto* operand : (*it)->operands()) {
-      if (!matched_instrs.contains(operand)) {
-        continue;
-      }
-      if (!processed_instrs.contains(operand)) {
-        sorted_matches.emplace_back(operand);
-        processed_instrs.insert(operand);
-      }
-    }
-    sorted_matches.emplace_back(*it);
-    processed_instrs.insert(*it);
-  }
-
-  return sorted_matches;
-}
-
 Status CreateRootTuple(HloInstruction* hero, HloComputation::Builder& builder,
                        DefUseDataflowPaths sliced_user_paths,
                        absl::flat_hash_map<const HloInstruction*,
@@ -473,10 +450,9 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
         absl::c_copy(sliced_user_path, std::back_inserter(matches));
 
       auto captures = GetPatternCaptures(matches);
-      auto sorted_operand_matches = GetSortedMatches(operand_matches);
 
       TF_ASSIGN_OR_RETURN(HloComputation * fusion_body,
-                          CreateFusionBody(module, sorted_operand_matches,
+                          CreateFusionBody(module, operand_matches,
                                            sliced_user_paths, captures));
 
       TF_ASSIGN_OR_RETURN(HloInstruction * fusion,
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
index cf63e3978758c2..ec358a7b522e9f 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
@@ -324,10 +324,10 @@ TEST_F(AddressComputationFusionRewriterTest,
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
-      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %p1 = f16[4,8,8]{2,1,0} parameter(1)
       %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
       %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
-      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[2:3], [0:8], [0:8]}
       %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
 
       %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
@@ -355,26 +355,26 @@ TEST_F(AddressComputationFusionRewriterTest,
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
-    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[8,8]{1,0} parameter(0)
-    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
-    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[8,8]{1,0} parameter(1)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[2:3], [0:8], [0:8]}
     ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
     ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[B1]]),
     ; CHECK:              custom_call_target="__cublas$gemm"
     ; CHECK:     }
 
     ; CHECK:     ENTRY %main{{.*}} {
-    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
-    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
-    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
-    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
-    ; CHECK:       [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion([[B0]], [[P1]])
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(1)
+    ; CHECK:       [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion([[P0]], [[B1]])
     ; CHECK:         kind=kCustom, calls=%address-computation,
     ; CHECK:         backend_config={
     ; CHECK:           "kind":"__custom_fusion",
     ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
     ; CHECK:         }
-    ; CHECK:       ROOT {{.*}} = f16[8,8]{1,0} add([[FUSION]], [[B0]])
+    ; CHECK:       ROOT {{.*}} = f16[8,8]{1,0} add([[FUSION]], [[B1]])
     ; CHECK:     }
   )";
 
@@ -848,11 +848,11 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandAliasingOutput) {
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
-    ; CHECK-DAG:   [[P0:%[^ ]+]] = f32[100,100]{1,0} parameter(0)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f32[100,100]{1,0} parameter(2)
     ; CHECK-DAG:   [[P1:%[^ ]+]] = f32[100,100]{1,0} parameter(1)
-    ; CHECK-DAG:   [[P2:%[^ ]+]] = f32[200,100]{1,0} parameter(2)
-    ; CHECK-DAG:   [[S1:%[^ ]+]] = f32[100,100]{1,0} slice([[P2]]), slice={[16:116], [0:100]}
-    ; CHECK:       [[CC:%[^ ]+]] = (f32[100,100]{1,0}, s8[120000]{0}) custom-call([[P0]], [[S1]], [[P1]]),
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f32[200,100]{1,0} parameter(0)
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f32[100,100]{1,0} slice([[P0]]), slice={[16:116], [0:100]}
+    ; CHECK:       [[CC:%[^ ]+]] = (f32[100,100]{1,0}, s8[120000]{0}) custom-call([[P1]], [[S1]], [[P2]]),
     ; CHECK:         custom_call_target="__cublas$gemm"
     ; CHECK:     }
 
@@ -862,7 +862,7 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandAliasingOutput) {
     ; CHECK:       [[GTE1:%[^ ]+]] = f32[100,100]{1,0} get-tuple-element([[P]]), index=1
     ; CHECK:       [[CONCAT:%[^ ]+]] = f32[200,100]{1,0} concatenate([[GTE0]], [[GTE1]]), dimensions={0}
     ; CHECK:       [[S:%[^ ]+]] = f32[100,100]{1,0} slice([[CONCAT]]), slice={[99:199], [0:100]}
-    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = (f32[100,100]{1,0}, s8[120000]{0}) fusion([[GTE0]], [[S]], [[CONCAT]])
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = (f32[100,100]{1,0}, s8[120000]{0}) fusion([[CONCAT]], [[GTE0]], [[S]])
     ; CHECK:         kind=kCustom, calls=%address-computation,
     ; CHECK:         backend_config={
     ; CHECK:           "kind":"__custom_fusion",
@@ -1108,12 +1108,12 @@ TEST_F(AddressComputationFusionRewriterTest, TupleSliceCustomCallLegacy) {
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
-    ; CHECK-DAG:   [[P2:%[^ ]+]] = f32[8,8]{1,0} parameter(2)
-    ; CHECK-DAG:   [[S0:%[^ ]+]] = f32[4,8]{1,0} slice([[P2]]), slice={[0:4], [0:8]}
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f32[8,8]{1,0} parameter(0)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f32[4,8]{1,0} slice([[P0]]), slice={[0:4], [0:8]}
     ; CHECK-DAG:   [[P1:%[^ ]+]] = f32[256]{0} parameter(1)
     ; CHECK-DAG:   [[T0:%[^ ]+]] = (f32[4,8]{1,0}, f32[256]{0}) tuple([[S0]], [[P1]])
-    ; CHECK-DAG:   [[P0:%[^ ]+]] = (f32[1024]{0}, f32[8]{0}) parameter(0)
-    ; CHECK:       ROOT [[CC:%[^ ]+]] = f32[128]{0} custom-call([[T0]], [[P0]]),
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = (f32[1024]{0}, f32[8]{0}) parameter(2)
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f32[128]{0} custom-call([[T0]], [[P2]]),
     ; CHECK:              custom_call_target="Callback_Void"
     ; CHECK:     }
 
@@ -1184,12 +1184,12 @@ TEST_F(AddressComputationFusionRewriterTest, TupledOutputCustomCallLegacy) {
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
-    ; CHECK-DAG:   [[P0:%[^ ]+]] = (f32[1024]{0}, f32[8]{0}) parameter(0)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = (f32[1024]{0}, f32[8]{0}) parameter(2)
     ; CHECK-DAG:   [[P1:%[^ ]+]] = f32[256]{0} parameter(1)
-    ; CHECK-DAG:   [[P2:%[^ ]+]] = f32[8,8]{1,0} parameter(2)
-    ; CHECK-DAG:   [[S0:%[^ ]+]] = f32[4,8]{1,0} slice([[P2]]), slice={[0:4], [0:8]}
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f32[8,8]{1,0} parameter(0)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f32[4,8]{1,0} slice([[P0]]), slice={[0:4], [0:8]}
     ; CHECK-DAG:   [[T0:%[^ ]+]] = (f32[4,8]{1,0}, f32[256]{0}) tuple([[S0]], [[P1]])
-    ; CHECK:       [[CC:%[^ ]+]] = (f32[8]{0}, (f32[128]{0}, f32[256]{0}), f32[1024]{0}, f32[4,8]{1,0}) custom-call([[T0]], [[P0]]),
+    ; CHECK:       [[CC:%[^ ]+]] = (f32[8]{0}, (f32[128]{0}, f32[256]{0}), f32[1024]{0}, f32[4,8]{1,0}) custom-call([[T0]], [[P2]]),
     ; CHECK:              custom_call_target="Callback_Void"
     ; CHECK-DAG:   [[GTE0:%[^ ]+]] = f32[8]{0} get-tuple-element([[CC]]), index=0
     ; CHECK-DAG:   [[GTE1:%[^ ]+]] = (f32[128]{0}, f32[256]{0}) get-tuple-element([[CC]]), index=1

From 42883cb09d1a8155824ce4ed044794c0dffdd19f Mon Sep 17 00:00:00 2001
From: Aliia Khasanova <aliia@google.com>
Date: Thu, 28 Mar 2024 04:24:07 -0700
Subject: [PATCH 525/670] Run xla triton hopper tests on demand on TAP

PiperOrigin-RevId: 619877948
---
 third_party/xla/xla/service/gpu/BUILD | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index b2483ae3b3c115..3ae6b8646d3822 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -600,6 +600,7 @@ xla_test(
     backends = [
         "gpu_a100",
         "gpu_v100",
+        "gpu_h100",
     ],
     shard_count = 20,
     tags = ["nomac"],
@@ -653,7 +654,10 @@ xla_test(
     backend_tags = {"gpu": [
         "requires-gpu-sm70",
     ]},
-    backends = ["gpu"],
+    backends = [
+        "gpu",
+        "gpu_h100",
+    ],
     tags = [
         "large",
         "no_oss",  # requires-mem:16g tag doesn't work in open source

From 462d6e01352ed769ada17d509a5d1fdda5e2ba51 Mon Sep 17 00:00:00 2001
From: Vladyslav Tsilytskyi <tsilytskyi@google.com>
Date: Thu, 28 Mar 2024 04:26:54 -0700
Subject: [PATCH 526/670] [stream_executor:host] Add LLVM kernel support in
 kernel_spec

Related to https://github.com/openxla/xla/issues/7234

PiperOrigin-RevId: 619878628
---
 third_party/xla/xla/stream_executor/kernel.h  | 19 ++++++++++++
 .../xla/xla/stream_executor/kernel_spec.cc    | 20 ++++++++++++-
 .../xla/xla/stream_executor/kernel_spec.h     | 30 +++++++++++++++++++
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/stream_executor/kernel.h b/third_party/xla/xla/stream_executor/kernel.h
index a88cc59071d099..edf0e24b31a119 100644
--- a/third_party/xla/xla/stream_executor/kernel.h
+++ b/third_party/xla/xla/stream_executor/kernel.h
@@ -312,6 +312,14 @@ class TypedKernel {
                                             absl::string_view kernel_name,
                                             void *symbol);
 
+  // Creates a kernel which can be launched with `stream.ThenLaunch(...)` from
+  // an LLVM IR.
+  static absl::StatusOr<TypedKernel> Create(StreamExecutor *executor,
+                                            absl::string_view ir,
+                                            absl::string_view entrypoint,
+                                            absl::string_view kernel_name,
+                                            absl::Span<std::string> options);
+
   TypedKernel() = default;
 
   Kernel &operator*() { return *kernel_; }
@@ -757,6 +765,17 @@ inline absl::StatusOr<TypedKernel<Args...>> TypedKernel<Args...>::Create(
   return TypedKernel<Args...>::Create(executor, loader_spec);
 }
 
+template <typename... Args>
+inline absl::StatusOr<TypedKernel<Args...>> TypedKernel<Args...>::Create(
+    StreamExecutor *executor, absl::string_view ir,
+    absl::string_view entrypoint, absl::string_view kernel_name,
+    absl::Span<std::string> options) {
+  MultiKernelLoaderSpec loader_spec(TypedKernel<Args...>::kNumberOfParameters);
+  loader_spec.AddLlvmHostKernel(ir, entrypoint, kernel_name, options);
+
+  return TypedKernel<Args...>::Create(executor, loader_spec);
+}
+
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.cc b/third_party/xla/xla/stream_executor/kernel_spec.cc
index fe29fabe52643c..5f7077e991bbbc 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.cc
+++ b/third_party/xla/xla/stream_executor/kernel_spec.cc
@@ -59,6 +59,15 @@ CudaPtxInMemory::CudaPtxInMemory(
   }
 }
 
+LlvmHostKernel::LlvmHostKernel(absl::string_view ir,
+                               absl::string_view entrypoint,
+                               absl::string_view kernel_name,
+                               absl::Span<std::string> options)
+    : KernelLoaderSpec(std::move(kernel_name)),
+      ir_(ir),
+      entrypoint_(entrypoint),
+      options_(options.cbegin(), options.cend()) {}
+
 const char *CudaPtxInMemory::default_text() const {
   if (ptx_by_compute_capability_.empty()) {
     return nullptr;
@@ -84,7 +93,7 @@ MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddInProcessSymbol(
     void *symbol, absl::string_view kernel_name) {
   CHECK(in_process_symbol_ == nullptr);
   in_process_symbol_ =
-      std::make_unique<InProcessSymbol>(symbol, std::string(kernel_name));
+      std::make_shared<InProcessSymbol>(symbol, std::string(kernel_name));
   return this;
 }
 
@@ -102,6 +111,15 @@ MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
   return this;
 }
 
+MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddLlvmHostKernel(
+    absl::string_view ir, absl::string_view entrypoint,
+    absl::string_view kernel_name, absl::Span<std::string> options) {
+  CHECK(llvm_host_kernel_ == nullptr);
+  llvm_host_kernel_ =
+      std::make_shared<LlvmHostKernel>(ir, entrypoint, kernel_name, options);
+  return this;
+}
+
 MultiKernelLoaderSpec::MultiKernelLoaderSpec(
     size_t arity, KernelArgsPacking kernel_args_packing)
     : arity_(arity), kernel_args_packing_(std::move(kernel_args_packing)) {}
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.h b/third_party/xla/xla/stream_executor/kernel_spec.h
index aa75e4a7b7454e..d50ac23713dc5e 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.h
+++ b/third_party/xla/xla/stream_executor/kernel_spec.h
@@ -175,6 +175,25 @@ class CudaCubinInMemory : public KernelLoaderSpec {
   void operator=(const CudaCubinInMemory &) = delete;
 };
 
+class LlvmHostKernel : public KernelLoaderSpec {
+ public:
+  LlvmHostKernel(absl::string_view ir, absl::string_view entrypoint,
+                 absl::string_view kernel_name,
+                 absl::Span<std::string> options);
+
+  absl::string_view ir() const { return ir_; }
+  absl::string_view entrypoint() const { return entrypoint_; }
+  absl::Span<const std::string> options() const { return options_; }
+
+ private:
+  std::string ir_;
+  std::string entrypoint_;
+  std::vector<std::string> options_;
+
+  LlvmHostKernel(const LlvmHostKernel &) = delete;
+  void operator=(const LlvmHostKernel &) = delete;
+};
+
 // Describes how to load a kernel on any subset of a number of target platforms.
 class MultiKernelLoaderSpec {
  public:
@@ -199,6 +218,7 @@ class MultiKernelLoaderSpec {
     return cuda_cubin_in_memory_ != nullptr;
   }
   bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; }
+  bool has_llvm_host_kernel() const { return llvm_host_kernel_ != nullptr; }
 
   // Accessors for platform variant kernel load specifications.
   // Precondition: corresponding has_* is true.
@@ -214,6 +234,10 @@ class MultiKernelLoaderSpec {
     CHECK(has_cuda_ptx_in_memory());
     return *cuda_ptx_in_memory_;
   }
+  const LlvmHostKernel &llvm_host_kernel() const {
+    CHECK(has_llvm_host_kernel());
+    return *llvm_host_kernel_;
+  }
   // Builder-pattern-like methods for use in initializing a
   // MultiKernelLoaderSpec. Each of these should be used at most once for a
   // single MultiKernelLoaderSpec object. See file comment for example usage.
@@ -227,6 +251,10 @@ class MultiKernelLoaderSpec {
       absl::Span<const uint8_t> cubin_bytes, absl::string_view kernel_name);
   MultiKernelLoaderSpec *AddCudaPtxInMemory(absl::string_view ptx,
                                             absl::string_view kernel_name);
+  MultiKernelLoaderSpec *AddLlvmHostKernel(absl::string_view ir,
+                                           absl::string_view entrypoint,
+                                           absl::string_view kernel_name,
+                                           absl::Span<std::string> options);
 
   const KernelArgsPacking &kernel_args_packing() const {
     return kernel_args_packing_;
@@ -239,6 +267,8 @@ class MultiKernelLoaderSpec {
       cuda_cubin_in_memory_;  // Binary CUDA program in memory.
   std::shared_ptr<CudaPtxInMemory>
       cuda_ptx_in_memory_;  // PTX text that resides in memory.
+  std::shared_ptr<LlvmHostKernel>
+      llvm_host_kernel_;  // LLVM kernel for host execution.
 
   // Number of parameters that the kernel takes. (This is nicer to have in a
   // constexpr than having to determine it from the types via template

From b23fc08a6028eb0541ff3f826b9cae50e55ab8de Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Thu, 28 Mar 2024 04:29:52 -0700
Subject: [PATCH 527/670] Avoid loading then saving reprensentative dataset
 again

If the representative dataset is in QuantizationOptions, currently, we first load it then save it again. This is not efficient, just use it directly.

PiperOrigin-RevId: 619879297
---
 .../tensorflow/python/quantize_model.py       | 132 ++++++++++--------
 1 file changed, 74 insertions(+), 58 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index 094f344221581f..e0eeca13d92f20 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -59,6 +59,8 @@
 _QuantizationComponent = _QuantizationComponentSpec.QuantizationComponent
 _TensorType = _QuantizationComponentSpec.TensorType
 
+_RepresentativeDatasetFile = quant_opts_pb2.RepresentativeDatasetFile
+
 # Mapping of signature def key -> SignatureDef.
 _SignatureDefMap = Mapping[str, meta_graph_pb2.SignatureDef]
 
@@ -99,6 +101,57 @@ def _serialize_signature_def_map(
   return signature_def_map_serialized
 
 
+def _save_representative_dataset(
+    representative_dataset: repr_dataset.RepresentativeDatasetOrMapping,
+    signature_def_map: _SignatureDefMap,
+) -> Mapping[str, _RepresentativeDatasetFile]:
+  """Saves the representative dataset to temporary TFRecord files.
+
+  Args:
+    representative_dataset: Representative dataset used for the calibration
+      step. Representative datasets should exist for each signature def key in
+      `signature_def_keys`.
+    signature_def_map: Signature def key -> SignatureDef mapping.
+
+  Returns:
+    A map from signature key to the saved representative dataset file.
+  """
+  if isinstance(representative_dataset, Mapping):
+    if set(signature_def_map.keys()) != set(representative_dataset.keys()):
+      raise ValueError(
+          'The signature keys and the keys of representative dataset map '
+          f'do not match. Signature keys: {set(signature_def_map.keys())}, '
+          f'representative dataset map: {set(representative_dataset.keys())}.'
+      )
+    representative_dataset_map = representative_dataset
+  elif len(signature_def_map.keys()) > 1:
+    raise ValueError(
+        'Representative dataset is not a mapping (got: '
+        f'{type(representative_dataset)}), but there is more than one '
+        'signature key provided. Please provide a map of '
+        '{signature_key -> dataset} with more than one signature key.'
+    )
+  else:
+    representative_dataset_map = {
+        list(signature_def_map.keys())[0]: representative_dataset,
+    }
+
+  # Save the representative dataset to temporary TFRecord files.
+  path_map = {}
+  expected_input_key_map = {}
+  for signature_key, signature_def in signature_def_map.items():
+    # Filepath is the second return value of mkstemp.
+    _, path_map[signature_key] = tempfile.mkstemp(
+        suffix='.tfrecord', prefix=signature_key
+    )
+    expected_input_key_map[signature_key] = signature_def.inputs.keys()
+
+  return repr_dataset.TfRecordRepresentativeDatasetSaver(
+      path_map=path_map,
+      expected_input_key_map=expected_input_key_map,
+  ).save(representative_dataset_map)
+
+
 def _run_static_range_qat(
     src_saved_model_path: str,
     dst_saved_model_path: str,
@@ -133,7 +186,7 @@ def _run_static_range_ptq(
     src_saved_model_path: str,
     dst_saved_model_path: str,
     quant_opts: _QuantizationOptions,
-    representative_dataset: repr_dataset.RepresentativeDatasetOrMapping,
+    representative_dataset: Mapping[str, _RepresentativeDatasetFile],
     signature_def_map: _SignatureDefMap,
 ) -> None:
   """Runs static-range Post-Training Quantization.
@@ -147,9 +200,8 @@ def _run_static_range_ptq(
     src_saved_model_path: Path to the source SavedModel directory.
     dst_saved_model_path: Path to the destination SavedModel directory.
     quant_opts: Quantization options.
-    representative_dataset: Representative dataset used for the calibration
-      step. Representative datasets should exist for each signature def key in
-      `signature_def_keys`.
+    representative_dataset: A map from signature key to the saved representative
+      dataset file.
     signature_def_map: Signature def key -> SignatureDef mapping.
 
   Raises:
@@ -159,48 +211,11 @@ def _run_static_range_ptq(
 
   signature_def_map_serialized = _serialize_signature_def_map(signature_def_map)
 
-  if isinstance(representative_dataset, Mapping):
-    if set(signature_def_map.keys()) != set(representative_dataset.keys()):
-      raise ValueError(
-          'The signature keys and the keys of representative dataset map '
-          f'do not match. Signature keys: {set(signature_def_map.keys())}, '
-          f'representative dataset map: {set(representative_dataset.keys())}.'
-      )
-    representative_dataset_map = representative_dataset
-  elif len(signature_def_map.keys()) > 1:
-    raise ValueError(
-        'Representative dataset is not a mapping (got: '
-        f'{type(representative_dataset)}), but there is more than one '
-        'signature key provided. Please provide a map of '
-        '{signature_key -> dataset} with more than one signature key.'
-    )
-  else:
-    representative_dataset_map = {
-        list(signature_def_map.keys())[0]: representative_dataset,
-    }
-
-  # Save the representative dataset to temporary TFRecord files.
-  # TODO: b/329552787 - If the representative dataset is in QuantizationOptions
-  # avoid loading then saving it again.
-  path_map = {}
-  expected_input_key_map = {}
-  for signature_key, signature_def in signature_def_map.items():
-    # Filepath is the second return value of mkstemp.
-    _, path_map[signature_key] = tempfile.mkstemp(
-        suffix='.tfrecord', prefix=signature_key
-    )
-    expected_input_key_map[signature_key] = signature_def.inputs.keys()
-
-  dataset_file_map = repr_dataset.TfRecordRepresentativeDatasetSaver(
-      path_map=path_map,
-      expected_input_key_map=expected_input_key_map,
-  ).save(representative_dataset_map)
-
   # `quantize_ptq_static_range` requires `RepresentativeDatasetFile`s to be
   # serialized. Serialize the values to match the type.
   dataset_file_map_serialized = {
       signature_key: dataset_file.SerializeToString()
-      for signature_key, dataset_file in dataset_file_map.items()
+      for signature_key, dataset_file in representative_dataset.items()
   }
   pywrap_quantize_model.quantize_ptq_static_range(
       src_saved_model_path,
@@ -265,9 +280,24 @@ def _static_range_quantize(
       set(quantization_options.tags),
   )
 
+  if (
+      representative_dataset is not None
+      and quantization_options.representative_datasets
+  ):
+    raise ValueError(
+        'Do not specify both the `representative_dataset` argument and'
+        ' the `representative_datasets` field in `QuantizationOptions`.'
+    )
+
+  saved_representative_dataset = quantization_options.representative_datasets
+  if representative_dataset is not None:
+    saved_representative_dataset = _save_representative_dataset(
+        representative_dataset, signature_def_map
+    )
+
   # Checks if the model is from QAT or method is METHOD_NO_QUANTIZE.
   if (
-      representative_dataset is None
+      not saved_representative_dataset
       and not is_qat_saved_model_or_method_no_quantize
   ):
     raise ValueError(
@@ -293,7 +323,7 @@ def _static_range_quantize(
         src_saved_model_path,
         dst_saved_model_path,
         quantization_options,
-        representative_dataset,
+        saved_representative_dataset,
         signature_def_map,
     )
 
@@ -859,20 +889,6 @@ def quantize(
 
   _populate_quantization_options_default_values(quantization_options)
 
-  if (
-      representative_dataset is not None
-      and quantization_options.representative_datasets
-  ):
-    raise ValueError(
-        'Do not specify both the `representative_dataset` argument and'
-        ' the `representative_datasets` field in `QuantizationOptions`.'
-    )
-
-  if quantization_options.representative_datasets:
-    representative_dataset = repr_dataset.TfRecordRepresentativeDatasetLoader(
-        quantization_options.representative_datasets
-    ).load()
-
   method: _QuantizationMethod = quantization_options.quantization_method
   if (
       method.preset_method == _PresetMethod.METHOD_STATIC_RANGE_INT8

From 2218d5f10bda1bd2d9c1eb8b1aa60b2877c3acbd Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Thu, 28 Mar 2024 05:28:47 -0700
Subject: [PATCH 528/670] Use indexing map symbol rescaling in the ReduceWindow
 emitter

We current codegen the reduce window op as a loop nest with the inner loop
iterating over the window and accumulating input values.

If the op had a base dilation set we would still generate the same inner
loop but the loop body now woudl have an additional condition that checks
whether we are in bounds of the dilation.

This change makes use of IndexingMap's symbol rescaling which results
in the generation of an inner loop with fewer iterations. It also avoids
the in-bounds check by only iterating over the tensor elements that
actually need to be accumulated.

PiperOrigin-RevId: 619893695
---
 .../gpu/fusions/mlir/elemental_hlo_to_mlir.cc |  6 +--
 .../mlir/elemental_hlo_to_mlir_test.cc        | 41 +++++++++++++++++++
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index 321b06c9c42a97..4e3e181f94c817 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -311,7 +311,8 @@ absl::StatusOr<SmallVector<Value>> EmitReduceWindow(
   MLIRContext* mlir_context = b.getContext();
   HloInstructionIndexing indexing =
       ComputeOutputToInputIndexing(instr, 0, mlir_context);
-  const auto& indexing_map = *indexing.indexing_maps[0].begin();
+  auto indexing_map = *indexing.indexing_maps[0].begin();
+  indexing_map.RescaleSymbols();
 
   auto reduce_window = DynCast<HloReduceWindowInstruction>(instr);
   CHECK(reduce_window != nullptr);
@@ -1228,9 +1229,6 @@ void GetLoopBoundsFromIndexingMap(ImplicitLocOpBuilder& b,
   for (const Interval& bound : indexing_map.GetSymbolBounds()) {
     lbs->push_back(b.create<ConstantIndexOp>(bound.lower));
     ubs->push_back(b.create<ConstantIndexOp>(bound.upper + 1));
-    // Note that this is not optimal, when there are mod constraints on symbols,
-    // e.g. for reduce-window. In that case we have to extract loop steps from
-    // the mod constraints.
     steps->push_back(c1);
   }
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
index 3bb4e9fbd1f01d..1acdff315457e4 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
@@ -245,6 +245,47 @@ TEST_F(ElementalHloToMlirTest, ReduceWindow) {
   )"));
 }
 
+TEST_F(ElementalHloToMlirTest, ReduceWindowWithRescaling) {
+  TF_EXPECT_OK(Run(R"(
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT sum = f32[] add(p0, p1)
+    }
+
+    ENTRY main {
+      p0 = f32[42,12,8] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT r = f32[19,12,8] reduce-window(p0, p1), window={
+                                                size=8x1x1
+                                                stride=4x1x1
+                                                pad=0_0x0_0x0_0
+                                                lhs_dilate=2x1x1
+                                               },
+                                               to_apply=add
+    })",
+                   R"(
+    // CHECK:      @main_r(
+    // CHECK-SAME:   %[[ARG0:.*]]: tensor<42x12x8xf32>
+    // CHECK-SAME:   %[[ARG1:.*]]: tensor<f32>
+    // CHECK-SAME:   %[[X:arg[0-9]*]]: index {{[^}]*}}},
+    // CHECK-SAME:   %[[Y:arg[0-9]*]]: index {{[^}]*}}},
+    // CHECK-SAME:   %[[Z:arg[0-9]*]]: index {{[^}]*}}}) -> f32
+    // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
+
+    // We have a window size of 8, but expect a loop from 0 to 4
+    // due to the base dilation of 2 and the applied symbol rescaling:
+    // CHECK:      scf.for %[[I:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK:      %[[K:.*]] = affine.apply affine_map<()[s0, s1] ->
+    // If symbol rescaling wasn't working we would have a
+    // `s0 floordiv <base_dilation>` in the map:
+    // CHECK-SAME: (s0 + s1 * 2)>()[%[[I]], %[[X]]]
+    // CHECK:      tensor.extract %[[ARG0]][%[[K]], %[[Y]], %[[Z]]]
+  )"));
+}
+
 TEST_F(ElementalHloToMlirTest, Concatenate) {
   TF_EXPECT_OK(Run(R"(
     ENTRY main {

From 1713d5563df74c0b384159dfdd27f4410b74978a Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 28 Mar 2024 05:36:09 -0700
Subject: [PATCH 529/670] [XLA:Python] Refactor jit argument parsing code.

* Put the logic to split static and dynamic arguments under ParseArguments, which returns a ArgumentSignature and a list of dynamic arguments.
* Move the other argument parsing logic for jit and pmap into a separate CallSignature type.

This simplifies the code, and prepares for adding another use case for the argument parsing code without the rest of the jit logic.

Refactoring only, no functional changes intended.

PiperOrigin-RevId: 619895559
---
 third_party/xla/xla/python/BUILD       |   4 +-
 third_party/xla/xla/python/jax_jit.cc  | 140 +++++++++++----------
 third_party/xla/xla/python/jax_jit.h   | 161 +++++++++++++------------
 third_party/xla/xla/python/pjit.cc     | 109 +++++++++--------
 third_party/xla/xla/python/pmap_lib.cc |  46 +++----
 5 files changed, 246 insertions(+), 214 deletions(-)

diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 9938364cfd94bc..91b78aa20c8822 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -537,8 +537,6 @@ cc_library(
         "@local_config_python//:python_headers",  # build_cleaner: keep
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:status_casters",
-        "//xla/python/ifrt",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
@@ -733,6 +731,7 @@ cc_library(
         ":types",
         # placeholder for index annotation deps
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -747,7 +746,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:exceptions",
         "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:status_casters",
         "//xla/python/ifrt",
         "@local_tsl//tsl/concurrency:ref_count",
diff --git a/third_party/xla/xla/python/jax_jit.cc b/third_party/xla/xla/python/jax_jit.cc
index 726972f0d7fc6d..20ff684cd38ca4 100644
--- a/third_party/xla/xla/python/jax_jit.cc
+++ b/third_party/xla/xla/python/jax_jit.cc
@@ -38,6 +38,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/base/attributes.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -138,13 +139,60 @@ bool FetchMemoriesFlag() {
       *global_state.enable_memories);
 }
 
-std::string CallSignature::DebugString() const {
+std::string ArgumentSignature::DebugString() const {
   auto py_object_formatter = [](std::string* out, const nb::object& o) {
     out->append(nb::cast<std::string_view>(nb::str(o)));
   };
   auto treedef_formatter = [](std::string* out, const xla::PyTreeDef& d) {
     out->append(d.ToString());
   };
+  return absl::StrFormat(
+      "static args (positional + keyword): [%s], "
+      "static arg keyword names: [%s], "
+      "dynamic arg signatures (positional + keyword): [%s]"
+      "dynamic arg shardings: [%s]",
+      absl::StrJoin(static_args, ",", py_object_formatter),
+      absl::StrJoin(static_arg_names, ",", py_object_formatter),
+      absl::StrJoin(dynamic_arg_names, ",", py_object_formatter),
+      absl::StrJoin(dynamic_arg_treedefs, "| ", treedef_formatter));
+}
+
+bool ArgumentSignature::operator==(const ArgumentSignature& other) const {
+  if (dynamic_arg_treedefs != other.dynamic_arg_treedefs) {
+    return false;
+  }
+  auto object_ptr_equality = [](nb::handle a, nb::handle b) {
+    return a.ptr() == b.ptr();
+  };
+  if (!absl::c_equal(dynamic_arg_names, other.dynamic_arg_names,
+                     object_ptr_equality)) {
+    return false;
+  }
+  if (!absl::c_equal(static_arg_names, other.static_arg_names,
+                     object_ptr_equality)) {
+    return false;
+  }
+  return absl::c_equal(
+      static_args, other.static_args,
+      [](const nb::object& a, const nb::object& b) {
+        try {
+          return a.type().ptr() == b.type().ptr() && a.equal(b);
+        } catch (const nb::python_error& e) {
+          throw std::invalid_argument(absl::StrCat(
+              "static arguments should be comparable using __eq__."
+              "The following error was raised when comparing two objects of "
+              "types ",
+              nb::cast<std::string_view>(nb::str(a.type())), " and ",
+              nb::cast<std::string_view>(nb::str(b.type())),
+              ". The error was:\n", e.what()));
+        }
+      });
+}
+
+std::string CallSignature::DebugString() const {
+  auto py_object_formatter = [](std::string* out, const nb::object& o) {
+    out->append(nb::cast<std::string_view>(nb::str(o)));
+  };
   auto signature_formatter = [](std::string* out,
                                 const xla::PyArgSignature& s) {
     out->append(s.DebugString());
@@ -153,25 +201,20 @@ std::string CallSignature::DebugString() const {
     out->append(o ? "true" : "false");
   };
   return absl::StrFormat(
-      "static args (positional + keyword): %s\nstatic arg keyword names: %s\n"
+      "arg signature: %s\n"
       "dynamic arg signatures (positional + keyword): %s\n"
       "dynamic arg shardings: %s\n"
       "committed args: %s\n"
-      "dynamic arg keyword names: %s\n"
-      "dynamic arg treedefs: %s\n"
       "device: %s\n"
       "default_device: %s\n"
       "jax_enable_x64: %d\n"
       "jax_enable_memories: %d\n"
       "global_extra_jit_context: %s\n"
       "thread_local_extra_jit_context: %s\n",
-      absl::StrJoin(static_args, ",", py_object_formatter),
-      absl::StrJoin(static_arg_names, ",", py_object_formatter),
+      arg_signature.DebugString(),
       absl::StrJoin(dynamic_arg_signatures, ", ", signature_formatter),
       absl::StrJoin(dynamic_arg_shardings, ", ", py_object_formatter),
       absl::StrJoin(committed_args, ",", bool_formatter),
-      absl::StrJoin(dynamic_arg_names, ",", py_object_formatter),
-      absl::StrJoin(dynamic_arg_treedefs, "| ", treedef_formatter),  // new line
       device != nullptr ? device->DebugString() : "nullptr",
       OptionalDebugString(default_device), jax_enable_x64, jax_enable_memories,
       OptionalDebugString(global_extra_jit_context),
@@ -179,14 +222,7 @@ std::string CallSignature::DebugString() const {
 }
 
 bool CallSignature::operator==(const CallSignature& other) const {
-  if (dynamic_arg_treedefs != other.dynamic_arg_treedefs) {
-    return false;
-  }
-  auto object_ptr_equality = [](nb::handle a, nb::handle b) {
-    return a.ptr() == b.ptr();
-  };
-  if (!absl::c_equal(dynamic_arg_names, other.dynamic_arg_names,
-                     object_ptr_equality)) {
+  if (arg_signature != other.arg_signature) {
     return false;
   }
   if (dynamic_arg_signatures != other.dynamic_arg_signatures) {
@@ -201,10 +237,6 @@ bool CallSignature::operator==(const CallSignature& other) const {
   if (jax_enable_memories != other.jax_enable_memories) {
     return false;
   }
-  if (!absl::c_equal(static_arg_names, other.static_arg_names,
-                     object_ptr_equality)) {
-    return false;
-  }
   if (committed_args != other.committed_args) {
     return false;
   }
@@ -212,21 +244,6 @@ bool CallSignature::operator==(const CallSignature& other) const {
       // `==` on py:objects is the Python `is`. We need equal.
       absl::c_equal(dynamic_arg_shardings, other.dynamic_arg_shardings,
                     ShardingEqual) &&
-      absl::c_equal(
-          static_args, other.static_args,
-          [this](const nb::object& a, const nb::object& b) {
-            try {
-              return a.type().ptr() == b.type().ptr() && a.equal(b);
-            } catch (const nb::python_error& e) {
-              throw std::invalid_argument(absl::StrCat(
-                  "static arguments should be comparable using __eq__."
-                  "The following error was raised during a call to '",
-                  function_name, "' when comparing two objects of types ",
-                  nb::cast<std::string_view>(nb::str(a.type())), " and ",
-                  nb::cast<std::string_view>(nb::str(b.type())),
-                  ". The error was:\n", e.what()));
-            }
-          }) &&
       (global_extra_jit_context.has_value() ==
        other.global_extra_jit_context.has_value()) &&
       (!global_extra_jit_context.has_value() ||
@@ -243,41 +260,37 @@ bool CallSignature::operator==(const CallSignature& other) const {
 
 // Filter out static arguments, flatten and concatenate other arguments (i.e.
 // dynamic positional and keyword arguments), filling `arguments` in place.
-absl::Status ParseArguments(absl::Span<PyObject* const> positional_args,
-                            absl::Span<PyObject* const> keyword_args,
-                            nb::handle kwnames,
-                            absl::Span<int const> static_argnums,
-                            absl::Span<nb::str const> static_argnames,
-                            xla::PyTreeRegistry* pytree_registry,
-                            ParsedArgumentsAsBuffers& arguments) {
+absl::Status ParseArguments(
+    absl::Span<PyObject* const> positional_args,
+    absl::Span<PyObject* const> keyword_args, nb::handle kwnames,
+    absl::Span<int const> static_argnums,
+    absl::Span<nb::str const> static_argnames,
+    xla::PyTreeRegistry* pytree_registry, ArgumentSignature& signature,
+    absl::InlinedVector<nanobind::object, 2>& flat_dynamic_args) {
   tsl::profiler::TraceMe traceme("ParseArguments");
 
-  arguments.flat_dynamic_args.reserve(positional_args.size() +
-                                      keyword_args.size());
+  flat_dynamic_args.reserve(positional_args.size() + keyword_args.size());
   if (static_argnums.empty()) {
-    arguments.signature.dynamic_arg_treedefs.reserve(positional_args.size());
+    signature.dynamic_arg_treedefs.reserve(positional_args.size());
 
     // Positional arguments.
     for (int i = 0; i < positional_args.size(); ++i) {
-      arguments.signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
-      xla::PyTreeDef& pytree_def =
-          arguments.signature.dynamic_arg_treedefs.back();
-      pytree_def.Flatten(nb::handle(positional_args[i]),
-                         arguments.flat_dynamic_args);
+      signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
+      xla::PyTreeDef& pytree_def = signature.dynamic_arg_treedefs.back();
+      pytree_def.Flatten(nb::handle(positional_args[i]), flat_dynamic_args);
     }
   } else {
-    arguments.signature.dynamic_arg_treedefs.reserve(positional_args.size());
+    signature.dynamic_arg_treedefs.reserve(positional_args.size());
 
     // Positional arguments.
     for (int i = 0; i < positional_args.size(); ++i) {
       if (std::find(static_argnums.begin(), static_argnums.end(), i) ==
           static_argnums.end()) {
-        arguments.signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
-        xla::PyTreeDef& pytree_def =
-            arguments.signature.dynamic_arg_treedefs.back();
-        pytree_def.Flatten(positional_args[i], arguments.flat_dynamic_args);
+        signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
+        xla::PyTreeDef& pytree_def = signature.dynamic_arg_treedefs.back();
+        pytree_def.Flatten(positional_args[i], flat_dynamic_args);
       } else {
-        arguments.signature.static_args.emplace_back(
+        signature.static_args.emplace_back(
             nb::borrow<nb::object>(positional_args[i]));
       }
     }
@@ -313,21 +326,20 @@ absl::Status ParseArguments(absl::Span<PyObject* const> positional_args,
       return false;
     };
 
-    arguments.signature.dynamic_arg_names.reserve(keyword_args.size());
+    signature.dynamic_arg_names.reserve(keyword_args.size());
     for (int i = 0; i < keyword_args.size(); ++i) {
       if (kwarg_is_static(kwargs[i].first)) {
-        arguments.signature.static_arg_names.push_back(
+        signature.static_arg_names.push_back(
             nb::steal<nb::object>(kwargs[i].first));
-        arguments.signature.static_args.push_back(
+        signature.static_args.push_back(
             nb::borrow<nb::object>(kwargs[i].second));
       } else {
-        arguments.signature.dynamic_arg_names.push_back(
+        signature.dynamic_arg_names.push_back(
             nb::steal<nb::object>(kwargs[i].first));
-        arguments.signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
-        xla::PyTreeDef& pytree_def =
-            arguments.signature.dynamic_arg_treedefs.back();
+        signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
+        xla::PyTreeDef& pytree_def = signature.dynamic_arg_treedefs.back();
         pytree_def.Flatten(nb::handle(kwargs[i].second.ptr()),
-                           arguments.flat_dynamic_args);
+                           flat_dynamic_args);
       }
     }
   }
diff --git a/third_party/xla/xla/python/jax_jit.h b/third_party/xla/xla/python/jax_jit.h
index b16b111c4a5b70..c076ef9cbeabe1 100644
--- a/third_party/xla/xla/python/jax_jit.h
+++ b/third_party/xla/xla/python/jax_jit.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <Python.h>
 
+#include <cstddef>
 #include <optional>
 #include <stdexcept>
 #include <string>
@@ -32,13 +33,11 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/python/ifrt/array.h"
 #include "xla/python/nb_helpers.h"
 #include "xla/python/py_values.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/pytree.h"
 #include "xla/python/sharding.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/logging.h"
 
 namespace jax {
@@ -93,6 +92,88 @@ bool GetEnableX64();
 std::optional<nanobind::object> GetDefaultDevice();
 std::optional<nanobind::callable> GetPostHook();
 
+// An ArgumentSignature describes the static arguments to a function call, and
+// how the dynamic arguments are related to the arguments. Together with the
+// values of the dynamic arguments, this fully describes the arguments.
+struct ArgumentSignature {
+  // A PyTreeDef for each dynamic argument, positional arguments first
+  // followed by keyword arguments. Keyword arguments are in the order given
+  // by dynamic_arg_names.
+  absl::InlinedVector<xla::PyTreeDef, 2> dynamic_arg_treedefs;
+
+  // Dynamic keyword argument names. Interned, and sorted by the keyword
+  // name. Interned values are safe to compare by pointer.
+  std::vector<nanobind::object> dynamic_arg_names;
+
+  // Static arguments. Contains the positional arguments sorted in argument
+  // order, followed by static keyword arguments in the order given by
+  // `static_arg_names`.
+  std::vector<nanobind::object> static_args;
+
+  // Static keyword argument names. Interned, and sorted by keyword name.
+  std::vector<nanobind::object> static_arg_names;
+
+  bool operator==(const ArgumentSignature& other) const;
+  bool operator!=(const ArgumentSignature& other) const {
+    return !(*this == other);
+  }
+
+  std::string DebugString() const;
+};
+
+template <typename H>
+H AbslHashValue(H h, const ArgumentSignature& s) {
+  h = H::combine(std::move(h), s.dynamic_arg_treedefs,
+                 s.dynamic_arg_names.size(), s.static_args.size(),
+                 s.static_arg_names.size());
+
+  for (const auto& name : s.dynamic_arg_names) {
+    h = H::combine(std::move(h), name.ptr());
+  }
+  for (size_t i = 0; i < s.static_args.size(); ++i) {
+    const auto& static_arg = s.static_args[i];
+    Py_hash_t hash;
+    try {
+      hash = xla::nb_hash(static_arg);
+    } catch (const nanobind::python_error& e) {
+      if (!e.matches(PyExc_TypeError)) throw;
+      throw std::invalid_argument(absl::StrCat(
+          "Non-hashable static arguments are not supported. An error occurred "
+          "while trying to hash an object of type ",
+          nanobind::cast<std::string_view>(nanobind::str(static_arg.type())),
+          ", ", nanobind::cast<std::string_view>(nanobind::str(static_arg)),
+          ". The error was:\n", e.what(), "\n"));
+    }
+    h = H::combine(std::move(h), hash);
+  }
+  for (const auto& name : s.static_arg_names) {
+    h = H::combine(std::move(h), name.ptr());
+  }
+  return h;
+}
+
+// Filter out static arguments, flatten and concatenate other arguments (i.e.
+// dynamic positional and keyword arguments), filling `arguments` in place.
+// Args:
+// positional_args: positional arguments
+// keyword_args: the values of the keyword arguments
+// kwnames: either None or a tuple containing the keyword argument names
+// static_argnums: the indices of the static arguments in the positional
+//   arguments
+// static_argnames: the names of the static arguments
+// pytree_registry: the registry to use to convert the arguments to pytrees
+// arguments: output; describes the static arguments and the identities of the
+//  dynamic arguments.
+// flat_dynamic_args: output; the concatenation of the dynamic positional
+//  arguments and sorted keyword arguments.
+absl::Status ParseArguments(
+    absl::Span<PyObject* const> positional_args,
+    absl::Span<PyObject* const> keyword_args, nanobind::handle kwnames,
+    absl::Span<int const> static_argnums,
+    absl::Span<nanobind::str const> static_argnames,
+    xla::PyTreeRegistry* pytree_registry, ArgumentSignature& signature,
+    absl::InlinedVector<nanobind::object, 2>& flat_dynamic_args);
+
 // The signature of Python jitted function call, partitioned into:
 // - dynamic positional arguments (i.e. positional args which are not static)
 // - static positional arguments (i.e. the args associated to static_argnums)
@@ -106,13 +187,8 @@ struct CallSignature {
   // Not part of the signature, but we need it for error messages.
   std::string_view function_name;
 
-  // A PyTreeDef for each dynamic argument, positional arguments first
-  // followed by keyword arguments. Keyword arguments are in the order given
-  // by dynamic_arg_names.
-  absl::InlinedVector<xla::PyTreeDef, 2> dynamic_arg_treedefs;
-  // Dynamic keyword argument names. Interned, and sorted by the keyword
-  // name.
-  std::vector<nanobind::object> dynamic_arg_names;
+  ArgumentSignature arg_signature;
+
   // Shape and dtype for both the dynamic positional arguments and the keyword
   // arguments (sorted by keyword name).
   absl::InlinedVector<xla::PyArgSignature, 2> dynamic_arg_signatures;
@@ -121,13 +197,6 @@ struct CallSignature {
   // jax.Array enabled.
   std::vector<nanobind::object> dynamic_arg_shardings;
 
-  // Static arguments. Contains the positional arguments sorted in argument
-  // order, followed by static keyword arguments in the order given by
-  // `static_arg_names`.
-  std::vector<nanobind::object> static_args;
-  // Static keyword argument names. Interned, and sorted by keyword name.
-  std::vector<nanobind::object> static_arg_names;
-
   absl::InlinedVector<bool, 2> committed_args;
 
   // For JIT, we need this in the key because computation follows the data, so
@@ -155,8 +224,7 @@ struct CallSignature {
 
 template <typename H>
 H AbslHashValue(H h, const CallSignature& s) {
-  h = H::combine(std::move(h), s.dynamic_arg_treedefs,
-                 s.dynamic_arg_signatures);
+  h = H::combine(std::move(h), s.arg_signature, s.dynamic_arg_signatures);
 
   DCHECK(s.dynamic_arg_shardings.empty() ||
          s.dynamic_arg_shardings.size() == s.dynamic_arg_signatures.size());
@@ -169,35 +237,7 @@ H AbslHashValue(H h, const CallSignature& s) {
     h = H::combine(std::move(h), ShardingHash(sharding.ptr()));
   }
 
-  for (const auto& name : s.dynamic_arg_names) {
-    h = H::combine(std::move(h), name.ptr());
-  }
-
-  h = H::combine(std::move(h), s.committed_args);
-
-  h = H::combine(std::move(h), s.dynamic_arg_names.size());
-  for (const auto& static_arg : s.static_args) {
-    ssize_t hash;
-    try {
-      hash = xla::nb_hash(static_arg);
-    } catch (const nanobind::python_error& e) {
-      if (!e.matches(PyExc_TypeError)) throw;
-      throw std::invalid_argument(absl::StrCat(
-          "Non-hashable static arguments are not supported. An error occurred "
-          "during a call to '",
-          s.function_name, "' while trying to hash an object of type ",
-          nanobind::cast<std::string_view>(nanobind::str(static_arg.type())),
-          ", ", nanobind::cast<std::string_view>(nanobind::str(static_arg)),
-          ". The error was:\n", e.what(), "\n"));
-    }
-    h = H::combine(std::move(h), hash);
-  }
-  h = H::combine(std::move(h), s.static_args.size());
-  for (const auto& name : s.static_arg_names) {
-    h = H::combine(std::move(h), name.ptr());
-  }
-  h = H::combine(std::move(h), s.static_arg_names.size());
-  h = H::combine(std::move(h), s.device, s.jax_enable_x64);
+  h = H::combine(std::move(h), s.committed_args, s.device, s.jax_enable_x64);
 
   // We do not hash the extra_jit_context fields since calling Python hash
   // functions is expensive (~300ns) and we don't expect a large number of
@@ -205,33 +245,6 @@ H AbslHashValue(H h, const CallSignature& s) {
   return h;
 }
 
-// The resulting information of the parsing and conversion of the arguments.
-struct ParsedArgumentsAsBuffers {
-  // The call signature will be filled during 2 steps:
-  // - `ParseArguments` will fill the static arguments and the pytree
-  //    structures
-  // - the shapes and dtypes are filled later, by `ParseAndTransferArguments`.
-  CallSignature signature;
-  // The concatenation of the dynamic positional arguments and the sorted
-  // keyword arguments.
-  absl::InlinedVector<nanobind::object, 2> flat_dynamic_args;
-  std::vector<nanobind::object> keep_alive_objects;
-
-  xla::ifrt::Client* ifrt_client;
-  // The following is only valid if the parsing succeeds.
-  std::vector<tsl::RCReference<xla::ifrt::Array>> ifrt_arg_arrays;
-};
-
-// Filter out static arguments, flatten and concatenate other arguments (i.e.
-// dynamic positional and keyword arguments), filling `arguments` in place.
-absl::Status ParseArguments(absl::Span<PyObject* const> positional_args,
-                            absl::Span<PyObject* const> keyword_args,
-                            nanobind::handle kwnames,
-                            absl::Span<int const> static_argnums,
-                            absl::Span<nanobind::str const> static_argnames,
-                            xla::PyTreeRegistry* pytree_registry,
-                            ParsedArgumentsAsBuffers& arguments);
-
 // The function to call in `xla.cc` to add the bindings for this module.
 void BuildJaxjitSubmodule(nanobind::module_& m);
 
diff --git a/third_party/xla/xla/python/pjit.cc b/third_party/xla/xla/python/pjit.cc
index f3aa9b21a2ce54..c562eaa2eadd65 100644
--- a/third_party/xla/xla/python/pjit.cc
+++ b/third_party/xla/xla/python/pjit.cc
@@ -275,10 +275,11 @@ class PjitFunction {
   }
 
  private:
-  absl::Status UpdateArgsSignature(ParsedArgumentsAsBuffers& arguments);
+  absl::Status ComputeCallSignature(
+      absl::Span<nb::object const> flat_dynamic_args,
+      CallSignature& call_signature);
 
   void PopulateCacheEntry(PjitCacheEntry& cache_entry,
-                          const CallSignature& signature,
                           const nb::tuple& out_and_fastpath_data);
 
   std::string function_name_;
@@ -352,31 +353,32 @@ PjitFunction::~PjitFunction() { GetGlobalPjitFunctionStore().Erase(this); }
 void CallShardArgFallback(
     nb::handle arg, nb::handle sharding, const nb::callable& fallback,
     std::vector<tsl::RCReference<xla::ifrt::Array>>& num_args_arrays,
-    ParsedArgumentsAsBuffers& arguments) {
+    std::vector<nb::object>& keep_alive_objects) {
   tsl::profiler::TraceMe traceme("cpp_pjit_shard_arg_fallback");
   auto py_array_or_bufs = fallback(arg, sharding);
   auto py_array = nb::cast<xla::PyArray>(py_array_or_bufs);
   num_args_arrays.push_back(tsl::FormRef(py_array.ifrt_array()));
-  arguments.keep_alive_objects.push_back(std::move(py_array_or_bufs));
+  keep_alive_objects.push_back(std::move(py_array_or_bufs));
 }
 
 // Prepares the input PjRtBuffers from the python arguments. This is equivalent
 // to shard_args() in pxla.py but for only a few supported cases.
 absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
 PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
-                  ParsedArgumentsAsBuffers& arguments,
-                  const std::vector<bool>& kept_args,
+                  absl::Span<nb::object const> flat_dynamic_args,
+                  bool enable_x64, const std::vector<bool>& kept_args,
                   const std::vector<nb::object>& in_shardings,
-                  const nb::callable& shard_arg_fallback) {
+                  const nb::callable& shard_arg_fallback,
+                  std::vector<nb::object>& keep_alive_objects) {
   const auto& addressable_devices =
       executable.ifrt_loaded_executable()->addressable_devices();
-  int num_args = arguments.flat_dynamic_args.size();
+  int num_args = flat_dynamic_args.size();
 
   std::vector<tsl::RCReference<xla::ifrt::Array>> num_args_arrays;
   num_args_arrays.reserve(num_args);
 
   xla::DevicePutOptions options;
-  options.squash_64bit_types = !arguments.signature.jax_enable_x64;
+  options.squash_64bit_types = !enable_x64;
   options.allow_zero_copy = true;
   xla::PjRtDevice* data_device = nullptr;
   if (executable.ifrt_loaded_executable()->num_devices() == 1) {
@@ -390,7 +392,7 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
     int dce_index = dce_i;
     ++dce_i;
 
-    const nb::object& arg = arguments.flat_dynamic_args[i];
+    const nb::object& arg = flat_dynamic_args[i];
 
     auto transfer_guard_formatter = [] { return std::string(""); };
 
@@ -405,13 +407,13 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
 
         num_args_arrays.push_back(std::move(on_device.ifrt_array));
         if (on_device.owning_pybuffer) {
-          arguments.keep_alive_objects.push_back(
-              std::move(on_device.owning_pybuffer));
+          keep_alive_objects.push_back(std::move(on_device.owning_pybuffer));
         }
         continue;
       } else {
         CallShardArgFallback(arg.ptr(), in_shardings[dce_index],
-                             shard_arg_fallback, num_args_arrays, arguments);
+                             shard_arg_fallback, num_args_arrays,
+                             keep_alive_objects);
         continue;
       }
     }
@@ -428,20 +430,22 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
 
     if (sharding.type().ptr() == jax::PmapSharding::type().ptr()) {
       CallShardArgFallback(arg.ptr(), in_shardings[dce_index],
-                           shard_arg_fallback, num_args_arrays, arguments);
+                           shard_arg_fallback, num_args_arrays,
+                           keep_alive_objects);
       continue;
     }
 
     if (py_array.num_shards() != addressable_devices.size()) {
       CallShardArgFallback(arg.ptr(), in_shardings[dce_index],
-                           shard_arg_fallback, num_args_arrays, arguments);
+                           shard_arg_fallback, num_args_arrays,
+                           keep_alive_objects);
       continue;
     }
 
     xla::ifrt::Array* ifrt_array = py_array.ifrt_array();
     // PyArray inputs should have already been checked in
     // `xla::PyArgSignatureOfValue()` called by
-    // `PjitFunction::UpdateArgsSignature()`.
+    // `PjitFunction::ComputeCallSignature()`.
     DCHECK(ifrt_array != nullptr) << "PyArray has been unexpectedly deleted.";
 
     if (sharding_num_devices == 1 &&
@@ -460,7 +464,7 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
       num_args_arrays.push_back(tsl::FormRef(ifrt_array));
     }
 
-    arguments.keep_alive_objects.push_back(arg);
+    keep_alive_objects.push_back(arg);
   }
 
   return num_args_arrays;
@@ -471,7 +475,6 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
                                               size_t nargs, PyObject* kwnames) {
   tsl::profiler::TraceMe traceme(
       [&] { return absl::StrCat("PjitFunction(", function_name_, ")"); });
-  ParsedArgumentsAsBuffers arguments;
 
   // Make sure we trigger a garbage collection on JIT function calls. Otherwise
   // code like
@@ -516,9 +519,13 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
   absl::Span<PyObject* const> positional_args(args, num_positional_args);
   absl::Span<PyObject* const> keyword_args(args + num_positional_args,
                                            num_keyword_args);
-  auto status =
-      ParseArguments(positional_args, keyword_args, kwnames, static_argnums_,
-                     static_argnames_, pytree_registry_.get(), arguments);
+
+  CallSignature call_signature;
+  std::vector<nb::object> keep_alive_objects;
+  absl::InlinedVector<nb::object, 2> flat_dynamic_args;
+  auto status = ParseArguments(
+      positional_args, keyword_args, kwnames, static_argnums_, static_argnames_,
+      pytree_registry_.get(), call_signature.arg_signature, flat_dynamic_args);
   if (!status.ok()) {
     VLOG(2) << "ParseArguments failed: " << status;
     return fallback_to_cache_miss();
@@ -528,7 +535,7 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
   // committed PyArray inputs. For other cases, e.g. Tracers or ShapedArray, it
   // will fallback to python. For jit, numpy arrays and scalars are also
   // allowed, which we will check later.
-  for (const auto& arg : arguments.flat_dynamic_args) {
+  for (const auto& arg : flat_dynamic_args) {
     if (arg.type().ptr() != xla::PyArray::type().ptr()) {
       continue;
     }
@@ -552,17 +559,17 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
     }
   }
 
-  status = UpdateArgsSignature(arguments);
+  status = ComputeCallSignature(flat_dynamic_args, call_signature);
   if (!status.ok()) {
-    VLOG(2) << "UpdateArgsSignature failed: " << status;
+    VLOG(2) << "ComputeCallSignature failed: " << status;
     return fallback_to_cache_miss();
   }
 
-  VLOG(2) << "CallSignature:\n" << arguments.signature.DebugString();
+  VLOG(2) << "CallSignature:\n" << call_signature.DebugString();
   bool inserted = false;
   std::shared_ptr<PjitCacheEntry> cache_entry =
       executables_->GetOrCreateIfAbsent(
-          arguments.signature, [this, &inserted](const CallSignature& unused) {
+          call_signature, [this, &inserted](const CallSignature& unused) {
             inserted = true;
             return std::make_shared<PjitCacheEntry>(pytree_registry_.get());
           });
@@ -573,7 +580,7 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
     if (inserted) {
       nb::object out_and_fastpath_data;
       nb::tuple out_tuple;
-      VLOG(2) << "Cache miss for " << arguments.signature.DebugString();
+      VLOG(2) << "Cache miss for " << call_signature.DebugString();
       try {
         // Calls Python and may release the GIL. May also throw if
         // compilation/tracing fails.
@@ -583,7 +590,7 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
         }
         out_tuple = nb::cast<nb::tuple>(out_and_fastpath_data);
 
-        PopulateCacheEntry(*cache_entry, arguments.signature, out_tuple);
+        PopulateCacheEntry(*cache_entry, out_tuple);
       } catch (const std::exception& e) {
         VLOG(2) << "cache miss fail: " << e.what();
         cache_entry->fall_back_to_python = true;
@@ -599,7 +606,7 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
     } else {
       if (cache_entry->thread_id == std::this_thread::get_id()) {
         auto error_string = absl::StrCat("Recursively calling jit: ",
-                                         arguments.signature.DebugString());
+                                         call_signature.DebugString());
         PyErr_SetString(PyExc_RecursionError, error_string.c_str());
         throw nb::python_error();
       }
@@ -617,8 +624,9 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
 
   // A vector of [num_inputs].
   auto num_args_arrays = PrepareIfrtInputs(
-      *cache_entry->executable, arguments, cache_entry->kept_var_bitvec,
-      cache_entry->in_shardings, shard_arg_fallback_);
+      *cache_entry->executable, flat_dynamic_args,
+      call_signature.jax_enable_x64, cache_entry->kept_var_bitvec,
+      cache_entry->in_shardings, shard_arg_fallback_, keep_alive_objects);
 
   if (!num_args_arrays.ok()) {
     VLOG(2) << "Failed to prepare IFRT inputs: " << num_args_arrays.status();
@@ -684,49 +692,48 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
   return out;
 }
 
-absl::Status PjitFunction::UpdateArgsSignature(
-    ParsedArgumentsAsBuffers& arguments) {
-  arguments.signature.function_name = function_name_;
+absl::Status PjitFunction::ComputeCallSignature(
+    absl::Span<nb::object const> flat_dynamic_args, CallSignature& signature) {
+  signature.function_name = function_name_;
 
   // Get dynamic argument signatures.
   JitState& global_state = jax::GlobalJitState();
   JitState& tls = jax::ThreadLocalJitState();
   bool jax_enable_x64 = GetEnableX64();
 
-  arguments.signature.default_device = GetDefaultDevice();
-  arguments.signature.jax_enable_x64 = jax_enable_x64;
-  arguments.signature.jax_enable_memories = GetEnableMemories();
+  signature.default_device = GetDefaultDevice();
+  signature.jax_enable_x64 = jax_enable_x64;
+  signature.jax_enable_memories = GetEnableMemories();
 
-  auto& dynamic_arg_signatures = arguments.signature.dynamic_arg_signatures;
-  dynamic_arg_signatures.reserve(arguments.flat_dynamic_args.size());
-  auto& dynamic_arg_shardings = arguments.signature.dynamic_arg_shardings;
-  dynamic_arg_shardings.reserve(arguments.flat_dynamic_args.size());
+  auto& dynamic_arg_signatures = signature.dynamic_arg_signatures;
+  dynamic_arg_signatures.reserve(flat_dynamic_args.size());
+  auto& dynamic_arg_shardings = signature.dynamic_arg_shardings;
+  dynamic_arg_shardings.reserve(flat_dynamic_args.size());
 
-  for (nb::handle arg : arguments.flat_dynamic_args) {
-    TF_ASSIGN_OR_RETURN(auto signature,
+  for (nb::handle arg : flat_dynamic_args) {
+    TF_ASSIGN_OR_RETURN(auto arg_signature,
                         xla::PyArgSignatureOfValue(arg, jax_enable_x64));
-    arguments.signature.dynamic_arg_signatures.push_back(std::move(signature));
+    signature.dynamic_arg_signatures.push_back(std::move(arg_signature));
 
     // It should be already checked previously in the entry point of
     // PjitFunction::Call().
     if (arg.type().ptr() == xla::PyArray::type().ptr()) {
       auto py_array = nb::borrow<xla::PyArray>(arg);
-      arguments.signature.dynamic_arg_shardings.push_back(py_array.sharding());
-      arguments.signature.committed_args.push_back(py_array.committed());
+      signature.dynamic_arg_shardings.push_back(py_array.sharding());
+      signature.committed_args.push_back(py_array.committed());
     } else {
-      arguments.signature.dynamic_arg_shardings.push_back(nb::none());
-      arguments.signature.committed_args.push_back(false);
+      signature.dynamic_arg_shardings.push_back(nb::none());
+      signature.committed_args.push_back(false);
     }
   }
 
-  arguments.signature.thread_local_extra_jit_context = tls.extra_jit_context;
-  arguments.signature.global_extra_jit_context = global_state.extra_jit_context;
+  signature.thread_local_extra_jit_context = tls.extra_jit_context;
+  signature.global_extra_jit_context = global_state.extra_jit_context;
 
   return absl::OkStatus();
 }
 
 void PjitFunction::PopulateCacheEntry(PjitCacheEntry& cache_entry,
-                                      const CallSignature& signature,
                                       const nb::tuple& out_and_fastpath_data) {
   DCHECK_EQ(out_and_fastpath_data.size(), 2);
 
diff --git a/third_party/xla/xla/python/pmap_lib.cc b/third_party/xla/xla/python/pmap_lib.cc
index 9caaa46d0a44a6..61b16dedccbd1a 100644
--- a/third_party/xla/xla/python/pmap_lib.cc
+++ b/third_party/xla/xla/python/pmap_lib.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/hash/hash.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -330,7 +331,7 @@ class PmapFunction {
   }
   const std::vector<int>& static_argnums() const { return static_argnums_; }
 
-  // nanobind::object typed subclass for PmapFunction objects.
+  // nb::object typed subclass for PmapFunction objects.
   class pyobject : public nb::object {
    public:
     NB_OBJECT(pyobject, nb::object, "PmapFunction",
@@ -363,27 +364,28 @@ class PmapFunction {
   //
   // It deals with the arguments signatures and also of the global and
   // thread-local jit context.
-  absl::Status UpdateArgsSignature(ParsedArgumentsAsBuffers& arguments) {
-    arguments.signature.function_name = function_name_;
+  absl::Status ComputeCallSignature(
+      absl::Span<nb::object const> flat_dynamic_args,
+      CallSignature& signature) {
+    signature.function_name = function_name_;
 
     // Get dynamic argument signatures.
     JitState& global_state = jax::GlobalJitState();
     JitState& tls = jax::ThreadLocalJitState();
     const bool jax_enable_x64 = GetEnableX64();
-    arguments.signature.jax_enable_x64 = jax_enable_x64;
-    for (nb::handle arg : arguments.flat_dynamic_args) {
+    signature.jax_enable_x64 = jax_enable_x64;
+    for (nb::handle arg : flat_dynamic_args) {
       auto signature_or_error = xla::PyArgSignatureOfValue(arg, jax_enable_x64);
       if (!signature_or_error.ok()) {
         VLOG(2) << "PyArgSignatureOfValue failed: "
                 << signature_or_error.status();
         return signature_or_error.status();
       }
-      arguments.signature.dynamic_arg_signatures.push_back(
+      signature.dynamic_arg_signatures.push_back(
           std::move(signature_or_error).value());
     }
-    arguments.signature.thread_local_extra_jit_context = tls.extra_jit_context;
-    arguments.signature.global_extra_jit_context =
-        global_state.extra_jit_context;
+    signature.thread_local_extra_jit_context = tls.extra_jit_context;
+    signature.global_extra_jit_context = global_state.extra_jit_context;
     return absl::Status();
   }
 
@@ -404,7 +406,6 @@ class PmapFunction {
  private:
   // Mutates `cache_entry` in place.
   void PopulateCacheEntry(PmapCacheEntry& cache_entry,
-                          const CallSignature& signature,
                           const nb::tuple& out_and_fastpath_data);
 
   bool always_fallback_to_python_ = false;
@@ -428,7 +429,6 @@ class PmapFunction {
 };
 
 void PmapFunction::PopulateCacheEntry(PmapCacheEntry& cache_entry,
-                                      const CallSignature& signature,
                                       const nb::tuple& out_and_fastpath_data) {
   CHECK_EQ(out_and_fastpath_data.size(), 2);
   if (out_and_fastpath_data[1].is_none()) {
@@ -551,16 +551,19 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
   absl::Span<PyObject* const> positional_args(args, num_positional_args);
   absl::Span<PyObject* const> keyword_args(args + num_positional_args,
                                            num_keyword_args);
-  ParsedArgumentsAsBuffers arguments;
+  CallSignature call_signature;
+  absl::InlinedVector<nb::object, 2> flat_dynamic_args;
+  std::vector<nb::object> keep_alive_objects;
   absl::Status status =
       ParseArguments(positional_args, keyword_args, kwnames, static_argnums_,
-                     /*static_argnames=*/{}, pytree_registry_.get(), arguments);
+                     /*static_argnames=*/{}, pytree_registry_.get(),
+                     call_signature.arg_signature, flat_dynamic_args);
   if (!status.ok()) {
     VLOG(2) << "ParseArguments failed: " << status;
     return fallback_to_cache_miss();
   }
 
-  status = UpdateArgsSignature(arguments);
+  status = ComputeCallSignature(flat_dynamic_args, call_signature);
   if (!status.ok()) {
     return fallback_to_cache_miss();
   }
@@ -570,7 +573,7 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
       it;
   bool inserted;
   std::tie(it, inserted) = executables_.try_emplace(
-      arguments.signature, std::unique_ptr<PmapCacheEntry>());
+      call_signature, std::unique_ptr<PmapCacheEntry>());
   if (inserted) {
     it->second = std::make_unique<PmapCacheEntry>(pytree_registry_.get());
   }
@@ -582,7 +585,7 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
     if (inserted) {
       nb::object out_and_fastpath_data;
       nb::tuple out_tuple;
-      VLOG(2) << "Cache miss for " << arguments.signature.DebugString();
+      VLOG(2) << "Cache miss for " << call_signature.DebugString();
       try {
         // Calls Python and may release the GIL. May also throw if
         // compilation/tracing fails.
@@ -592,7 +595,7 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
         }
         out_tuple = nb::cast<nb::tuple>(out_and_fastpath_data);
 
-        PopulateCacheEntry(cache_entry, arguments.signature, out_tuple);
+        PopulateCacheEntry(cache_entry, out_tuple);
       } catch (const std::exception& e) {
         cache_entry.fall_back_to_python = true;
         cache_entry.compilation_complete.Notify();
@@ -618,20 +621,19 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
   // 1. Parse arguments.
   std::vector<xla::PjRtDevice*>& input_devices = cache_entry.devices;
   std::vector<InputSpec>& input_specs = cache_entry.input_specs;
-  const int num_args = arguments.flat_dynamic_args.size();
+  const int num_args = flat_dynamic_args.size();
 
   // We need [num_args] for the `Execute` call below.
   std::vector<tsl::RCReference<xla::ifrt::Array>> num_args_arrays(num_args);
   for (int i = 0; i < num_args; ++i) {
     TF_ASSIGN_OR_RETURN(
         ShardArgResult sharded_arg,
-        ShardArg(arguments.flat_dynamic_args[i].ptr(), input_devices,
-                 input_specs[i], cache_entry.py_devices,
-                 python_shard_arg_fallback_));
+        ShardArg(flat_dynamic_args[i].ptr(), input_devices, input_specs[i],
+                 cache_entry.py_devices, python_shard_arg_fallback_));
 
     num_args_arrays[i] = std::move(sharded_arg.ifrt_array);
     if (sharded_arg.owning_sda) {
-      arguments.keep_alive_objects.push_back(std::move(sharded_arg.owning_sda));
+      keep_alive_objects.push_back(std::move(sharded_arg.owning_sda));
     }
   }
 

From 3f12e20fe4979527b1fdd85ec570d3334f9cd46a Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Thu, 28 Mar 2024 05:49:17 -0700
Subject: [PATCH 530/670] Add support for folding constant unary non-compute
 ops in RTVar optimization

So far we can only optimize away RTVars that refer to a `constant` or `iota`
HLO op. This change is extending that to unary ops without compute (given
the operand is also constant).

That includes:
- Bitcast
- Broadcast
- Reshape
- Reverse
- Slice
- Transpose

PiperOrigin-RevId: 619898399
---
 .../xla/xla/service/gpu/model/indexing_map.cc | 82 ++++++++++++-----
 .../service/gpu/model/indexing_map_test.cc    | 88 +++++++++++++++++++
 2 files changed, 148 insertions(+), 22 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index 129a5e67cf7c4e..a740243bc2e795 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <sstream>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/types/span.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
 
@@ -1337,26 +1339,50 @@ bool IndexingMap::RescaleSymbols() {
   return !to_delete.empty();
 }
 
-static std::optional<AffineExpr> FoldsIntoConstantIndexingExpression(
-    const HloInstruction* instr, const mlir::AffineMap& affine_map,
-    MLIRContext* mlir_context,
+// Returns either:
+// 1. an AffineExpr if the RTVar folds entirely into a constant expression
+// 2. an updated RTVar if some partial optimization was possible
+// 3. an unchanged RTVar if no optimization was possible
+static std::variant<AffineExpr, RTVar> OptimizeRTVar(
+    RTVar rt_var, MLIRContext* mlir_context,
     IndexingMap::IndexingMapProvider indexing_map_provider) {
-  if (auto constant_expr = DynCast<HloConstantInstruction>(instr)) {
-    if (affine_map.isConstant()) {
-      const auto idx = affine_map.getConstantResults();
-      return getAffineConstantExpr(
-          constant_expr->literal().GetIntegralAsS64(idx).value(), mlir_context);
+  while (true) {
+    if (auto constant_expr = DynCast<HloConstantInstruction>(rt_var.hlo)) {
+      if (rt_var.map.isConstant()) {
+        const auto idx = rt_var.map.getConstantResults();
+        return getAffineConstantExpr(
+            constant_expr->literal().GetIntegralAsS64(idx).value(),
+            mlir_context);
+      }
+      return rt_var;
     }
-    return std::nullopt;
-  }
 
-  if (auto iota_expr = DynCast<HloIotaInstruction>(instr)) {
-    auto iota_dimension = iota_expr->iota_dimension();
-    CHECK(iota_dimension < affine_map.getNumResults());
-    return affine_map.getResults()[iota_dimension];
-  }
+    if (auto iota_expr = DynCast<HloIotaInstruction>(rt_var.hlo)) {
+      auto iota_dimension = iota_expr->iota_dimension();
+      CHECK(iota_dimension < rt_var.map.getNumResults());
+      return rt_var.map.getResults()[iota_dimension];
+    }
 
-  return std::nullopt;
+    auto is_indexing_transformation = [](const HloInstruction* instr) {
+      return instr->opcode() == HloOpcode::kBitcast ||
+             instr->opcode() == HloOpcode::kBroadcast ||
+             instr->opcode() == HloOpcode::kReshape ||
+             instr->opcode() == HloOpcode::kReverse ||
+             instr->opcode() == HloOpcode::kSlice ||
+             instr->opcode() == HloOpcode::kTranspose;
+    };
+
+    if (is_indexing_transformation(rt_var.hlo)) {
+      auto instr_indexing_map =
+          indexing_map_provider(rt_var.hlo, 0, mlir_context);
+
+      rt_var.hlo = rt_var.hlo->operand(0);
+      rt_var.map = instr_indexing_map.GetAffineMap().compose(rt_var.map);
+      continue;
+    }
+
+    return rt_var;
+  }
 }
 
 bool IndexingMap::ReplaceConstantRTVars(
@@ -1365,22 +1391,34 @@ bool IndexingMap::ReplaceConstantRTVars(
 
   std::vector<size_t> to_delete;
 
-  for (const auto& [index, rt_var] : llvm::enumerate(rt_vars_)) {
-    auto folded_expr = FoldsIntoConstantIndexingExpression(
-        rt_var.hlo, rt_var.map, GetMLIRContext(), indexing_map_provider);
-    if (!folded_expr.has_value()) continue;
+  for (auto index = 0; index < rt_vars_.size(); ++index) {
+    auto& rt_var = rt_vars_[index];
+    auto result =
+        OptimizeRTVar(rt_var, GetMLIRContext(), indexing_map_provider);
+
+    // If we got an RTVar back, then we just replace it and move on.
+    if (std::holds_alternative<RTVar>(result)) {
+      rt_var = std::get<RTVar>(std::move(result));
+      continue;
+    }
+
+    // But if we received an AffineExpr we can eliminate the RTVar from
+    // all expressions in the indexing map.
+    auto folded_expr = std::get<AffineExpr>(std::move(result));
 
+    // range_vars and rt_vars share the symbol space, with the rt_vars coming
+    // after the range_vars.
     auto symbol_index = range_vars_.size() + index;
     affine_map_ = affine_map_.replace(
         {{mlir::getAffineSymbolExpr(symbol_index, GetMLIRContext()),
-          folded_expr.value()}});
+          folded_expr}});
 
     llvm::DenseMap<AffineExpr, AffineExpr> replacements;
 
     for (const auto& [constraint, interval] : constraints_) {
       auto modified_constraint = constraint.replace(
           mlir::getAffineSymbolExpr(symbol_index, GetMLIRContext()),
-          folded_expr.value());
+          folded_expr);
 
       if (constraint == modified_constraint) continue;
       replacements[constraint] = modified_constraint;
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index 084b535708d332..2b250db7871099 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -826,6 +826,94 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_ConstraintsGetUpdated) {
               )"));
 }
 
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_Broadcast) {
+  auto iota = HloInstruction::CreateIota(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {12}), 0);
+  auto transpose = HloInstruction::CreateBroadcast(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {32, 12}), iota.get(), {1});
+
+  // (d0, 11): d0 maps into the broadcasted dimension, so it doesn't matter
+  // and 11 maps to 11 in iota.
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 31}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 11}, transpose.get(),
+             ParseAffineMap("(d0) -> (d0, 11)", &mlir_context_)}});
+
+  indexing_map.Simplify(GetIndexingMapForInstruction);
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0) -> (d0, 11)
+              domain:
+              d0 in [0, 31]
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_ChainedNoncomputeOps) {
+  auto iota = HloInstruction::CreateIota(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {12}), 0);
+  auto reverse = HloInstruction::CreateReverse(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {12}), iota.get(), {0});
+  auto reshape = HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {3, 4}), reverse.get());
+  auto broadcast = HloInstruction::CreateBroadcast(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {36, 3, 4}), reshape.get(),
+      {1, 2});
+
+  // - Iota: [0, 1, ,,,, 11]
+  // - Reverse: [11, 10, ..., 0]
+  // - Reshape: [[11, 10, 9, 8], [7, 6, 5, 4], [3, 2, 1, 0]]
+  // - Coordinates: (d0 floordiv 12, 3)
+  // - y-coordinate=3 means we index into [8, 4, 0]
+  // - x-coordinate=(d0 floordiv 12) means our constant looks like this:
+  //   [8, ..., 8, 4, ..., 4, 0, ..., 0]
+  // - Hence our final expression: (d0 floordiv 12) * -4 + 8
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 35}},
+      /*range_vars=*/{},
+      {RTVar{
+          Interval{0, 11}, broadcast.get(),
+          ParseAffineMap("(d0) -> (d0, d0 floordiv 12, 3)", &mlir_context_)}});
+
+  indexing_map.Simplify(GetIndexingMapForInstruction);
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0) -> (d0, (d0 floordiv 12) * -4 + 8)
+              domain:
+              d0 in [0, 35]
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_PartialRTVarRemoval) {
+  auto iota = HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<int64_t>({1, 7, 25, 1, 7, 25, 1, 7, 25, 1, 7, 25}));
+  auto broadcast = HloInstruction::CreateBroadcast(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {24, 12}), iota.get(), {1});
+
+  // (d0, d0 floordiv 2): d0 maps into the broadcasted dimension, so it can't be
+  // removed, but d0 floordiv 2 doesn't yield an affine expression so we need to
+  // keep the RTVar, but can optimize it by removing the broadcast.
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 23}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 512}, broadcast.get(),
+             ParseAffineMap("(d0) -> (d0, d0 floordiv 2)", &mlir_context_)}});
+
+  indexing_map.Simplify(GetIndexingMapForInstruction);
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0)[s0] -> (d0, s0)
+              domain:
+              d0 in [0, 23]
+              s0 in [0, 512]
+                hlo: %constant = s64[12]{0} constant({...})
+                (d0) -> (d0 floordiv 2)
+              )"));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From 5ceb83858e590102a69b10ce9682ab86bf427c63 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Thu, 28 Mar 2024 05:55:20 -0700
Subject: [PATCH 531/670] [XLA:GPU] Deprecate Triton codegen before Ampere.

Unfortunately, upstream Triton has decided to drop support for NVIDIA GPUs
below Ampere, so we bump the GPU version requirements for using Triton.

PiperOrigin-RevId: 619899728
---
 third_party/xla/xla/service/gpu/BUILD         |  26 ++-
 .../xla/xla/service/gpu/gemm_fusion.cc        |  14 +-
 .../service/gpu/gemm_fusion_autotuner_test.cc |  42 ----
 .../xla/xla/service/gpu/gemm_fusion_test.cc   |  19 +-
 .../xla/xla/service/gpu/gpu_compiler.cc       |   4 +-
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |  32 +--
 .../ir_emitter_triton_parametrized_test.cc    | 172 +--------------
 .../xla/service/gpu/ir_emitter_triton_test.cc | 205 +++++-------------
 .../xla/service/gpu/nvptx_compiler_test.cc    |  21 +-
 .../service/gpu/softmax_rewriter_triton.cc    |   7 +
 .../gpu/softmax_rewriter_triton_test.cc       |  75 +++----
 third_party/xla/xla/service/gpu/tests/BUILD   |  12 +-
 .../gpu/tests/gpu_triton_custom_call_test.cc  |  97 ++++++++-
 .../xla/xla/service/gpu/triton_support.cc     |  16 +-
 .../service/gpu/triton_tiling_propagation.cc  |   5 -
 15 files changed, 285 insertions(+), 462 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 3ae6b8646d3822..2e2d4b943ad100 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -599,7 +599,6 @@ xla_test(
     srcs = if_cuda_is_configured(["ir_emitter_triton_test.cc"]),
     backends = [
         "gpu_a100",
-        "gpu_v100",
         "gpu_h100",
     ],
     shard_count = 20,
@@ -636,7 +635,6 @@ xla_test(
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
@@ -652,7 +650,7 @@ xla_test(
     name = "ir_emitter_triton_large_test",
     srcs = if_cuda_is_configured(["ir_emitter_triton_large_test.cc"]),
     backend_tags = {"gpu": [
-        "requires-gpu-sm70",
+        "requires-gpu-sm80",
     ]},
     backends = [
         "gpu",
@@ -680,7 +678,6 @@ xla_test(
     srcs = if_cuda_is_configured(["ir_emitter_triton_parametrized_test.cc"]),
     backends = [
         "gpu_a100",
-        "gpu_v100",
     ],
     shard_count = 10,
     tags = ["nomac"],
@@ -768,7 +765,7 @@ xla_test(
     name = "gemm_fusion_autotuner_test",
     srcs = if_cuda_is_configured(["gemm_fusion_autotuner_test.cc"]),
     backend_tags = {"gpu": [
-        "requires-gpu-sm70",
+        "requires-gpu-sm80",
     ]},
     backends = [
         "gpu",
@@ -1618,8 +1615,10 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -3965,26 +3964,27 @@ cc_library(
     ]),
 )
 
-xla_cc_test(
+xla_test(
     name = "nvptx_compiler_test",
     srcs = if_gpu_is_configured([
         "nvptx_compiler_test.cc",
     ]),
+    backends = [
+        "gpu_v100",
+        "gpu_a100",
+    ],
     tags = [
         "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false positives in msan.
-        "requires-gpu-sm70",
     ],
     deps = [
-        ":gpu_compiler",
         ":nvptx_compiler_impl",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:backend",
         "//xla/service:buffer_assignment",
-        "//xla/service:gpu_plugin",
+        "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/status:statusor",
@@ -5917,13 +5917,15 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "determinism_test",
     srcs = ["determinism_test.cc"],
+    backends = [
+        "gpu_a100",
+    ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    tags = tf_gpu_tests_tags(),
     deps = [
         ":autotuner_util",
         "//xla:literal",
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion.cc b/third_party/xla/xla/service/gpu/gemm_fusion.cc
index 2bceba577ff8d5..7518fa51269533 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion.cc
@@ -794,7 +794,7 @@ bool IsSupportedByTriton(PrecisionConfig::Algorithm algorithm,
   switch (algorithm) {
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
       if (cuda_compute_capability) {
-        return cuda_compute_capability->IsAtLeastAmpere();
+        return true;
       }
       return false;
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
@@ -802,7 +802,7 @@ bool IsSupportedByTriton(PrecisionConfig::Algorithm algorithm,
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
       if (cuda_compute_capability) {
-        return cuda_compute_capability->IsAtLeastAmpere();
+        return true;
       }
       if (rocm_compute_capability) {
         return rocm_compute_capability->has_bf16_dtype_support();
@@ -852,8 +852,7 @@ FusionDecision CanTritonHandleGEMM(
         return true;
       case BF16:
         if (cuda_compute_capability) {
-          return cuda_compute_capability->IsAtLeast(
-              stream_executor::CudaComputeCapability::AMPERE);
+          return true;
         }
         if (rocm_compute_capability) {
           return rocm_compute_capability->has_bf16_dtype_support();
@@ -908,6 +907,13 @@ bool ShouldTritonHandleGEMM(HloDotInstruction& dot,
 absl::StatusOr<bool> GemmFusion::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  auto cuda_compute_capability =
+      std::get_if<se::CudaComputeCapability>(&gpu_version_);
+  if (!cuda_compute_capability || !cuda_compute_capability->IsAtLeastAmpere()) {
+    return absl::FailedPreconditionError(
+        "Triton support is only enabled for Ampere GPUs and up.");
+  }
+
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
index d4557f4737f70d..f8def9455b2c1f 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
@@ -230,28 +230,6 @@ class GemmFusionAutotunerTestWithMorePreciseReduction
   }
 };
 
-TEST_F(GemmFusionAutotunerTest, VoltaUsesNoMoreThanTwoStages) {
-  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
-ENTRY e {
-  p0 = f32[1024,1024] parameter(0)
-  p1 = f32[1024,1024] parameter(1)
-  ROOT r = f32[1024,1024] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})")
-                                                  .value();
-  const se::CudaComputeCapability compute_capability{
-      se::CudaComputeCapability::VOLTA, /*minor=*/0};
-  TF_ASSERT_OK_AND_ASSIGN(
-      const std::vector<TritonGemmConfig> configs,
-      GetPossibleMatmulAutotuneConfigs(
-          *Cast<HloDotInstruction>(
-              module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest()));
-  EXPECT_FALSE(std::any_of(
-      configs.begin(), configs.end(),
-      [](const TritonGemmConfig& config) { return config.num_stages > 2; }));
-}
-
 TEST_F(GemmFusionAutotunerTest, AmpereUsesMoreThanTwoStages) {
   std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -366,10 +344,6 @@ ENTRY e {
 }
 
 TEST_F(GemmFusionAutotunerTest, SelectsSplitK) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   // Shapes with K >> M, N have to force split-K configurations.
   const std::string kHloText = R"(
 HloModule t
@@ -395,10 +369,6 @@ ENTRY e {
 }
 
 TEST_F(GemmFusionAutotunerTestWithMorePreciseReduction, SelectsSplitK) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   // Shapes with K >> M, N have to force split-K configurations.
   constexpr absl::string_view kHloText = R"(
 HloModule t
@@ -468,10 +438,6 @@ ENTRY %e {
     backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
 })";
 
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "Not enough shared memory to run big tiles before Ampere.";
-  }
   auto module = ParseAndReturnVerifiedModule(kHloText).value();
   EXPECT_THAT(
       backend().compiler()->RunBackend(std::move(module),
@@ -507,10 +473,6 @@ ENTRY %e {
     backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
 })";
 
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "Not enough shared memory to run big tiles before Ampere.";
-  }
   auto module = ParseAndReturnVerifiedModule(kHloText).value();
   HloModuleConfig config = module->config();
   DebugOptions debug_options = config.debug_options();
@@ -594,10 +556,6 @@ ENTRY e {
 }
 
 TEST_F(GemmFusionAutotunerTest, AutotuneCuDnnFusion) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "cuDNN fusion autotuning is not tested before Ampere.";
-  }
   const std::string kHlo = R"(
 fusion1 {
   p0 = f32[3,28,32] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
index a4db45eb78d08e..e5986c9968b5ea 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "xla/tests/verified_hlo_module.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -194,7 +196,7 @@ ENTRY e {
     lhs_contracting_dims={0}, rhs_contracting_dims={0}
 })"));
 
-  const se::CudaComputeCapability cc{se::CudaComputeCapability::VOLTA, 0};
+  const se::CudaComputeCapability cc{se::CudaComputeCapability::AMPERE, 0};
   EXPECT_TRUE(CublasRequiresPadding(
       *xla::Cast<HloDotInstruction>(
           module->entry_computation()->root_instruction()),
@@ -215,7 +217,7 @@ ENTRY e {
   ROOT t = tuple(d, s1)
 })"));
 
-  const se::CudaComputeCapability cc{se::CudaComputeCapability::VOLTA, 0};
+  const se::CudaComputeCapability cc{se::CudaComputeCapability::AMPERE, 0};
   EXPECT_TRUE(GemmFusion(cc).Run(module.get()).value());
 }
 
@@ -759,7 +761,7 @@ e {
                                     m::Parameter(), m::Parameter()))));
 }
 
-TEST_F(GemmFusionLevel2Test, FusionLevelIsLimitedOnVolta) {
+TEST_F(GemmFusionLevel2Test, GemmFusionBailsOutPreAmpere) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -770,12 +772,13 @@ ENTRY e {
   ROOT dot = f32[2,2] dot(p0e, p1c),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })"));
-  EXPECT_TRUE(
+  EXPECT_THAT(
       GemmFusion(se::CudaComputeCapability{se::CudaComputeCapability::VOLTA, 0})
-          .Run(module.get())
-          .value());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch((m::Fusion(m::Exp(), m::Parameter()))));
+          .Run(module.get()),
+      tsl::testing::StatusIs(
+          absl::StatusCode::kFailedPrecondition,
+          ::testing::StrEq(
+              "Triton support is only enabled for Ampere GPUs and up.")));
 }
 
 TEST_F(GemmFusionLevel2Test, ParameterUsedElementwiseTwiceIsFused) {
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 8ec5319dd8b10b..c6f0490777370a 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1402,7 +1402,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // and may rewrite quantized FP8 GEMMs as higher-precision GEMMs.
     pipeline.AddPass<GemmRewriter>(gpu_version, /*f8_rewrite=*/true);
     if (debug_options.xla_gpu_enable_triton_gemm() && cuda_cc != nullptr &&
-        cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+        cuda_cc->IsAtLeast(se::CudaComputeCapability::AMPERE)) {
       pipeline.AddPass<GemmFusion>(gpu_version);
     }
     // Rewrite non-FP8 GEMMs.
@@ -1424,7 +1424,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // harder.
     if (debug_options.xla_gpu_enable_triton_softmax_fusion() &&
         cuda_cc != nullptr &&
-        cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+        cuda_cc->IsAtLeast(se::CudaComputeCapability::AMPERE)) {
       pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
       pipeline.AddPass<SoftmaxRewriterTriton>(gpu_version);
     }
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 98bf1ac6519286..184880632da09e 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -351,8 +351,7 @@ Value Compare(ImplicitLocOpBuilder& b, ValueRange values,
 
 Value Maximum(ImplicitLocOpBuilder& b, const se::DeviceDescription& device_info,
               ValueRange values) {
-  if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::FloatType>() &&
-      device_info.cuda_compute_capability().IsAtLeastAmpere()) {
+  if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::FloatType>()) {
     return b.create<ma::MaximumFOp>(values);
   }
   // logic: isNaN(lhs) || (!isNan(rhs) && lhs >= rhs) ? lhs : rhs
@@ -373,8 +372,7 @@ Value Maximum(ImplicitLocOpBuilder& b, const se::DeviceDescription& device_info,
 
 Value Minimum(ImplicitLocOpBuilder& b, const se::DeviceDescription& device_info,
               ValueRange values) {
-  if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::FloatType>() &&
-      device_info.cuda_compute_capability().IsAtLeastAmpere()) {
+  if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::FloatType>()) {
     return b.create<ma::MinimumFOp>(values);
   }
   // logic: isNaN(lhs) || (!isNan(rhs) && lhs <= rhs) ? lhs : rhs
@@ -969,10 +967,9 @@ absl::Status CreateTritonPipeline(
   pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
   pm.addPass(mlir::createCSEPass());
 
-  if (cc.IsAtLeastAmpere()) {
-    pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
-                                           config.num_ctas, ccAsInt));
-  }
+  pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
+                                         config.num_ctas, ccAsInt));
+
   if (!cc.IsAtLeastHopper()) {
     pm.addPass(mt::gpu::createPrefetchPass());
   }
@@ -1907,9 +1904,7 @@ bool Is6xBfloat16MatMul(const HloDotInstruction* dot_instr,
   if (algorithm == PrecisionConfig::ALG_UNSET) {
     const HloModule* hlo_module = dot_instr->GetModule();
     Type f32 = builder.getF32Type();
-    // BF16 datatype is not supported before Ampere.
-    return device_info.cuda_compute_capability().IsAtLeastAmpere() &&
-           hlo_module->config()
+    return hlo_module->config()
                .debug_options()
                .xla_gpu_enable_bf16_6way_gemm() &&
            dot_input_lhs.getType().cast<ShapedType>().getElementType() == f32 &&
@@ -1929,9 +1924,7 @@ bool Is3xBfloat16MatMul(const HloDotInstruction* dot_instr,
   if (algorithm == PrecisionConfig::ALG_UNSET) {
     const HloModule* hlo_module = dot_instr->GetModule();
     Type f32 = builder.getF32Type();
-    // BF16 datatype is not supported before Ampere.
-    return device_info.cuda_compute_capability().IsAtLeastAmpere() &&
-           hlo_module->config()
+    return hlo_module->config()
                .debug_options()
                .xla_gpu_enable_bf16_3way_gemm() &&
            dot_input_lhs.getType().cast<ShapedType>().getElementType() == f32 &&
@@ -2186,7 +2179,6 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
     Value accumulator_next;
     if (Is6xBfloat16MatMul(dot_instr, b, dot_input_lhs, dot_input_rhs,
                            device_info)) {
-      CHECK(device_info.cuda_compute_capability().IsAtLeastAmpere());
       absl::StatusOr<Value> accumulator_next_or = Emit6xBfloat16MatMul(
           b, dot_input_lhs, dot_input_rhs, iter_args.back());
       TF_CHECK_OK(accumulator_next_or.status());
@@ -2753,6 +2745,11 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     llvm::Module* llvm_module, TritonIrEmitter ir_emitter,
     mlir::MLIRContext& mlir_context) {
+  if (!cc.IsAtLeastAmpere()) {
+    return absl::FailedPreconditionError(
+        "Triton support is only enabled for Ampere GPUs and up.");
+  }
+
   auto debug_options = GetDebugOptionsFromFlags();
   if (debug_options.xla_gpu_enable_triton_hopper()) {
     // Set environment variables for consumption by Triton.
@@ -2782,6 +2779,11 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     mlir::ModuleOp triton_module, llvm::Module* llvm_module,
     mlir::MLIRContext& mlir_context) {
+  if (!cc.IsAtLeastAmpere()) {
+    return absl::FailedPreconditionError(
+        "Triton support is only enabled for Ampere GPUs and up.");
+  }
+
   bool should_verify =
       (hlo_config.debug_options().xla_gpu_llvm_verification_level() >= 1);
 #ifndef NDEBUG
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index 8e01b864b893ee..3be360ae554379 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -71,11 +71,6 @@ class MixedTypeTest : public GpuCodegenTest,
 
 TEST_P(MixedTypeTest, MixedTypeDotProducesCorrectResult) {
   MixTypeParams params = GetParam();
-  if ((params.lhs_ty == BF16 || params.rhs_ty == BF16) &&
-      !GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string hlo_string_template = R"(
 HloModule m
 
@@ -799,10 +794,6 @@ TEST_P(TritonSoftmaxTest, CanFuseAndEmitExactSoftmax) {
 
   if (data_type == F16) {
     GTEST_SKIP() << "Exponential op does not support F16.";
-  } else if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                                      se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
   }
 
   const std::string hlo_text_template = R"(
@@ -887,14 +878,7 @@ ENTRY main {
   const std::string hlo_text = absl::Substitute(
       hlo_text_template, primitive_util::LowercasePrimitiveTypeName(data_type));
 
-  std::string hlo_ref_template;
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    hlo_ref_template = R"(
-; CHECK-NOT: triton
-)";
-  } else {
-    hlo_ref_template = R"(
+  std::string hlo_ref_template = R"(
 ; CHECK:    ENTRY
 ; CHECK:      %[[P0:.*]] = $0[127,125]{1,0} parameter(0)
 ; CHECK:      ROOT
@@ -902,7 +886,6 @@ ENTRY main {
 ; CHECK-SAME:   kind=kCustom
 ; CHECK-SAME:   __triton_softmax
 )";
-  }
 
   const std::string hlo_ref = absl::Substitute(
       hlo_ref_template, primitive_util::LowercasePrimitiveTypeName(data_type));
@@ -927,12 +910,6 @@ ENTRY main {
 
 TEST_P(TritonSoftmaxTest, CanFuseAndEmitSoftmaxDiamondWithSmallRows) {
   PrimitiveType data_type = GetParam();
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   constexpr absl::string_view kHloTextTemplate = R"(
 HloModule softmax
 min_computation {
@@ -969,12 +946,6 @@ ENTRY main {
 }
 
 TEST_F(TritonSoftmaxTest, CanFuseAndEmitDiamondWithBF16Converts) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text = R"(
 HloModule softmax
 max_computation {
@@ -1016,10 +987,6 @@ TEST_P(
 
   if (data_type == F16) {
     GTEST_SKIP() << "Exponential op does not support F16.";
-  } else if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                                      se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
   }
 
   const std::string hlo_text_template = R"(
@@ -1089,12 +1056,6 @@ TEST_P(TritonSoftmaxTest,
        CanFuseAndEmitDiamondWithMultipleBroadcastDimensions) {
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1151,12 +1112,7 @@ TEST_P(TritonSoftmaxTest,
 
   if (data_type == F16) {
     GTEST_SKIP() << "Exponential op does not support F16.";
-  } else if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                                      se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
   }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1220,12 +1176,6 @@ TEST_P(
     CanFuseAndEmitTwoDiamondsWithSecondDiamondProducerEqualToFirstDiamondRoot) {
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1289,12 +1239,6 @@ TEST_P(TritonSoftmaxTest,
        CanFuseAndEmitDiamondWithTrailingUnaryElementwiseAtTheRoot) {
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1349,12 +1293,6 @@ ENTRY main {
 TEST_P(TritonSoftmaxTest, CanFuseAndEmitDiamondWithUnaryElementwisePrefix) {
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1410,12 +1348,6 @@ TEST_P(TritonSoftmaxTest,
        CanFuseAndEmitSoftmaxDiamondWithLastDimensionBitcastAfterReduce) {
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1469,17 +1401,10 @@ ENTRY main {
                             ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
 }
 
-TEST_P(
-    TritonSoftmaxTest,
-    CanFuseAndEmitConvertInvolvingBF16InputIntoSoftmaxDiamondCorrectlyForAmpereAndVoltaComputeCapability) {  // NOLINT(whitespace/line_length)
+TEST_P(TritonSoftmaxTest,
+       CanFuseAndEmitConvertInvolvingBF16InputIntoSoftmaxDiamondCorrectly) {
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1499,8 +1424,7 @@ ENTRY main {
   const std::string hlo_text = absl::Substitute(
       hlo_text_template, primitive_util::LowercasePrimitiveTypeName(data_type));
 
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-    const std::string hlo_ref = R"(
+  const std::string hlo_ref = R"(
 ; CHECK:    ENTRY
 ; CHECK:      %[[P0:.*]] = bf16[127,125]{1,0} parameter(0)
 ; CHECK:      ROOT
@@ -1509,23 +1433,7 @@ ENTRY main {
 ; CHECK-SAME:   __triton_softmax
 )";
 
-    MatchOptimizedHlo(hlo_text, hlo_ref);
-  } else {
-    const std::string hlo_ref_template = R"(
-; CHECK:    ENTRY
-; CHECK:      %[[P0:.*]] = bf16[127,125]{1,0} parameter(0)
-; CHECK:      %[[CONVERT:.*]] = $0[127,125]{1,0} convert(%[[P0]])
-; CHECK:      ROOT
-; CHECK-SAME: fusion(%[[CONVERT]])
-; CHECK-SAME:   kind=kCustom
-; CHECK-SAME:   __triton_softmax
-)";
-
-    const std::string hlo_ref =
-        absl::Substitute(hlo_ref_template,
-                         primitive_util::LowercasePrimitiveTypeName(data_type));
-    MatchOptimizedHlo(hlo_text, hlo_ref);
-  }
+  MatchOptimizedHlo(hlo_text, hlo_ref);
 
   float tolerance;
   switch (data_type) {
@@ -1550,12 +1458,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseProducerIntoDiamondWhenBothOperandsAreTheSame) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamond
 max_computation {
@@ -1612,12 +1514,6 @@ TEST_P(
     CanFuseAndEmitIntermediateBinaryElementwiseWithinDiamondWhenBothOperandsAreTheSame) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamond
 max_computation {
@@ -1674,12 +1570,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseWhenBothOperandsAreTheSameBetweenDiamonds) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamonds
 max_computation {
@@ -1745,12 +1635,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseConsumerWhereBothOperandsAreTheSameIntoDiamond) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamond
 max_computation {
@@ -1813,12 +1697,6 @@ TEST_P(
     CanFuseAndEmitTwoBinaryElementwiseWhereBothOperandsAreTheSameBetweenDiamonds) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamonds
 max_computation {
@@ -1919,10 +1797,6 @@ TEST_P(TritonSoftmaxTest, CanFuseAndEmitRMSNormDiamond) {
 
   if (data_type == F16) {
     GTEST_SKIP() << "rsqrt op does not support F16.";
-  } else if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                                      se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
   }
 
   const std::string hlo_text_template = R"(
@@ -1989,12 +1863,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseWhereTheFirstOperandIsASplatConstantBetweenDiamonds) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamonds
 add_computation {
@@ -2058,12 +1926,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseWhereTheSecondOperandIsASplatConstantBetweenDiamonds) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamonds
 add_computation {
@@ -2127,12 +1989,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseWhereTheFirstOperandIsASplatConstantWithinDiamond) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamond
 max_computation {
@@ -2192,12 +2048,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseConsumerWhereTheFirstOperandIsASplatConstantIntoDiamond) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamond
 add_computation {
@@ -2256,12 +2106,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseProducerWhereTheFirstOperandIsASplatConstantIntoDiamond) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamond
 add_computation {
@@ -2321,12 +2165,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseOperationWhereOneOperandIsASharedSplatProducerIntoDiamond) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule nonfusible_diamond
 max_computation {
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 78e277f224b316..9a2b5a4e8b1d25 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
@@ -1289,17 +1288,10 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
                           ParseAndReturnVerifiedModule(kHloText));
 
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-    CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                  R"(
+  CompileAndOptionallyVerifyPtx(std::move(verified_module),
+                                R"(
 CHECK: mma
 )");
-  } else {
-    CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                  R"(
-CHECK: fma
-)");
-  }
 }
 
 TEST_F(TritonGemmTest, FailIfTooMuchShmem) {
@@ -2021,12 +2013,6 @@ ENTRY e {
 
 class TritonGemmLevel2Test : public TritonGemmTest {
  public:
-  void SetUp() override {
-    if (!GetCudaComputeCapability().IsAtLeast(
-            se::CudaComputeCapability::AMPERE)) {
-      GTEST_SKIP() << "Triton fusion on pre-Ampere GPUs is limited.";
-    }
-  }
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = TritonGemmTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_triton_fusion_level(2);
@@ -3061,11 +3047,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, BF16TransposedLHS) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
-
   const char* hlo_text_ref = R"(
 HloModule r
 
@@ -3106,11 +3087,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, UsingOptinSharedMemoryOnAmpereProducesSameResult) {
-  // On pre-Ampere GPUs the test would use a different amount of shared memory.
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "This test is for Ampere+ GPUs.";
-  }
   const se::DeviceDescription dev_info =
       backend().default_stream_executor()->GetDeviceDescription();
   constexpr int kBytesOfSharedMemoryTested = 64 * 1024;
@@ -3276,10 +3252,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, S8BF16) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* hlo_text_ref = R"(
 HloModule r
 
@@ -3327,10 +3299,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, SplitK) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string hlo_text_ref = R"(
 HloModule t, is_scheduled=true
 
@@ -3404,10 +3372,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, SplitKBatch) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloTextRef = R"(
 HloModule m, is_scheduled=true
 
@@ -3470,10 +3434,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, SplitKNontrivialBitcast) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloTextRef = R"(
 HloModule module, is_scheduled=true
 
@@ -4048,10 +4008,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, PredToBF16ConversionWorks) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloTextTest = R"(
 HloModule m, is_scheduled=true
 
@@ -4172,10 +4128,6 @@ class TritonGemmContractionDims : public TritonGemmTest {
 };
 
 TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_0) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -4198,10 +4150,6 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_2_1_2) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -4224,10 +4172,6 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_2_0_1) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -4251,10 +4195,6 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_1) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -4318,9 +4258,6 @@ class Triton6xBF16GemmTestWithFlag : public TritonFilecheckTest {
 };
 
 TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmWhenBothInputsAreF32) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4364,9 +4301,6 @@ CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf
 }
 
 TEST_F(Triton6xBF16GemmTestWithFlag, Emit6xBF16GemmWhenBothInputsAreF32) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4409,9 +4343,6 @@ CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf
 }
 
 TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmWorksForLongContractingDimension) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4442,9 +4373,6 @@ CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
 }
 
 TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmCanHandleInfinity) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4489,9 +4417,6 @@ CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
 }
 
 TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmCanHandleNaN) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4548,9 +4473,6 @@ CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
 //   x_lo:  5.17201445e+33
 // The result of x*x would be NaN instead of positive infinity.
 TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmWorksForInputsWithLargeExponent) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4594,42 +4516,6 @@ CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
                                ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
 }
 
-TEST_F(Triton6xBF16GemmTestWithFlag, ShouldNotEmit6xBF16GemmForPreAmpere) {
-  if (GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "6xBF16Gemm should be emitted post-Ampere.";
-  }
-  const char* kHloText = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[5,7] parameter(0)
-  p1 = f32[7,33] parameter(1)
-  ROOT dot = f32[5,33] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[5,7]{1,0} parameter(0)
-  p1 = f32[7,33]{1,0} parameter(1)
-  ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config:
-    {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
-}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
-                          ParseAndReturnVerifiedModule(kHloText));
-
-  CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                R"(
-CHECK-NOT: mma
-CHECK: selp.f32
-CHECK: st.shared{{(\.v[24])?}}.f32
-CHECK: ld.shared{{(\.v[24])?}}.f32
-CHECK: fma.rn.f32
-CHECK: st.shared{{(\.v[24])?}}.f32
-)");
-}
 
 TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmEndToEnd) {
   const char* kHloText = R"(
@@ -4645,18 +4531,13 @@ ENTRY e {
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
                           ParseAndReturnVerifiedModule(kHloText));
-  if (GetCudaComputeCapability().IsAtLeastAmpere()) {
-    CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                  R"(
+  CompileAndOptionallyVerifyPtx(std::move(verified_module),
+                                R"(
 CHECK: mma.sync.aligned.{{.*}}.row.col.f32.bf16.bf16.f32
 CHECK-NOT: mma.sync.aligned.{{.*}}.row.col.f32.tf32.tf32.f32
 )");
-    EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-6,
-                                                  /*arel=*/1e-6}));
-  } else {
-    EXPECT_THAT(CompileToExecutable(std::move(verified_module)),
-                tsl::testing::StatusIs(absl::StatusCode::kUnimplemented));
-  }
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-6,
+                                                /*arel=*/1e-6}));
 }
 
 // In these tests, we depend on "algorithm" annotations for selecting the 3XBF16
@@ -4702,9 +4583,6 @@ class Triton3xBF16GemmTestWithFlag : public TritonFilecheckTest {
 };
 
 TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmWhenBothInputsAreF32) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4748,9 +4626,6 @@ CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf
 }
 
 TEST_F(Triton3xBF16GemmTestWithFlag, Emit3xBF16GemmWhenBothInputsAreF32) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4822,9 +4697,6 @@ CHECK-NOT:  tt.dot
 }
 
 TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmWorksForLongContractingDimension) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4855,9 +4727,6 @@ CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
 }
 
 TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmCanHandleInfinity) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4902,9 +4771,6 @@ CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
 }
 
 TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmCanHandleNaN) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4951,9 +4817,6 @@ CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
 }
 
 TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmWorksForInputsWithLargeExponent) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -5011,19 +4874,57 @@ ENTRY e {
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
                           ParseAndReturnVerifiedModule(kHloText));
-  if (GetCudaComputeCapability().IsAtLeastAmpere()) {
-    CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                  R"(
+  CompileAndOptionallyVerifyPtx(std::move(verified_module),
+                                R"(
 CHECK: mma.sync.aligned.{{.*}}.row.col.f32.bf16.bf16.f32
 CHECK-NOT: mma.sync.aligned.{{.*}}.row.col.f32.tf32.tf32.f32
 )");
-    EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-5,
-                                                  /*arel=*/1e-5}));
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-5,
+                                                /*arel=*/1e-5}));
+}
 
-  } else {
-    EXPECT_THAT(CompileToExecutable(std::move(verified_module)),
-                tsl::testing::StatusIs(absl::StatusCode::kUnimplemented));
-  }
+using TritonEmitterTest = TritonGemmTest;
+
+TEST_F(TritonEmitterTest, EmitterFailsIfComputeCapabilityIsBelowAmpere) {
+  const std::string kHloText = R"(
+HloModule module, is_scheduled=true
+
+triton_gemm_dot {
+  p0 = f32[10,20] parameter(0)
+  p1 = f32[20,30] parameter(1)
+  ROOT dot = f32[10,30] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = f32[10,20] parameter(0)
+  p1 = f32[20,30] parameter(1)
+  ROOT r = f32[10,30] fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  const HloComputation* triton_dot_computation =
+      hlo_module->entry_computation()
+          ->root_instruction()
+          ->fused_instructions_computation();
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  llvm::LLVMContext llvm_ctx;
+  llvm::Module llvm_module("module", llvm_ctx);
+  mlir::MLIRContext mlir_context;
+
+  EXPECT_THAT(
+      TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
+                    "test_fn", triton_dot_computation,
+                    se::CudaComputeCapability{se::CudaComputeCapability::VOLTA,
+                                              /*minor=*/0},
+                    dev_info, TritonGemmConfig{}, &llvm_module, &EmitMatMul,
+                    mlir_context),
+      tsl::testing::StatusIs(
+          absl::StatusCode::kFailedPrecondition,
+          ::testing::StrEq(
+              "Triton support is only enabled for Ampere GPUs and up.")));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
index e6ab31b84033cf..f247582d38a5a3 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/backend.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
@@ -132,7 +133,7 @@ ENTRY entry {
 
 TEST_F(NVPTXCompilerTestTriton,
        DotDimensionAreSortedBeforePaddingForCublasEnablingTritonFusion) {
-  MatchOptimizedHlo(R"(
+  const absl::string_view hlo_string = R"(
 ENTRY e {
  p0 = f16[11,22,33,44] parameter(0)
  p1 = s8[11,22,33,44] parameter(1)
@@ -140,13 +141,25 @@ ENTRY e {
  ROOT d = f16[11,22,44,44] dot(p0, p1c),
   lhs_batch_dims={0,1}, lhs_contracting_dims={2},
   rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-})",
-                    R"(
+})";
+
+  se::CudaComputeCapability cc = backend()
+                                     .default_stream_executor()
+                                     ->GetDeviceDescription()
+                                     .cuda_compute_capability();
+
+  if (cc.IsAtLeastAmpere()) {
+    MatchOptimizedHlo(hlo_string, R"(
 ; CHECK: ENTRY
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: __triton_gemm
-  )");
+    )");
+  } else {
+    MatchOptimizedHlo(hlo_string, R"(
+; CHECK-NOT: triton
+    )");
+  }
 }
 
 TEST_F(NVPTXCompilerTest, RemovesUnnecessaryCopyInPostSchedulingPipelines) {
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
index df771c4df85d32..902a10ed935b71 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
@@ -670,6 +670,13 @@ absl::Status SoftmaxRewriterTriton::FuseDiamondChain(
 absl::StatusOr<bool> SoftmaxRewriterTriton::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  auto cuda_compute_capability =
+      std::get_if<se::CudaComputeCapability>(&gpu_version_);
+  if (!cuda_compute_capability || !cuda_compute_capability->IsAtLeastAmpere()) {
+    return absl::FailedPreconditionError(
+        "Triton support is only enabled for Ampere GPUs and up.");
+  }
+
   std::vector<DiamondChainDescriptor> diamond_chains =
       FindAllFusibleDiamondChains(*module, execution_threads);
 
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
index 0683f405e30f6e..ef0bc0f6f7d7f6 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
@@ -1,8 +1,11 @@
 /* Copyright 2023 The OpenXLA Authors.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
+
 You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -1073,9 +1076,8 @@ ENTRY main {
               GmockMatch(m::Fusion(m::Parameter())));
 }
 
-TEST_P(
-    SoftmaxRewriterTritonTest,
-    CanOnlyFuseConvertInvolvingBF16InputIntoSoftmaxDiamondWithAtLeastAmpereComputeCapability) {  // NOLINT(whitespace/line_length)
+TEST_P(SoftmaxRewriterTritonTest,
+       CanFuseConvertInvolvingBF16InputIntoSoftmaxDiamond) {
   PrimitiveType data_type = GetParam();
   const std::string hlo_string_template = R"(
 HloModule softmax
@@ -1091,52 +1093,51 @@ ENTRY main {
   reduce = $0[127]{0} reduce(param_0_$0, constant_neg_inf), dimensions={1}, to_apply=max_computation
   broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
   ROOT subtract = $0[127,125]{1,0} subtract(param_0_$0, broadcast)
-}
-)";
+})";
   const std::string hlo_string =
       absl::Substitute(hlo_string_template,
                        primitive_util::LowercasePrimitiveTypeName(data_type));
 
-  auto ampere_module = ParseAndReturnVerifiedModule(hlo_string).value();
-  auto volta_module = ampere_module->Clone();
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
 
-  // Ampere
   EXPECT_TRUE(
       SoftmaxRewriterTritonMatchAndRewrite(
           se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0},
-          ampere_module.get())
+          module.get())
           .value());
-  EXPECT_TRUE(verifier().Run(ampere_module.get()).status().ok());
-  VLOG(2) << ampere_module->ToString();
-  EXPECT_THAT(ampere_module->entry_computation()->root_instruction(),
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(m::Fusion(m::Parameter())));
+}
 
-  // Volta (pre-Ampere)
-  VLOG(2) << volta_module->ToString();
+TEST_F(SoftmaxRewriterTritonTest, RewriterBailsOutOnPreAmpereGpu) {
+  const std::string hlo_string = R"(
+HloModule softmax
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+ENTRY main {
+  param_0 = bf16[127,125]{1,0} parameter(0)
+  param_0_f32 = f32[127,125]{1,0} convert(param_0)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[127]{0} reduce(param_0_f32, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[127,125]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[127,125]{1,0} subtract(param_0_f32, broadcast)
+})";
 
-  switch (data_type) {
-    case F32:
-    case F16:
-      EXPECT_TRUE(
-          SoftmaxRewriterTritonMatchAndRewrite(
-              se::CudaComputeCapability{se::CudaComputeCapability::VOLTA, 0},
-              volta_module.get())
-              .value());
-      EXPECT_TRUE(verifier().Run(volta_module.get()).status().ok());
-      EXPECT_THAT(volta_module->entry_computation()->root_instruction(),
-                  GmockMatch(m::Fusion(m::Convert(m::Parameter()))));
-      break;
-    case BF16:
-      // When bf16 is used, no fusion is possible on Volta.
-      EXPECT_FALSE(
-          SoftmaxRewriterTritonMatchAndRewrite(
-              se::CudaComputeCapability{se::CudaComputeCapability::VOLTA, 0},
-              volta_module.get())
-              .value());
-      break;
-    default:
-      ABSL_UNREACHABLE();
-  }
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+
+  EXPECT_THAT(
+      SoftmaxRewriterTriton(
+          se::CudaComputeCapability{se::CudaComputeCapability::VOLTA, 0})
+          .Run(module.get()),
+      tsl::testing::StatusIs(
+          tsl::error::FAILED_PRECONDITION,
+          ::testing::StrEq(
+              "Triton support is only enabled for Ampere GPUs and up.")));
 }
 
 TEST_P(SoftmaxRewriterTritonTest, DoesNotFuseConvertWithC64DataType) {
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 783661f929211d..87617a29b8c289 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -442,21 +442,25 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_triton_custom_call_test",
     srcs = ["gpu_triton_custom_call_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    backends = [
+        "gpu_a100",
+        "gpu_v100",
+    ],
     deps = [
         ":gpu_codegen_test",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:device_description",
         "//xla/tests:verified_hlo_module",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:CAPIIRHeaders",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
index 439292ca2eab92..7dc6fda816609a 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -29,7 +31,9 @@ limitations under the License.
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tests/verified_hlo_module.h"
+#include "tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace gpu {
@@ -37,10 +41,22 @@ namespace gpu {
 using ::mlir::ArrayRef;
 using ::mlir::NamedAttribute;
 
-using GpuIrEmitterUnnestedTest = GpuCodegenTest;
+class GpuIrEmitterUnnestedTest : public GpuCodegenTest {
+ public:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+};
 
 TEST_F(GpuIrEmitterUnnestedTest,
        EmitTritonCustomCallWithCorrectLoweringAndWithoutNoaliasOrAlignment) {
+  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "Triton support is only enabled for Ampere GPUs and up.";
+  }
+
   // Tests that the lowering of a Triton custom call produces the correct LLVM
   // IR, and that the arguments do not specify noalias or alignment attributes.
 
@@ -139,5 +155,84 @@ TEST_F(GpuIrEmitterUnnestedTest,
                      /*match_optimized_ir=*/false);
 }
 
+TEST_F(GpuIrEmitterUnnestedTest, CanNotEmitTritonCustomCallOnPreAmpereGpu) {
+  if (GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "Running on Ampere or more recent GPU, skipping.";
+  }
+
+  HloComputation::Builder computation_builder(TestName());
+  mlir::MLIRContext context_;
+  mlir::Builder builder(&context_);
+
+  // Create parameters and custom call in the computation builder.
+  Shape scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  HloInstruction* param_0 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "arg_0"));
+
+  HloInstruction* param_1 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "arg_1"));
+
+  // Create the backend_config for the triton custom call.
+  const std::string kMLIRText = R"(
+  module {
+    tt.func public @add_one(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}) {
+      %0 = tt.get_program_id x : i32
+      %1 = tt.load %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      %2 = tt.load %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      %cst = arith.constant 1.000000e+00 : f32
+      %3 = arith.addf %1, %cst : f32
+      %4 = tt.load %arg2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      tt.store %arg2, %3 {cache = 1 : i32, evict = 1 : i32} : f32
+      %5 = tt.load %arg3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      tt.store %arg3, %2 {cache = 1 : i32, evict = 1 : i32} : f32
+      tt.return
+    }
+  }
+  )";
+
+  NamedAttribute name =
+      builder.getNamedAttr("name", builder.getStringAttr("add_one"));
+  NamedAttribute ir =
+      builder.getNamedAttr("ir", builder.getStringAttr(kMLIRText));
+  NamedAttribute num_stages =
+      builder.getNamedAttr("num_stages", builder.getI32IntegerAttr(3));
+  NamedAttribute num_warps =
+      builder.getNamedAttr("num_warps", builder.getI32IntegerAttr(4));
+  NamedAttribute grid_x =
+      builder.getNamedAttr("grid_x", builder.getI32IntegerAttr(1));
+  NamedAttribute grid_y =
+      builder.getNamedAttr("grid_y", builder.getI32IntegerAttr(1));
+  NamedAttribute grid_z =
+      builder.getNamedAttr("grid_z", builder.getI32IntegerAttr(1));
+  NamedAttribute debug =
+      builder.getNamedAttr("debug", builder.getBoolAttr(false));
+
+  std::vector<NamedAttribute> attributes = {
+      name, ir, num_stages, num_warps, grid_x, grid_y, grid_z, debug};
+  ArrayRef<NamedAttribute> attributesRef(attributes);
+  mlir::DictionaryAttr backend_config =
+      mlir::DictionaryAttr::get(&context_, attributesRef);
+
+  // Parse the backend_config into a string.
+  std::string backend_config_str;
+  llvm::raw_string_ostream(backend_config_str) << backend_config;
+
+  computation_builder.AddInstruction(HloInstruction::CreateCustomCall(
+      tuple_shape, {param_0, param_1}, "__gpu$xla.gpu.triton",
+      backend_config_str));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(computation_builder.Build());
+
+  EXPECT_THAT(
+      CompileToExecutable(std::move(module), /*run_optimization_passes=*/false),
+      tsl::testing::StatusIs(
+          absl::StatusCode::kFailedPrecondition,
+          ::testing::StrEq(
+              "Triton support is only enabled for Ampere GPUs and up.")));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_support.cc b/third_party/xla/xla/service/gpu/triton_support.cc
index a4c837c182f13b..3e2b15222320d1 100644
--- a/third_party/xla/xla/service/gpu/triton_support.cc
+++ b/third_party/xla/xla/service/gpu/triton_support.cc
@@ -61,15 +61,13 @@ bool IsTritonSupportedDataType(PrimitiveType type,
     case F32:
       return true;
     case BF16:
-      return std::visit(
-          VariantVisitor{[](const se::CudaComputeCapability& cc) {
-                           return cc.IsAtLeast(
-                               stream_executor::CudaComputeCapability::AMPERE);
-                         },
-                         [](const se::RocmComputeCapability& cc) {
-                           return cc.has_bf16_dtype_support();
-                         }},
-          gpu_version);
+      return std::visit(VariantVisitor{[](const se::CudaComputeCapability& cc) {
+                                         return true;
+                                       },
+                                       [](const se::RocmComputeCapability& cc) {
+                                         return cc.has_bf16_dtype_support();
+                                       }},
+                        gpu_version);
     default:
       return false;
   }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 9d54797f27f849..89b54ed4b4a4fa 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -1086,11 +1086,6 @@ GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
   int fusion_level =
       hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
   // TODO(ROCm): Check fusion level for ROCm.
-  if (std::holds_alternative<se::CudaComputeCapability>(gpu_version) &&
-      !std::get<se::CudaComputeCapability>(gpu_version)
-           .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-    fusion_level = std::min(fusion_level, 1);
-  }
   if (transform_direction == TransformDirection::kOutputToInput) {
     if (fusion_level < 2) {
       if (hlo.opcode() == HloOpcode::kConvert) {

From 34a4054ac11a9161fd791c29ca9f3da3127319fe Mon Sep 17 00:00:00 2001
From: RJ Ascani <rjascani@google.com>
Date: Thu, 28 Mar 2024 05:57:52 -0700
Subject: [PATCH 532/670] #shlo_ref Use _Float16 for F16 with GCC

The GCC OSS build was failing to use `_Float16` for the `shlo_ref::F16` type alias despite `_Float16`'s availability with GCC. This was because the `has_keyword` macro was always returning false on GCC, as it was implemented using `__is_identifier`, which is Clang only. _Float16 should be available on both GCC & Clang, so we should just use those unless std::float16_t is available.

This also removes has_keyword.h, as it is no longer used and ensures that `BF16` and `F16` map to `std::bfloat16_t` and `std::float16_t` when those are available.

PiperOrigin-RevId: 619900330
---
 tensorflow/lite/experimental/shlo/BUILD       |  7 ----
 tensorflow/lite/experimental/shlo/bf16.h      |  2 +-
 tensorflow/lite/experimental/shlo/f16.h       |  8 ++---
 .../lite/experimental/shlo/has_keyword.h      | 32 -------------------
 tensorflow/lite/experimental/shlo/ops/BUILD   |  2 +-
 .../experimental/shlo/ops/is_finite_test.cc   | 12 ++++---
 6 files changed, 11 insertions(+), 52 deletions(-)
 delete mode 100644 tensorflow/lite/experimental/shlo/has_keyword.h

diff --git a/tensorflow/lite/experimental/shlo/BUILD b/tensorflow/lite/experimental/shlo/BUILD
index 02838f1449002d..a274bab28d0f4a 100644
--- a/tensorflow/lite/experimental/shlo/BUILD
+++ b/tensorflow/lite/experimental/shlo/BUILD
@@ -86,7 +86,6 @@ cc_library(
     name = "bf16",
     hdrs = ["bf16.h"],
     deps = [
-        ":has_keyword",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/log:absl_check",
     ],
@@ -106,12 +105,6 @@ cc_test(
 cc_library(
     name = "f16",
     hdrs = ["f16.h"],
-    deps = [":has_keyword"],
-)
-
-cc_library(
-    name = "has_keyword",
-    hdrs = ["has_keyword.h"],
 )
 
 cc_library(
diff --git a/tensorflow/lite/experimental/shlo/bf16.h b/tensorflow/lite/experimental/shlo/bf16.h
index 33b614d1888435..3f228fb161af14 100644
--- a/tensorflow/lite/experimental/shlo/bf16.h
+++ b/tensorflow/lite/experimental/shlo/bf16.h
@@ -19,7 +19,7 @@ limitations under the License.
 #if defined(__STDCPP_BFLOAT16_T__)
 #include <stdfloat>
 namespace shlo_ref {
-using BF16 = bfloat16_t;
+using BF16 = ::std::bfloat16_t;
 }  // namespace shlo_ref
 
 #else
diff --git a/tensorflow/lite/experimental/shlo/f16.h b/tensorflow/lite/experimental/shlo/f16.h
index 2496b31b84dc9f..f18170cb052682 100644
--- a/tensorflow/lite/experimental/shlo/f16.h
+++ b/tensorflow/lite/experimental/shlo/f16.h
@@ -16,21 +16,17 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
 
-#include "tensorflow/lite/experimental/shlo/has_keyword.h"
-
 #if defined(__STDCPP_FLOAT16_T__)
 #include <stdfloat>
 namespace shlo_ref {
-using F16 = float16_t;
+using F16 = ::std::float16_t;
 }  // namespace shlo_ref
 
-#elif __has_keyword(_Float16)
+#else
 namespace shlo_ref {
 using F16 = _Float16;
 }  // namespace shlo_ref
 
-#else
-#error Type F16 is not available
 #endif
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
diff --git a/tensorflow/lite/experimental/shlo/has_keyword.h b/tensorflow/lite/experimental/shlo/has_keyword.h
deleted file mode 100644
index 548c86eec4de36..00000000000000
--- a/tensorflow/lite/experimental/shlo/has_keyword.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_HAS_KEYWORD_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_HAS_KEYWORD_H_
-
-// CAUTION: __is_identifier behaves opposite how you would expect!
-// '__is_identifier' returns '0' if '__x' is a reserved identifier provided by
-// the compiler and '1' otherwise.  borrowed from LLVM __config header under
-// Apache license 2.
-// (https://www.mend.io/blog/top-10-apache-license-questions-answered/)
-
-#ifndef __is_identifier       // Optional of course.
-#define __is_identifier(x) 1  // Compatibility with non-clang compilers.
-#endif
-
-// More sensible macro for keyword detection
-#define __has_keyword(__x) !(__is_identifier(__x))
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_HAS_KEYWORD_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index b4e3004ff12f8a..8a924dfd80aae3 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -35,10 +35,10 @@ cc_test(
     srcs = ["is_finite_test.cc"],
     linkopts = shlo_ref_linkopts(),
     deps = [
-        ":benchmark_util",
         ":is_finite",
         "//tensorflow/lite/experimental/shlo:bf16",
         "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:shape",
         "//tensorflow/lite/experimental/shlo:status_matcher",
         "//tensorflow/lite/experimental/shlo:tensor",
diff --git a/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc b/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc
index 0c78f5264a1849..be5fbdbcf1817b 100644
--- a/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
 #include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
 #include "tensorflow/lite/experimental/shlo/status_matcher.h"
 #include "tensorflow/lite/experimental/shlo/tensor.h"
@@ -65,11 +66,12 @@ INSTANTIATE_TEST_SUITE_P(
                     BF16{-1.0f}, BF16{0.0f}, BF16{1.0f}}),
                TensorWithData::Create<DataType::kI1>(
                    Shape{{7}}, {false, false, false, false, true, true, true})},
-        Params{TensorWithData::Create<DataType::kF16>(
-                   Shape{{7}},
-                   {+NAN, -NAN, -INFINITY, +INFINITY, -1.0f, 0.0f, 1.0f}),
-               TensorWithData::Create<DataType::kI1>(
-                   Shape{{7}}, {false, false, false, false, true, true, true})},
+        Params{
+            TensorWithData::Create<DataType::kF16>(
+                Shape{{7}}, {F16{+NAN}, F16{-NAN}, F16{-INFINITY},
+                             F16{+INFINITY}, F16{-1.0f}, F16{0.0f}, F16{1.0f}}),
+            TensorWithData::Create<DataType::kI1>(
+                Shape{{7}}, {false, false, false, false, true, true, true})},
         Params{
             TensorWithData::Create<DataType::kF32>(
                 Shape{{7}},

From 5b5e2fbd3705ea460d861745cc37940e88c8fe2a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 06:25:00 -0700
Subject: [PATCH 533/670] Treat entry computation parameter that has host
 memory space as MoveToHost annotation in host_offload_legalize.

PiperOrigin-RevId: 619907383
---
 third_party/xla/xla/service/hlo_verifier.cc   | 10 ++--
 .../xla/xla/service/host_offload_legalize.cc  | 45 ++++++++++----
 .../xla/service/host_offload_legalize_test.cc | 58 ++++++++++++++++++-
 .../xla/xla/service/layout_assignment.cc      | 21 +++++--
 4 files changed, 113 insertions(+), 21 deletions(-)

diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index 497b74734920af..9b526c3bae41f0 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -1160,11 +1160,12 @@ Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
           (ShapeUtil::ArrayDataSize(output_shape) ==
            ShapeUtil::ArrayDataSize(operand_shape)))) {
       return Internal(
-          "Bitcast cannot have different shape sizes of output (%d) and "
+          "%s: Bitcast cannot have different shape sizes of output (%d) and "
           "operand "
           "(%d) (%s) (%s)",
-          opts_.shape_size(output_shape), opts_.shape_size(operand_shape),
-          output_shape.ToString(true), operand_shape.ToString(true));
+          bitcast->ToString(), opts_.shape_size(output_shape),
+          opts_.shape_size(operand_shape), output_shape.ToString(true),
+          operand_shape.ToString(true));
     }
   }
   return OkStatus();
@@ -1975,7 +1976,8 @@ Status ShapeVerifier::VerifyEntryComputationLayout(const HloModule& module) {
     if (!ShapesSame(parameter->shape(), layout.parameter_shape(i),
                     Shape::Equal()
                         .IgnoreTilesInLayout()
-                        .IgnoreTailPaddingAlignmentInElements())) {
+                        .IgnoreTailPaddingAlignmentInElements()
+                        .IgnoreMemorySpaceInLayout())) {
       return Internal(
           "Shape of the entry computation parameter %d is %s should be "
           "compatible to the one specified in module's entry computation "
diff --git a/third_party/xla/xla/service/host_offload_legalize.cc b/third_party/xla/xla/service/host_offload_legalize.cc
index a56a5aad1ddc4c..958ec67718c51e 100644
--- a/third_party/xla/xla/service/host_offload_legalize.cc
+++ b/third_party/xla/xla/service/host_offload_legalize.cc
@@ -47,7 +47,7 @@ constexpr std::array<HloOpcode, 2> kUsersOpcodes = {HloOpcode::kSlice,
                                                     HloOpcode::kDynamicSlice};
 
 // Find an annotation moving up. Meant to find an annotation from a DUS operand.
-HloInstruction* FindAnnotationToUpdate(HloInstruction* instr) {
+HloInstruction* FindToHostAnnotationToUpdate(HloInstruction* instr) {
   while (!instr->IsCustomCall(
       host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
     if ((instr->opcode() != HloOpcode::kBitcast &&
@@ -61,7 +61,8 @@ HloInstruction* FindAnnotationToUpdate(HloInstruction* instr) {
   return instr;
 }
 
-// Find an annotation moving up. Meant to find an annotation from a DUS operand.
+// Find an annotation moving up. Meant to find an annotation from a DUS
+// instruction.
 HloInstruction* FindToDeviceAnnotationToUpdate(HloInstruction* instr) {
   while (!instr->IsCustomCall(
       host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
@@ -330,6 +331,11 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
     HloInstruction* instruction, const CallGraph* call_graph,
     absl::flat_hash_set<HloInstruction*>& processed_annotations,
     std::vector<HloInstruction*>& to_remove) {
+  auto is_entry_computation_parameter = [](HloInstruction* instruction) {
+    return instruction->opcode() == HloOpcode::kParameter &&
+           instruction->parent()->IsEntryComputation();
+  };
+
   HloInstruction* starting_instr =
       FindDUSFromAnnotation(instruction->users()[0]);
   // If it's the pure copy case reset instruction.
@@ -343,7 +349,8 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
   // to update (required in case there are multiple insertions in the buffer).
   processed_annotations.insert(current_value.first);
   if (!current_value.first->IsCustomCall(
-          host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+          host_memory_offload_annotations::kMoveToHostCustomCallTarget) &&
+      !is_entry_computation_parameter(current_value.first)) {
     CHECK_EQ(current_value.first->opcode(), HloOpcode::kDynamicUpdateSlice);
     while (true) {
       VLOG(10) << "Current value before: " << current_value.first->ToString();
@@ -361,7 +368,7 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
       HloInstruction* annotation = current_value.first;
       if (annotation->opcode() == HloOpcode::kDynamicUpdateSlice) {
         HloInstruction* real_annotation =
-            FindAnnotationToUpdate(annotation->mutable_operand(1));
+            FindToHostAnnotationToUpdate(annotation->mutable_operand(1));
         // Check if this dynamic-update-slice doesn't have an annotation
         // attached.
         if (!real_annotation->IsCustomCall(
@@ -473,8 +480,11 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
           }
           update_shape_layout(std::make_pair(new_annotation, -1),
                               copy_to_move.first);
+          Shape new_copy_shape = new_annotation->shape();
+          *new_copy_shape.mutable_layout() =
+              copy_to_move.first->shape().layout();
           HloInstruction* new_copy = instruction.first->AddInstruction(
-              copy_to_move.first->CloneWithNewOperands(new_annotation->shape(),
+              copy_to_move.first->CloneWithNewOperands(new_copy_shape,
                                                        {new_annotation}));
           std::vector<HloInstruction*> users = instruction.first->users();
           for (auto* use : users) {
@@ -495,8 +505,8 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
         // Move the annotation first just before dynamic-update-slice to avoid
         // shape changes.
         if (instruction.first->opcode() == HloOpcode::kDynamicUpdateSlice) {
-          HloInstruction* annotation =
-              FindAnnotationToUpdate(instruction.first->mutable_operand(1));
+          HloInstruction* annotation = FindToHostAnnotationToUpdate(
+              instruction.first->mutable_operand(1));
           if (annotation == nullptr) {
             CHECK(false);
             return false;
@@ -535,7 +545,7 @@ absl::StatusOr<bool> FixupInterveningCopies(
   std::vector<HloInstruction*> annotations_to_remove;
   bool changed = false;
   for (HloInstruction* instruction : copy_to_host_annotations) {
-    if (processed_annotations.count(instruction)) {
+    if (processed_annotations.contains(instruction)) {
       continue;
     }
     TF_ASSIGN_OR_RETURN(bool changed_annotation_for_copy_movement,
@@ -576,11 +586,22 @@ absl::StatusOr<bool> HostOffloadLegalize::Run(
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() != HloOpcode::kCustomCall) {
-        continue;
+      if (instruction->opcode() == HloOpcode::kParameter &&
+          instruction->parent()->IsEntryComputation()) {
+        Shape param_shape =
+            module->entry_computation_layout()
+                .parameter_layout(instruction->parameter_number())
+                .shape();
+        // TODO(mingyao): Add support for tuple parameter.
+        if (param_shape.has_layout() &&
+            param_shape.layout().memory_space() == kHostMemorySpaceColor) {
+          copy_to_host_annotations.push_back(instruction);
+          continue;
+        }
       }
-      if (instruction->custom_call_target() ==
-          host_memory_offload_annotations::kMoveToHostCustomCallTarget) {
+
+      if (instruction->IsCustomCall(
+              host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
         copy_to_host_annotations.push_back(instruction);
       }
     }
diff --git a/third_party/xla/xla/service/host_offload_legalize_test.cc b/third_party/xla/xla/service/host_offload_legalize_test.cc
index a1abd7e0a188b9..f9929648a3f12f 100644
--- a/third_party/xla/xla/service/host_offload_legalize_test.cc
+++ b/third_party/xla/xla/service/host_offload_legalize_test.cc
@@ -78,7 +78,7 @@ class HostOffloadLegalizeTest : public HloTestBase {
 
 TEST_F(HostOffloadLegalizeTest, NoCopyWithOptBarrierMoreElaborate) {
   const std::string& hlo_string = R"(
-HloModule jit_f, entry_computation_layout={(f32[16,256]{0,1})->f32[16,256]{0,1}}
+HloModule jit_f, entry_computation_layout={(f32[16,256]{0,1})->f32[16,256]{1,0}}
 
 ENTRY main.24 {
   Arg_0.1 = f32[16,256]{0,1} parameter(0)
@@ -120,6 +120,62 @@ ENTRY main.24 {
   HloInstruction* custom_call = FindInstruction(module.get(), "custom-call.18");
   EXPECT_EQ(custom_call->users()[0]->opcode(), HloOpcode::kCopy);
   EXPECT_EQ(custom_call->shape().layout(), LayoutUtil::MakeLayout({0, 1}));
+  EXPECT_EQ(custom_call->users()[0]->shape().layout(),
+            LayoutUtil::MakeLayout({1, 0}));
+}
+
+TEST_F(HostOffloadLegalizeTest, XposeCopyOnParameterStreaming) {
+  const std::string& hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[16,256]{0,1},f32[16,256]{0,1:T(8,128)S(5)})->f32[16,256]{1,0}}
+
+ENTRY main.24 {
+  Arg_0.1 = f32[16,256]{0,1} parameter(0)
+  Arg_0.2 = f32[16,256]{0,1:T(8,128)} parameter(1)
+  cp0 = f32[16,256]{1,0} copy(Arg_0.2)
+  cosine.4 = f32[16,256]{0,1} cosine(Arg_0.1)
+  custom-call.5 = f32[16,256]{0,1} custom-call(cosine.4), custom_call_target="MoveToHost"
+  sine.3 = f32[16,256]{0,1} sine(Arg_0.1)
+  cosine.7 = f32[16,256]{0,1} cosine(sine.3)
+  custom-call.8 = f32[16,256]{0,1} custom-call(cosine.7), custom_call_target="MoveToHost"
+  constant.2 = f32[] constant(1)
+  cp1 = f32[16,256]{1,0} copy(custom-call.8)
+  tuple.11 = (f32[16,256]{0,1}, f32[16,256]{1,0}, f32[16,256]{1,0}, f32[]) tuple(custom-call.5, cp1, cp0, constant.2)
+  opt-barrier.12 = (f32[16,256]{0,1}, f32[16,256]{1,0}, f32[16,256]{1,0}, f32[]) opt-barrier(tuple.11)
+  get-tuple-element.16 = f32[] get-tuple-element(opt-barrier.12), index=3
+  broadcast.20 = f32[16,256]{0,1} broadcast(get-tuple-element.16), dimensions={}
+  get-tuple-element.15 = f32[16,256]{1,0} get-tuple-element(opt-barrier.12), index=2
+  custom-call.19 = f32[16,256]{1,0} custom-call(get-tuple-element.15), custom_call_target="MoveToDevice"
+  multiply.21 = f32[16,256]{0,1} multiply(broadcast.20, custom-call.19)
+  cp2 = f32[16,256]{1,0} copy(multiply.21)
+  get-tuple-element.14 = f32[16,256]{1,0} get-tuple-element(opt-barrier.12), index=1
+  custom-call.18 = f32[16,256]{1,0} custom-call(get-tuple-element.14), custom_call_target="MoveToDevice"
+  multiply.22 = f32[16,256]{1,0} multiply(cp2, custom-call.18)
+  get-tuple-element.13 = f32[16,256]{0,1} get-tuple-element(opt-barrier.12), index=0
+  custom-call.17 = f32[16,256]{0,1} custom-call(get-tuple-element.13), custom_call_target="MoveToDevice"
+  cp3 = f32[16,256]{1,0} copy(custom-call.17)
+  ROOT multiply.23 = f32[16,256]{1,0} multiply(multiply.22, cp3)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloadLegalize(module.get()));
+
+  EXPECT_TRUE(changed);
+  XLA_VLOG_LINES(1, module->ToString());
+  HloInstruction* custom_call = FindInstruction(module.get(), "custom-call.18");
+  EXPECT_EQ(custom_call->users()[0]->opcode(), HloOpcode::kCopy);
+  EXPECT_EQ(custom_call->shape().layout(), LayoutUtil::MakeLayout({0, 1}));
+  EXPECT_EQ(custom_call->users()[0]->shape().layout(),
+            LayoutUtil::MakeLayout({1, 0}));
+
+  custom_call = FindInstruction(module.get(), "custom-call.19");
+  EXPECT_EQ(custom_call->users()[0]->opcode(), HloOpcode::kCopy);
+  EXPECT_EQ(custom_call->shape().layout(),
+            LayoutUtil::MakeLayout({0, 1}, {}, {}, {}, {Tile{{8, 128}}}));
+  EXPECT_EQ(custom_call->users()[0]->shape().layout(),
+            LayoutUtil::MakeLayout({1, 0}));
 }
 
 TEST_F(HostOffloadLegalizeTest, LlmActivationHostMemoryMultipleConsumers) {
diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index 82624c414b8168..8ca2296875e62b 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -721,11 +721,23 @@ Status LayoutAssignment::AddMandatoryConstraints(
         if (parameter_layout.LayoutIsSet()) {
           // Parameter layouts must match the respective layout in
           // ComputationLayout, if there is one.
-          TF_RETURN_IF_ERROR(
-              SetInstructionLayout(parameter_layout.shape(), instruction));
+          Shape param_shape = parameter_layout.shape();
+          // Clear out memory space in layout. Host offloader will do the
+          // analysis later.
+          TF_RETURN_IF_ERROR(ShapeUtil::ForEachMutableSubshapeWithStatus(
+              &param_shape, [](Shape* subshape, const ShapeIndex& index) {
+                if (!subshape->has_layout() || !subshape->IsArray()) {
+                  return OkStatus();
+                }
+                subshape->mutable_layout()->set_memory_space(
+                    Layout::kDefaultMemorySpace);
+                return OkStatus();
+              }));
+
+          TF_RETURN_IF_ERROR(SetInstructionLayout(param_shape, instruction));
           if (reverse_computation_order_) {
             TF_RETURN_IF_ERROR(PropagateParameterLayoutToUsers(
-                instruction, parameter_layout.shape(), this));
+                instruction, param_shape, this));
           }
         }
       }
@@ -2537,7 +2549,8 @@ Status LayoutAssignment::PropagateComputationLayouts(
           }
           const auto& computed_subshape = ShapeUtil::GetSubshape(
               computed_computation_layout.parameter_shape(i), shape_index);
-          if (subshape.layout() != computed_subshape.layout()) {
+          if (!Layout::Equal().IgnoreMemorySpace()(
+                  subshape.layout(), computed_subshape.layout())) {
             return Internal(
                 "Assigned parameter shape %s does not match layout of "
                 "computation shape: %s",

From 41b8006c12180815d30134f24d87b4dc786ffc73 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <dmitrig@google.com>
Date: Thu, 28 Mar 2024 06:49:50 -0700
Subject: [PATCH 534/670] Integrate LLVM at llvm/llvm-project@feebcd65fb7e

Updates LLVM usage to match
[feebcd65fb7e](https://github.com/llvm/llvm-project/commit/feebcd65fb7e)

PiperOrigin-RevId: 619913390
---
 third_party/llvm/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 83a5ccb8ec502b..ad8f149f602ed5 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "3cf169ca160eaf5464503fbd93d73ee1d8597936"
-    LLVM_SHA256 = "b63cac687df1bc98e3eb0289f3be6824fcb1b106d0720b5c083417918d1029fd"
+    LLVM_COMMIT = "feebcd65fb7e0534f5219e05432a05e45aa8cd2a"
+    LLVM_SHA256 = "39b2b0c5f5fefb54866a0e9738f1617d79049dbac3b5cdecb7b1f785a57bb669"
 
     tf_http_archive(
         name = name,

From 3c2fb510507ea4913692a17c5d27c5b4853b5e91 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 28 Mar 2024 07:13:53 -0700
Subject: [PATCH 535/670] Add Linux ppc64le support to CUDA stubs.

Update implib.so to include a commit with ppc64le support.

May fix https://github.com/google/jax/issues/19992, although I don't have such a machine so it's untested.

PiperOrigin-RevId: 619919522
---
 third_party/implib_so/workspace.bzl                         | 6 +++---
 third_party/xla/third_party/implib_so/workspace.bzl         | 6 +++---
 .../xla/third_party/tsl/third_party/implib_so/workspace.bzl | 6 +++---
 third_party/xla/xla/tsl/cuda/stub.bzl                       | 1 +
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/third_party/implib_so/workspace.bzl b/third_party/implib_so/workspace.bzl
index 01dad3b169f402..37f36cc135fd6d 100644
--- a/third_party/implib_so/workspace.bzl
+++ b/third_party/implib_so/workspace.bzl
@@ -6,8 +6,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "implib_so",
-        strip_prefix = "Implib.so-5fb84c2a750434b9df1da67d67b749eb929598f1",
-        sha256 = "10de0a616df24849f2a883747784c115f209708960e44556f5ce384de6f103e8",
-        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/5fb84c2a750434b9df1da67d67b749eb929598f1.tar.gz"),
+        strip_prefix = "Implib.so-2cce6cab8ff2c15f9da858ea0b68646a8d62aef2",
+        sha256 = "4ef3089969d57a5b60bb41b8212c478eaa15c56941f86d4bf5e7f98a3afd24e8",
+        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/2cce6cab8ff2c15f9da858ea0b68646a8d62aef2.tar.gz"),
         build_file = "//third_party/implib_so:implib_so.BUILD",
     )
diff --git a/third_party/xla/third_party/implib_so/workspace.bzl b/third_party/xla/third_party/implib_so/workspace.bzl
index 01dad3b169f402..37f36cc135fd6d 100644
--- a/third_party/xla/third_party/implib_so/workspace.bzl
+++ b/third_party/xla/third_party/implib_so/workspace.bzl
@@ -6,8 +6,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "implib_so",
-        strip_prefix = "Implib.so-5fb84c2a750434b9df1da67d67b749eb929598f1",
-        sha256 = "10de0a616df24849f2a883747784c115f209708960e44556f5ce384de6f103e8",
-        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/5fb84c2a750434b9df1da67d67b749eb929598f1.tar.gz"),
+        strip_prefix = "Implib.so-2cce6cab8ff2c15f9da858ea0b68646a8d62aef2",
+        sha256 = "4ef3089969d57a5b60bb41b8212c478eaa15c56941f86d4bf5e7f98a3afd24e8",
+        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/2cce6cab8ff2c15f9da858ea0b68646a8d62aef2.tar.gz"),
         build_file = "//third_party/implib_so:implib_so.BUILD",
     )
diff --git a/third_party/xla/third_party/tsl/third_party/implib_so/workspace.bzl b/third_party/xla/third_party/tsl/third_party/implib_so/workspace.bzl
index 01dad3b169f402..37f36cc135fd6d 100644
--- a/third_party/xla/third_party/tsl/third_party/implib_so/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/implib_so/workspace.bzl
@@ -6,8 +6,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "implib_so",
-        strip_prefix = "Implib.so-5fb84c2a750434b9df1da67d67b749eb929598f1",
-        sha256 = "10de0a616df24849f2a883747784c115f209708960e44556f5ce384de6f103e8",
-        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/5fb84c2a750434b9df1da67d67b749eb929598f1.tar.gz"),
+        strip_prefix = "Implib.so-2cce6cab8ff2c15f9da858ea0b68646a8d62aef2",
+        sha256 = "4ef3089969d57a5b60bb41b8212c478eaa15c56941f86d4bf5e7f98a3afd24e8",
+        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/2cce6cab8ff2c15f9da858ea0b68646a8d62aef2.tar.gz"),
         build_file = "//third_party/implib_so:implib_so.BUILD",
     )
diff --git a/third_party/xla/xla/tsl/cuda/stub.bzl b/third_party/xla/xla/tsl/cuda/stub.bzl
index d5e644dc13c97c..1aaa52746d69b8 100644
--- a/third_party/xla/xla/tsl/cuda/stub.bzl
+++ b/third_party/xla/xla/tsl/cuda/stub.bzl
@@ -21,6 +21,7 @@ def cuda_stub(name, srcs):
         cmd = select({
             "@local_tsl//tsl:linux_aarch64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target aarch64",
             "@local_tsl//tsl:linux_x86_64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target x86_64",
+            "@local_tsl//tsl:linux_ppc64le": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target powerpc64le",
             "//conditions:default": "NOT_IMPLEMENTED_FOR_THIS_PLATFORM_OR_ARCHITECTURE",
         }),
     )

From 8b1b801437cc28690a2bf843919692636c45821b Mon Sep 17 00:00:00 2001
From: Dan Suh <dansuh@google.com>
Date: Thu, 28 Mar 2024 07:54:27 -0700
Subject: [PATCH 536/670] Replace usage of `PermuteShape` to `Permute<T>`.

Replaces the duplicate implementation of array permutation.

As a corollary, replaces permutation values to existing definitions in `attrs_and_constraints.h`.

PiperOrigin-RevId: 619930275
---
 .../common/attrs_and_constraints.h            |  6 +++
 .../passes/nchw_convolution_to_nhwc.cc        | 46 ++++---------------
 2 files changed, 14 insertions(+), 38 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
index faf56159b39cc5..852902e229a9fc 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
@@ -49,6 +49,12 @@ inline constexpr std::array<int64_t, 4> kNhwcToNchwPermutation = {0, 3, 1, 2};
 // permutation of `kNchwToNhwcPermutation`.
 inline constexpr std::array<int64_t, 4> kNchwToNhwcPermutation = {0, 2, 3, 1};
 
+// Permutation from the OIHW (== (output features, input features, height,
+// width)) tensor format to HWIO. This is commonly used to transpose convolution
+// weights represented as OIHW format to HWIO, which is more desirable for
+// certain downstream optimization passes (e.g. XLA).
+inline constexpr std::array<int64_t, 4> kOihwToHwioPermutation = {2, 3, 1, 0};
+
 // Returns true if the value has static shape.
 bool HasStaticShape(Value value);
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
index 5ba80df30a9f2d..521f701598fb0a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h"
 
 namespace mlir::quant::stablehlo {
 
@@ -72,20 +73,20 @@ class RewriteNchwConvolutionToNhwc
     // Transpose the input tensor: [b, f, 0, 1] => [b, 0, 1, f]
     Value input = op->getOperand(0);
     const TensorType new_input_tensor_type = GetTransposedTensorType(
-        input.getType().cast<TensorType>(), kActivationPermutation);
+        input.getType().cast<TensorType>(), kNchwToNhwcPermutation);
 
     auto input_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
         op.getLoc(), /*resultType0=*/new_input_tensor_type, /*operand=*/input,
-        rewriter.getDenseI64ArrayAttr(kActivationPermutation));
+        rewriter.getDenseI64ArrayAttr(kNchwToNhwcPermutation));
 
     // Transpose the filter tensor: [o, i, 0, 1] => [0, 1, i, o]
     Value filter = op->getOperand(1);
     const TensorType new_filter_tensor_type = GetTransposedTensorType(
-        filter.getType().cast<TensorType>(), kFilterPermutation);
+        filter.getType().cast<TensorType>(), kOihwToHwioPermutation);
 
     auto filter_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
         op.getLoc(), /*resultType0=*/new_filter_tensor_type, /*operand=*/filter,
-        rewriter.getDenseI64ArrayAttr(kFilterPermutation));
+        rewriter.getDenseI64ArrayAttr(kOihwToHwioPermutation));
 
     // [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
     const auto new_dimension_nums = rewriter.getAttr<ConvDimensionNumbersAttr>(
@@ -99,7 +100,7 @@ class RewriteNchwConvolutionToNhwc
     // Determine the shape of the output tensor: [b, f, 0, 1] => [b, 0, 1, f]
     auto output_tensor_type = op->getResult(0).getType().cast<TensorType>();
     const TensorType new_conv_output_tensor_type =
-        GetTransposedTensorType(output_tensor_type, kOutputPermutation);
+        GetTransposedTensorType(output_tensor_type, kNchwToNhwcPermutation);
 
     // window_strides, padding, lhs_dilation, rhs_dilation, window_reversal are
     // reused without modification because the ordering of spatial dimensions
@@ -125,31 +126,12 @@ class RewriteNchwConvolutionToNhwc
     auto output_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
         new_convolution_op.getLoc(), /*resultType0=*/output_tensor_type,
         /*operand=*/new_convolution_op,
-        rewriter.getDenseI64ArrayAttr(kOutputReversePermutation));
+        rewriter.getDenseI64ArrayAttr(kNhwcToNchwPermutation));
 
     rewriter.replaceAllUsesWith(op, output_transpose_op);
   }
 
  private:
-  // Permutation to transpose the input tensor from [b, f, 0, 1] to
-  // [b, 0, 1, f].
-  static constexpr std::array<int64_t, 4> kActivationPermutation = {0, 2, 3, 1};
-
-  // Permutation to transpose the filter tensor from [o, i, 0, 1] to
-  // [0, 1, i, o].
-  static constexpr std::array<int64_t, 4> kFilterPermutation = {2, 3, 1, 0};
-
-  // Permutation to transpose the output tensor from [b, f, 0, 1] to
-  // [b, 0, 1, f]. This is used to determine the shape of the new
-  // `ConvolutionOp`'s output tensor.
-  static constexpr std::array<int64_t, 4> kOutputPermutation = {0, 2, 3, 1};
-
-  // Permutation to transpose the output tensor from [b, 0, 1, f] to
-  // [b, f, 0, 1]. This is used to revert the new output tensor of
-  // `ConvolutionOp` with a `TransposeOp`.
-  static constexpr std::array<int64_t, 4> kOutputReversePermutation = {0, 3, 1,
-                                                                       2};
-
   // Matches input dimensions corresponding to: [b, f, 0, 1].
   bool MatchInputDimensionNumbers(
       const ConvDimensionNumbersAttr dimension_numbers) const {
@@ -183,21 +165,9 @@ class RewriteNchwConvolutionToNhwc
   TensorType GetTransposedTensorType(
       const TensorType type, const ArrayRef<int64_t> permutation) const {
     const SmallVector<int64_t> after_shape =
-        PermuteShape(type.getShape(), permutation);
+        Permute<int64_t>(type.getShape(), permutation);
     return type.cloneWith(after_shape, type.getElementType());
   }
-
-  // Permutes the shape according to the permutation. The size of `shape` and
-  // `permutation` should be equal.
-  SmallVector<int64_t> PermuteShape(const ArrayRef<int64_t> shape,
-                                    const ArrayRef<int64_t> permutation) const {
-    const int64_t size = shape.size();
-    SmallVector<int64_t, 4> after_shape(size);
-    for (int i = 0; i < size; ++i) {
-      after_shape[i] = shape[permutation[i]];
-    }
-    return after_shape;
-  }
 };
 
 }  // namespace

From 4aa077841d27eb902054d6a9baad1345e78605eb Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Thu, 28 Mar 2024 08:07:34 -0700
Subject: [PATCH 537/670] Remove legacy post-training quantization functions

PiperOrigin-RevId: 619933983
---
 .../tensorflow/python/quantize_model.cc       | 68 -------------------
 .../tensorflow/python/quantize_model.h        | 13 ----
 2 files changed, 81 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index 8d230b97ca772b..89467d30944ca9 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -298,74 +298,6 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
       *function_aliases);
 }
 
-absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
-    absl::string_view saved_model_path,
-    const std::vector<std::string> &signature_keys,
-    const std::unordered_set<std::string> &tags,
-    const QuantizationOptions &quantization_options) {
-  std::unique_ptr<mlir::MLIRContext> context =
-      CreateMlirContextForQuantization();
-
-  absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
-      function_aliases = GetFunctionAliases(saved_model_path, tags);
-  if (!function_aliases.ok()) {
-    return absl::InternalError(absl::StrCat(
-        "Failed to get function alias: ", function_aliases.status().message()));
-  }
-
-  const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
-  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module =
-      ImportAndPreprocessSavedModel(
-          saved_model_path, signature_keys, tags, context.get(),
-          /*is_inliner_run=*/true,
-          /*run_tf_to_stablehlo=*/is_stablehlo,
-          /*deserialize_xla_call_module=*/false, *function_aliases);
-  if (!module.status().ok()) {
-    return absl::InternalError(
-        absl::StrCat("Failed to import and preprocess SavedModel: ",
-                     module.status().message()));
-  }
-  mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
-
-  return QuantizePtqModelPreCalibrationImpl(
-      *module_ref, context.get(), quantization_options, *function_aliases);
-}
-
-absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
-    absl::string_view saved_model_path,
-    const std::vector<std::string> &signature_keys,
-    const std::unordered_set<std::string> &tags,
-    const QuantizationOptions &quantization_options) {
-  std::unique_ptr<mlir::MLIRContext> context =
-      CreateMlirContextForQuantization();
-
-  absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
-      function_aliases = GetFunctionAliases(saved_model_path, tags);
-  if (!function_aliases.ok()) {
-    return absl::InternalError(absl::StrCat(
-        "Failed to get function alias: ", function_aliases.status().message()));
-  }
-
-  // Freezing is required again since variables might have been produced during
-  // the pre-calibration step. `is_inliner_run = false` to prevent the functions
-  // lifted for quantization from being inlined.
-  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module =
-      ImportAndPreprocessSavedModel(
-          saved_model_path, signature_keys, tags, context.get(),
-          /*is_inliner_run=*/false,
-          /*run_tf_to_stablehlo=*/false,
-          /*deserialize_xla_call_module=*/false, *function_aliases);
-  if (!module.status().ok()) {
-    return absl::InternalError(
-        absl::StrCat("Failed to import and preprocess SavedModel: ",
-                     module.status().message()));
-  }
-  mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
-
-  return QuantizePtqModelPostCalibrationImpl(
-      *module_ref, context.get(), quantization_options, *function_aliases);
-}
-
 absl::StatusOr<ExportedModel> QuantizeDynamicRangePtq(
     absl::string_view saved_model_path,
     const std::vector<std::string> &signature_keys,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
index ec7df2929660d5..a54e988c043aa3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
@@ -70,19 +70,6 @@ absl::StatusOr<ExportedModel> QuantizeStaticRangePtq(
     const absl::flat_hash_map<std::string, RepresentativeDatasetFile>&
         representative_dataset_file_map_serialized);
 
-// Legacy versions of static-range quantization.
-absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
-    absl::string_view saved_model_path,
-    const std::vector<std::string>& signature_keys,
-    const std::unordered_set<std::string>& tags,
-    const QuantizationOptions& quantization_options);
-
-absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
-    absl::string_view saved_model_path,
-    const std::vector<std::string>& signature_keys,
-    const std::unordered_set<std::string>& tags,
-    const QuantizationOptions& quantization_options);
-
 }  // namespace quantization
 }  // namespace tensorflow
 

From c046dde407558f688b8ec1bbb1d380f38eb0a9e0 Mon Sep 17 00:00:00 2001
From: Matt Callanan <mpcallanan@google.com>
Date: Thu, 28 Mar 2024 08:12:15 -0700
Subject: [PATCH 538/670] #tf-data-service Reduce severity of missing default
 transfer server log line.

PiperOrigin-RevId: 619935370
---
 .../core/data/service/client/data_service_client.cc  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index 472562e750bb73..7bba141740292a 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -400,12 +400,12 @@ DataServiceClient::CreateWorkerClient(const TaskInfo& task_info) {
       return CreateAlternativeWorkerClientWithGrpcFallback(*transfer_server,
                                                            task_info);
     }
-    LOG(INFO) << "Failed to find transfer server for default data transfer "
-                 "protocol '"
-              << default_protocol << "' for worker '"
-              << task_info.worker_address()
-              << "'; falling back to grpc. Original error: "
-              << transfer_server.status();
+    VLOG(1) << "Failed to find transfer server for default data transfer "
+               "protocol '"
+            << default_protocol << "' for worker '"
+            << task_info.worker_address()
+            << "'; falling back to grpc. Original error: "
+            << transfer_server.status();
     metrics::RecordTFDataServiceDataTransferProtocolFallback(
         default_protocol, error::Code::NOT_FOUND,
         "Failed to find transfer server for default protocol");

From 27fc08fb84fc0d76f3ebf8e924b3e4c6b59876c4 Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <slebedev@google.com>
Date: Thu, 28 Mar 2024 08:28:00 -0700
Subject: [PATCH 539/670] [xla:gpu] Added a test checking that Triton kernels
 compiled via XLA do not dedup arguments

PiperOrigin-RevId: 619940030
---
 third_party/xla/xla/service/gpu/tests/BUILD   |   2 +
 .../gpu/tests/gpu_triton_custom_call_test.cc  | 177 ++++++++++--------
 2 files changed, 97 insertions(+), 82 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 87617a29b8c289..b9e7764047d8c4 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -454,8 +454,10 @@ xla_test(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
index 7dc6fda816609a..52351018c743bb 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
@@ -21,17 +21,20 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
 #include "tsl/platform/status_matchers.h"
 
@@ -41,39 +44,13 @@ namespace gpu {
 using ::mlir::ArrayRef;
 using ::mlir::NamedAttribute;
 
-class GpuIrEmitterUnnestedTest : public GpuCodegenTest {
- public:
-  se::CudaComputeCapability GetCudaComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .cuda_compute_capability();
-  }
-};
+namespace {
 
-TEST_F(GpuIrEmitterUnnestedTest,
-       EmitTritonCustomCallWithCorrectLoweringAndWithoutNoaliasOrAlignment) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "Triton support is only enabled for Ampere GPUs and up.";
-  }
-
-  // Tests that the lowering of a Triton custom call produces the correct LLVM
-  // IR, and that the arguments do not specify noalias or alignment attributes.
-
-  HloComputation::Builder computation_builder(TestName());
+std::unique_ptr<HloInstruction> CreateAddTritonCustomCall(
+    Shape tuple_shape, HloInstruction* param_0, HloInstruction* param_1) {
   mlir::MLIRContext context_;
   mlir::Builder builder(&context_);
 
-  // Create parameters and custom call in the computation builder.
-  Shape scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
-  Shape tuple_shape = ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
-
-  HloInstruction* param_0 = computation_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape, "arg_0"));
-
-  HloInstruction* param_1 = computation_builder.AddInstruction(
-      HloInstruction::CreateParameter(1, scalar_shape, "arg_1"));
-
   // Create the backend_config for the triton custom call.
   const std::string kMLIRText = R"(
   module {
@@ -119,9 +96,46 @@ TEST_F(GpuIrEmitterUnnestedTest,
   std::string backend_config_str;
   llvm::raw_string_ostream(backend_config_str) << backend_config;
 
-  computation_builder.AddInstruction(HloInstruction::CreateCustomCall(
-      tuple_shape, {param_0, param_1}, "__gpu$xla.gpu.triton",
-      backend_config_str));
+  return HloInstruction::CreateCustomCall(tuple_shape, {param_0, param_1},
+                                          "__gpu$xla.gpu.triton",
+                                          backend_config_str);
+}
+
+}  // namespace
+
+class GpuIrEmitterUnnestedTest : public GpuCodegenTest {
+ public:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+};
+
+TEST_F(GpuIrEmitterUnnestedTest,
+       EmitTritonCustomCallWithCorrectLoweringAndWithoutNoaliasOrAlignment) {
+  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "Triton support is only enabled for Ampere GPUs and up.";
+  }
+
+  // Tests that the lowering of a Triton custom call produces the correct LLVM
+  // IR, and that the arguments do not specify noalias or alignment attributes.
+
+  HloComputation::Builder computation_builder(TestName());
+
+  // Create parameters and custom call in the computation builder.
+  Shape scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  HloInstruction* param_0 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "arg_0"));
+
+  HloInstruction* param_1 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "arg_1"));
+
+  computation_builder.AddInstruction(
+      CreateAddTritonCustomCall(tuple_shape, param_0, param_1));
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(computation_builder.Build());
@@ -161,8 +175,6 @@ TEST_F(GpuIrEmitterUnnestedTest, CanNotEmitTritonCustomCallOnPreAmpereGpu) {
   }
 
   HloComputation::Builder computation_builder(TestName());
-  mlir::MLIRContext context_;
-  mlir::Builder builder(&context_);
 
   // Create parameters and custom call in the computation builder.
   Shape scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
@@ -174,54 +186,8 @@ TEST_F(GpuIrEmitterUnnestedTest, CanNotEmitTritonCustomCallOnPreAmpereGpu) {
   HloInstruction* param_1 = computation_builder.AddInstruction(
       HloInstruction::CreateParameter(1, scalar_shape, "arg_1"));
 
-  // Create the backend_config for the triton custom call.
-  const std::string kMLIRText = R"(
-  module {
-    tt.func public @add_one(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}) {
-      %0 = tt.get_program_id x : i32
-      %1 = tt.load %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
-      %2 = tt.load %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
-      %cst = arith.constant 1.000000e+00 : f32
-      %3 = arith.addf %1, %cst : f32
-      %4 = tt.load %arg2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
-      tt.store %arg2, %3 {cache = 1 : i32, evict = 1 : i32} : f32
-      %5 = tt.load %arg3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
-      tt.store %arg3, %2 {cache = 1 : i32, evict = 1 : i32} : f32
-      tt.return
-    }
-  }
-  )";
-
-  NamedAttribute name =
-      builder.getNamedAttr("name", builder.getStringAttr("add_one"));
-  NamedAttribute ir =
-      builder.getNamedAttr("ir", builder.getStringAttr(kMLIRText));
-  NamedAttribute num_stages =
-      builder.getNamedAttr("num_stages", builder.getI32IntegerAttr(3));
-  NamedAttribute num_warps =
-      builder.getNamedAttr("num_warps", builder.getI32IntegerAttr(4));
-  NamedAttribute grid_x =
-      builder.getNamedAttr("grid_x", builder.getI32IntegerAttr(1));
-  NamedAttribute grid_y =
-      builder.getNamedAttr("grid_y", builder.getI32IntegerAttr(1));
-  NamedAttribute grid_z =
-      builder.getNamedAttr("grid_z", builder.getI32IntegerAttr(1));
-  NamedAttribute debug =
-      builder.getNamedAttr("debug", builder.getBoolAttr(false));
-
-  std::vector<NamedAttribute> attributes = {
-      name, ir, num_stages, num_warps, grid_x, grid_y, grid_z, debug};
-  ArrayRef<NamedAttribute> attributesRef(attributes);
-  mlir::DictionaryAttr backend_config =
-      mlir::DictionaryAttr::get(&context_, attributesRef);
-
-  // Parse the backend_config into a string.
-  std::string backend_config_str;
-  llvm::raw_string_ostream(backend_config_str) << backend_config;
-
-  computation_builder.AddInstruction(HloInstruction::CreateCustomCall(
-      tuple_shape, {param_0, param_1}, "__gpu$xla.gpu.triton",
-      backend_config_str));
+  computation_builder.AddInstruction(
+      CreateAddTritonCustomCall(tuple_shape, param_0, param_1));
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(computation_builder.Build());
@@ -234,5 +200,52 @@ TEST_F(GpuIrEmitterUnnestedTest, CanNotEmitTritonCustomCallOnPreAmpereGpu) {
               "Triton support is only enabled for Ampere GPUs and up.")));
 }
 
+class TritonCustomCallTest : public HloTestBase {};
+
+TEST_F(TritonCustomCallTest, NoArgumentDeduplication) {
+  if (auto cc = backend()
+                    .default_stream_executor()
+                    ->GetDeviceDescription()
+                    .cuda_compute_capability();
+      !cc.IsAtLeastAmpere()) {
+    GTEST_SKIP() << "Triton support is only enabled for Ampere GPUs and up.";
+  }
+
+  // Tests that no argument deduplication is done for Triton kernels.
+  //
+  // Triton kernels are compiled on the first call and re-used for all the
+  // following calls. So, if we are unlucky, we could end up calling the
+  // compiled kernel with fewer arguments than it expects in the presence
+  // of argument deduplication.
+  //
+  // For example,
+  //
+  //  * The first call is f(x, y). The arguments are distinct, no deduplication
+  //    is done at compilation time and the compiled kernel expects two
+  //    arguments.
+  //  * The second call is f(x, x). The arguments are deduplicated and we
+  //    call the previously compiled kernel with just x, causing a crash.
+
+  HloComputation::Builder computation_builder(TestName());
+
+  Shape scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  HloInstruction* param_0 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "arg_0"));
+
+  HloInstruction* param_1 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "arg_1"));
+
+  auto* instr_0 = computation_builder.AddInstruction(
+      CreateAddTritonCustomCall(tuple_shape, param_0, param_1));
+  computation_builder.AddInstruction(
+      CreateAddTritonCustomCall(tuple_shape, instr_0, instr_0));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(computation_builder.Build());
+  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/false));
+}
+
 }  // namespace gpu
 }  // namespace xla

From ab23fef288dd6b5af64fbb575e194c7bedbc06cc Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Thu, 28 Mar 2024 08:28:42 -0700
Subject: [PATCH 540/670] #shlo_ref Add `maximum` op.

PiperOrigin-RevId: 619940246
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  29 ++++
 .../lite/experimental/shlo/ops/maximum.cc     |  66 ++++++++
 .../lite/experimental/shlo/ops/maximum.h      |  36 +++++
 .../experimental/shlo/ops/maximum_test.cc     | 151 ++++++++++++++++++
 4 files changed, 282 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/maximum.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/maximum.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/maximum_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 8a924dfd80aae3..8624708052c3d2 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -963,3 +963,32 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "maximum",
+    srcs = ["maximum.cc"],
+    hdrs = ["maximum.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "maximum_test",
+    srcs = ["maximum_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":maximum",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/maximum.cc b/tensorflow/lite/experimental/shlo/ops/maximum.cc
new file mode 100644
index 00000000000000..9239dadaac3f92
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/maximum.cc
@@ -0,0 +1,66 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/maximum.h"
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Maximum {
+  template <class T>
+  constexpr auto operator()(const T a, const T b) {
+    return a > b ? a : b;
+  }
+};
+
+MaximumOp Create(MaximumOp::Attributes) { return {}; }
+
+absl::Status Prepare(MaximumOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("maximum"), lhs, IsBoolTensor, IsIntTensor,
+                          IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("maximum"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("maximum"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(MaximumOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  Maximum maximum;
+  if (IsBoolTensor(lhs) || IsIntTensor(lhs) || IsFloatTensor(lhs)) {
+    // Note: all the arithmetic types share the same implementation.
+    DISPATCH_BOOL_INT_FLOAT(detail::EvaluateNoQuantization,
+                            lhs.tensor_element_type(), maximum, lhs, rhs,
+                            output);
+  } else if (IsQuantizedPerTensorTensor(lhs)) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       lhs.quantized_tensor_element_type().StorageType(),
+                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       maximum, lhs, rhs, output)
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.maximum: Unsupported tensor type.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/maximum.h b/tensorflow/lite/experimental/shlo/ops/maximum.h
new file mode 100644
index 00000000000000..1ce0be360542f9
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/maximum.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MAXIMUM_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MAXIMUM_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct MaximumOp {
+  struct Attributes {};
+};
+
+MaximumOp Create(MaximumOp::Attributes);
+absl::Status Prepare(MaximumOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(MaximumOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MAXIMUM_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/maximum_test.cc b/tensorflow/lite/experimental/shlo/ops/maximum_test.cc
new file mode 100644
index 00000000000000..0422331324daf9
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/maximum_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/maximum.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<MaximumOp> {
+  static std::string Get() { return "Maximum"; }
+};
+
+struct Maximum {
+  template <class T>
+  constexpr auto operator()(const T a, const T b) {
+    return a > b ? a : b;
+  }
+};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Maximum, BinaryElementwiseOpShapePropagationTest,
+                               MaximumOp, TestParamNames);
+
+using MaximumBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    MaximumOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes,
+                           BaselineConstraintFloatTypes,
+                           BaselineConstraintQuantizedPerTensorTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Maximum, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MaximumBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<MaximumOp, ConcatTypes<PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Maximum, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using SupportedTypes = ConcatTypes<BoolTestType, ArithmeticTestTypes>;
+
+template <class T>
+struct MaximumTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(MaximumTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(MaximumTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(), Maximum());
+
+  auto op = Create(MaximumOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedMaximumTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedMaximumTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedMaximumTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(2);
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/zero_point + 1, /*max=*/zero_point + 5);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor lhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [zero_point, scale](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs = Dequantize(lhs, zero_point, scale);
+        const ExpressedT dequantized_rhs = Dequantize(rhs, zero_point, scale);
+        const ExpressedT dequantized_res =
+            Maximum()(dequantized_lhs, dequantized_rhs);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(MaximumOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+}  // namespace
+}  // namespace shlo_ref

From c063f06f7fb3d87858d34d2811dc978276f3c987 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Thu, 28 Mar 2024 08:58:37 -0700
Subject: [PATCH 541/670] #shlo_ref Add `minimum` op.

PiperOrigin-RevId: 619948992
---
 tensorflow/lite/experimental/shlo/ops/BUILD   |  29 ++++
 .../lite/experimental/shlo/ops/minimum.cc     |  66 ++++++++
 .../lite/experimental/shlo/ops/minimum.h      |  36 +++++
 .../experimental/shlo/ops/minimum_test.cc     | 151 ++++++++++++++++++
 4 files changed, 282 insertions(+)
 create mode 100644 tensorflow/lite/experimental/shlo/ops/minimum.cc
 create mode 100644 tensorflow/lite/experimental/shlo/ops/minimum.h
 create mode 100644 tensorflow/lite/experimental/shlo/ops/minimum_test.cc

diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 8624708052c3d2..f839714f3ecaf2 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -992,3 +992,32 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "minimum",
+    srcs = ["minimum.cc"],
+    hdrs = ["minimum.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "minimum_test",
+    srcs = ["minimum_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":minimum",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/minimum.cc b/tensorflow/lite/experimental/shlo/ops/minimum.cc
new file mode 100644
index 00000000000000..c583d7afb6b147
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/minimum.cc
@@ -0,0 +1,66 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/minimum.h"
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Minimum {
+  template <class T>
+  constexpr auto operator()(const T a, const T b) {
+    return a < b ? a : b;
+  }
+};
+
+MinimumOp Create(MinimumOp::Attributes) { return {}; }
+
+absl::Status Prepare(MinimumOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("minimum"), lhs, IsBoolTensor, IsIntTensor,
+                          IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("minimum"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("minimum"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(MinimumOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  Minimum minimum;
+  if (IsBoolTensor(lhs) || IsIntTensor(lhs) || IsFloatTensor(lhs)) {
+    // Note: all the arithmetic types share the same implementation.
+    DISPATCH_BOOL_INT_FLOAT(detail::EvaluateNoQuantization,
+                            lhs.tensor_element_type(), minimum, lhs, rhs,
+                            output);
+  } else if (IsQuantizedPerTensorTensor(lhs)) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       lhs.quantized_tensor_element_type().StorageType(),
+                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       minimum, lhs, rhs, output)
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.minimum: Unsupported tensor type.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/minimum.h b/tensorflow/lite/experimental/shlo/ops/minimum.h
new file mode 100644
index 00000000000000..5fc2205566de9c
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/minimum.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MINIMUM_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MINIMUM_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct MinimumOp {
+  struct Attributes {};
+};
+
+MinimumOp Create(MinimumOp::Attributes);
+absl::Status Prepare(MinimumOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(MinimumOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MINIMUM_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/minimum_test.cc b/tensorflow/lite/experimental/shlo/ops/minimum_test.cc
new file mode 100644
index 00000000000000..586c096f1cbab0
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/minimum_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/minimum.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<MinimumOp> {
+  static std::string Get() { return "Minimum"; }
+};
+
+struct Minimum {
+  template <class T>
+  constexpr auto operator()(const T a, const T b) {
+    return a < b ? a : b;
+  }
+};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Minimum, BinaryElementwiseOpShapePropagationTest,
+                               MinimumOp, TestParamNames);
+
+using MinimumBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    MinimumOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes,
+                           BaselineConstraintFloatTypes,
+                           BaselineConstraintQuantizedPerTensorTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Minimum, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MinimumBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<MinimumOp, ConcatTypes<PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Minimum, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using SupportedTypes = ConcatTypes<BoolTestType, ArithmeticTestTypes>;
+
+template <class T>
+struct MinimumTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(MinimumTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(MinimumTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(), Minimum());
+
+  auto op = Create(MinimumOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedMinimumTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedMinimumTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedMinimumTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(2);
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/zero_point + 1, /*max=*/zero_point + 5);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor lhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [zero_point, scale](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs = Dequantize(lhs, zero_point, scale);
+        const ExpressedT dequantized_rhs = Dequantize(rhs, zero_point, scale);
+        const ExpressedT dequantized_res =
+            Minimum()(dequantized_lhs, dequantized_rhs);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(MinimumOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+}  // namespace
+}  // namespace shlo_ref

From 48bb29f982ccab0d800043fb83f7118f8173a9fb Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Thu, 28 Mar 2024 09:06:31 -0700
Subject: [PATCH 542/670] [xla:gpu] Address computation thunk should not be
 inside command buffer

Address computation is not compatible with command buffer since it requires a host sync.

PiperOrigin-RevId: 619951741
---
 third_party/xla/xla/service/gpu/command_buffer_scheduling.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
index 97e28bdd356761..a9ed89cbf91ea8 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
@@ -167,6 +167,9 @@ static bool IsCommand(const HloInstruction* hlo,
           &custom_call_adaptor->instruction());
       return IsCommand(custom_call, config);
     }
+    if (custom_config.name() == "dynamic_address_computation") {
+      return false;
+    }
     return config.enabled_commands.contains(DebugOptions::FUSION);
   }
 

From 2953811e87b70627c70647aea23ed6e986f1c647 Mon Sep 17 00:00:00 2001
From: Kanvi Khanna <kanvi.khanna@intel.com>
Date: Thu, 28 Mar 2024 09:14:06 -0700
Subject: [PATCH 543/670] PR #10344: [XLA:CPU] Enable BMM+Mul+Add fusion

Imported from GitHub PR https://github.com/openxla/xla/pull/10344

This PR enables the BatchMatMul + Mul + Add fusion and adds a simple test
Copybara import of the project:

--
2a202c9d171064bb3005e0753af41e26f2e1baf3 by Kanvi Khanna <kanvi.khanna@intel.com>:

Enable BMM+Mul+Add fusion

--
584e59861f5f91fae8222b9281e501dbeb270b94 by Kanvi Khanna <kanvi.khanna@intel.com>:

Address review comments

Merging this change closes #10344

PiperOrigin-RevId: 619954276
---
 .../xla/service/cpu/onednn_matmul_rewriter.cc |  4 +--
 .../xla/xla/tests/onednn_matmul_test.cc       | 32 +++++++++++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
index 0bbd677325150b..d73975b3eac654 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -434,7 +434,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
         OptionalConvertAndBitcast(&optional_dot_convert, &optional_dot_bitcast,
                                   OneDnnMatmulInstr(&dot))
             .WithOneUser(),
-        m::Op(&addend_intermediate).WithOneUser());
+        m::Op(&addend_intermediate));
 
     if (Match(instr, pattern)) {
       if (!IsSupportedType(dot->shape().element_type())) return OkStatus();
@@ -587,7 +587,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
                                .WithOneUser()
                                .WithOpcode(HloOpcode::kCustomCall)
                                .WithCustomCallTarget({"__onednn$matmul"}),
-                           m::Broadcast(m::Constant(&constant)).WithOneUser());
+                           m::Broadcast(m::Constant(&constant)));
 
     if (Match(instr, pattern)) {
       std::vector<HloInstruction*> new_operands;
diff --git a/third_party/xla/xla/tests/onednn_matmul_test.cc b/third_party/xla/xla/tests/onednn_matmul_test.cc
index d8488bc2ca4fb6..8b100b6141bee2 100644
--- a/third_party/xla/xla/tests/onednn_matmul_test.cc
+++ b/third_party/xla/xla/tests/onednn_matmul_test.cc
@@ -618,6 +618,38 @@ TEST_F(MatmulTest, TestTransposeBNoRewriteF32) {
   )");
 }
 
+TEST_F(MatmulTest, SimpleTestF32WithMulAndAddFusion) {
+  const char* matmul_module_str = R"(
+  ENTRY matmul.mul.add.test.f32 {
+    arg0.1 = f32[32,32,40,30] parameter(0), parameter_replication={false}
+    arg0.2 = f32[32,32,30,40] parameter(1), parameter_replication={false}
+    dot.7 = f32[32,32,40,40] dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    const.0 = f32[] constant(0.044715)
+    bcast.0 = f32[32,32,40,40] broadcast(const.0), dimensions={}
+    mul.0 = f32[32,32,40,40] multiply(dot.7,bcast.0)
+    const.1 = f32[] constant(0.65)
+    bcast.1 = f32[32,32,40,40] broadcast(const.1), dimensions={}
+    add.0 = f32[32,32,40,40] add(mul.0, bcast.1)
+    const.2 = f32[] constant(0.65)
+    bcast.2 = f32[32,32,40,40] broadcast(const.2), dimensions={}
+    add.1 = f32[32,32,40,40] add(bcast.2, bcast.1)
+    tuple.12 = (f32[32,32,40,40]) tuple(add.0)
+    ROOT get-tuple-element.13 = f32[32,32,40,40] get-tuple-element(tuple.12), index=0
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str,
+                    R"(
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    ; CHECK:       backend_config={
+    ; CHECK-DAG:     "outer_dimension_partitions":[],
+    ; CHECK-DAG:     "onednn_matmul_config":{
+    ; CHECK-DAG:       "fused_ops":["LINEAR","BINARY_ADD"]
+    ; CHECK-DAG:   }
+    ; CHECK:     }
+    )");
+}
+
 }  // namespace cpu
 }  // namespace xla
 

From 0722946a072cf0542134235e1c29b1de7826311f Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Thu, 28 Mar 2024 09:16:17 -0700
Subject: [PATCH 544/670] [xla:gpu][NFC] Refactor and rename in
 address_computation_fusion_rewriter for clarity

PiperOrigin-RevId: 619954961
---
 .../address_computation_fusion_rewriter.cc    | 41 ++++++++++---------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index 6be6b4cc0be087..89a47d7cd9cd05 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -314,7 +314,7 @@ Status CreateRootTuple(HloInstruction* hero, HloComputation::Builder& builder,
 }
 
 absl::StatusOr<HloComputation*> CreateFusionBody(
-    HloModule* module, absl::Span<HloInstruction* const> operand_matches,
+    HloModule* module, absl::Span<HloInstruction* const> sliced_operand_paths,
     DefUseDataflowPaths sliced_user_paths,
     absl::Span<HloInstruction* const> captures) {
   HloComputation::Builder builder("address-computation");
@@ -342,7 +342,7 @@ absl::StatusOr<HloComputation*> CreateFusionBody(
   // Instructions in the pattern are already topologically sorted, as we visited
   // them following use-def path, then reverse the list.
   HloInstruction* hero;
-  for (HloInstruction* instr : operand_matches) {
+  for (HloInstruction* instr : sliced_operand_paths) {
     instr_mapping[instr] = builder.AddInstruction(
         instr->CloneWithNewOperands(instr->shape(), mapped_operands(instr)));
     hero = instr;
@@ -441,36 +441,36 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
     if (matches.empty()) return false;
 
     HloSchedule& schedule = module->schedule();
-    for (auto& kv : matches) {
-      auto& [operand_matches, sliced_user_paths] = kv.second;
-      std::vector<HloInstruction*> matches;
-      absl::c_copy(operand_matches, std::back_inserter(matches));
+    for (auto& [hero, paths] : matches) {
+      auto& [sliced_operand_paths, sliced_user_paths] = paths;
+      std::vector<HloInstruction*> matched_instrs;
+      absl::c_copy(sliced_operand_paths, std::back_inserter(matched_instrs));
 
       for (auto& sliced_user_path : sliced_user_paths)
-        absl::c_copy(sliced_user_path, std::back_inserter(matches));
+        absl::c_copy(sliced_user_path, std::back_inserter(matched_instrs));
 
-      auto captures = GetPatternCaptures(matches);
+      auto captures = GetPatternCaptures(matched_instrs);
 
       TF_ASSIGN_OR_RETURN(HloComputation * fusion_body,
-                          CreateFusionBody(module, operand_matches,
+                          CreateFusionBody(module, sliced_operand_paths,
                                            sliced_user_paths, captures));
 
       TF_ASSIGN_OR_RETURN(HloInstruction * fusion,
-                          CreateFusionInstruction(module, kv.first, captures,
+                          CreateFusionInstruction(module, hero, captures,
                                                   fusion_body, dynamic));
 
       // As we are running after scheduling we have to keep it valid.
-      HloComputation* parent = kv.first->parent();
+      HloComputation* parent = hero->parent();
       // Update schedule to replace the custom call instruction with the fusion
       // instruction.
       // Removal of the rest of the instructions in the sequence is handled by
       // schedule update below.
       HloInstructionSequence& sequence = schedule.GetOrCreateSequence(parent);
-      sequence.replace_instruction(kv.first, fusion);
+      sequence.replace_instruction(hero, fusion);
 
       if (fusion->shape().IsTuple()) {
         TF_RETURN_IF_ERROR(parent->ReplaceInstructionWithDifferentShape(
-            const_cast<HloInstruction*>(kv.first), fusion));
+            const_cast<HloInstruction*>(hero), fusion));
         for (auto& sliced_user_path : sliced_user_paths) {
           auto old_gte =
               Cast<HloGetTupleElementInstruction>(sliced_user_path.front());
@@ -481,28 +481,29 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
               parent->ReplaceInstruction(sliced_user_path.back(), gte));
         }
       } else {
-        auto* old_instr = const_cast<HloInstruction*>(kv.first);
+        auto* instr_to_be_replaced = const_cast<HloInstruction*>(hero);
         if (sliced_user_paths.empty()) {
           // The only case where a tuple-shaped original hero op is fused into a
           // non-tuple-shaped fusion is there's only one element of the original
           // tuple being used. In that case, we need to replace that single
           // get-tuple-element (instead of the hero op) with the fusion
           // instruction.
-          if (kv.first->shape().IsTuple()) {
-            if (kv.first->user_count() != 1 ||
+          if (hero->shape().IsTuple()) {
+            if (hero->user_count() != 1 ||
                 !DynCast<HloGetTupleElementInstruction>(
-                    kv.first->users().front())) {
+                    hero->users().front())) {
               return absl::InternalError(
                   "Expect a single get-tuple-element user of the original "
                   "tuple-shaped hero op when address computation fusion does "
                   "not return a tuple");
             }
-            old_instr = kv.first->users().front();
+            instr_to_be_replaced = hero->users().front();
           }
         } else {
-          old_instr = sliced_user_paths.front().back();
+          instr_to_be_replaced = sliced_user_paths.front().back();
         }
-        TF_RETURN_IF_ERROR(parent->ReplaceInstruction(old_instr, fusion));
+        TF_RETURN_IF_ERROR(
+            parent->ReplaceInstruction(instr_to_be_replaced, fusion));
       }
     }
 

From 3a2cd8887ed96de6abbd26e46844b959aa42e481 Mon Sep 17 00:00:00 2001
From: Marcello Maggioni <maggioni@google.com>
Date: Thu, 28 Mar 2024 09:26:08 -0700
Subject: [PATCH 545/670] [XLA] Add functionality to ReduceScatterDecomposer to
 be selective

PiperOrigin-RevId: 619957874
---
 .../xla/xla/service/collective_opt_utils.cc   | 34 ++++++++++++++++++-
 .../xla/xla/service/collective_opt_utils.h    |  2 +-
 .../xla/service/reduce_scatter_decomposer.cc  |  4 +++
 .../xla/service/reduce_scatter_decomposer.h   |  6 ++--
 .../service/reduce_scatter_decomposer_test.cc | 32 +++++++++++++++--
 5 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/service/collective_opt_utils.cc b/third_party/xla/xla/service/collective_opt_utils.cc
index 8e7a6d874cfa8a..183173801077c5 100644
--- a/third_party/xla/xla/service/collective_opt_utils.cc
+++ b/third_party/xla/xla/service/collective_opt_utils.cc
@@ -267,13 +267,45 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
   return true;
 }
 
+ReduceScatterSpec SpecFromReduceScatterInstr(const HloInstruction* rs_instr,
+                                             int64_t num_partitions,
+                                             int64_t num_replicas,
+                                             bool is_constrain_layout,
+                                             bool use_global_device_ids,
+                                             bool is_cross_module) {
+  CHECK(rs_instr->opcode() == HloOpcode::kReduceScatter);
+  ReduceScatterSpec spec;
+  spec.split_dim = rs_instr->dimensions(0);
+  if (!is_cross_module) {
+    spec.sharded_replicas = num_replicas;
+    spec.group_size = rs_instr->replica_groups().empty()
+                          ? num_replicas
+                          : rs_instr->replica_groups()[0].replica_ids_size();
+  } else if (use_global_device_ids) {
+    spec.sharded_replicas = num_replicas;
+    spec.sharded_partitions = num_partitions;
+    spec.group_size = rs_instr->replica_groups()[0].replica_ids_size();
+  } else {
+    spec.sharded_partitions = num_partitions;
+    spec.group_size = num_partitions;
+  }
+  spec.original_split_dims = {spec.split_dim};
+  spec.dynamic_slice = nullptr;
+  return spec;
+}
+
 }  // namespace
 
 std::optional<ReduceScatterSpec> MatchReduceScatter(
-    const HloAllReduceInstruction* ar, int64_t num_partitions,
+    const HloAllReduceInstructionBase* ar, int64_t num_partitions,
     int64_t num_replicas, bool allow_multiple_split_dims,
     bool allow_intervening_reshape, int64_t min_rank,
     HloPredicate match_partition_id, HloPredicate match_replica_id) {
+  if (ar->opcode() == HloOpcode::kReduceScatter) {
+    return SpecFromReduceScatterInstr(
+        ar, num_partitions, num_replicas, ar->constrain_layout(),
+        ar->use_global_device_ids(), ar->channel_id().has_value());
+  }
   auto spec = MatchWithDynamicSlice(
       ar, num_partitions, num_replicas, allow_multiple_split_dims,
       allow_intervening_reshape, min_rank, match_partition_id, match_replica_id,
diff --git a/third_party/xla/xla/service/collective_opt_utils.h b/third_party/xla/xla/service/collective_opt_utils.h
index 11b65c1acc4160..7d044be3c34568 100644
--- a/third_party/xla/xla/service/collective_opt_utils.h
+++ b/third_party/xla/xla/service/collective_opt_utils.h
@@ -36,7 +36,7 @@ struct ReduceScatterSpec {
 
 // Matches the given all-reduce operation to a reduce-scatter pattern.
 std::optional<ReduceScatterSpec> MatchReduceScatter(
-    const HloAllReduceInstruction* ar, int64_t num_partitions,
+    const HloAllReduceInstructionBase* ar, int64_t num_partitions,
     int64_t num_replicas, bool allow_multiple_split_dims = false,
     bool allow_intervening_reshape = false, int64_t min_rank = 1,
     HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>,
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.cc b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
index 7210a2c12b4f30..da2fed224a53f5 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
@@ -53,7 +53,11 @@ absl::StatusOr<bool> ReduceScatterDecomposer::Run(
       if (rs->channel_id()) {
         channel_id = next_channel_id++;
       }
+      if (should_decompose_ && !should_decompose_(rs)) {
+        continue;
+      }
 
+      VLOG(2) << "Decompose: " << rs->ToString();
       // Create an all-reduce
       HloComputation *apply_clone = module->AddComputationAndUnifyNamesAndIds(
           rs->to_apply()->Clone(), /*is_entry=*/false);
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.h b/third_party/xla/xla/service/reduce_scatter_decomposer.h
index 324d97d0e915e9..1ee1f603c09f28 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.h
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.h
@@ -29,8 +29,9 @@ namespace xla {
 class ReduceScatterDecomposer : public HloModulePass {
  public:
   explicit ReduceScatterDecomposer(
-      std::function<void(Shape&)> update_layout = nullptr)
-      : update_layout_(update_layout) {}
+      std::function<void(Shape&)> update_layout = nullptr,
+      std::function<bool(const HloInstruction*)> should_decompose = nullptr)
+      : update_layout_(update_layout), should_decompose_(should_decompose) {}
   absl::string_view name() const override {
     return "reduce-scatter-decomposer";
   }
@@ -40,6 +41,7 @@ class ReduceScatterDecomposer : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
   std::function<void(Shape&)> update_layout_;
+  std::function<bool(const HloInstruction*)> should_decompose_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc b/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
index bfaa918930befb..d7f8360fbdc910 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
@@ -41,13 +41,18 @@ class ReduceScatterDecomposerTest : public HloTestBase {
       absl::string_view hlo_module, PassAction action,
       CollectiveOpGroupMode mode = CollectiveOpGroupMode::kCrossReplica,
       int64_t shard_size = 0, int64_t shard_dimension = 0,
-      int64_t replica_count = 2) {
+      int64_t replica_count = 2,
+      std::function<bool(const HloInstruction *)> should_decompose =
+          [](const HloInstruction *) { return true; }) {
     const int64_t partition_count = 2;
     TF_ASSERT_OK_AND_ASSIGN(
         auto module, ParseAndReturnVerifiedModule(hlo_module, replica_count,
                                                   partition_count));
-    TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                            ReduceScatterDecomposer().Run(module.get()));
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool changed,
+        ReduceScatterDecomposer(/*update_layout=*/nullptr,
+                                /*should_decompose=*/should_decompose)
+            .Run(module.get()));
     if (action == PassAction::kNoChange) {
       ASSERT_FALSE(changed);
       return;
@@ -222,5 +227,26 @@ ENTRY main {
   RunPass(hlo_string, PassAction::kNoChange);
 }
 
+TEST_F(ReduceScatterDecomposerTest, NoChangeWithShouldDecompose) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+ENTRY main {
+  p0 = f32[4, 8] parameter(0)
+  ROOT rs = f32[4, 4] reduce-scatter(p0), replica_groups={{0,1}, {2,3}}, channel_id=1, dimensions={1}, to_apply=sum, use_global_device_ids=true
+}
+)";
+  RunPass(hlo_string, PassAction::kNoChange,
+          CollectiveOpGroupMode::kCrossReplica,
+          /*shard_size=*/0, /*shard_dimension=*/0,
+          /*replica_count=*/2, [](const HloInstruction *) { return false; });
+}
+
 }  // namespace
 }  // namespace xla

From 6bbc1688e7c99a5067c12bdd2352f7509bb3b1cd Mon Sep 17 00:00:00 2001
From: Vladyslav Tsilytskyi <tsilytskyi@google.com>
Date: Thu, 28 Mar 2024 09:45:23 -0700
Subject: [PATCH 546/670] [stream_executor:host] Rename host_gpu_executor

PiperOrigin-RevId: 619965098
---
 third_party/xla/xla/stream_executor/host/BUILD           | 9 ++++-----
 .../host/{host_gpu_executor.cc => host_executor.cc}      | 2 +-
 .../host/{host_gpu_executor.h => host_executor.h}        | 6 +++---
 .../xla/xla/stream_executor/host/host_platform.cc        | 2 +-
 4 files changed, 9 insertions(+), 10 deletions(-)
 rename third_party/xla/xla/stream_executor/host/{host_gpu_executor.cc => host_executor.cc} (99%)
 rename third_party/xla/xla/stream_executor/host/{host_gpu_executor.h => host_executor.h} (97%)

diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index a29fa215b5736c..04edca1c9e225a 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -40,7 +40,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":host_gpu_executor",
+        ":host_executor",
         ":host_platform_id",
         "//xla/stream_executor",
         "//xla/stream_executor:platform_manager",
@@ -110,14 +110,13 @@ xla_cc_test(
     ],
 )
 
-# TODO(22689637): Rename this target.
 cc_library(
-    name = "host_gpu_executor",
+    name = "host_executor",
     srcs = [
-        "host_gpu_executor.cc",
+        "host_executor.cc",
     ],
     hdrs = [
-        "host_gpu_executor.h",
+        "host_executor.h",
     ],
     deps = [
         ":host_stream",
diff --git a/third_party/xla/xla/stream_executor/host/host_gpu_executor.cc b/third_party/xla/xla/stream_executor/host/host_executor.cc
similarity index 99%
rename from third_party/xla/xla/stream_executor/host/host_gpu_executor.cc
rename to third_party/xla/xla/stream_executor/host/host_executor.cc
index 13f8ec2e0bca9c..33d54a16f3fee1 100644
--- a/third_party/xla/xla/stream_executor/host/host_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/host/host_executor.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // Implementation of HostExecutor class [of those methods not defined in the
 // class declaration].
-#include "xla/stream_executor/host/host_gpu_executor.h"
+#include "xla/stream_executor/host/host_executor.h"
 
 #include <stdint.h>
 #include <string.h>
diff --git a/third_party/xla/xla/stream_executor/host/host_gpu_executor.h b/third_party/xla/xla/stream_executor/host/host_executor.h
similarity index 97%
rename from third_party/xla/xla/stream_executor/host/host_gpu_executor.h
rename to third_party/xla/xla/stream_executor/host/host_executor.h
index 0c86d7080c8755..6123e227591fa8 100644
--- a/third_party/xla/xla/stream_executor/host/host_gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/host/host_executor.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Declares the HostExecutor class, which is a CPU-only implementation of
 // the StreamExecutor interface. For now, this is used for testing and to
 // examine the performance of host-based StreamExecutor code.
-#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
-#define XLA_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_EXECUTOR_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_EXECUTOR_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -147,4 +147,4 @@ class HostExecutor : public internal::StreamExecutorInterface {
 }  // namespace host
 }  // namespace stream_executor
 
-#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_EXECUTOR_H_
diff --git a/third_party/xla/xla/stream_executor/host/host_platform.cc b/third_party/xla/xla/stream_executor/host/host_platform.cc
index 58771670d0b6a1..23112fbecd51aa 100644
--- a/third_party/xla/xla/stream_executor/host/host_platform.cc
+++ b/third_party/xla/xla/stream_executor/host/host_platform.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/host/host_gpu_executor.h"
+#include "xla/stream_executor/host/host_executor.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/initialize.h"

From 5e27ca300ab463783f491fb3e67d6bfdb4ba9cd2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 09:50:49 -0700
Subject: [PATCH 547/670] Make the computation of the memory budget lower bound
 more efficient. Specifically: 1. Iterate over AliasAnalysis::buffers() rather
 than the live ranges as the latter has too many duplicates 2. Turn a Shape
 object into a const reference 3. Pull out a conditional to exit early when
 possible.

PiperOrigin-RevId: 619967473
---
 .../auto_sharding/auto_sharding.cc            | 44 ++++++++++++-------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 82bac59e3b2d7d..49370d832b393f 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -2650,7 +2650,7 @@ int64_t MemoryBudgetLowerBound(
     const HloAliasAnalysis& alias_analysis, const int64_t num_devices,
     const absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserved_shardings) {
-  auto get_value_sharding = [](const HloValue* value) {
+  auto get_value_sharding = [](const HloValue* value) -> HloSharding {
     return !value->index().empty()
                ? value->instruction()->sharding().GetSubSharding(
                      value->instruction()->shape(), value->index())
@@ -2664,9 +2664,8 @@ int64_t MemoryBudgetLowerBound(
   absl::flat_hash_map<HloBuffer::Id, const HloValue*>
       buffer_to_sharded_value_mapping;
   bool vlog_is_on_5 = VLOG_IS_ON(5);
-  for (LivenessIdx time_idx = 0; time_idx < liveness_set.size(); ++time_idx) {
-    for (const HloValue* value : liveness_set[time_idx]) {
-      const HloBuffer& buffer = alias_analysis.GetBufferContainingValue(*value);
+  for (const HloBuffer& buffer : alias_analysis.buffers()) {
+    for (const HloValue* value : buffer.values()) {
       if (value->instruction()->has_sharding()) {
         if (vlog_is_on_5) {
           const HloSharding& this_value_sharding = get_value_sharding(value);
@@ -2694,36 +2693,51 @@ int64_t MemoryBudgetLowerBound(
   }
 
   int64_t max_memory_usage = 0;
+  absl::flat_hash_map<const HloValue*, int64_t> value_to_memory_size_mapping;
   for (LivenessIdx time_idx = 0; time_idx < liveness_set.size(); ++time_idx) {
     int64_t memory_usage = 0;
     for (const HloValue* value : liveness_set[time_idx]) {
       if (value->instruction()->shape().IsTuple() && value->index().empty()) {
         continue;
       }
-      Shape shape =
-          ShapeUtil::GetSubshape(value->instruction()->shape(), value->index());
-      const HloBuffer& buffer = alias_analysis.GetBufferContainingValue(*value);
-      auto iter = buffer_to_sharded_value_mapping.find(buffer.id());
+
+      auto iter1 = value_to_memory_size_mapping.find(value);
+      if (iter1 != value_to_memory_size_mapping.end()) {
+        memory_usage += iter1->second;
+        continue;
+      }
+
       std::optional<HloSharding> optional_sharding = std::nullopt;
-      if (iter != buffer_to_sharded_value_mapping.end()) {
+      const HloBuffer& buffer = alias_analysis.GetBufferContainingValue(*value);
+      auto iter2 = buffer_to_sharded_value_mapping.find(buffer.id());
+      if (iter2 != buffer_to_sharded_value_mapping.end()) {
         // The instructions here can have partial sharding annotations from
         // previous iterations with partial mesh shapes when
         // solve_nd_sharding_iteratively is true. To exclude these, we only
         // utilize those shardings which corresponding to the current device
         // mesh.
-        const HloSharding& value_sharding = get_value_sharding(iter->second);
         if (preserved_shardings.find(value->instruction()->name()) !=
-                preserved_shardings.end() ||
-            !value_sharding.IsTiled() ||
-            value_sharding.TotalNumTiles() == num_devices) {
-          optional_sharding = value_sharding;
+            preserved_shardings.end()) {
+          optional_sharding = get_value_sharding(iter2->second);
+        } else {
+          const HloSharding& value_sharding = get_value_sharding(iter2->second);
+          if (!value_sharding.IsTiled() ||
+              value_sharding.TotalNumTiles() == num_devices) {
+            optional_sharding = value_sharding;
+          }
         }
       }
-      memory_usage +=
+
+      const Shape& shape =
+          ShapeUtil::GetSubshape(value->instruction()->shape(), value->index());
+      int64_t value_memory_usage =
           GetShardedInstructionSize(shape, num_devices, optional_sharding);
+      value_to_memory_size_mapping[value] = value_memory_usage;
+      memory_usage += value_memory_usage;
     }
     max_memory_usage = std::max(max_memory_usage, memory_usage);
   }
+
   return max_memory_usage;
 }
 

From 498618553949d8a5b33f7e7515b9fbd45cffdaaa Mon Sep 17 00:00:00 2001
From: akhilgoe <114951738+akhilgoe@users.noreply.github.com>
Date: Thu, 28 Mar 2024 09:53:05 -0700
Subject: [PATCH 548/670] PR #10759: [XLA:CPU][oneDNN] Enable matrix-vector and
 vector-vector product

Imported from GitHub PR https://github.com/openxla/xla/pull/10759

This PR relaxes conditions to rewrite dot operations of the form vector-matrix, matrix-vector, or vector-vector to oneDNN custom calls, provided the original problem meets the empirically determined multiply-accumulate threshold. In particular this PR:

1. Relaxes some constraints on Dot to oneDNN matmul custom call conversion
2. Reconfigures the dimensions of the operands and outputs of convertible dot operations.
3. Adds tests to verify rewrite and execution result
Copybara import of the project:

--
d24e5cd0b77d0734a0f33011ae03127f00d80e7d by Akhil Goel <akhil.goel@intel.com>:

Relax constraints for matmul rewrite

--
9582288fc5d06eb8f4641be32d6c41746f13dbce by Akhil Goel <akhil.goel@intel.com>:

Fix gemv test after merge

--
604d4fbd29a78a7d305092356176e31081ea4ff0 by Akhil Goel <akhil.goel@intel.com>:

Address review comments

--
4534fbd7e96a6e45ccc5a501b38b940aa8bd9d38 by Akhil Goel <akhil.goel@intel.com>:

Optional commit

Merging this change closes #10759

PiperOrigin-RevId: 619968241
---
 .../xla/xla/service/cpu/cpu_float_support.cc  |  14 +-
 .../xla/service/cpu/onednn_matmul_rewriter.cc |  97 +++-
 .../xla/xla/tests/onednn_matmul_test.cc       | 466 ++++++++++--------
 3 files changed, 344 insertions(+), 233 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/cpu_float_support.cc b/third_party/xla/xla/service/cpu/cpu_float_support.cc
index dd5c6c5b9d5049..0bb4dd8e875a75 100644
--- a/third_party/xla/xla/service/cpu/cpu_float_support.cc
+++ b/third_party/xla/xla/service/cpu/cpu_float_support.cc
@@ -27,7 +27,7 @@ bool CpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
     // oneDNN rewritable ops
     case HloOpcode::kDot:
       return LowPrecisionType() == BF16 &&
-             OneDnnMatMulRewriter::ShouldRewrite(&hlo) && DotSupported(hlo);
+             OneDnnMatMulRewriter::ShouldRewrite(&hlo);
     // Collective ops.
     case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
@@ -59,18 +59,6 @@ bool CpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
   }
 }
 
-bool CpuFloatSupport::DotSupported(const HloInstruction& hlo) const {
-  bool supported = true;
-  const Shape& lhs_shape = hlo.operand(0)->shape();
-  const Shape& rhs_shape = hlo.operand(1)->shape();
-  if (lhs_shape.rank() == rhs_shape.rank() && lhs_shape.rank() == 2) {
-    // If first dim size is 1, it may be removed by a later pass which makes it
-    // unsupported case.
-    supported &= lhs_shape.dimensions(0) != 1;
-  }
-  return supported;
-}
-
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
index d73975b3eac654..e08cd7b6c7118a 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -333,12 +333,15 @@ bool OneDnnMatMulRewriter::ShouldRewrite(const HloInstruction* dot_instr) {
       ShapeUtil::IsZeroElementArray(output_shape)) {
     return false;
   }
-  // OneDNN only supports 2 <= rank <= kOneDnnMaxNDims.
-  if (lhs_shape.rank() != rhs_shape.rank() ||
-      rhs_shape.rank() != output_shape.rank() || lhs_shape.rank() < 2 ||
-      lhs_shape.rank() > kOneDnnMaxNDims) {
+  // OneDNN only supports rank <= kOneDnnMaxNDims and singular non-contracting
+  // dimensions. We should not rewrite if any of these conditions are violated.
+  if (lhs_shape.rank() <= 0 || lhs_shape.rank() > kOneDnnMaxNDims ||
+      rhs_shape.rank() <= 0 || rhs_shape.rank() > kOneDnnMaxNDims ||
+      output_shape.rank() > std::min({lhs_shape.rank(), rhs_shape.rank(),
+                                      static_cast<int64_t>(kOneDnnMaxNDims)})) {
     return false;
   }
+
   // Layout should be row-major, contraction dimensions captures transpose
   // scenarios in last two dimensions.
   if (!IsRowMajor(lhs_shape) || !IsRowMajor(rhs_shape) ||
@@ -362,7 +365,7 @@ bool OneDnnMatMulRewriter::ShouldRewrite(const HloInstruction* dot_instr) {
   auto num_flops = xla::HloCostAnalysis::GetDotFlops(lhs_shape, output_shape,
                                                      dot_dim_numbers);
   auto rank = output_shape.rank();
-  auto flops_threshold = (rank == 2) ? (1 << 24) : (1 << 19);
+  auto flops_threshold = (rank <= 2) ? (1 << 24) : (1 << 19);
   return (num_flops >= flops_threshold);
 }
 
@@ -375,10 +378,11 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
     auto pattern = m::Op(&dot_instr).WithOpcode(HloOpcode::kDot);
     if (!Match(instr, pattern)) return OkStatus();
 
-    auto dot_dim_numbers = dot_instr->dot_dimension_numbers();
-    TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(dot_dim_numbers));
-
+    TF_RETURN_IF_ERROR(
+        ValidateDotDimensionNumbers(dot_instr->dot_dimension_numbers()));
     if (!OneDnnMatMulRewriter::ShouldRewrite(dot_instr)) return OkStatus();
+    TF_ASSIGN_OR_RETURN(dot_instr, ReconfigureDotDimensions(dot_instr));
+    auto dot_dim_numbers = dot_instr->dot_dimension_numbers();
     const Shape& lhs_shape = dot_instr->operand(0)->shape();
     const Shape& rhs_shape = dot_instr->operand(1)->shape();
     const Shape& output_shape = dot_instr->shape();
@@ -630,6 +634,83 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
 
     return ReplaceWithNewInstruction(activation, std::move(output));
   }
+
+  // This function changes dot instruction for supported matrix
+  // multiplication scenarios. In particular, it changes the shape
+  // of lhs, rhs and result arrays.
+  //    - lhs configuration scenario
+  //      lhs:    [batch_dims,contracting_dim] to [batch_dims,1,contracting_dim]
+  //      result: [batch_dims,feature_dim] to [batch_dims,1,feature_dim]
+  //
+  //    - rhs configuration scenario
+  //      rhs:    [batch_dims,contracting_dim] to [batch_dims,contracting_dim,1]
+  //      result: [batch_dims,feature_dim] to [batch_dims,feature_dim, 1]
+  //
+  //    - both lhs and rhs configuration scenario
+  //      lhs:    [batch_dims,contracting_dim] to [batch_dims,1,contracting_dim]
+  //      rhs:    [batch_dims,contracting_dim] to [batch_dims,contracting_dim,1]
+  //      result: [batch_dims] to [batch_dims,1,1]
+  StatusOr<HloInstruction*> ReconfigureDotDimensions(
+      HloInstruction* dot_instr) {
+    HloInstruction* lhs = dot_instr->mutable_operand(0);
+    HloInstruction* rhs = dot_instr->mutable_operand(1);
+    DotDimensionNumbers dim_numbers = dot_instr->dot_dimension_numbers();
+
+    auto lhs_batch_dims = dim_numbers.lhs_batch_dimensions();
+    auto lhs_contraction_dims = dim_numbers.lhs_contracting_dimensions();
+    bool is_lhs_vector = lhs->shape().rank() ==
+                         (lhs_batch_dims.size() + lhs_contraction_dims.size());
+
+    auto rhs_batch_dims = dim_numbers.rhs_batch_dimensions();
+    auto rhs_contraction_dims = dim_numbers.rhs_contracting_dimensions();
+    bool is_rhs_vector = rhs->shape().rank() ==
+                         (rhs_batch_dims.size() + rhs_contraction_dims.size());
+
+    if (!is_lhs_vector && !is_rhs_vector) return dot_instr;
+
+    std::vector<int64_t> adjusted_lhs_dims(lhs->shape().dimensions().begin(),
+                                           lhs->shape().dimensions().end());
+    std::vector<int64_t> adjusted_rhs_dims(rhs->shape().dimensions().begin(),
+                                           rhs->shape().dimensions().end());
+    std::vector<int64_t> adjusted_dot_dims(
+        dot_instr->shape().dimensions().begin(),
+        dot_instr->shape().dimensions().end());
+
+    if (is_lhs_vector) {
+      auto lhs_it = adjusted_lhs_dims.begin() + lhs_batch_dims.size();
+      adjusted_lhs_dims.insert(lhs_it, 1, 1);
+      auto result_it = adjusted_dot_dims.begin() + lhs_batch_dims.size();
+      adjusted_dot_dims.insert(result_it, 1, 1);
+      auto lhs_contraction_dim =
+          dot_instr->dot_dimension_numbers().lhs_contracting_dimensions(0);
+      dim_numbers.set_lhs_contracting_dimensions(0, lhs_contraction_dim + 1);
+      lhs = lhs->AddInstruction(HloInstruction::CreateBitcast(
+          ShapeUtil::MakeShape(lhs->shape().element_type(), adjusted_lhs_dims),
+          lhs));
+    }
+
+    if (is_rhs_vector) {
+      auto it = adjusted_rhs_dims.end();
+      adjusted_rhs_dims.insert(it, 1, 1);
+      auto result_it = adjusted_dot_dims.end();
+      adjusted_dot_dims.insert(result_it, 1, 1);
+      rhs = rhs->AddInstruction(HloInstruction::CreateBitcast(
+          ShapeUtil::MakeShape(rhs->shape().element_type(), adjusted_rhs_dims),
+          rhs));
+    }
+
+    HloInstruction* adjusted_dot =
+        dot_instr->AddInstruction(HloInstruction::CreateDot(
+            ShapeUtil::MakeShape(dot_instr->shape().element_type(),
+                                 adjusted_dot_dims),
+            lhs, rhs, dim_numbers, dot_instr->precision_config()));
+
+    HloInstruction* replacement_instr = adjusted_dot->AddInstruction(
+        HloInstruction::CreateBitcast(dot_instr->shape(), adjusted_dot));
+
+    TF_RETURN_IF_ERROR(ReplaceInstruction(dot_instr, replacement_instr));
+    return adjusted_dot;
+  }
 };
 
 class OneDnnMatMulReorderVisitor : public DfsHloRewriteVisitor {
diff --git a/third_party/xla/xla/tests/onednn_matmul_test.cc b/third_party/xla/xla/tests/onednn_matmul_test.cc
index 8b100b6141bee2..64befeab768c25 100644
--- a/third_party/xla/xla/tests/onednn_matmul_test.cc
+++ b/third_party/xla/xla/tests/onednn_matmul_test.cc
@@ -67,12 +67,12 @@ class MatmulTest : public HloTestBase {
 
 TEST_F(MatmulTest, SimpleTestF32) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[32,8,128,64]{3,2,1,0},f32[32,8,64,128]{3,2,1,0})->f32[32,8,128,128]{3,2,1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.f32 {
-    arg.0 = f32[32,8,128,64]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[32,8,64,128]{3,2,1,0} parameter(1), parameter_replication={false}
-    ROOT onednn.matmul.0 = f32[32,8,128,128]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    arg.0 = f32[32,8,128,64] parameter(0), parameter_replication={false}
+    arg.1 = f32[32,8,64,128] parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = f32[32,8,128,128] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -87,12 +87,12 @@ TEST_F(MatmulTest, SimpleTestBF16) {
   }
 
   const char* matmul_module_str = R"(
-  HloModule matmul.test.bf16, entry_computation_layout={(bf16[32,8,128,64]{3,2,1,0},bf16[32,8,64,128]{3,2,1,0})->bf16[32,8,128,128]{3,2,1,0}}
+  HloModule matmul.test.bf16
 
   ENTRY matmul.test.bf16 {
-    arg.0 = bf16[32,8,128,64]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = bf16[32,8,64,128]{3,2,1,0} parameter(1), parameter_replication={false}
-    ROOT onednn.matmul.0 = bf16[32,8,128,128]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    arg.0 = bf16[32,8,128,64] parameter(0), parameter_replication={false}
+    arg.1 = bf16[32,8,64,128] parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = bf16[32,8,128,128] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-4}));
@@ -105,11 +105,12 @@ TEST_F(MatmulTest, SimpleTestF16) {
   }
 
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f16, entry_computation_layout={(f16[32,8,128,64]{3,2,1,0},f16[32,8,64,128]{3,2,1,0})->f16[32,8,128,128]{3,2,1,0}}
+  HloModule matmul.test.f16
+
   ENTRY matmul.test.f16 {
-    arg.0 = f16[32,8,128,64]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = f16[32,8,64,128]{3,2,1,0} parameter(1), parameter_replication={false}
-    ROOT onednn.matmul.0 = f16[32,8,128,128]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    arg.0 = f16[32,8,128,64] parameter(0), parameter_replication={false}
+    arg.1 = f16[32,8,64,128] parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = f16[32,8,128,128] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-4}));
@@ -118,12 +119,12 @@ TEST_F(MatmulTest, SimpleTestF16) {
 
 TEST_F(MatmulTest, SimpleTestF32TransposeB) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.1, entry_computation_layout={(f32[32,8,128,64]{3,1,2,0},f32[32,8,128,64]{3,1,2,0})->f32[32,8,128,128]{3,2,1,0}}
+  HloModule matmul.test.1
 
   ENTRY matmul.test.1 {
     arg.0 = f32[32,8,128,64]{3,1,2,0} parameter(0), parameter_replication={false}
     arg.1 = f32[32,8,128,64]{3,1,2,0} parameter(1), parameter_replication={false}
-    ROOT onednn.matmul.0 = f32[32,8,128,128]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+    ROOT onednn.matmul.0 = f32[32,8,128,128] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -132,21 +133,21 @@ TEST_F(MatmulTest, SimpleTestF32TransposeB) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAddFusion1) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[32,32,40,30]{3,2,1,0})->f32[32,32,40,40]{3,2,1,0}}
-  
+  HloModule matmul.biasadd.test.f32
+
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[32,32,40,30]{3,2,1,0} parameter(0), parameter_replication={false}
-    reshape.2 = f32[32,32,40,30]{3,2,1,0} reshape(arg0.1)
+    arg0.1 = f32[32,32,40,30] parameter(0), parameter_replication={false}
+    reshape.2 = f32[32,32,40,30] reshape(arg0.1)
     constant.3 = f32[] constant(1)
-    broadcast.4 = f32[32,32,30,40]{3,2,1,0} broadcast(constant.3), dimensions={}
-    dot.7 = f32[32,32,40,40]{3,2,1,0} dot(reshape.2, broadcast.4), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    broadcast.4 = f32[32,32,30,40] broadcast(constant.3), dimensions={}
+    dot.7 = f32[32,32,40,40] dot(reshape.2, broadcast.4), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
     constant.5 = f32[] constant(15)
-    broadcast.6 = f32[40]{0} broadcast(constant.5), dimensions={}
-    broadcast.9 = f32[32,32,40,40]{3,2,1,0} broadcast(broadcast.6), dimensions={3}
-    add.10 = f32[32,32,40,40]{3,2,1,0} add(dot.7, broadcast.9)
-    reshape.11 = f32[32,32,40,40]{3,2,1,0} reshape(add.10)
-    tuple.12 = (f32[32,32,40,40]{3,2,1,0}) tuple(reshape.11)
-    ROOT get-tuple-element.13 = f32[32,32,40,40]{3,2,1,0} get-tuple-element(tuple.12), index=0
+    broadcast.6 = f32[40] broadcast(constant.5), dimensions={}
+    broadcast.9 = f32[32,32,40,40] broadcast(broadcast.6), dimensions={3}
+    add.10 = f32[32,32,40,40] add(dot.7, broadcast.9)
+    reshape.11 = f32[32,32,40,40] reshape(add.10)
+    tuple.12 = (f32[32,32,40,40]) tuple(reshape.11)
+    ROOT get-tuple-element.13 = f32[32,32,40,40] get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -155,21 +156,21 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAddFusion1) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAddFusion2) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[400,300]{1,0})->f32[400,1,400]{2,1,0}}
+  HloModule matmul.biasadd.test.f32
   
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[400,300]{1,0} parameter(0), parameter_replication={false}
-    reshape.2 = f32[400,300]{1,0} reshape(arg0.1)
+    arg0.1 = f32[400,300] parameter(0), parameter_replication={false}
+    reshape.2 = f32[400,300] reshape(arg0.1)
     constant.3 = f32[] constant(1)
-    broadcast.4 = f32[300,400]{1,0} broadcast(constant.3), dimensions={}
-    dot.7 = f32[400,400]{1,0} dot(reshape.2, broadcast.4), lhs_batch_dims={}, lhs_contracting_dims={1}, rhs_batch_dims={}, rhs_contracting_dims={0}
-    reshape.1 = f32[400,1,400]{2,1,0} reshape(dot.7)
+    broadcast.4 = f32[300,400] broadcast(constant.3), dimensions={}
+    dot.7 = f32[400,400] dot(reshape.2, broadcast.4), lhs_batch_dims={}, lhs_contracting_dims={1}, rhs_batch_dims={}, rhs_contracting_dims={0}
+    reshape.1 = f32[400,1,400] reshape(dot.7)
     constant.5 = f32[] constant(15)
-    broadcast.6 = f32[400]{0} broadcast(constant.5), dimensions={}
-    broadcast.9 = f32[400,1,400]{2,1,0} broadcast(broadcast.6), dimensions={2}
-    add.10 = f32[400,1,400]{2,1,0} add(reshape.1, broadcast.9)
-    tuple.12 = (f32[400,1,400]{2,1,0}) tuple(add.10)
-    ROOT get-tuple-element.13 = f32[400,1,400]{2,1,0} get-tuple-element(tuple.12), index=0
+    broadcast.6 = f32[400] broadcast(constant.5), dimensions={}
+    broadcast.9 = f32[400,1,400] broadcast(broadcast.6), dimensions={2}
+    add.10 = f32[400,1,400] add(reshape.1, broadcast.9)
+    tuple.12 = (f32[400,1,400]) tuple(add.10)
+    ROOT get-tuple-element.13 = f32[400,1,400] get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -178,17 +179,17 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAddFusion2) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter1) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[32,32,40,30]{3,2,1,0}, f32[32,32,30,40]{3,2,1,0}, f32[32,32,40,40]{3,2,1,0})->f32[32,32,40,40]{3,2,1,0}}
-  
+  HloModule matmul.biasadd.test.f32
+
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[32,32,40,30]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg0.2 = f32[32,32,30,40]{3,2,1,0} parameter(1), parameter_replication={false}
-    arg0.3 = f32[32,32,40,40]{3,2,1,0} parameter(2), parameter_replication={false}
-    dot.7 = f32[32,32,40,40]{3,2,1,0} dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    add.10 = f32[32,32,40,40]{3,2,1,0} add(dot.7, arg0.3)
-    reshape.11 = f32[32,32,40,40]{3,2,1,0} reshape(add.10)
-    tuple.12 = (f32[32,32,40,40]{3,2,1,0}) tuple(reshape.11)
-    ROOT get-tuple-element.13 = f32[32,32,40,40]{3,2,1,0} get-tuple-element(tuple.12), index=0
+    arg0.1 = f32[32,32,40,30] parameter(0), parameter_replication={false}
+    arg0.2 = f32[32,32,30,40] parameter(1), parameter_replication={false}
+    arg0.3 = f32[32,32,40,40] parameter(2), parameter_replication={false}
+    dot.7 = f32[32,32,40,40] dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    add.10 = f32[32,32,40,40] add(dot.7, arg0.3)
+    reshape.11 = f32[32,32,40,40] reshape(add.10)
+    tuple.12 = (f32[32,32,40,40]) tuple(reshape.11)
+    ROOT get-tuple-element.13 = f32[32,32,40,40] get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -197,18 +198,18 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter1) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[32,32,40,30]{3,2,1,0}, f32[32,32,30,40]{3,2,1,0}, f32[40]{0})->f32[32,32,40,40]{3,2,1,0}}
-  
+  HloModule matmul.biasadd.test.f32
+
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[32,32,40,30]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg0.2 = f32[32,32,30,40]{3,2,1,0} parameter(1), parameter_replication={false}
+    arg0.1 = f32[32,32,40,30] parameter(0), parameter_replication={false}
+    arg0.2 = f32[32,32,30,40] parameter(1), parameter_replication={false}
     arg0.3 = f32[40]{0} parameter(2), parameter_replication={false}
-    dot.7 = f32[32,32,40,40]{3,2,1,0} dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    broad.1 = f32[32,32,40,40]{3,2,1,0} broadcast(arg0.3), dimensions={3}
-    add.10 = f32[32,32,40,40]{3,2,1,0} add(dot.7, broad.1)
-    reshape.11 = f32[32,32,40,40]{3,2,1,0} reshape(add.10)
-    tuple.12 = (f32[32,32,40,40]{3,2,1,0}) tuple(reshape.11)
-    ROOT get-tuple-element.13 = f32[32,32,40,40]{3,2,1,0} get-tuple-element(tuple.12), index=0
+    dot.7 = f32[32,32,40,40] dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    broad.1 = f32[32,32,40,40] broadcast(arg0.3), dimensions={3}
+    add.10 = f32[32,32,40,40] add(dot.7, broad.1)
+    reshape.11 = f32[32,32,40,40] reshape(add.10)
+    tuple.12 = (f32[32,32,40,40]) tuple(reshape.11)
+    ROOT get-tuple-element.13 = f32[32,32,40,40] get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -217,18 +218,18 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2D) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[2,2,400,30]{3,2,1,0}, f32[2,2,30,400]{3,2,1,0}, f32[2,400]{1,0})->f32[2,2,400,400]{3,2,1,0}}
-  
+  HloModule matmul.biasadd.test.f32
+
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[2,2,400,30]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg0.2 = f32[2,2,30,400]{3,2,1,0} parameter(1), parameter_replication={false}
-    arg0.3 = f32[2,400]{1,0} parameter(2), parameter_replication={false}
-    dot.7 = f32[2,2,400,400]{3,2,1,0} dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    broad.1 = f32[2,2,400,400]{3,2,1,0} broadcast(arg0.3), dimensions={0,3}
-    add.10 = f32[2,2,400,400]{3,2,1,0} add(dot.7, broad.1)
-    reshape.11 = f32[2,2,400,400]{3,2,1,0} reshape(add.10)
-    tuple.12 = (f32[2,2,400,400]{3,2,1,0}) tuple(reshape.11)
-    ROOT get-tuple-element.13 = f32[2,2,400,400]{3,2,1,0} get-tuple-element(tuple.12), index=0
+    arg0.1 = f32[2,2,400,30] parameter(0), parameter_replication={false}
+    arg0.2 = f32[2,2,30,400] parameter(1), parameter_replication={false}
+    arg0.3 = f32[2,400] parameter(2), parameter_replication={false}
+    dot.7 = f32[2,2,400,400] dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    broad.1 = f32[2,2,400,400] broadcast(arg0.3), dimensions={0,3}
+    add.10 = f32[2,2,400,400] add(dot.7, broad.1)
+    reshape.11 = f32[2,2,400,400] reshape(add.10)
+    tuple.12 = (f32[2,2,400,400]) tuple(reshape.11)
+    ROOT get-tuple-element.13 = f32[2,2,400,400] get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -237,18 +238,18 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2D) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2D1B) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[1,2,400,30]{3,2,1,0}, f32[1,2,30,400]{3,2,1,0}, f32[1,400]{1,0})->f32[1,2,400,400]{3,2,1,0}}
-  
+  HloModule matmul.biasadd.test.f32
+
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[1,2,400,30]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg0.2 = f32[1,2,30,400]{3,2,1,0} parameter(1), parameter_replication={false}
-    arg0.3 = f32[1,400]{1,0} parameter(2), parameter_replication={false}
-    dot.7 = f32[1,2,400,400]{3,2,1,0} dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    broad.1 = f32[1,2,400,400]{3,2,1,0} broadcast(arg0.3), dimensions={0,3}
-    add.10 = f32[1,2,400,400]{3,2,1,0} add(dot.7, broad.1)
-    reshape.11 = f32[1,2,400,400]{3,2,1,0} reshape(add.10)
-    tuple.12 = (f32[1,2,400,400]{3,2,1,0}) tuple(reshape.11)
-    ROOT get-tuple-element.13 = f32[1,2,400,400]{3,2,1,0} get-tuple-element(tuple.12), index=0
+    arg0.1 = f32[1,2,400,30] parameter(0), parameter_replication={false}
+    arg0.2 = f32[1,2,30,400] parameter(1), parameter_replication={false}
+    arg0.3 = f32[1,400] parameter(2), parameter_replication={false}
+    dot.7 = f32[1,2,400,400] dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    broad.1 = f32[1,2,400,400] broadcast(arg0.3), dimensions={0,3}
+    add.10 = f32[1,2,400,400] add(dot.7, broad.1)
+    reshape.11 = f32[1,2,400,400] reshape(add.10)
+    tuple.12 = (f32[1,2,400,400]) tuple(reshape.11)
+    ROOT get-tuple-element.13 = f32[1,2,400,400] get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -257,18 +258,18 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2D1B) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter3) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[16,128,768]{2,1,0}, f32[768,768]{1,0}, f32[768]{0})->f32[16,128,768]{2,1,0}}
-  
+  HloModule matmul.biasadd.test.f32
+
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[16,128,768]{2,1,0} parameter(0), sharding={replicated}
-    arg0.2 = f32[768,768]{1,0} parameter(1), sharding={replicated}
-    dot.84 = f32[16,128,768]{2,1,0} dot(arg0.1, arg0.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    arg0.1 = f32[16,128,768] parameter(0), sharding={replicated}
+    arg0.2 = f32[768,768] parameter(1), sharding={replicated}
+    dot.84 = f32[16,128,768] dot(arg0.1, arg0.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
     arg0.3 = f32[768]{0} parameter(2), sharding={replicated}
-    reshape.85 = f32[1,1,768]{2,1,0} reshape(arg0.3)
-    broadcast.86 = f32[1,1,768]{2,1,0} broadcast(reshape.85), dimensions={0,1,2}
+    reshape.85 = f32[1,1,768] reshape(arg0.3)
+    broadcast.86 = f32[1,1,768] broadcast(reshape.85), dimensions={0,1,2}
     reshape.87 = f32[768]{0} reshape(broadcast.86)
-    broadcast.88 = f32[16,128,768]{2,1,0} broadcast(reshape.87), dimensions={2}
-    ROOT add.89 = f32[16,128,768]{2,1,0} add(dot.84, broadcast.88)
+    broadcast.88 = f32[16,128,768] broadcast(reshape.87), dimensions={2}
+    ROOT add.89 = f32[16,128,768] add(dot.84, broadcast.88)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -277,8 +278,8 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter3) {
 
 TEST_F(MatmulTest, SimpleTestF32TransposeBWithBiasAddFusion) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.1, entry_computation_layout={(f32[32,8,4,16]{3,1,2,0},f32[32,8,16,16]{3,1,2,0})->f32[32,8,4,16]{3,2,1,0}}
-  
+  HloModule matmul.test.1
+
   ENTRY matmul.test.1 {
     arg.0 = f32[32,8,4,16]{3,1,2,0} parameter(0), parameter_replication={false}
     arg.1 = f32[32,8,16,16]{3,1,2,0} parameter(1), parameter_replication={false}
@@ -298,19 +299,19 @@ TEST_F(MatmulTest, SimpleTestF32TransposeBWithBiasAddFusion) {
 
 TEST_F(MatmulTest, F32BiasAddFusionNonCompatibleBias) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[12288,2]{1,0},f32[2,1024]{1,0})->f32[32,384,1024]{2,1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.1 {
-    arg.0 = f32[12288,2]{1,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[2,1024]{1,0} parameter(1), parameter_replication={false}
-    dot.0 = f32[12288,1024]{1,0} dot(arg.0, arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    reshape.0 = f32[32,384,1024]{2,1,0} reshape(dot.0)
-    constant.0 = f32[1,384,1024]{2,1,0} constant(15)
-    reshape.1 = f32[384,1024]{1,0} reshape(constant.0)
-    broadcast.0 = f32[32,384,1024]{2,1,0} broadcast(reshape.1), dimensions={1,2}
-    add.0 = f32[32,384,1024]{2,1,0} add(reshape.0, broadcast.0)
-    tuple.0 = (f32[32,384,1024]{2,1,0}) tuple(add.0)
-    ROOT get-tuple-element.0 = f32[32,384,1024]{2,1,0} get-tuple-element(tuple.0), index=0
+    arg.0 = f32[12288,2] parameter(0), parameter_replication={false}
+    arg.1 = f32[2,1024] parameter(1), parameter_replication={false}
+    dot.0 = f32[12288,1024] dot(arg.0, arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    reshape.0 = f32[32,384,1024] reshape(dot.0)
+    constant.0 = f32[1,384,1024] constant(15)
+    reshape.1 = f32[384,1024] reshape(constant.0)
+    broadcast.0 = f32[32,384,1024] broadcast(reshape.1), dimensions={1,2}
+    add.0 = f32[32,384,1024] add(reshape.0, broadcast.0)
+    tuple.0 = (f32[32,384,1024]) tuple(add.0)
+    ROOT get-tuple-element.0 = f32[32,384,1024] get-tuple-element(tuple.0), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -319,29 +320,29 @@ TEST_F(MatmulTest, F32BiasAddFusionNonCompatibleBias) {
 
 TEST_F(MatmulTest, ApproxGELUTestF32) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[32,32,4,16]{3,2,1,0},f32[32,32,16,32]{3,2,1,0})->f32[32,32,4,32]{3,2,1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.f32 {
-    arg.0 = f32[32,32,4,16]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[32,32,16,32]{3,2,1,0} parameter(1), parameter_replication={false}
-    onednn.matmul.0 = f32[32,32,4,32]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    mul.0 = f32[32,32,4,32]{3,2,1,0} multiply(onednn.matmul.0, onednn.matmul.0)
-    mul.1 = f32[32,32,4,32]{3,2,1,0} multiply(onednn.matmul.0, mul.0)
+    arg.0 = f32[32,32,4,16] parameter(0), parameter_replication={false}
+    arg.1 = f32[32,32,16,32] parameter(1), parameter_replication={false}
+    onednn.matmul.0 = f32[32,32,4,32] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    mul.0 = f32[32,32,4,32] multiply(onednn.matmul.0, onednn.matmul.0)
+    mul.1 = f32[32,32,4,32] multiply(onednn.matmul.0, mul.0)
     const.0 = f32[] constant(0.044715)
-    bcast.0 = f32[32,32,4,32]{3,2,1,0} broadcast(const.0), dimensions={}
-    mul.2 = f32[32,32,4,32]{3,2,1,0} multiply(mul.1, bcast.0)
-    add.0 = f32[32,32,4,32]{3,2,1,0} add(onednn.matmul.0, mul.2)
+    bcast.0 = f32[32,32,4,32] broadcast(const.0), dimensions={}
+    mul.2 = f32[32,32,4,32] multiply(mul.1, bcast.0)
+    add.0 = f32[32,32,4,32] add(onednn.matmul.0, mul.2)
     const.1 = f32[] constant(0.797884583)
-    bcast.1 = f32[32,32,4,32]{3,2,1,0} broadcast(const.1), dimensions={}
-    mul.3 = f32[32,32,4,32]{3,2,1,0} multiply(add.0, bcast.1)
-    tanh = f32[32,32,4,32]{3,2,1,0} tanh(mul.3)
+    bcast.1 = f32[32,32,4,32] broadcast(const.1), dimensions={}
+    mul.3 = f32[32,32,4,32] multiply(add.0, bcast.1)
+    tanh = f32[32,32,4,32] tanh(mul.3)
     const.2 = f32[] constant(1)
-    bcast.2 = f32[32,32,4,32]{3,2,1,0} broadcast(const.2), dimensions={}
-    add.2 = f32[32,32,4,32]{3,2,1,0} add(tanh, bcast.2)
+    bcast.2 = f32[32,32,4,32] broadcast(const.2), dimensions={}
+    add.2 = f32[32,32,4,32] add(tanh, bcast.2)
     const.3 = f32[] constant(0.5)
-    bcast.3 = f32[32,32,4,32]{3,2,1,0} broadcast(const.3), dimensions={}
-    mul.4 = f32[32,32,4,32]{3,2,1,0} multiply(add.2, bcast.3)
-    ROOT out = f32[32,32,4,32]{3,2,1,0} multiply(onednn.matmul.0, mul.4)
+    bcast.3 = f32[32,32,4,32] broadcast(const.3), dimensions={}
+    mul.4 = f32[32,32,4,32] multiply(add.2, bcast.3)
+    ROOT out = f32[32,32,4,32] multiply(onednn.matmul.0, mul.4)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -361,35 +362,35 @@ TEST_F(MatmulTest, ApproxGELUTestF32) {
 // batch=32; seq_len=32; hidden_size=64; intermediate_size=256
 TEST_F(MatmulTest, BiasAndApproxGELUTestF32) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[32,32,64]{2,1,0}, f32[64,256]{1,0}, f32[256]{0})->f32[32,32,256]{2,1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.f32 {
-  Arg_5.6 = f32[32,32,64]{2,1,0} parameter(0), sharding={replicated}
-  Arg_7.8 = f32[64,256]{1,0} parameter(1), sharding={replicated}
-  dot.232 = f32[32,32,256]{2,1,0} dot(Arg_5.6, Arg_7.8), lhs_contracting_dims={2}, rhs_contracting_dims={0}
-  Arg_6.7 = f32[256]{0} parameter(2), sharding={replicated}
-  reshape.233 = f32[1,1,256]{2,1,0} reshape(Arg_6.7)
-  broadcast.234 = f32[1,1,256]{2,1,0} broadcast(reshape.233), dimensions={0,1,2}
-  reshape.235 = f32[256]{0} reshape(broadcast.234)
-  broadcast.236 = f32[32,32,256]{2,1,0} broadcast(reshape.235), dimensions={2}
-  add.237 = f32[32,32,256]{2,1,0} add(dot.232, broadcast.236)
-  multiply.238 = f32[32,32,256]{2,1,0} multiply(add.237, add.237)
-  multiply.239 = f32[32,32,256]{2,1,0} multiply(add.237, multiply.238)
+  Arg_5.6 = f32[32,32,64] parameter(0), sharding={replicated}
+  Arg_7.8 = f32[64,256] parameter(1), sharding={replicated}
+  dot.232 = f32[32,32,256] dot(Arg_5.6, Arg_7.8), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  Arg_6.7 = f32[256] parameter(2), sharding={replicated}
+  reshape.233 = f32[1,1,256] reshape(Arg_6.7)
+  broadcast.234 = f32[1,1,256] broadcast(reshape.233), dimensions={0,1,2}
+  reshape.235 = f32[256] reshape(broadcast.234)
+  broadcast.236 = f32[32,32,256] broadcast(reshape.235), dimensions={2}
+  add.237 = f32[32,32,256] add(dot.232, broadcast.236)
+  multiply.238 = f32[32,32,256] multiply(add.237, add.237)
+  multiply.239 = f32[32,32,256] multiply(add.237, multiply.238)
   constant.20 = f32[] constant(0.044715)
-  broadcast.21 = f32[32,32,256]{2,1,0} broadcast(constant.20), dimensions={}
-  multiply.240 = f32[32,32,256]{2,1,0} multiply(multiply.239, broadcast.21)
-  add.241 = f32[32,32,256]{2,1,0} add(add.237, multiply.240)
+  broadcast.21 = f32[32,32,256] broadcast(constant.20), dimensions={}
+  multiply.240 = f32[32,32,256] multiply(multiply.239, broadcast.21)
+  add.241 = f32[32,32,256] add(add.237, multiply.240)
   constant.18 = f32[] constant(0.797884583)
-  broadcast.19 = f32[32,32,256]{2,1,0} broadcast(constant.18), dimensions={}
-  multiply.242 = f32[32,32,256]{2,1,0} multiply(add.241, broadcast.19)
-  tanh.243 = f32[32,32,256]{2,1,0} tanh(multiply.242)
+  broadcast.19 = f32[32,32,256] broadcast(constant.18), dimensions={}
+  multiply.242 = f32[32,32,256] multiply(add.241, broadcast.19)
+  tanh.243 = f32[32,32,256] tanh(multiply.242)
   constant.16 = f32[] constant(1)
-  broadcast.17 = f32[32,32,256]{2,1,0} broadcast(constant.16), dimensions={}
-  add.244 = f32[32,32,256]{2,1,0} add(tanh.243, broadcast.17)
+  broadcast.17 = f32[32,32,256] broadcast(constant.16), dimensions={}
+  add.244 = f32[32,32,256] add(tanh.243, broadcast.17)
   constant.14 = f32[] constant(0.5)
-  broadcast.15 = f32[32,32,256]{2,1,0} broadcast(constant.14), dimensions={}
-  multiply.245 = f32[32,32,256]{2,1,0} multiply(add.244, broadcast.15)
-  ROOT out = f32[32,32,256]{2,1,0} multiply(add.237, multiply.245)
+  broadcast.15 = f32[32,32,256] broadcast(constant.14), dimensions={}
+  multiply.245 = f32[32,32,256] multiply(add.244, broadcast.15)
+  ROOT out = f32[32,32,256] multiply(add.237, multiply.245)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -407,20 +408,20 @@ TEST_F(MatmulTest, BiasAndApproxGELUTestF32) {
 
 TEST_F(MatmulTest, ReLUTestF32) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[32,32,4,16]{3,2,1,0},f32[32,32,16,32]{3,2,1,0})->f32[32,32,4,32]{3,2,1,0}}
+  HloModule matmul.test.f32
 
   relu.1 {
-    Arg_0.3 = f32[32,32,4,32]{3,2,1,0} parameter(0)
+    Arg_0.3 = f32[32,32,4,32] parameter(0)
     constant.4 = f32[] constant(0)
-    broadcast.5 = f32[32,32,4,32]{3,2,1,0} broadcast(constant.4), dimensions={}
-    ROOT maximum.6 = f32[32,32,4,32]{3,2,1,0} maximum(Arg_0.3, broadcast.5)
+    broadcast.5 = f32[32,32,4,32] broadcast(constant.4), dimensions={}
+    ROOT maximum.6 = f32[32,32,4,32] maximum(Arg_0.3, broadcast.5)
   }
 
   ENTRY matmul.test.f32 {
-    arg.0 = f32[32,32,4,16]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[32,32,16,32]{3,2,1,0} parameter(1), parameter_replication={false}
-    onednn.matmul.0 = f32[32,32,4,32]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    ROOT call.7 = f32[32,32,4,32]{3,2,1,0} call(onednn.matmul.0), to_apply=relu.1
+    arg.0 = f32[32,32,4,16] parameter(0), parameter_replication={false}
+    arg.1 = f32[32,32,16,32] parameter(1), parameter_replication={false}
+    onednn.matmul.0 = f32[32,32,4,32] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    ROOT call.7 = f32[32,32,4,32] call(onednn.matmul.0), to_apply=relu.1
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -442,20 +443,21 @@ TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_F32) {
   }
 
   const char* matmul_module_str = R"(
-  HloModule jit_apply, entry_computation_layout={(f32[3072]{0}, f32[768,3072]{1,0}, f32[16,128,768]{2,1,0})->bf16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+  HloModule jit_apply
+
   ENTRY matmul.test.bf16 {
-    Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
-    convert.4 = bf16[16,128,768]{2,1,0} convert(Arg_2.3)
-    Arg_1.2 = f32[768,3072]{1,0} parameter(1), sharding={replicated}
-    convert.5 = bf16[768,3072]{1,0} convert(Arg_1.2)
-    dot.7 = bf16[16,128,3072]{2,1,0} dot(convert.4, convert.5), lhs_contracting_dims={2}, rhs_contracting_dims={0}
-    Arg_0.1 = f32[3072]{0} parameter(0), sharding={replicated}
-    convert.6 = bf16[3072]{0} convert(Arg_0.1)
-    reshape.8 = bf16[1,1,3072]{2,1,0} reshape(convert.6)
-    broadcast.9 = bf16[1,1,3072]{2,1,0} broadcast(reshape.8), dimensions={0,1,2}
-    reshape.10 = bf16[3072]{0} reshape(broadcast.9)
-    broadcast.11 = bf16[16,128,3072]{2,1,0} broadcast(reshape.10), dimensions={2}
-    ROOT add.12 = bf16[16,128,3072]{2,1,0} add(dot.7, broadcast.11)
+    Arg_2.3 = f32[16,128,768] parameter(2), sharding={replicated}
+    convert.4 = bf16[16,128,768] convert(Arg_2.3)
+    Arg_1.2 = f32[768,3072] parameter(1), sharding={replicated}
+    convert.5 = bf16[768,3072] convert(Arg_1.2)
+    dot.7 = bf16[16,128,3072] dot(convert.4, convert.5), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_0.1 = f32[3072] parameter(0), sharding={replicated}
+    convert.6 = bf16[3072] convert(Arg_0.1)
+    reshape.8 = bf16[1,1,3072] reshape(convert.6)
+    broadcast.9 = bf16[1,1,3072] broadcast(reshape.8), dimensions={0,1,2}
+    reshape.10 = bf16[3072] reshape(broadcast.9)
+    broadcast.11 = bf16[16,128,3072] broadcast(reshape.10), dimensions={2}
+    ROOT add.12 = bf16[16,128,3072] add(dot.7, broadcast.11)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
@@ -468,18 +470,19 @@ TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_BF16) {
   }
 
   const char* matmul_module_str = R"(
-  HloModule jit_apply, entry_computation_layout={(bf16[3072]{0}, bf16[768,3072]{1,0}, f32[16,128,768]{2,1,0})->bf16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+  HloModule jit_apply
+
   ENTRY matmul.test.bf16 {
-    Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
-    convert.4 = bf16[16,128,768]{2,1,0} convert(Arg_2.3)
-    Arg_1.2 = bf16[768,3072]{1,0} parameter(1), sharding={replicated}
-    dot.5 = bf16[16,128,3072]{2,1,0} dot(convert.4, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
-    Arg_0.1 = bf16[3072]{0} parameter(0), sharding={replicated}
-    reshape.6 = bf16[1,1,3072]{2,1,0} reshape(Arg_0.1)
-    broadcast.7 = bf16[1,1,3072]{2,1,0} broadcast(reshape.6), dimensions={0,1,2}
-    reshape.8 = bf16[3072]{0} reshape(broadcast.7)
-    broadcast.9 = bf16[16,128,3072]{2,1,0} broadcast(reshape.8), dimensions={2}
-    ROOT add.10 = bf16[16,128,3072]{2,1,0} add(dot.5, broadcast.9)
+    Arg_2.3 = f32[16,128,768] parameter(2), sharding={replicated}
+    convert.4 = bf16[16,128,768] convert(Arg_2.3)
+    Arg_1.2 = bf16[768,3072] parameter(1), sharding={replicated}
+    dot.5 = bf16[16,128,3072] dot(convert.4, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_0.1 = bf16[3072] parameter(0), sharding={replicated}
+    reshape.6 = bf16[1,1,3072] reshape(Arg_0.1)
+    broadcast.7 = bf16[1,1,3072] broadcast(reshape.6), dimensions={0,1,2}
+    reshape.8 = bf16[3072] reshape(broadcast.7)
+    broadcast.9 = bf16[16,128,3072] broadcast(reshape.8), dimensions={2}
+    ROOT add.10 = bf16[16,128,3072] add(dot.5, broadcast.9)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
@@ -488,14 +491,15 @@ TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_BF16) {
 
 TEST_F(MatmulTest, DivisionByConstantWithEltwiseLinearF32) {
   const char* matmul_module_str = R"(
-  HloModule matmul.divide.test.1, entry_computation_layout={(f32[16,128,768]{2,1,0}, f32[768,12,64]{2,1,0})->f32[16,128,12,64]{3,2,1,0}}
+  HloModule matmul.divide.test.1
+
   ENTRY matmul.divide.test.f32 {
-    Arg_4.5 = f32[16,128,768]{2,1,0} parameter(0), sharding={replicated}
-    Arg_2.3 = f32[768,12,64]{2,1,0} parameter(1), sharding={replicated}
-    onednn.matmul.0 = f32[16,128,12,64]{3,2,1,0} dot(Arg_4.5, Arg_2.3), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_4.5 = f32[16,128,768] parameter(0), sharding={replicated}
+    Arg_2.3 = f32[768,12,64] parameter(1), sharding={replicated}
+    onednn.matmul.0 = f32[16,128,12,64] dot(Arg_4.5, Arg_2.3), lhs_contracting_dims={2}, rhs_contracting_dims={0}
     constant.8 = f32[] constant(8)
-    broadcast.9 = f32[16,128,12,64]{3,2,1,0} broadcast(constant.8), dimensions={}
-    ROOT divide.16 = f32[16,128,12,64]{3,2,1,0} divide(onednn.matmul.0, broadcast.9)
+    broadcast.9 = f32[16,128,12,64] broadcast(constant.8), dimensions={}
+    ROOT divide.16 = f32[16,128,12,64] divide(onednn.matmul.0, broadcast.9)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec(1e-4, 1e-4)));
@@ -517,20 +521,21 @@ TEST_F(MatmulTest, SimpleBiasTestF16_PARAM_F32) {
   }
 
   const char* matmul_module_str = R"(
-  HloModule jit_apply, entry_computation_layout={(f32[3072]{0}, f32[768,3072]{1,0}, f32[16,128,768]{2,1,0})->f16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+  HloModule jit_apply
+
   ENTRY matmul.test.f16 {
-    Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
-    convert.4 = f16[16,128,768]{2,1,0} convert(Arg_2.3)
-    Arg_1.2 = f32[768,3072]{1,0} parameter(1), sharding={replicated}
-    convert.5 = f16[768,3072]{1,0} convert(Arg_1.2)
-    dot.7 = f16[16,128,3072]{2,1,0} dot(convert.4, convert.5), lhs_contracting_dims={2}, rhs_contracting_dims={0}
-    Arg_0.1 = f32[3072]{0} parameter(0), sharding={replicated}
-    convert.6 = f16[3072]{0} convert(Arg_0.1)
-    reshape.8 = f16[1,1,3072]{2,1,0} reshape(convert.6)
-    broadcast.9 = f16[1,1,3072]{2,1,0} broadcast(reshape.8), dimensions={0,1,2}
-    reshape.10 = f16[3072]{0} reshape(broadcast.9)
-    broadcast.11 = f16[16,128,3072]{2,1,0} broadcast(reshape.10), dimensions={2}
-    ROOT add.12 = f16[16,128,3072]{2,1,0} add(dot.7, broadcast.11)
+    Arg_2.3 = f32[16,128,768] parameter(2), sharding={replicated}
+    convert.4 = f16[16,128,768] convert(Arg_2.3)
+    Arg_1.2 = f32[768,3072] parameter(1), sharding={replicated}
+    convert.5 = f16[768,3072] convert(Arg_1.2)
+    dot.7 = f16[16,128,3072] dot(convert.4, convert.5), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_0.1 = f32[3072] parameter(0), sharding={replicated}
+    convert.6 = f16[3072] convert(Arg_0.1)
+    reshape.8 = f16[1,1,3072] reshape(convert.6)
+    broadcast.9 = f16[1,1,3072] broadcast(reshape.8), dimensions={0,1,2}
+    reshape.10 = f16[3072] reshape(broadcast.9)
+    broadcast.11 = f16[16,128,3072] broadcast(reshape.10), dimensions={2}
+    ROOT add.12 = f16[16,128,3072] add(dot.7, broadcast.11)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
@@ -542,18 +547,19 @@ TEST_F(MatmulTest, SimpleBiasTestF16_PARAM_F16) {
     GTEST_SKIP() << "CPU does not support F16.";
   }
   const char* matmul_module_str = R"(
-  HloModule jit_apply, entry_computation_layout={(f16[3072]{0}, f16[768,3072]{1,0}, f32[16,128,768]{2,1,0})->f16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+  HloModule jit_apply
+
   ENTRY matmul.test.f16 {
-    Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
-    convert.4 = f16[16,128,768]{2,1,0} convert(Arg_2.3)
-    Arg_1.2 = f16[768,3072]{1,0} parameter(1), sharding={replicated}
-    dot.5 = f16[16,128,3072]{2,1,0} dot(convert.4, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
-    Arg_0.1 = f16[3072]{0} parameter(0), sharding={replicated}
-    reshape.6 = f16[1,1,3072]{2,1,0} reshape(Arg_0.1)
-    broadcast.7 = f16[1,1,3072]{2,1,0} broadcast(reshape.6), dimensions={0,1,2}
-    reshape.8 = f16[3072]{0} reshape(broadcast.7)
-    broadcast.9 = f16[16,128,3072]{2,1,0} broadcast(reshape.8), dimensions={2}
-    ROOT add.10 = f16[16,128,3072]{2,1,0} add(dot.5, broadcast.9)
+    Arg_2.3 = f32[16,128,768] parameter(2), sharding={replicated}
+    convert.4 = f16[16,128,768] convert(Arg_2.3)
+    Arg_1.2 = f16[768,3072] parameter(1), sharding={replicated}
+    dot.5 = f16[16,128,3072] dot(convert.4, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_0.1 = f16[3072] parameter(0), sharding={replicated}
+    reshape.6 = f16[1,1,3072] reshape(Arg_0.1)
+    broadcast.7 = f16[1,1,3072] broadcast(reshape.6), dimensions={0,1,2}
+    reshape.8 = f16[3072] reshape(broadcast.7)
+    broadcast.9 = f16[16,128,3072] broadcast(reshape.8), dimensions={2}
+    ROOT add.10 = f16[16,128,3072] add(dot.5, broadcast.9)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
@@ -562,12 +568,12 @@ TEST_F(MatmulTest, SimpleBiasTestF16_PARAM_F16) {
 
 TEST_F(MatmulTest, TestF32NonConstantWeights) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[64,256,16]{2,1,0},f32[16,32]{1,0})->f32[64,256,32]{2,1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.f32 {
-    arg.0 = f32[64,256,16]{2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[16,32]{1,0} parameter(1), parameter_replication={false}
-    ROOT onednn.matmul.0 = f32[64,256,32]{2,1,0} dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    arg.0 = f32[64,256,16] parameter(0), parameter_replication={false}
+    arg.1 = f32[16,32] parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = f32[64,256,32] dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -581,13 +587,13 @@ TEST_F(MatmulTest, TestF32NonConstantWeights) {
 
 TEST_F(MatmulTest, TestF32ConstantWeights) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[64,256,16]{2,1,0})->f32[64,256,32]{2,1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.f32 {
-    arg.0 = f32[64,256,16]{2,1,0} parameter(0), parameter_replication={false}
+    arg.0 = f32[64,256,16] parameter(0), parameter_replication={false}
     constant = f32[] constant(1)
-    arg.1 = f32[16,32]{1,0} broadcast(constant), dimensions={}
-    ROOT onednn.matmul.0 = f32[64,256,32]{2,1,0} dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    arg.1 = f32[16,32] broadcast(constant), dimensions={}
+    ROOT onednn.matmul.0 = f32[64,256,32] dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -599,9 +605,45 @@ TEST_F(MatmulTest, TestF32ConstantWeights) {
   )");
 }
 
+TEST_F(MatmulTest, SimpleTestBF16Gemv1) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.bf16
+
+  ENTRY matmul.test.bf16 {
+    arg.0 = bf16[1000,10000] parameter(0)
+    arg.1 = bf16[10000] parameter(1)
+    ROOT onednn.matmul.0 = bf16[1000] dot(arg.0, arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{2e-2, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, matmul_rewrite_str_);
+}
+
+TEST_F(MatmulTest, SimpleTestBF16Gemv2) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.bf16
+  
+  ENTRY matmul.test.bf16 {
+    arg.0 = bf16[100,300,300] parameter(0)
+    arg.1 = bf16[300] parameter(1)
+    ROOT onednn.matmul.0 = bf16[100,300] dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{2e-2, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, matmul_rewrite_str_);
+}
+
 TEST_F(MatmulTest, TestTransposeBNoRewriteF32) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[384,1024]{1,0},f32[2,1024]{1,0})->f32[384,2]{1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.f32 {
     arg.0 = f32[384,1024]{1,0} parameter(0), parameter_replication={false}

From 1593877b000b738a15b15ccf46b036c0ca43a47c Mon Sep 17 00:00:00 2001
From: RJ Ascani <rjascani@google.com>
Date: Thu, 28 Mar 2024 10:07:47 -0700
Subject: [PATCH 549/670] #shlo_ref: Fix adb typo

PiperOrigin-RevId: 619974234
---
 tensorflow/lite/experimental/shlo/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/experimental/shlo/README.md b/tensorflow/lite/experimental/shlo/README.md
index e4b5e83ce6630c..4cc46da7b13429 100644
--- a/tensorflow/lite/experimental/shlo/README.md
+++ b/tensorflow/lite/experimental/shlo/README.md
@@ -199,7 +199,7 @@ using ADB.
 
 ```sh
 adb push shlo/ops/op_name_test /data/local/tmp
-ash shell /data/local/tmp/op_name_test
+adb shell /data/local/tmp/op_name_test
 ```
 
 #### iOS

From 69847fc42297d5310270fb4f5af2927bb13c2579 Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Thu, 28 Mar 2024 10:29:45 -0700
Subject: [PATCH 550/670] [XLA:GPU] Skip matrix-matrix multiplication in
 GemvRewriter

PiperOrigin-RevId: 619983009
---
 third_party/xla/xla/service/gpu/gemv_rewriter.cc   |  5 +++++
 .../xla/xla/service/gpu/gemv_rewriter_test.cc      | 14 ++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/gemv_rewriter.cc b/third_party/xla/xla/service/gpu/gemv_rewriter.cc
index 67ffd2b81db172..21e5f477e4b059 100644
--- a/third_party/xla/xla/service/gpu/gemv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemv_rewriter.cc
@@ -75,6 +75,11 @@ class GemvRewriterVisitor : public DfsHloRewriteVisitor {
         dim_numbers.rhs_batch_dimensions_size() +
             dim_numbers.rhs_contracting_dimensions_size() + 1;
 
+    // Skip matrix-matrix multiplication.
+    if (lhs_has_non_contracting_dim && rhs_has_non_contracting_dim) {
+      return absl::OkStatus();
+    }
+
     // Skip vector-vector multiplication.
     if (!lhs_has_non_contracting_dim && !rhs_has_non_contracting_dim) {
       return absl::OkStatus();
diff --git a/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc b/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc
index 46aee0aab3fb88..2a8b8103e0a94e 100644
--- a/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc
@@ -111,6 +111,20 @@ TEST_F(GemvRewriterTest, DotNotRewriteVectorVectorMultiplication) {
   RunAndFilecheckHloRewrite(hlo, GemvRewriter(), /*expected=*/std::nullopt);
 }
 
+TEST_F(GemvRewriterTest, DotNotRewriteMatrixMatrixMultiplication) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[5,7] parameter(0)
+    p1 = f32[7,32] parameter(1)
+    ROOT d = f32[5,32] dot(p0, p1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), /*expected=*/std::nullopt);
+}
+
 TEST_F(GemvRewriterTest, DoNotRewriteDotsWithNonNormalizedLayout) {
   const char* hlo = R"(
   HloModule m

From ddbbd8993263a6176cb42806b07124a76f6392dd Mon Sep 17 00:00:00 2001
From: Peter Gavin <pgavin@google.com>
Date: Thu, 28 Mar 2024 10:36:44 -0700
Subject: [PATCH 551/670] [xla] remove BorrowingLiteral constructor for
 LiteralProto

PiperOrigin-RevId: 619985533
---
 third_party/xla/xla/literal.cc      | 103 ----------------------------
 third_party/xla/xla/literal.h       |  16 -----
 third_party/xla/xla/literal_test.cc |  42 ++++++------
 3 files changed, 20 insertions(+), 141 deletions(-)

diff --git a/third_party/xla/xla/literal.cc b/third_party/xla/xla/literal.cc
index d5364cb848e652..935e6092ac07c8 100644
--- a/third_party/xla/xla/literal.cc
+++ b/third_party/xla/xla/literal.cc
@@ -2709,107 +2709,4 @@ BorrowingLiteral::BorrowingLiteral(absl::Span<const char* const> src_buf_ptrs,
   }
 }
 
-BorrowingLiteral::BorrowingLiteral(const LiteralProto& proto)
-    : LiteralBase(), shape_(std::make_unique<Shape>(proto.shape())) {
-  root_piece_ = Piece();
-  root_piece_.set_subshape(shape_.get());
-
-  if (shape().IsArray()) {
-    absl::Span<const char> data;
-    switch (shape_->element_type()) {
-#define BORROWING_LITERAL_CAST_DATA_(FIELD)                                   \
-  absl::Span<const char>(reinterpret_cast<const char*>(proto.FIELD().data()), \
-                         proto.FIELD().size() * sizeof *proto.FIELD().data())
-      case PRED:
-        data = BORROWING_LITERAL_CAST_DATA_(preds);
-        break;
-      case S4:
-        data = proto.s4s();
-        break;
-      case S8:
-        data = proto.s8s();
-        break;
-      case S16:
-        data = proto.s16s();
-        break;
-      case S32:
-        data = BORROWING_LITERAL_CAST_DATA_(s32s);
-        break;
-      case S64:
-        data = BORROWING_LITERAL_CAST_DATA_(s64s);
-        break;
-      case U4:
-        data = proto.u4s();
-        break;
-      case U8:
-        data = proto.u8s();
-        break;
-      case U16:
-        data = proto.u16s();
-        break;
-      case U32:
-        data = BORROWING_LITERAL_CAST_DATA_(u32s);
-        break;
-      case U64:
-        data = BORROWING_LITERAL_CAST_DATA_(u64s);
-        break;
-      case F16:
-        data = proto.f16s();
-        break;
-      case F32:
-        data = BORROWING_LITERAL_CAST_DATA_(f32s);
-        break;
-      case BF16:
-        data = proto.bf16s();
-        break;
-      case F64:
-        data = BORROWING_LITERAL_CAST_DATA_(f64s);
-        break;
-      case F8E5M2:
-        data = proto.f8e5m2s();
-        break;
-      case F8E4M3FN:
-        data = proto.f8e4m3fns();
-        break;
-      case F8E4M3B11FNUZ:
-        data = proto.f8e4m3b11fnuzs();
-        break;
-      case F8E5M2FNUZ:
-        data = proto.f8e5m2fnuzs();
-        break;
-      case F8E4M3FNUZ:
-        data = proto.f8e4m3fnuzs();
-        break;
-      case C64:
-        data = BORROWING_LITERAL_CAST_DATA_(c64s);
-        break;
-      case C128:
-        data = BORROWING_LITERAL_CAST_DATA_(c128s);
-        break;
-#undef BORROWING_LITERAL_CAST_DATA_
-      default:
-        LOG(FATAL) << "Invalid element type for array: " << shape();
-    }
-    CHECK_EQ(data.size(), ShapeUtil::ByteSizeOfElements(*shape_));
-    root_piece_.set_buffer(const_cast<char*>(data.data()));
-  } else if (shape_->IsTuple()) {
-    CHECK_EQ(shape().tuple_shapes_size(), proto.tuple_literals_size());
-    BuildPieceSubtree(*shape_, &root_piece_);
-    for (int i = 0; i < shape_->tuple_shapes_size(); ++i) {
-      BorrowingLiteral child(proto.tuple_literals(i));
-      child.root_piece_.ForEachMutableSubpiece(
-          [&](const ShapeIndex& child_index, Piece* child_piece) {
-            if (!child_piece->subshape().IsArray()) {
-              return;
-            }
-            ShapeIndex index = {i};
-            index.insert(index.end(), child_index.begin(), child_index.end());
-            root_piece_.child(index).set_buffer(child_piece->buffer());
-          });
-    }
-  } else {
-    LOG(FATAL) << "Invalid shape: " << *shape_;
-  }
-}
-
 }  // namespace xla
diff --git a/third_party/xla/xla/literal.h b/third_party/xla/xla/literal.h
index fea38633994652..2ebe0c2d727174 100644
--- a/third_party/xla/xla/literal.h
+++ b/third_party/xla/xla/literal.h
@@ -912,18 +912,6 @@ class LiteralBase {
       return tuple_rep->children[index];
     }
 
-    Piece& child(ShapeIndexView index) {
-      return const_cast<Piece&>(const_cast<const Piece*>(this)->child(index));
-    }
-    const Piece& child(ShapeIndexView index) const {
-      const Piece* result = this;
-      while (!index.empty()) {
-        result = &result->child(index.front());
-        index.remove_prefix(1);
-      }
-      return *result;
-    }
-
     // Adds a child piece to this piece's children.
     void emplace_back(Piece child_piece) {
       auto* tuple_rep = GetTupleRep();
@@ -1590,10 +1578,6 @@ class BorrowingLiteral : public LiteralBase {
                    const Shape& shape);
   // TODO(b/79707221): adding constructors for nested tuples as well.
 
-  // Construct a BorrowingLiteral from a LiteralProto.  The proto must not be
-  // modified during the lifetime of the BorrowingLiteral.
-  explicit BorrowingLiteral(const LiteralProto& proto);
-
  private:
   // Recursively builds the subtree for the given piece and sets the subshapes
   // of the given piece with the given shape.
diff --git a/third_party/xla/xla/literal_test.cc b/third_party/xla/xla/literal_test.cc
index e61ba78d95e2ba..24c12bb92d6d84 100644
--- a/third_party/xla/xla/literal_test.cc
+++ b/third_party/xla/xla/literal_test.cc
@@ -2197,32 +2197,30 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
   auto nested_tuple =
       LiteralUtil::MakeTuple({&tuple, &vector_bfloat16, &tuple, &nil_literal});
 
-  auto test_proto = [](const Literal& literal) {
-    LiteralProto proto = literal.ToProto();
-    EXPECT_EQ(literal, Literal::CreateFromProto(proto).value());
-    EXPECT_EQ(literal, BorrowingLiteral(proto));
+  auto to_from_proto = [](const Literal& literal) -> Literal {
+    return Literal::CreateFromProto(literal.ToProto()).value();
   };
 
-  test_proto(one_f32);
-  test_proto(vector_int8);
-  test_proto(vector_uint8);
-  test_proto(vector_c64);
-  test_proto(vector_c128);
-  test_proto(vector_bfloat16);
-  test_proto(vector_f8e5m2);
-  test_proto(vector_f8e4m3);
-  test_proto(vector_f8e4m3b11);
-  test_proto(vector_f8e5m2fnuz);
-  test_proto(vector_f8e4m3fnuz);
-  test_proto(matrix_pred);
-  test_proto(vector_s4);
-  test_proto(vector_u4);
-  test_proto(tuple);
-  test_proto(nested_tuple);
-  test_proto(nil_literal);
+  EXPECT_EQ(one_f32, to_from_proto(one_f32));
+  EXPECT_EQ(vector_int8, to_from_proto(vector_int8));
+  EXPECT_EQ(vector_uint8, to_from_proto(vector_uint8));
+  EXPECT_EQ(vector_c64, to_from_proto(vector_c64));
+  EXPECT_EQ(vector_c128, to_from_proto(vector_c128));
+  EXPECT_EQ(vector_bfloat16, to_from_proto(vector_bfloat16));
+  EXPECT_EQ(vector_f8e5m2, to_from_proto(vector_f8e5m2));
+  EXPECT_EQ(vector_f8e4m3, to_from_proto(vector_f8e4m3));
+  EXPECT_EQ(vector_f8e4m3b11, to_from_proto(vector_f8e4m3b11));
+  EXPECT_EQ(vector_f8e5m2fnuz, to_from_proto(vector_f8e5m2fnuz));
+  EXPECT_EQ(vector_f8e4m3fnuz, to_from_proto(vector_f8e4m3fnuz));
+  EXPECT_EQ(matrix_pred, to_from_proto(matrix_pred));
+  EXPECT_EQ(vector_s4, to_from_proto(vector_s4));
+  EXPECT_EQ(vector_u4, to_from_proto(vector_u4));
+  EXPECT_EQ(tuple, to_from_proto(tuple));
+  EXPECT_EQ(nested_tuple, to_from_proto(nested_tuple));
+  EXPECT_EQ(nil_literal, to_from_proto(nil_literal));
 
   EXPECT_NE(one_f32, two_f32);
-  EXPECT_NE(one_f32, Literal::CreateFromProto(two_f32.ToProto()).value());
+  EXPECT_NE(one_f32, to_from_proto(two_f32));
 }
 
 TEST_F(LiteralUtilTest, InvalidProtoNoValues) {

From 354e90c4d0619e9bf8a0b149790df682356dfb95 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 28 Mar 2024 10:49:07 -0700
Subject: [PATCH 552/670] [PJRT:GPU] Fix implementation of .compute_capability
 on devices to remove the need for an ifdef.

PiperOrigin-RevId: 619989520
---
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index c57fe404badb38..9dec918f993274 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <string>
 #include <string_view>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -38,8 +39,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
@@ -994,15 +995,16 @@ Status BuildDistributedDevices(
 }
 
 std::string MakeComputeCapabilityString(const se::DeviceDescription* desc) {
-  std::string compute_capability;
-#if GOOGLE_CUDA
-  se::CudaComputeCapability cc = desc->cuda_compute_capability();
-  compute_capability =
-      std::to_string(cc.major) + "." + std::to_string(cc.minor);
-#else   // GOOGLE_CUDA
-  compute_capability = desc->rocm_compute_capability().gfx_version();
-#endif  // GOOGLE_CUDA
-  return compute_capability;
+  se::GpuComputeCapability cc = desc->gpu_compute_capability();
+  if (std::holds_alternative<se::CudaComputeCapability>(cc)) {
+    auto nvcc = std::get<se::CudaComputeCapability>(cc);
+    return absl::StrCat(nvcc.major, ".", nvcc.minor);
+  } else if (std::holds_alternative<se::RocmComputeCapability>(cc)) {
+    auto rocmcc = std::get<se::RocmComputeCapability>(cc);
+    return rocmcc.gfx_version();
+  } else {
+    return "unknown";
+  }
 }
 
 StreamExecutorGpuDevice::StreamExecutorGpuDevice(

From 8b681670751c6b2dcc4c49e25a1eb0409a664450 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 28 Mar 2024 10:59:48 -0700
Subject: [PATCH 553/670] [stream_executor:host] Add missing externs to C API
 header

PiperOrigin-RevId: 619992965
---
 .../xla/xla/stream_executor/host/host_kernel_c_api.h      | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h b/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h
index 6564bb49e58f22..6768706abc2800 100644
--- a/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h
+++ b/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h
@@ -23,6 +23,10 @@ limitations under the License.
 // StreamExecutor Host Kernel API
 //===----------------------------------------------------------------------===//
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 // StreamExecutor host kernel API is an integration point between a codegen
 // backend and a runtime. XLA:CPU backend compiles fusion regions to native
 // functions (via LLVM backend) that are compatible with a kernel API (and ABI),
@@ -77,4 +81,8 @@ typedef struct SE_HOST_KernelError SE_HOST_KernelError;
 typedef SE_HOST_KernelError* SE_HOST_Kernel(
     const SE_HOST_KernelCallFrame* call_frame);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif  // XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_C_API_H_

From 22e22f7a9af07ff2fc4a0c00de61909af1df9825 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 11:13:15 -0700
Subject: [PATCH 554/670] Logs a fatal message if Auto Sharding times out
 (since we no longer rely on GSPMD as a backup).

PiperOrigin-RevId: 619997588
---
 .../hlo/experimental/auto_sharding/auto_sharding.cc   | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 49370d832b393f..ef7420bb384f4b 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -4101,16 +4101,7 @@ absl::StatusOr<bool> AutoSharding::Run(
 
   absl::StatusOr<bool> module_is_changed;
   if (skip_auto_sharding) {
-    VLOG(1) << "Solver timed out. Will now rely on sharding propagation to "
-               "perform sharding.";
-    if (!ModuleHasUserShardings(module)) {
-      LOG(WARNING)
-          << "The auto-sharding solver has timed out without a solution. "
-             "Further, as the input module does not contain any sharding "
-             "annotations, we cannot rely on sharding propagation to perform "
-             "heuristic-guided sharding. The module therefore may not be "
-             "sharded leading to low performance.";
-    }
+    LOG(FATAL) << "The auto-sharding solver has timed out without a solution.";
     module_is_changed = false;
   } else {
     std::string trying_to_find;

From 81d3c5c07a6b4f9488640924eda33a20b54f2336 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 11:23:34 -0700
Subject: [PATCH 555/670] [XLA] add a setter for HloInputOutputAliasConfig and
 HloBufferDonorConfig

This is needed to move those configs from an old HloModule to a new one.

PiperOrigin-RevId: 620001061
---
 third_party/xla/xla/hlo/ir/hlo_module.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index 6311929ac533a5..0f6d449127e68d 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -505,6 +505,9 @@ class HloModule {
   const HloInputOutputAliasConfig& input_output_alias_config() const {
     return input_output_alias_config_;
   }
+  void set_input_output_alias_config(HloInputOutputAliasConfig config) {
+    input_output_alias_config_ = std::move(config);
+  }
 
   // buffer_donor_config_ indicates the set of input buffer donors that are
   // expected from the module.
@@ -512,6 +515,9 @@ class HloModule {
   const HloBufferDonorConfig& buffer_donor_config() const {
     return buffer_donor_config_;
   }
+  void set_buffer_donor_config(HloBufferDonorConfig config) {
+    buffer_donor_config_ = std::move(config);
+  }
 
   // Returns an id that is unique to this module across all modules created over
   // the lifetime of this process.

From 5742cd596308506cc5dc1e7ceff102cd0b11bd7e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 11:34:59 -0700
Subject: [PATCH 556/670] Change the default gpu loop value to 1

PiperOrigin-RevId: 620005388
---
 tensorflow/lite/delegates/gpu/api.h                    |  4 +++-
 tensorflow/lite/delegates/gpu/cl/api.cc                |  4 ++++
 tensorflow/lite/delegates/gpu/delegate_options.cc      |  2 +-
 tensorflow/lite/delegates/gpu/delegate_options.h       |  2 ++
 tensorflow/lite/tools/benchmark/benchmark_model.cc     | 10 +++++-----
 .../tools/benchmark/benchmark_performance_options.cc   |  4 ++--
 .../lite/tools/delegates/default_execution_provider.cc |  2 +-
 7 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/api.h b/tensorflow/lite/delegates/gpu/api.h
index 7b0d8312e2b154..535f077bf0f8b7 100644
--- a/tensorflow/lite/delegates/gpu/api.h
+++ b/tensorflow/lite/delegates/gpu/api.h
@@ -372,7 +372,9 @@ struct InferenceOptions {
 
   InferencePriority priority3 = InferencePriority::AUTO;
 #ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
-  int gpu_invoke_loop_times = -1;
+  // Number of times to invoke the inference in GPU delegate, to collect more
+  // accurate latency result. Default as 1, which is the original behavior.
+  int gpu_invoke_loop_times = 1;
 #endif
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index 08cf817da685b9..dc9a8d3a6ca649 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -545,6 +545,10 @@ class InferenceRunnerImpl : public CLInferenceRunner {
 #ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
     // TODO(b/328511338): Remove code enabled by TFLITE_GPU_ENABLE_INVOKE_LOOP
     // when Async API solution is ready to replace it.
+    if (gpu_invoke_loop_times_ <= 0) {
+      return absl::InvalidArgumentError(
+          "gpu_invoke_loop_times must be positive");
+    }
     for (int i = 0; i < gpu_invoke_loop_times_; i++) {
       RETURN_IF_ERROR(RunWithoutExternalBufferCopy());
     }
diff --git a/tensorflow/lite/delegates/gpu/delegate_options.cc b/tensorflow/lite/delegates/gpu/delegate_options.cc
index 7b7059df37e4d1..e596045e2a9d47 100644
--- a/tensorflow/lite/delegates/gpu/delegate_options.cc
+++ b/tensorflow/lite/delegates/gpu/delegate_options.cc
@@ -35,7 +35,7 @@ TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
   options.last_delegate_node_index = std::numeric_limits<int>::max();
 #endif
 #ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
-  options.gpu_invoke_loop_times = -1;
+  options.gpu_invoke_loop_times = 1;
 #endif
   return options;
 }
diff --git a/tensorflow/lite/delegates/gpu/delegate_options.h b/tensorflow/lite/delegates/gpu/delegate_options.h
index 98f1347bb7febe..b52d45c8abd5b1 100644
--- a/tensorflow/lite/delegates/gpu/delegate_options.h
+++ b/tensorflow/lite/delegates/gpu/delegate_options.h
@@ -144,6 +144,8 @@ typedef struct {
   int last_delegate_node_index;
 #endif
 #ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
+  // Number of times to invoke the inference in GPU delegate, to collect more
+  // accurate latency result. Default as 1, which is the original behavior.
   int gpu_invoke_loop_times;
 #endif
 } TfLiteGpuDelegateOptionsV2;
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc
index 9fd9aac8508f90..77b20d1b9b1db8 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc
@@ -73,7 +73,7 @@ BenchmarkParams BenchmarkModel::DefaultParams() {
                   BenchmarkParam::Create<bool>(false));
   params.AddParam("memory_footprint_check_interval_ms",
                   BenchmarkParam::Create<int32_t>(kMemoryCheckIntervalMs));
-  params.AddParam("gpu_invoke_loop_times", BenchmarkParam::Create<int32_t>(-1));
+  params.AddParam("gpu_invoke_loop_times", BenchmarkParam::Create<int32_t>(1));
   return params;
 }
 
@@ -207,10 +207,10 @@ void BenchmarkModel::LogParams() {
   LOG_BENCHMARK_PARAM(int32_t, "memory_footprint_check_interval_ms",
                       "Memory footprint check interval (ms)", verbose);
 #ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
-  LOG_BENCHMARK_PARAM(
-      int32_t, "gpu_invoke_loop_times",
-      "Number of GPU delegate invoke loop iterations to divide latency by",
-      verbose);
+  LOG_BENCHMARK_PARAM(int32_t, "gpu_invoke_loop_times",
+                      "Number of GPU delegate invoke loop iterations. Latency "
+                      "will be divided by it.",
+                      verbose);
 #endif
 }
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
index ecce8cb1572046..1645c7ec50fd4c 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -149,7 +149,7 @@ BenchmarkParams BenchmarkPerformanceOptions::DefaultParams() {
                   BenchmarkParam::Create<float>(-1.0f));
   params.AddParam("random_shuffle_benchmark_runs",
                   BenchmarkParam::Create<bool>(true));
-  params.AddParam("gpu_invoke_loop_times", BenchmarkParam::Create<int32_t>(-1));
+  params.AddParam("gpu_invoke_loop_times", BenchmarkParam::Create<int32_t>(1));
   return params;
 }
 
@@ -250,7 +250,7 @@ void BenchmarkPerformanceOptions::ResetPerformanceOptions() {
   single_option_run_params_->Set<int32_t>("num_threads", 1);
   single_option_run_params_->Set<bool>("use_gpu", false);
 #ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
-  single_option_run_params_->Set<int32_t>("gpu_invoke_loop_times", -1);
+  single_option_run_params_->Set<int32_t>("gpu_invoke_loop_times", 1);
   single_option_run_params_->Set<bool>("require_full_delegation", false);
 #endif
 #if defined(__ANDROID__)
diff --git a/tensorflow/lite/tools/delegates/default_execution_provider.cc b/tensorflow/lite/tools/delegates/default_execution_provider.cc
index 93dc242a8e2bfd..38e8fa56632784 100644
--- a/tensorflow/lite/tools/delegates/default_execution_provider.cc
+++ b/tensorflow/lite/tools/delegates/default_execution_provider.cc
@@ -81,7 +81,7 @@ std::vector<Flag> DefaultExecutionProvider::CreateFlags(
       CreateFlag<int32_t>(
           "gpu_invoke_loop_times", params,
           "Number of GPU delegate invoke loop iterations. Used only when "
-          "TFLITE_GPU_ENABLE_INVOKE_LOOP is defined. Default is -1."),
+          "TFLITE_GPU_ENABLE_INVOKE_LOOP is defined. Default is 1."),
       CreateFlag<std::string>(
           "delegate_serialize_dir", params,
           "Directory to be used by delegates for serializing any model data. "

From ce8999191a1d1c61a9190c37e42b4ada29431a2d Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Thu, 28 Mar 2024 11:39:01 -0700
Subject: [PATCH 557/670] NFC: Add `@llvm-project//mlir:TransformUtils`
 dependency as preparation for
 https://github.com/llvm/llvm-project/pull/86819.

PiperOrigin-RevId: 620006777
---
 tensorflow/compiler/mlir/lite/BUILD           |  1 +
 .../compiler/mlir/lite/experimental/tac/BUILD |  2 +
 tensorflow/compiler/mlir/lite/stablehlo/BUILD | 10 +++
 .../mlir/quantization/stablehlo/BUILD         |  1 +
 .../mlir/quantization/tensorflow/cc/BUILD     |  1 +
 .../compiler/mlir/tensorflow/transforms/BUILD |  2 +
 .../compiler/mlir/tf2xla/transforms/BUILD     |  1 +
 tensorflow/compiler/mlir/tfrt/BUILD           |  4 +
 tensorflow/compiler/mlir/tfrt/ir/BUILD        |  2 +
 tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD   |  1 +
 .../compiler/mlir/tools/kernel_gen/BUILD      |  1 +
 .../mlir/tools/kernel_gen/transforms/BUILD    |  5 ++
 tensorflow/compiler/mlir/tosa/BUILD           |  4 +
 tensorflow/compiler/tf2xla/BUILD              |  1 +
 .../core/transforms/constant_folding/BUILD    |  1 +
 tensorflow/core/transforms/remapper/BUILD     |  1 +
 tensorflow/dtensor/mlir/BUILD                 |  1 +
 third_party/triton/cl619443019.patch          | 76 +++++++++++++++++++
 third_party/triton/workspace.bzl              |  1 +
 .../xla/third_party/triton/cl619443019.patch  | 76 +++++++++++++++++++
 .../xla/third_party/triton/workspace.bzl      |  1 +
 .../xla/xla/mlir/runtime/transforms/BUILD     |  3 +
 third_party/xla/xla/mlir_hlo/BUILD            |  9 +++
 23 files changed, 205 insertions(+)
 create mode 100644 third_party/triton/cl619443019.patch
 create mode 100644 third_party/xla/third_party/triton/cl619443019.patch

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index d684d1f3a5c27c..f535d0d1aaea2e 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -742,6 +742,7 @@ cc_library(
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla:status",
         "@local_xla//xla:statusor",
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
index 75228b1bc607bc..1c5a0703d0a58a 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
@@ -199,6 +199,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -268,6 +269,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index b6abb996f47837..bd83f16de105f8 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -86,6 +86,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
     ],
@@ -109,6 +110,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
     ],
@@ -138,6 +140,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
         "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
@@ -315,6 +318,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
     ],
@@ -339,6 +343,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
     ],
@@ -367,6 +372,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
     ],
@@ -419,6 +425,7 @@ cc_library(
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_serialization",
@@ -514,6 +521,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
     ],
@@ -537,6 +545,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
     ],
@@ -676,6 +685,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
         "@stablehlo//:broadcast_utils",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index bf1d249bf05cb5..4998c87f70febe 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -334,6 +334,7 @@ cc_library(
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/mlir_hlo",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
index 62a6f27c8ad5f1..23ce2105634854 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
@@ -183,6 +183,7 @@ tf_cc_test(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index d44bd428bd9456..c84871fd564156 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -60,6 +60,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -337,6 +338,7 @@ cc_library(
         "@llvm-project//mlir:MLProgramDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
index 1250206ef80cea..b76b52c9fd774a 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
@@ -287,6 +287,7 @@ cc_library(
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:side_effect_util",
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 2fa322d47c5c58..21cdf1203a3554 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -145,6 +145,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:core_runtime_opdefs",
@@ -165,6 +166,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_opdefs",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:core_runtime_opdefs",
@@ -180,6 +182,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -254,6 +257,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:core_runtime_opdefs",
diff --git a/tensorflow/compiler/mlir/tfrt/ir/BUILD b/tensorflow/compiler/mlir/tfrt/ir/BUILD
index 92a1dc3d2757cb..88baa91f6de0d0 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/BUILD
@@ -52,8 +52,10 @@ cc_library(
         ":tfrt_fallback_opdefs",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:compiler_tfrt_op_interfaces",
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
index cf64a37c2a696c..ce69fa85189423 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
@@ -167,6 +167,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_side_effects",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:compiler_tfrt_op_interfaces",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 59dabbafe2de88..86e2e269e4d329 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -89,6 +89,7 @@ cc_library(
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:ShapeToStandard",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
         "@local_xla//xla/mlir_hlo:all_passes",  # fixdeps: keep
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index f94d1adf2b0766..c4abb6420d9b38 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -39,6 +39,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -57,6 +58,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:chlo_ops",
     ],
@@ -73,6 +75,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -148,6 +151,7 @@ cc_library(
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TensorTransforms",
         "@llvm-project//mlir:ToLLVMIRTranslation",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorDialect",
         "@llvm-project//mlir:VectorToLLVM",
@@ -216,6 +220,7 @@ cc_library(
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo:transforms_passes",
     ],
diff --git a/tensorflow/compiler/mlir/tosa/BUILD b/tensorflow/compiler/mlir/tosa/BUILD
index e25d2229c605c8..a7d9610a472308 100644
--- a/tensorflow/compiler/mlir/tosa/BUILD
+++ b/tensorflow/compiler/mlir/tosa/BUILD
@@ -102,6 +102,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TosaDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/framework/fixedpoint",
     ],
@@ -157,6 +158,7 @@ cc_library(
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TosaDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -219,6 +221,7 @@ cc_library(
         "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TosaDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -248,6 +251,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TosaDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 6ee27ba27a8345..16315f71d9652c 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -303,6 +303,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@ducc//:fft_wrapper",
         "@eigen_archive//:eigen3",
+        "@llvm-project//mlir:TransformUtils",
         "@local_xla//xla:empty",
         "//tensorflow/core/framework:numeric_types",
         "//tensorflow/core/platform:bfloat16",
diff --git a/tensorflow/core/transforms/constant_folding/BUILD b/tensorflow/core/transforms/constant_folding/BUILD
index e64e9d868f2677..ba1bc56b1749d2 100644
--- a/tensorflow/core/transforms/constant_folding/BUILD
+++ b/tensorflow/core/transforms/constant_folding/BUILD
@@ -30,6 +30,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/core/transforms/remapper/BUILD b/tensorflow/core/transforms/remapper/BUILD
index 0c348a93f6d723..38b09b8bf601cb 100644
--- a/tensorflow/core/transforms/remapper/BUILD
+++ b/tensorflow/core/transforms/remapper/BUILD
@@ -48,6 +48,7 @@ cc_library(
         "@llvm-project//mlir:PDLInterpDialect",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/dtensor/mlir/BUILD b/tensorflow/dtensor/mlir/BUILD
index 063d78221644bb..18a1057808e981 100644
--- a/tensorflow/dtensor/mlir/BUILD
+++ b/tensorflow/dtensor/mlir/BUILD
@@ -267,6 +267,7 @@ cc_library(
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/platform:status",
         "@local_xla//xla:xla_data_proto_cc",
diff --git a/third_party/triton/cl619443019.patch b/third_party/triton/cl619443019.patch
new file mode 100644
index 00000000000000..95ce54b6e4d6aa
--- /dev/null
+++ b/third_party/triton/cl619443019.patch
@@ -0,0 +1,76 @@
+==== triton/BUILD#44 - /google/src/cloud/csigg/mlir_transform_utils/triton/BUILD ====
+# action=edit type=text
+--- triton/BUILD	2024-03-22 08:02:38.000000000 -0700
++++ triton/BUILD	2024-03-27 01:34:43.000000000 -0700
+@@ -620,6 +620,7 @@
+         "@llvm-project//mlir:FunctionInterfaces",
+         "@llvm-project//mlir:GPUDialect",
+         "@llvm-project//mlir:IR",
++        "@llvm-project//mlir:InliningUtils",
+         "@llvm-project//mlir:LLVMDialect",
+         "@llvm-project//mlir:MathDialect",
+         "@llvm-project//mlir:SCFDialect",
+@@ -628,6 +629,7 @@
+         # The following is added to make Utility compile
+         ":TritonTools",
+         "@llvm-project//mlir:LLVMCommonConversion",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+ )
+@@ -646,6 +648,7 @@
+         "@llvm-project//mlir:IR",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:Support",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+     alwayslink = True,  # TritonDialect uses getCanonicalizationPatterns().
+@@ -729,6 +732,7 @@
+         "@llvm-project//mlir:LLVMDialect",
+         "@llvm-project//mlir:NVVMDialect",
+         "@llvm-project//mlir:Pass",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+ )
+@@ -780,6 +784,7 @@
+         "@llvm-project//mlir:IR",
+         "@llvm-project//mlir:IndexDialect",
+         "@llvm-project//mlir:Pass",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+ )
+==== triton/test/BUILD#18 - /google/src/cloud/csigg/mlir_transform_utils/triton/test/BUILD ====
+# action=edit type=text
+--- triton/test/BUILD	2024-03-11 11:42:57.000000000 -0700
++++ triton/test/BUILD	2024-03-27 01:32:04.000000000 -0700
+@@ -53,6 +53,7 @@
+         "@llvm-project//mlir:IR",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:SCFToControlFlow",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+         "//:TritonAnalysis",
+         "//:TritonDialects",
+==== triton/third_party/nvidia/BUILD#3 - /google/src/cloud/csigg/mlir_transform_utils/triton/third_party/nvidia/BUILD ====
+# action=edit type=text
+--- triton/third_party/nvidia/BUILD	2024-03-11 11:42:57.000000000 -0700
++++ triton/third_party/nvidia/BUILD	2024-03-27 01:32:46.000000000 -0700
+@@ -66,6 +66,7 @@
+         "@llvm-project//mlir:NVVMDialect",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:Support",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+         "//:TritonDialects",
+     ],
+@@ -113,6 +114,7 @@
+         "@llvm-project//mlir:NVVMDialect",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:SCFToControlFlow",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+         "//:TritonAnalysis",
+         "//:TritonDialects",
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index ce8d828c2fc64b..2773b250ac8554 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -17,5 +17,6 @@ def repo():
             "//third_party/triton:cl607293980.patch",  # long standing :(
             "//third_party/triton:cl617812302.patch",
             "//third_party/triton:cl619146327.patch",
+            "//third_party/triton:cl619443019.patch",
         ],
     )
diff --git a/third_party/xla/third_party/triton/cl619443019.patch b/third_party/xla/third_party/triton/cl619443019.patch
new file mode 100644
index 00000000000000..95ce54b6e4d6aa
--- /dev/null
+++ b/third_party/xla/third_party/triton/cl619443019.patch
@@ -0,0 +1,76 @@
+==== triton/BUILD#44 - /google/src/cloud/csigg/mlir_transform_utils/triton/BUILD ====
+# action=edit type=text
+--- triton/BUILD	2024-03-22 08:02:38.000000000 -0700
++++ triton/BUILD	2024-03-27 01:34:43.000000000 -0700
+@@ -620,6 +620,7 @@
+         "@llvm-project//mlir:FunctionInterfaces",
+         "@llvm-project//mlir:GPUDialect",
+         "@llvm-project//mlir:IR",
++        "@llvm-project//mlir:InliningUtils",
+         "@llvm-project//mlir:LLVMDialect",
+         "@llvm-project//mlir:MathDialect",
+         "@llvm-project//mlir:SCFDialect",
+@@ -628,6 +629,7 @@
+         # The following is added to make Utility compile
+         ":TritonTools",
+         "@llvm-project//mlir:LLVMCommonConversion",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+ )
+@@ -646,6 +648,7 @@
+         "@llvm-project//mlir:IR",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:Support",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+     alwayslink = True,  # TritonDialect uses getCanonicalizationPatterns().
+@@ -729,6 +732,7 @@
+         "@llvm-project//mlir:LLVMDialect",
+         "@llvm-project//mlir:NVVMDialect",
+         "@llvm-project//mlir:Pass",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+ )
+@@ -780,6 +784,7 @@
+         "@llvm-project//mlir:IR",
+         "@llvm-project//mlir:IndexDialect",
+         "@llvm-project//mlir:Pass",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+ )
+==== triton/test/BUILD#18 - /google/src/cloud/csigg/mlir_transform_utils/triton/test/BUILD ====
+# action=edit type=text
+--- triton/test/BUILD	2024-03-11 11:42:57.000000000 -0700
++++ triton/test/BUILD	2024-03-27 01:32:04.000000000 -0700
+@@ -53,6 +53,7 @@
+         "@llvm-project//mlir:IR",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:SCFToControlFlow",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+         "//:TritonAnalysis",
+         "//:TritonDialects",
+==== triton/third_party/nvidia/BUILD#3 - /google/src/cloud/csigg/mlir_transform_utils/triton/third_party/nvidia/BUILD ====
+# action=edit type=text
+--- triton/third_party/nvidia/BUILD	2024-03-11 11:42:57.000000000 -0700
++++ triton/third_party/nvidia/BUILD	2024-03-27 01:32:46.000000000 -0700
+@@ -66,6 +66,7 @@
+         "@llvm-project//mlir:NVVMDialect",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:Support",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+         "//:TritonDialects",
+     ],
+@@ -113,6 +114,7 @@
+         "@llvm-project//mlir:NVVMDialect",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:SCFToControlFlow",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+         "//:TritonAnalysis",
+         "//:TritonDialects",
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index ce8d828c2fc64b..2773b250ac8554 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -17,5 +17,6 @@ def repo():
             "//third_party/triton:cl607293980.patch",  # long standing :(
             "//third_party/triton:cl617812302.patch",
             "//third_party/triton:cl619146327.patch",
+            "//third_party/triton:cl619443019.patch",
         ],
     )
diff --git a/third_party/xla/xla/mlir/runtime/transforms/BUILD b/third_party/xla/xla/mlir/runtime/transforms/BUILD
index 2de0bcb1c4f878..fa29f9ec740de2 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/BUILD
+++ b/third_party/xla/xla/mlir/runtime/transforms/BUILD
@@ -80,6 +80,7 @@ cc_library(
     deps = [
         "//xla/mlir/runtime/ir:rt",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -93,6 +94,7 @@ xla_cc_test(
         "//xla/mlir/runtime/ir:rt",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
@@ -170,6 +172,7 @@ cc_library(
     deps = [
         ":custom_call_encoding",
         "//xla/runtime:type_id",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index 1344327b89e7ec..0727dd7bc7f0fe 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -306,6 +306,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -446,6 +447,7 @@ cc_library(
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
@@ -619,6 +621,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TensorUtils",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:base",
         "@stablehlo//:chlo_ops",
@@ -638,6 +641,7 @@ cc_library(
         "@llvm-project//mlir:FuncTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
     ],
@@ -791,6 +795,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TensorUtils",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:chlo_ops",
     ],
@@ -878,6 +883,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -909,6 +915,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_ops_inc_gen",
@@ -928,6 +935,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_ops_inc_gen",
@@ -1115,6 +1123,7 @@ cc_library(
         "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:ShapeToStandard",
         "@llvm-project//mlir:TensorInferTypeOpInterfaceImpl",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorDialect",
         "@llvm-project//mlir:VectorToLLVM",

From 5e726bca2cad4ea7a4209bf7c73d668cc31078d8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 12:33:20 -0700
Subject: [PATCH 558/670] Add Batch Norm to the integration test. Remove all
 the rng_seed parameters.

PiperOrigin-RevId: 620022797
---
 .../integration_test/quantize_model_test.py   | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
index 6db660e23c20ef..bc25b9a858440f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -64,7 +64,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
               ([10, 1, 1024], [10, 1024, 3]),
               ([2, 3, 1, 1024], [2, 3, 1024, 3]),
           ),
-          'rng_seed': (1230, 1231, 1232, 1233),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
@@ -73,7 +72,6 @@ def test_matmul_ptq_model(
       bias_fn: Optional[ops.Operation],
       activation_fn: Optional[ops.Operation],
       dim_sizes: Sequence[int],
-      rng_seed: int,
   ):
     lhs_dim_size, rhs_dim_size = dim_sizes
     input_shape = (*lhs_dim_size,)
@@ -87,7 +85,7 @@ def test_matmul_ptq_model(
         activation_fn,
     )
 
-    rng = np.random.default_rng(rng_seed)
+    rng = np.random.default_rng(seed=42)
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
             np.float32
@@ -164,14 +162,12 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
               'slice',
               'transpose',
           ),
-          'rng_seed': (0, 11, 222, 3333),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
   def test_matmul_and_same_scale_ptq_model(
       self,
       same_scale_op: str,
-      rng_seed: int,
   ):
     input_shape = (2, 3, 1, 1024)
     filter_shape = (2, 3, 1024, 3)
@@ -184,7 +180,7 @@ def test_matmul_and_same_scale_ptq_model(
         same_scale_op,
     )
 
-    rng = np.random.default_rng(rng_seed)
+    rng = np.random.default_rng(seed=42)
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
             np.float32
@@ -249,7 +245,6 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
               # TODO: b/326242075 - Support other same-scale ops.
           ),
           'dim_sizes': (([None, 1024], [1024, 3]),),
-          'rng_seed': (0, 11, 222, 3333),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
@@ -257,7 +252,6 @@ def test_matmul_and_same_scale_ptq_model_dynamic(
       self,
       same_scale_op: str,
       dim_sizes: Sequence[int],
-      rng_seed: int,
   ):
     input_dim_size, filter_dim_size = dim_sizes
     input_shape = (*input_dim_size,)
@@ -271,7 +265,7 @@ def test_matmul_and_same_scale_ptq_model_dynamic(
         same_scale_op,
     )
 
-    rng = np.random.default_rng(rng_seed)
+    rng = np.random.default_rng(seed=42)
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
             np.float32
@@ -339,7 +333,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
               nn_ops.relu,
               nn_ops.relu6,
           ),
-          'has_batch_norm': (False,),
+          'has_batch_norm': (False, True),
           'input_shape_dynamic': (
               False,
               True,
@@ -348,7 +342,6 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
               False,
               True,
           ),
-          'rng_seed': (10, 11, 12, 13),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
@@ -359,7 +352,6 @@ def test_conv_ptq_model(
       has_batch_norm: bool,
       input_shape_dynamic: bool,
       enable_per_channel_quantized_weight: bool,
-      rng_seed: int,
       dilations: Sequence[int] = None,
   ):
     input_shape = (None, 3, 4, 3) if input_shape_dynamic else (1, 3, 4, 3)
@@ -375,9 +367,18 @@ def test_conv_ptq_model(
         strides,
         dilations,
     )
+    # TODO(b/331809306): investigate why these tests fail.
+    # skip these test cases.
+    if (
+        bias_fn is None
+        and has_batch_norm
+        and input_shape_dynamic
+        and enable_per_channel_quantized_weight
+    ):
+      return
 
     # Generate model input data.
-    rng = np.random.default_rng(rng_seed)
+    rng = np.random.default_rng(seed=42)
     static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
@@ -450,13 +451,11 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
               'abc,cde->abde',
               'abc,dce->abde',
           ),
-          'rng_seed': (82, 82732, 4444, 14),
       }])
   )
   def test_einsum_ptq_model(
       self,
       equation: str,
-      rng_seed: int,
   ):
     _, y_shape, bias_shape, x_signature, y_signature = (
         self._prepare_sample_einsum_datashapes(equation, use_bias=True)
@@ -472,7 +471,7 @@ def test_einsum_ptq_model(
     )
 
     # Generate model input data.
-    rng = np.random.default_rng(rng_seed)
+    rng = np.random.default_rng(seed=42)
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=x_signature).astype('f4')
     )

From 47bdff097e6c04cc4ec82209a4366a4f1f9be4b6 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 28 Mar 2024 12:45:29 -0700
Subject: [PATCH 559/670] [XLA:Python] Add pytype_srcs and pytype_deps
 attributes to pytype_strict_library.

These attributes are ignored, because we don't use pytype in OSS builds at the moment.

While we're here, remove a copybara transformation that strips off a pytype_srcs attribute: we can just leave the attribute alone and it won't do any harm.

PiperOrigin-RevId: 620026406
---
 third_party/xla/xla/python/BUILD       | 1 +
 third_party/xla/xla/pytype.default.bzl | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 91b78aa20c8822..e1f044aead00b3 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -40,6 +40,7 @@ package_group(
 pytype_strict_library(
     name = "xla_client",
     srcs = ["xla_client.py"],
+    pytype_srcs = ["xla_client.pyi"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
diff --git a/third_party/xla/xla/pytype.default.bzl b/third_party/xla/xla/pytype.default.bzl
index 05143e8a715181..b63011cc1b8e48 100644
--- a/third_party/xla/xla/pytype.default.bzl
+++ b/third_party/xla/xla/pytype.default.bzl
@@ -10,5 +10,6 @@ def pytype_strict_binary(name, **kwargs):
     native.py_binary(name = name, **kwargs)
 
 # Placeholder to use until bazel supports pytype_strict_library.
-def pytype_strict_library(name, **kwargs):
+def pytype_strict_library(name, pytype_deps = [], pytype_srcs = [], **kwargs):
+    _ = (pytype_deps, pytype_srcs)  # @unused
     native.py_library(name = name, **kwargs)

From 89591b6bbc322f690e0e6add26b1e603bb21120e Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Thu, 28 Mar 2024 13:27:01 -0700
Subject: [PATCH 560/670] Move `tsl/util` to `xla/tsl/util`

PiperOrigin-RevId: 620039887
---
 tensorflow/core/BUILD                         |  4 +-
 .../colocate_predecessor_trees_pass.cc        |  2 +-
 tensorflow/core/common_runtime/eager/BUILD    |  6 +-
 .../core/common_runtime/eager/context.cc      |  2 +-
 .../core/common_runtime/eager/execute.cc      |  2 +-
 .../core/common_runtime/eager/execute_node.cc |  2 +-
 .../common_runtime/gpu/gpu_process_state.cc   |  2 +-
 .../process_function_library_runtime.cc       |  2 +-
 tensorflow/core/data/BUILD                    |  2 +-
 tensorflow/core/data/dataset_utils_test.cc    |  2 +-
 tensorflow/core/framework/BUILD               |  2 +-
 tensorflow/core/framework/tensor.cc           |  2 +-
 tensorflow/core/kernels/BUILD                 |  2 +-
 tensorflow/core/kernels/image/BUILD           |  4 +-
 .../core/kernels/image/decode_image_op.cc     |  2 +-
 tensorflow/core/kernels/logging_ops_test.cc   |  2 +-
 .../core/kernels/numeric_options_utils.h      |  2 +-
 tensorflow/core/kernels/scatter_nd_util.h     |  2 +-
 tensorflow/core/profiler/convert/BUILD        |  2 +-
 .../op_stats_to_input_pipeline_analysis.cc    |  2 +-
 tensorflow/core/profiler/utils/BUILD          |  2 +-
 .../core/profiler/utils/derived_timeline.cc   |  2 +-
 tensorflow/core/util/BUILD                    | 32 +++----
 tensorflow/core/util/command_line_flags.h     |  2 +-
 tensorflow/core/util/determinism.h            |  2 +-
 tensorflow/core/util/device_name_utils.h      |  2 +-
 tensorflow/core/util/env_var.h                |  2 +-
 tensorflow/core/util/mkl_util.h               |  2 +-
 tensorflow/core/util/proto/BUILD              |  2 +-
 tensorflow/core/util/proto/proto_utils.h      |  2 +-
 tensorflow/core/util/reporter.h               |  2 +-
 .../core/util/stat_summarizer_options.h       |  2 +-
 tensorflow/core/util/stats_calculator.h       |  2 +-
 tensorflow/core/util/tensor_bundle/BUILD      |  4 +-
 .../core/util/tensor_bundle/byte_swap_array.h |  2 +-
 .../core/util/tensor_bundle/tensor_bundle.cc  |  2 +-
 tensorflow/core/util/use_cudnn.h              |  2 +-
 tensorflow/core/util/work_sharder.cc          |  2 +-
 tensorflow/dtensor/cc/BUILD                   |  4 +-
 tensorflow/dtensor/cc/dtensor_device.cc       |  2 +-
 tensorflow/dtensor/cc/dtensor_utils.cc        |  2 +-
 tensorflow/dtensor/mlir/utils/BUILD           |  2 +-
 .../dtensor/mlir/utils/collective_lowering.cc |  2 +-
 tensorflow/lite/CMakeLists.txt                |  2 +
 .../lite/examples/label_image/CMakeLists.txt  |  2 +-
 tensorflow/lite/testing/BUILD                 |  2 +-
 .../testing/generated_examples_zip_test.cc    |  2 +-
 .../lite/tools/benchmark/CMakeLists.txt       |  2 +-
 .../lite/tools/benchmark/experimental/c/BUILD |  2 +-
 .../experimental/c/benchmark_c_api.cc         |  2 +-
 tensorflow/lite/tools/evaluation/stages/BUILD |  6 +-
 .../stages/image_preprocessing_stage.h        |  2 +-
 .../stages/inference_profiler_stage.h         |  2 +-
 .../stages/tflite_inference_stage.h           |  2 +-
 tensorflow/python/framework/BUILD             |  2 +-
 tensorflow/python/framework/offset_counter.cc |  2 +-
 .../python/framework/python_op_gen_main.cc    |  2 +-
 tensorflow/python/util/BUILD                  |  2 +-
 .../python/util/stat_summarizer_wrapper.cc    |  4 +-
 .../inference_interface/jni/run_stats_jni.h   |  2 +-
 .../distributed_runtime/coordination/BUILD    |  2 +-
 .../coordination/coordination_service.cc      |  2 +-
 .../tsl/tsl/distributed_runtime/rpc/BUILD     |  6 +-
 .../distributed_runtime/rpc/grpc_channel.cc   |  2 +-
 .../rpc/grpc_channel_test.cc                  |  2 +-
 .../tsl/distributed_runtime/rpc/grpc_state.h  |  2 +-
 .../xla/third_party/tsl/tsl/framework/BUILD   |  4 +-
 .../tsl/tsl/framework/device_id_utils.h       |  2 +-
 .../tsl/tsl/framework/device_id_utils_test.cc |  2 +-
 .../third_party/tsl/tsl/platform/cloud/BUILD  |  2 +-
 .../tsl/platform/cloud/curl_http_request.cc   |  2 +-
 .../third_party/tsl/tsl/profiler/lib/BUILD    |  2 +-
 .../tsl/tsl/profiler/lib/profiler_lock.cc     |  2 +-
 .../third_party/tsl/tsl/profiler/utils/BUILD  |  2 +-
 .../tsl/tsl/profiler/utils/xplane_utils.cc    |  2 +-
 .../xla/third_party/tsl/tsl/protobuf/BUILD    |  2 +-
 third_party/xla/xla/BUILD                     |  9 +-
 .../xla/xla/backends/profiler/gpu/BUILD       |  4 +-
 .../xla/backends/profiler/gpu/cupti_utils.cc  |  2 +-
 .../profiler/gpu/device_tracer_cuda.cc        |  2 +-
 .../profiler/gpu/device_tracer_rocm.cc        |  2 +-
 .../backends/profiler/gpu/rocm_collector.cc   |  2 +-
 third_party/xla/xla/debug_options_flags.cc    |  2 +-
 third_party/xla/xla/debug_options_flags.h     |  2 +-
 third_party/xla/xla/literal.cc                |  2 +-
 third_party/xla/xla/parse_flags_from_env.cc   |  2 +-
 third_party/xla/xla/parse_flags_from_env.h    |  2 +-
 .../xla/xla/parse_flags_from_env_test.cc      |  2 +-
 third_party/xla/xla/pjrt/gpu/BUILD            |  4 +-
 third_party/xla/xla/pjrt/gpu/gpu_helpers.cc   |  2 +-
 third_party/xla/xla/service/BUILD             |  4 +-
 third_party/xla/xla/service/cpu/BUILD         |  8 +-
 .../xla/xla/service/cpu/onednn_layer_norm.cc  |  2 +-
 .../xla/xla/service/cpu/onednn_matmul.cc      |  2 +-
 .../xla/service/cpu/onednn_matmul_rewriter.cc |  2 +-
 .../xla/xla/service/cpu/onednn_softmax.cc     |  2 +-
 third_party/xla/xla/service/gpu/BUILD         | 14 +--
 .../xla/service/gpu/conv_algorithm_picker.cc  |  2 +-
 .../xla/service/gpu/gemm_algorithm_picker.cc  |  2 +-
 .../xla/service/gpu/gemm_fusion_autotuner.cc  |  2 +-
 .../xla/service/gpu/llvm_gpu_backend/BUILD    |  2 +-
 .../gpu/llvm_gpu_backend/gpu_backend_lib.cc   |  2 +-
 third_party/xla/xla/service/gpu/model/BUILD   |  2 +-
 .../service/gpu/model/hlo_op_profiler_run.cc  |  2 +-
 .../xla/xla/service/gpu/nvptx_compiler.cc     |  2 +-
 .../xla/service/gpu/stream_executor_util.cc   |  4 +-
 .../service/gpu/stream_executor_util_test.cc  |  2 +-
 .../service/gpu_compilation_environment.cc    |  2 +-
 .../xla/xla/service/xla_compile_main.cc       |  3 +-
 third_party/xla/xla/stream_executor/BUILD     |  2 +-
 .../xla/xla/stream_executor/cuda/BUILD        |  2 +-
 .../xla/xla/stream_executor/cuda/cuda_dnn.cc  |  2 +-
 third_party/xla/xla/stream_executor/gpu/BUILD |  4 +-
 .../gpu/gpu_cudamallocasync_allocator.cc      |  2 +-
 .../xla/xla/stream_executor/rocm/BUILD        |  6 +-
 .../xla/xla/stream_executor/rocm/rocm_blas.cc |  2 +-
 .../xla/xla/stream_executor/rocm/rocm_dnn.cc  |  4 +-
 .../stream_executor/stream_executor_pimpl.cc  |  2 +-
 third_party/xla/xla/tools/BUILD               | 14 +--
 .../tools/extract_collective_operations.cc    |  2 +-
 .../xla/tools/hex_floats_to_packed_literal.cc |  2 +-
 third_party/xla/xla/tools/hlo_bisect/BUILD    |  2 +-
 .../xla/xla/tools/hlo_bisect/hlo_bisect.cc    |  2 +-
 third_party/xla/xla/tools/hlo_expand.cc       |  2 +-
 third_party/xla/xla/tools/hlo_expand.h        |  2 +-
 third_party/xla/xla/tools/hlo_expand_main.cc  |  2 +-
 third_party/xla/xla/tools/hlo_opt/BUILD       |  2 +-
 third_party/xla/xla/tools/hlo_opt/opt_main.cc |  2 +-
 .../xla/xla/tools/hlo_proto_to_json.cc        |  2 +-
 .../xla/xla/tools/interactive_graphviz.cc     |  2 +-
 .../xla/xla/tools/multihost_hlo_runner/BUILD  |  2 +-
 .../multihost_hlo_runner/hlo_runner_main.cc   |  2 +-
 .../xla/xla/tools/run_hlo_module_main.cc      |  2 +-
 .../{third_party/tsl => xla}/tsl/util/BUILD   | 94 +++++++++----------
 .../tsl => xla}/tsl/util/byte_swap_array.cc   |  2 +-
 .../tsl => xla}/tsl/util/byte_swap_array.h    |  6 +-
 .../tsl/util/command_line_flags.cc            |  2 +-
 .../tsl => xla}/tsl/util/command_line_flags.h |  6 +-
 .../tsl => xla}/tsl/util/determinism.cc       |  4 +-
 .../tsl => xla}/tsl/util/determinism.h        |  6 +-
 .../tsl/util/determinism_test_util.h          |  8 +-
 .../tsl => xla}/tsl/util/device_name_utils.cc |  2 +-
 .../tsl => xla}/tsl/util/device_name_utils.h  |  6 +-
 .../tsl/util/device_name_utils_test.cc        |  2 +-
 .../tsl => xla}/tsl/util/env_var.cc           |  2 +-
 .../tsl => xla}/tsl/util/env_var.h            |  6 +-
 .../tsl => xla}/tsl/util/onednn_threadpool.h  |  6 +-
 .../tsl => xla}/tsl/util/proto/BUILD          |  0
 .../tsl => xla}/tsl/util/proto/proto_utils.h  |  6 +-
 .../tsl => xla}/tsl/util/reporter.cc          |  2 +-
 .../tsl => xla}/tsl/util/reporter.h           |  6 +-
 .../tsl/util/stat_summarizer_options.h        |  6 +-
 .../tsl => xla}/tsl/util/stats_calculator.cc  |  2 +-
 .../tsl => xla}/tsl/util/stats_calculator.h   |  8 +-
 .../tsl/util/stats_calculator_test.cc         |  2 +-
 .../tsl => xla}/tsl/util/use_cudnn.cc         |  4 +-
 .../tsl => xla}/tsl/util/use_cudnn.h          |  6 +-
 third_party/xla/xla/xla.bzl                   |  2 +-
 158 files changed, 286 insertions(+), 284 deletions(-)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/BUILD (77%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/byte_swap_array.cc (97%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/byte_swap_array.h (96%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/command_line_flags.cc (99%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/command_line_flags.h (97%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/determinism.cc (96%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/determinism.h (86%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/determinism_test_util.h (84%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/device_name_utils.cc (99%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/device_name_utils.h (98%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/device_name_utils_test.cc (99%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/env_var.cc (98%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/env_var.h (95%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/onednn_threadpool.h (97%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/proto/BUILD (100%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/proto/proto_utils.h (90%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/reporter.cc (98%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/reporter.h (97%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/stat_summarizer_options.h (88%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/stats_calculator.cc (99%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/stats_calculator.h (96%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/stats_calculator_test.cc (98%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/use_cudnn.cc (98%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/util/use_cudnn.h (92%)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 97c79bec1faccb..656c02e1214ac6 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1708,8 +1708,8 @@ tf_cuda_library(
         "@local_tsl//tsl/framework:cancellation",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:stringpiece",
-        "@local_tsl//tsl/util:command_line_flags",
-        "@local_tsl//tsl/util:device_name_utils",
+        "@local_xla//xla/tsl/util:command_line_flags",
+        "@local_xla//xla/tsl/util:device_name_utils",
     ] + if_cuda([
         "@local_config_cuda//cuda:cudnn_header",
     ]) + if_static(
diff --git a/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc
index 2b2debbd90458a..31a101a421a284 100644
--- a/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc
+++ b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/device_name_utils.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/dump_graph.h"
-#include "tsl/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 8161b5cd49aac3..cb4d241e639f44 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -124,7 +124,7 @@ tf_cuda_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util:env_var",
+        "@local_xla//xla/tsl/util:env_var",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -643,7 +643,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/util:env_var",
+        "@local_xla//xla/tsl/util:env_var",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -813,7 +813,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/util:env_var",
+        "@local_xla//xla/tsl/util:env_var",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 594ea4bf045388..0677e45b4c83a6 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -46,6 +46,7 @@ limitations under the License.
 
 #include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/c/tf_tensor_internal.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
 #include "tensorflow/core/common_runtime/colocation_graph.h"
@@ -63,7 +64,6 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tsl/platform/refcount.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/cluster_function_library_runtime.h"
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 7aff350fa65ac4..57af63ddb05e3d 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/common_runtime/colocation_graph.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
@@ -81,7 +82,6 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_copy_node.h"
diff --git a/tensorflow/core/common_runtime/eager/execute_node.cc b/tensorflow/core/common_runtime/eager/execute_node.cc
index ebedabf3eef3ee..02e032e604e1de 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/eager/execute_node.h"
 
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index b3d85d8d792d7e..9dc7530fc1e5d9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/stream_executor/integrations/device_mem_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/common_runtime/device/device_host_allocator.h"
 #include "tensorflow/core/common_runtime/device_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "tsl/platform/mutex.h"
 #include "tsl/platform/strcat.h"
 #include "tsl/platform/types.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index a404cb014aef35..b1d491fa2fcd87 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
 #include "absl/types/variant.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -67,7 +68,6 @@ limitations under the License.
 #include "tensorflow/core/util/reffed_status_callback.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
 #endif  // IS_MOBILE_PLATFORM
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index 0816237fb0e1d6..c5191a995b60a2 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -181,7 +181,7 @@ tf_cc_test(
         "//tensorflow/core/platform:str_util",
         "@com_google_absl//absl/container:flat_hash_set",
         "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/util:determinism_test_util",
+        "@local_xla//xla/tsl/util:determinism_test_util",
     ],
 )
 
diff --git a/tensorflow/core/data/dataset_utils_test.cc b/tensorflow/core/data/dataset_utils_test.cc
index 853f5e6c5c0bfa..e581f6e3cbe3e8 100644
--- a/tensorflow/core/data/dataset_utils_test.cc
+++ b/tensorflow/core/data/dataset_utils_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "xla/tsl/util/determinism_test_util.h"
 #include "tensorflow/core/data/compression_utils.h"
 #include "tensorflow/core/data/dataset_test_base.h"
 #include "tensorflow/core/data/serialization_utils.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/work_sharder.h"
 #include "tsl/platform/status_matchers.h"
-#include "tsl/util/determinism_test_util.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index b5f24419245868..15db744c0b2201 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -875,7 +875,7 @@ tf_cuda_library(
         "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/framework:device_type",
-        "@local_tsl//tsl/util:byte_swap_array",
+        "@local_xla//xla/tsl/util:byte_swap_array",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 010395a9a2c4bd..d2b0cd3efa0461 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/escaping.h"
+#include "xla/tsl/util/byte_swap_array.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/resource_handle.h"
@@ -65,7 +66,6 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/platform/types.h"
-#include "tsl/util/byte_swap_array.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 41546d050012f7..e2c67628646a18 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3433,7 +3433,7 @@ tf_cc_tests(
         "//tensorflow/core:testlib",
         "//tensorflow/core/platform:status_matchers",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util:determinism_test_util",
+        "@local_xla//xla/tsl/util:determinism_test_util",
     ],
 )
 
diff --git a/tensorflow/core/kernels/image/BUILD b/tensorflow/core/kernels/image/BUILD
index d26a532e1a8315..38c741f9844df0 100644
--- a/tensorflow/core/kernels/image/BUILD
+++ b/tensorflow/core/kernels/image/BUILD
@@ -206,7 +206,7 @@ tf_kernel_library(
     prefix = "decode_image_op",
     deps = IMAGE_DEPS + [
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util:byte_swap_array",
+        "@local_xla//xla/tsl/util:byte_swap_array",
     ],
 )
 
@@ -455,7 +455,7 @@ cc_library(
         "//tensorflow/core/platform:byte_order",
         "//tensorflow/core/platform:errors",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util:byte_swap_array",
+        "@local_xla//xla/tsl/util:byte_swap_array",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/kernels/image/decode_image_op.cc b/tensorflow/core/kernels/image/decode_image_op.cc
index 2ca9f67e17aca1..afb653191e3e8a 100644
--- a/tensorflow/core/kernels/image/decode_image_op.cc
+++ b/tensorflow/core/kernels/image/decode_image_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "absl/strings/match.h"
+#include "xla/tsl/util/byte_swap_array.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
@@ -42,7 +43,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/tstring.h"
-#include "tsl/util/byte_swap_array.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/logging_ops_test.cc b/tensorflow/core/kernels/logging_ops_test.cc
index 8ba44782a194c0..fdb85fda2d70a0 100644
--- a/tensorflow/core/kernels/logging_ops_test.cc
+++ b/tensorflow/core/kernels/logging_ops_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <chrono>
 #include <thread>
 
+#include "xla/tsl/util/determinism_test_util.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/status_matchers.h"
-#include "tsl/util/determinism_test_util.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/numeric_options_utils.h b/tensorflow/core/kernels/numeric_options_utils.h
index d9ea889b443a87..eb1d50ae7f80bc 100644
--- a/tensorflow/core/kernels/numeric_options_utils.h
+++ b/tensorflow/core/kernels/numeric_options_utils.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_NUMERIC_OPTIONS_UTILS_H_
 
 #include "xla/stream_executor/numeric_options.h"
+#include "xla/tsl/util/determinism.h"
 #include "tsl/platform/tensor_float_32_utils.h"
-#include "tsl/util/determinism.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/scatter_nd_util.h b/tensorflow/core/kernels/scatter_nd_util.h
index ae78a6abad3a0d..f0530048ef699a 100644
--- a/tensorflow/core/kernels/scatter_nd_util.h
+++ b/tensorflow/core/kernels/scatter_nd_util.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SCATTER_ND_UTIL_H_
 #define TENSORFLOW_CORE_KERNELS_SCATTER_ND_UTIL_H_
 
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 1b6781bdd8e012..40d967302cc34c 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -242,7 +242,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/utils:format_utils",
         "@local_tsl//tsl/profiler/utils:tf_op_utils",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index 120e22bab5d64d..268908b3f1cf28 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -46,7 +47,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tsl/profiler/utils/format_utils.h"
 #include "tsl/profiler/utils/tf_op_utils.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index b3b363375f1bf4..a3f7d16a834cef 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -239,7 +239,7 @@ cc_library(
         "@local_tsl//tsl/profiler/utils:tf_xplane_visitor",
         "@local_tsl//tsl/profiler/utils:timespan",
         "@local_tsl//tsl/profiler/utils:tpu_xplane_utils",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
 
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index 3e28bac0c766cf..981b6a0c54e8de 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/gpu_event_stats.h"
@@ -43,7 +44,6 @@ limitations under the License.
 #include "tsl/profiler/utils/tf_xplane_visitor.h"
 #include "tsl/profiler/utils/timespan.h"
 #include "tsl/profiler/utils/tpu_xplane_utils.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index 6f8dce9bf2107d..c2b913a17a6a8f 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -66,7 +66,7 @@ filegroup(
         "padding.h",
         "tensor_format.cc",
         "tensor_format.h",
-        "@local_tsl//tsl/util:mobile_srcs_no_runtime",
+        "@local_xla//xla/tsl/util:mobile_srcs_no_runtime",
     ],
 )
 
@@ -132,7 +132,7 @@ filegroup(
         "work_sharder.h",
         "//tensorflow/core/config:mobile_srcs_only_runtime",
         "//tensorflow/core/util/quantization:mobile_srcs_only_runtime",
-        "@local_tsl//tsl/util:mobile_srcs_only_runtime",
+        "@local_xla//xla/tsl/util:mobile_srcs_only_runtime",
     ],
 )
 
@@ -189,7 +189,7 @@ filegroup(
         "util.h",
         "work_sharder.h",
         "xla_config_registry.h",
-        "@local_tsl//tsl/util:framework_internal_private_hdrs",
+        "@local_xla//xla/tsl/util:framework_internal_private_hdrs",
     ],
 )
 
@@ -231,7 +231,7 @@ filegroup(
         "util.cc",
         "work_sharder.cc",
         "xla_config_registry.cc",
-        "@local_tsl//tsl/util:framework_internal_impl_srcs",
+        "@local_xla//xla/tsl/util:framework_internal_impl_srcs",
     ],
 )
 
@@ -240,7 +240,7 @@ filegroup(
     srcs = [
         "env_var.h",
         "use_cudnn.h",
-        "@local_tsl//tsl/util:lib_internal_public_hdrs",
+        "@local_xla//xla/tsl/util:lib_internal_public_hdrs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -286,7 +286,7 @@ filegroup(
     testonly = 1,
     srcs = [
         "reporter.h",
-        "@local_tsl//tsl/util:test_hdrs",
+        "@local_xla//xla/tsl/util:test_hdrs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -297,7 +297,7 @@ filegroup(
         "mkl_heuristics.h",
         "mkl_util.h",
         "onednn_env_vars.h",
-        "@local_tsl//tsl/util:onednn_util_hdrs",
+        "@local_xla//xla/tsl/util:onednn_util_hdrs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -316,7 +316,7 @@ filegroup(
     testonly = 1,
     srcs = [
         "reporter.h",
-        "@local_tsl//tsl/util:android_test_hdrs",
+        "@local_xla//xla/tsl/util:android_test_hdrs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -326,7 +326,7 @@ filegroup(
     testonly = 1,
     srcs = [
         ":android_test_hdrs",
-        "@local_tsl//tsl/util:android_test_srcs",
+        "@local_xla//xla/tsl/util:android_test_srcs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -367,7 +367,7 @@ filegroup(
         "use_cudnn.h",
         "util.h",
         "work_sharder.h",
-        "@local_tsl//tsl/util:framework_srcs",
+        "@local_xla//xla/tsl/util:framework_srcs",
     ],
 )
 
@@ -422,7 +422,7 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
 
@@ -510,7 +510,7 @@ cc_library(
         "//tensorflow/core/platform:mutex",
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:types",
-        "@local_tsl//tsl/util:reporter",
+        "@local_xla//xla/tsl/util:reporter",
     ],
 )
 
@@ -641,7 +641,7 @@ cc_library(
         "//tensorflow/core/platform:strcat",
         "//tensorflow/core/platform:stringpiece",
         "//tensorflow/core/platform:types",
-        "@local_tsl//tsl/util:env_var",
+        "@local_xla//xla/tsl/util:env_var",
     ],
 )
 
@@ -666,7 +666,7 @@ cc_library(
     deps = [
         ":env_var",
         "//tensorflow/core/platform:mutex",
-        "@local_tsl//tsl/util:determinism",
+        "@local_xla//xla/tsl/util:determinism",
     ],
     alwayslink = 1,
 )
@@ -675,7 +675,7 @@ filegroup(
     name = "determinism_hdr",
     srcs = [
         "determinism.h",
-        "@local_tsl//tsl/util:determinism_hdr",
+        "@local_xla//xla/tsl/util:determinism_hdr",
     ],
     compatible_with = get_compatible_with_portable(),
     visibility = ["//tensorflow:__subpackages__"],
@@ -688,7 +688,7 @@ cc_library(
     # TODO(b/298501506): narrow this in a way that won't break TFRT
     visibility = ["//visibility:public"],
     deps = [
-        "@local_tsl//tsl/util:determinism_hdr_lib",
+        "@local_xla//xla/tsl/util:determinism_hdr_lib",
     ],
 )
 
diff --git a/tensorflow/core/util/command_line_flags.h b/tensorflow/core/util/command_line_flags.h
index cc8ca1b8f119b4..ebc58f7ee476ab 100644
--- a/tensorflow/core/util/command_line_flags.h
+++ b/tensorflow/core/util/command_line_flags.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "xla/tsl/util/command_line_flags.h"
 #include "tensorflow/core/platform/types.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace tensorflow {
 using tsl::Flag;   // NOLINT
diff --git a/tensorflow/core/util/determinism.h b/tensorflow/core/util/determinism.h
index e42fb71d42b0bc..136534ea828570 100644
--- a/tensorflow/core/util/determinism.h
+++ b/tensorflow/core/util/determinism.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_DETERMINISM_H_
 #define TENSORFLOW_CORE_UTIL_DETERMINISM_H_
 
-#include "tsl/util/determinism.h"
+#include "xla/tsl/util/determinism.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index 20b1f21786b2f0..28b5b0f1b6e764 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_DEVICE_NAME_UTILS_H_
 #define TENSORFLOW_CORE_UTIL_DEVICE_NAME_UTILS_H_
 
-#include "tsl/util/device_name_utils.h"
+#include "xla/tsl/util/device_name_utils.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/util/env_var.h b/tensorflow/core/util/env_var.h
index fac0e2373ad145..faad61533d648a 100644
--- a/tensorflow/core/util/env_var.h
+++ b/tensorflow/core/util/env_var.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_ENV_VAR_H_
 #define TENSORFLOW_CORE_UTIL_ENV_VAR_H_
 
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 7d6b1d76d78b2a..a6164cc0264518 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -42,7 +42,7 @@ limitations under the License.
 #if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
 #include "tensorflow/core/platform/mutex.h"
 #endif
-#include "tsl/util/onednn_threadpool.h"
+#include "xla/tsl/util/onednn_threadpool.h"
 
 using dnnl::engine;
 using dnnl::memory;
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
index be93c135384c59..1d84749d6bf523 100644
--- a/tensorflow/core/util/proto/BUILD
+++ b/tensorflow/core/util/proto/BUILD
@@ -72,7 +72,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:platform_base",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util/proto:proto_utils",
+        "@local_xla//xla/tsl/util/proto:proto_utils",
     ],
 )
 
diff --git a/tensorflow/core/util/proto/proto_utils.h b/tensorflow/core/util/proto/proto_utils.h
index 43f8c918299c65..f0347a84cbe429 100644
--- a/tensorflow/core/util/proto/proto_utils.h
+++ b/tensorflow/core/util/proto/proto_utils.h
@@ -17,10 +17,10 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
 
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/proto/proto_utils.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tsl/util/proto/proto_utils.h"
 
 namespace tensorflow {
 namespace proto_utils {
diff --git a/tensorflow/core/util/reporter.h b/tensorflow/core/util/reporter.h
index f36b7c72bfb275..2db7a6f827dc22 100644
--- a/tensorflow/core/util/reporter.h
+++ b/tensorflow/core/util/reporter.h
@@ -21,11 +21,11 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 
+#include "xla/tsl/util/reporter.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
-#include "tsl/util/reporter.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/stat_summarizer_options.h b/tensorflow/core/util/stat_summarizer_options.h
index daf3d3b3a9c2c9..71f9bf372454f7 100644
--- a/tensorflow/core/util/stat_summarizer_options.h
+++ b/tensorflow/core/util/stat_summarizer_options.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_OPTIONS_H_
 #define TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_OPTIONS_H_
 
-#include "tsl/util/stat_summarizer_options.h"
+#include "xla/tsl/util/stat_summarizer_options.h"
 
 namespace tensorflow {
 using tsl::StatSummarizerOptions;
diff --git a/tensorflow/core/util/stats_calculator.h b/tensorflow/core/util/stats_calculator.h
index 90605aaa9da6d7..20c997ced374a7 100644
--- a/tensorflow/core/util/stats_calculator.h
+++ b/tensorflow/core/util/stats_calculator.h
@@ -26,8 +26,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/util/stat_summarizer_options.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index 76a0e41669d42e..4ca9b222fb114b 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -62,7 +62,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/lib/io:buffered_file",
-        "@local_tsl//tsl/util:byte_swap_array",
+        "@local_xla//xla/tsl/util:byte_swap_array",
     ],
 )
 
@@ -86,7 +86,7 @@ cc_library(
         "//tensorflow/core/platform:byte_order",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
-        "@local_tsl//tsl/util:byte_swap_array",
+        "@local_xla//xla/tsl/util:byte_swap_array",
     ],
 )
 
diff --git a/tensorflow/core/util/tensor_bundle/byte_swap_array.h b/tensorflow/core/util/tensor_bundle/byte_swap_array.h
index ed3b6e1445eabb..97315b12917744 100644
--- a/tensorflow/core/util/tensor_bundle/byte_swap_array.h
+++ b/tensorflow/core/util/tensor_bundle/byte_swap_array.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_ARRAY_H_
 #define TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_ARRAY_H_
 
+#include "xla/tsl/util/byte_swap_array.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tsl/util/byte_swap_array.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index f2db274c497c8a..8f58c0bdbcb6d9 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/base/call_once.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/tsl/util/byte_swap_array.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -54,7 +55,6 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_bundle/naming.h"
 #include "tensorflow/core/util/tensor_slice_util.h"
 #include "tsl/lib/io/buffered_file.h"
-#include "tsl/util/byte_swap_array.h"
 
 #ifdef PLATFORM_WINDOWS
 #undef DeleteFile
diff --git a/tensorflow/core/util/use_cudnn.h b/tensorflow/core/util/use_cudnn.h
index ac9f918b494b30..ba13b74016ce7e 100644
--- a/tensorflow/core/util/use_cudnn.h
+++ b/tensorflow/core/util/use_cudnn.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include <cstdint>
 
-#include "tsl/util/use_cudnn.h"
+#include "xla/tsl/util/use_cudnn.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/work_sharder.cc b/tensorflow/core/util/work_sharder.cc
index 6f039c85b948ff..0d48191d69b9e6 100644
--- a/tensorflow/core/util/work_sharder.cc
+++ b/tensorflow/core/util/work_sharder.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tsl/profiler/lib/traceme.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/dtensor/cc/BUILD b/tensorflow/dtensor/cc/BUILD
index 7d5070507fd14f..3d28e474d680ea 100644
--- a/tensorflow/dtensor/cc/BUILD
+++ b/tensorflow/dtensor/cc/BUILD
@@ -43,7 +43,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util:env_var",
+        "@local_xla//xla/tsl/util:env_var",
     ],
 )
 
@@ -373,11 +373,11 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/util:env_var",
         "@local_xla//xla:status_macros",
         "@local_xla//xla/stream_executor/tpu:c_api_decl",
         "@local_xla//xla/stream_executor/tpu:tpu_platform_interface",
         "@local_xla//xla/stream_executor/tpu:tpu_topology_external",
+        "@local_xla//xla/tsl/util:env_var",
     ] + tf_dtensor_tpu_dependencies(),
 )
 
diff --git a/tensorflow/dtensor/cc/dtensor_device.cc b/tensorflow/dtensor/cc/dtensor_device.cc
index cc3a92ec8d1c97..d169ba4c8595b6 100644
--- a/tensorflow/dtensor/cc/dtensor_device.cc
+++ b/tensorflow/dtensor/cc/dtensor_device.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "xla/stream_executor/tpu/tpu_topology.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
@@ -93,7 +94,6 @@ limitations under the License.
 #include "tensorflow/dtensor/proto/layout.pb.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
 
 using tensorflow::EagerExecutor;
 
diff --git a/tensorflow/dtensor/cc/dtensor_utils.cc b/tensorflow/dtensor/cc/dtensor_utils.cc
index dc5f1f827befaa..f5261a719a82e6 100644
--- a/tensorflow/dtensor/cc/dtensor_utils.cc
+++ b/tensorflow/dtensor/cc/dtensor_utils.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 namespace dtensor {
diff --git a/tensorflow/dtensor/mlir/utils/BUILD b/tensorflow/dtensor/mlir/utils/BUILD
index f53ca53420b72a..56620400ec950e 100644
--- a/tensorflow/dtensor/mlir/utils/BUILD
+++ b/tensorflow/dtensor/mlir/utils/BUILD
@@ -51,7 +51,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
-        "@local_tsl//tsl/util:env_var",
         "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/tsl/util:env_var",
     ],
 )
diff --git a/tensorflow/dtensor/mlir/utils/collective_lowering.cc b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
index 07e545f5851a83..5a12d4de95dcc0 100644
--- a/tensorflow/dtensor/mlir/utils/collective_lowering.cc
+++ b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/dtensor/cc/constants.h"
@@ -59,7 +60,6 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/spmd_expander_common.h"
 #include "tensorflow/dtensor/mlir/value_utils.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 namespace dtensor {
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index a61f537365a86e..7fa2ede1f4066d 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -50,6 +50,7 @@ if(NOT TENSORFLOW_SOURCE_DIR)
 endif()
 set(TF_SOURCE_DIR "${TENSORFLOW_SOURCE_DIR}/tensorflow")
 set(TSL_SOURCE_DIR "${TENSORFLOW_SOURCE_DIR}/third_party/xla/third_party/tsl")
+set(XLA_SOURCE_DIR "${TENSORFLOW_SOURCE_DIR}/third_party/xla/")
 set(TFLITE_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}")
 set(CMAKE_MODULE_PATH
   "${TFLITE_SOURCE_DIR}/tools/cmake/modules"
@@ -161,6 +162,7 @@ find_package(ruy REQUIRED)
 # Include TSL, which is in tensorflow/third_party
 include_directories(
   ${TSL_SOURCE_DIR}
+  ${XLA_SOURCE_DIR}
 )
 # Download necessary dependencies.
 # Download pthreadpool source package if it doesn't exist.
diff --git a/tensorflow/lite/examples/label_image/CMakeLists.txt b/tensorflow/lite/examples/label_image/CMakeLists.txt
index 08044b1675beb3..9874801f34fa31 100644
--- a/tensorflow/lite/examples/label_image/CMakeLists.txt
+++ b/tensorflow/lite/examples/label_image/CMakeLists.txt
@@ -21,7 +21,7 @@ populate_source_vars("${TFLITE_SOURCE_DIR}/examples/label_image"
   FILTER "_test\\.cc$"
 )
 list(APPEND TFLITE_LABEL_IMAGE_SRCS
-  ${TSL_SOURCE_DIR}/tsl/util/stats_calculator.cc
+  ${XLA_SOURCE_DIR}/xla/tsl/util/stats_calculator.cc
   ${TFLITE_SOURCE_DIR}/profiling/memory_info.cc
   ${TFLITE_SOURCE_DIR}/profiling/profile_summarizer.cc
   ${TFLITE_SOURCE_DIR}/profiling/profile_summary_formatter.cc
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 863ca7c6b72170..1ffa19e4e7b65d 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -106,7 +106,7 @@ _test_size_override = {
             "@local_tsl//tsl/platform:env",
             "@local_tsl//tsl/platform:status",
             "@local_tsl//tsl/platform:subprocess",
-            "@local_tsl//tsl/util:command_line_flags",
+            "@local_xla//xla/tsl/util:command_line_flags",
         ],
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib",
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index 43c0c0851d5e10..35e6737137a6ac 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "re2/re2.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/testing/parse_testdata.h"
 #include "tensorflow/lite/testing/tflite_driver.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "tsl/platform/env.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/subprocess.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/tools/benchmark/CMakeLists.txt b/tensorflow/lite/tools/benchmark/CMakeLists.txt
index 6702c1f9d352c6..fc2a1be282f985 100644
--- a/tensorflow/lite/tools/benchmark/CMakeLists.txt
+++ b/tensorflow/lite/tools/benchmark/CMakeLists.txt
@@ -21,7 +21,7 @@ populate_source_vars("${TFLITE_SOURCE_DIR}/tools/benchmark"
   FILTER "(_test|_plus_flex_main|_performance_options.*)\\.cc$"
 )
 list(APPEND TFLITE_BENCHMARK_SRCS
-  ${TSL_SOURCE_DIR}/tsl/util/stats_calculator.cc
+  ${XLA_SOURCE_DIR}/xla/tsl/util/stats_calculator.cc
   ${TFLITE_SOURCE_DIR}/kernels/internal/utils/sparsity_format_converter.cc
   ${TFLITE_SOURCE_DIR}/profiling/memory_info.cc
   ${TFLITE_SOURCE_DIR}/profiling/memory_usage_monitor.cc
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/BUILD b/tensorflow/lite/tools/benchmark/experimental/c/BUILD
index edefa5e6e35158..2a7a93671c265d 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/c/BUILD
@@ -31,6 +31,6 @@ cc_library(
     deps = [
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/tools/benchmark:benchmark_tflite_model_lib",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.cc b/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.cc
index d2cc7f65ceb9e0..2f07561e42c416 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.cc
+++ b/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <utility>
 
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
-#include "tsl/util/stats_calculator.h"
 
 extern "C" {
 
diff --git a/tensorflow/lite/tools/evaluation/stages/BUILD b/tensorflow/lite/tools/evaluation/stages/BUILD
index 2ebe3a67bd729e..b2eb3a2ee0cdde 100644
--- a/tensorflow/lite/tools/evaluation/stages/BUILD
+++ b/tensorflow/lite/tools/evaluation/stages/BUILD
@@ -49,7 +49,7 @@ cc_library(
         "//tensorflow/lite/tools/evaluation/proto:preprocessing_steps_cc_proto",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_jpeg_internal",
@@ -118,7 +118,7 @@ cc_library(
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "@com_google_absl//absl/base:core_headers",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
 
@@ -172,7 +172,7 @@ cc_library(
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "@FP16",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
 
diff --git a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
index 5de1c1bcf96288..1e7d36e098fb5c 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
@@ -21,12 +21,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/preprocessing_steps.pb.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tflite {
 namespace evaluation {
diff --git a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
index f81a7c3b2a1f3d..5f1bee82d33a35 100644
--- a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
@@ -21,11 +21,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tflite {
 namespace evaluation {
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
index b9451c2ee17f99..e6f2436f739b12 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/core/kernels/register.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tflite {
 namespace evaluation {
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index a1dc47c492de7b..fd25d20a55ffef 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -203,7 +203,7 @@ tf_cc_binary(
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:strcat",
         "@local_tsl//tsl/platform:types",
-        "@local_tsl//tsl/util:command_line_flags",
+        "@local_xla//xla/tsl/util:command_line_flags",
     ],
 )
 
diff --git a/tensorflow/python/framework/offset_counter.cc b/tensorflow/python/framework/offset_counter.cc
index 4dbae6a231a7cb..09a6facfb5c6b7 100644
--- a/tensorflow/python/framework/offset_counter.cc
+++ b/tensorflow/python/framework/offset_counter.cc
@@ -22,13 +22,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tensorflow/python/framework/offset_counter_helper.h"
 #include "tensorflow/python/framework/op_reg_offset.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/strcat.h"
 #include "tsl/platform/types.h"
-#include "tsl/util/command_line_flags.h"
 
 inline constexpr absl::string_view kUsage =
     "offset_counter reads C++ source codes, scans for the location of where "
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index 35314d604ffde4..fc6426e9e438bf 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "tensorflow/python/framework/python_op_gen.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/str_util.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index e5c0b1b6d29025..01b7664e1c30de 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -162,7 +162,7 @@ tf_python_pybind_extension(
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/memory",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/util/stat_summarizer_wrapper.cc b/tensorflow/python/util/stat_summarizer_wrapper.cc
index e6d00ff355b829..47120b21a24ee9 100644
--- a/tensorflow/python/util/stat_summarizer_wrapper.cc
+++ b/tensorflow/python/util/stat_summarizer_wrapper.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
+#include "xla/tsl/util/stat_summarizer_options.h"
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/util/stat_summarizer.h"
-#include "tsl/util/stat_summarizer_options.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace py = pybind11;
 
diff --git a/tensorflow/tools/android/inference_interface/jni/run_stats_jni.h b/tensorflow/tools/android/inference_interface/jni/run_stats_jni.h
index 8bbd38692b13c4..7020f63ad449e4 100644
--- a/tensorflow/tools/android/inference_interface/jni/run_stats_jni.h
+++ b/tensorflow/tools/android/inference_interface/jni/run_stats_jni.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <jni.h>
 
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/util/stat_summarizer.h"
-#include "tsl/util/stats_calculator.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
index c5686096428dba..a95081f56bd160 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
@@ -78,13 +78,13 @@ tsl_gpu_library(
         "//tsl/platform:thread_annotations",
         "//tsl/protobuf:coordination_config_proto_cc",
         "//tsl/protobuf:coordination_service_proto_cc",
-        "//tsl/util:device_name_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@local_xla//xla/tsl/util:device_name_utils",
     ],
     alwayslink = 1,
 )
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
index 8bb30c31cd798c..45dfb972a131f5 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
+#include "xla/tsl/util/device_name_utils.h"
 #include "tsl/distributed_runtime/call_options.h"
 #include "tsl/distributed_runtime/coordination/coordination_client.h"
 #include "tsl/distributed_runtime/coordination/coordination_service_error_util.h"
@@ -45,7 +46,6 @@ limitations under the License.
 #include "tsl/platform/thread_annotations.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 #include "tsl/protobuf/coordination_service.pb.h"
-#include "tsl/util/device_name_utils.h"
 
 namespace tsl {
 namespace {
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
index cc0a37aa4d0009..205029043be73f 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
@@ -95,8 +95,8 @@ cc_library(
         "//tsl/platform:thread_annotations",
         "//tsl/platform:types",
         "//tsl/protobuf:rpc_options_proto_cc",
-        "//tsl/util:device_name_utils",
         "@com_google_absl//absl/strings",
+        "@local_xla//xla/tsl/util:device_name_utils",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -114,7 +114,7 @@ tsl_cc_test(
         "//tsl/platform:test",
         "//tsl/platform:test_main",
         "//tsl/protobuf:rpc_options_proto_cc_impl",
-        "//tsl/util:device_name_utils",
+        "@local_xla//xla/tsl/util:device_name_utils",
     ],
 )
 
@@ -129,8 +129,8 @@ cc_library(
         "//tsl/platform:errors",
         "//tsl/platform:status",
         "//tsl/platform:strcat",
-        "//tsl/util:env_var",
         "@com_google_absl//absl/status",
+        "@local_xla//xla/tsl/util:env_var",
     ] + tsl_grpc_cc_dependencies(),
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc
index 492c984e12f13a..ba12449f03bf2f 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/escaping.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_split.h"
+#include "xla/tsl/util/device_name_utils.h"
 #include "tsl/distributed_runtime/rpc/grpc_channel_common.h"
 #include "tsl/lib/gtl/map_util.h"
 #include "tsl/platform/errors.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "tsl/platform/thread_annotations.h"
 #include "tsl/platform/types.h"
 #include "tsl/protobuf/rpc_options.pb.h"
-#include "tsl/util/device_name_utils.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_test.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_test.cc
index 6b2d330cb1d57a..adc0df2b89ddef 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_test.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "xla/tsl/util/device_name_utils.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/strcat.h"
 #include "tsl/platform/test.h"
 #include "tsl/protobuf/rpc_options.pb.h"
-#include "tsl/util/device_name_utils.h"
 
 namespace tsl {
 #define IsSameAddrSp DeviceNameUtils::IsSameAddressSpace
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_state.h b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_state.h
index 893e1b0192f694..21d8f2df5099e3 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_state.h
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_state.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "grpcpp/generic/generic_stub.h"
 #include "grpcpp/grpcpp.h"
 #include "absl/status/status.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/distributed_runtime/call_options.h"
 #include "tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tsl/distributed_runtime/rpc/grpc_util.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "tsl/platform/status.h"
 #include "tsl/platform/strcat.h"
 #include "tsl/platform/threadpool.h"
-#include "tsl/util/env_var.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/framework/BUILD b/third_party/xla/third_party/tsl/tsl/framework/BUILD
index cfa12cab82e00e..bca669a616f0ec 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/BUILD
@@ -273,10 +273,10 @@ cc_library(
         "//tsl/platform:status",
         "//tsl/platform:statusor",
         "//tsl/platform:str_util",
-        "//tsl/util:device_name_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@local_xla//xla/tsl/util:device_name_utils",
     ],
 )
 
@@ -465,7 +465,7 @@ tsl_cc_test(
         "//tsl/platform:status_matchers",
         "//tsl/platform:test_main",
         "//tsl/protobuf:error_codes_proto_impl_cc",
-        "//tsl/util:device_name_utils",
+        "@local_xla//xla/tsl/util:device_name_utils",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/framework/device_id_utils.h b/third_party/xla/third_party/tsl/tsl/framework/device_id_utils.h
index c2479aded5fe0a..e814e68c8530a8 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/device_id_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/framework/device_id_utils.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "xla/tsl/util/device_name_utils.h"
 #include "tsl/framework/device_id.h"
 #include "tsl/framework/device_type.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/device_name_utils.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/framework/device_id_utils_test.cc b/third_party/xla/third_party/tsl/tsl/framework/device_id_utils_test.cc
index ddf7cdd479935b..21e574f95c1b2c 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/device_id_utils_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/framework/device_id_utils_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <string_view>
 #include <vector>
 
+#include "xla/tsl/util/device_name_utils.h"
 #include "tsl/framework/device_id_manager.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status_matchers.h"
-#include "tsl/util/device_name_utils.h"
 
 namespace tsl {
 namespace {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD b/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
index c9067c74c8e526..21e13663a4b4cf 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
@@ -217,8 +217,8 @@ cc_library(
         "//tsl/platform:str_util",
         "//tsl/platform:stringpiece",
         "//tsl/platform:types",
-        "//tsl/util:env_var",
         "@curl",
+        "@local_xla//xla/tsl/util:env_var",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/curl_http_request.cc b/third_party/xla/third_party/tsl/tsl/platform/cloud/curl_http_request.cc
index a7e6a65e37335d..c41f967c04b055 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/curl_http_request.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/curl_http_request.cc
@@ -17,13 +17,13 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "xla/tsl/util/env_var.h"
 #include "tsl/lib/gtl/map_util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/scanner.h"
 #include "tsl/platform/str_util.h"
 #include "tsl/platform/types.h"
-#include "tsl/util/env_var.h"
 
 #define CHECK_CURL_OK(expr) CHECK_EQ(expr, CURLE_OK)
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
index e6f8b25a83809b..84f6a1439711aa 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
@@ -149,8 +149,8 @@ cc_library(
         "//tsl/platform:errors",
         "//tsl/platform:macros",
         "//tsl/platform:statusor",
-        "//tsl/util:env_var",
         "@com_google_absl//absl/strings:string_view",
+        "@local_xla//xla/tsl/util:env_var",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.cc
index e99db5ae366969..325713117a333a 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.cc
@@ -16,10 +16,10 @@ limitations under the License.
 
 #include <atomic>
 
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
 
 namespace tsl {
 namespace profiler {
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
index 24d0417b13c652..41d669aedbd103 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
@@ -213,12 +213,12 @@ cc_library(
         "//tsl/platform:types",
         "//tsl/profiler/lib:context_types",
         "//tsl/profiler/protobuf:xplane_proto_cc",
-        "//tsl/util:stats_calculator_portable",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.cc
index 333b93743ae64f..88c7e30b76eee5 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/stats_calculator.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/types.h"
 #include "tsl/profiler/lib/context_types.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "tsl/profiler/utils/xplane_builder.h"
 #include "tsl/profiler/utils/xplane_schema.h"
 #include "tsl/profiler/utils/xplane_visitor.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tsl {
 namespace profiler {
diff --git a/third_party/xla/third_party/tsl/tsl/protobuf/BUILD b/third_party/xla/third_party/tsl/tsl/protobuf/BUILD
index db185588785a46..557d9fcf2208e6 100644
--- a/third_party/xla/third_party/tsl/tsl/protobuf/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/protobuf/BUILD
@@ -105,7 +105,7 @@ tf_proto_library(
     make_default_target_header_only = True,
     visibility = internal_visibility([
         "//tensorflow/core:__subpackages__",
-        "//tsl/util:__pkg__",
+        "@local_xla//xla/tsl/util:__pkg__",
     ]),
 )
 
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index dd5858b633f826..fc4d67a6fe564f 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -604,6 +604,7 @@ cc_library(
         ":types",
         ":util",
         ":xla_data_proto_cc",
+        "//xla/tsl/util:byte_swap_array",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:config",
         "@com_google_absl//absl/base:core_headers",
@@ -622,7 +623,6 @@ cc_library(
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util:byte_swap_array",
     ],
 )
 
@@ -1073,12 +1073,12 @@ cc_library(
     deps =
         [
             ":types",
+            "//xla/tsl/util:command_line_flags",
             "@com_google_absl//absl/container:flat_hash_map",
             "@com_google_absl//absl/strings",
             "@com_google_absl//absl/strings:str_format",
             "@com_google_absl//absl/types:span",
             "@local_tsl//tsl/platform:logging",
-            "@local_tsl//tsl/util:command_line_flags",
         ],
 )
 
@@ -1088,12 +1088,12 @@ xla_cc_test(
     deps =
         [
             ":parse_flags_from_env",
+            "//xla/tsl/util:command_line_flags",
             "@com_google_absl//absl/strings:str_format",
             "@local_tsl//tsl/platform:env",
             "@local_tsl//tsl/platform:logging",
             "@local_tsl//tsl/platform:subprocess",
             "@local_tsl//tsl/platform:test",
-            "@local_tsl//tsl/util:command_line_flags",
         ],
 )
 
@@ -1111,6 +1111,7 @@ cc_library(
             ":parse_flags_from_env",
             ":xla_proto_cc",
             "//xla/stream_executor/cuda:ptx_compiler_support",
+            "//xla/tsl/util:command_line_flags",
             "@com_google_absl//absl/algorithm:container",
             "@com_google_absl//absl/base",
             "@com_google_absl//absl/container:flat_hash_map",
@@ -1119,7 +1120,6 @@ cc_library(
             "@com_google_absl//absl/strings:str_format",
             "@local_tsl//tsl/platform:logging",
             "@local_tsl//tsl/platform:protobuf",
-            "@local_tsl//tsl/util:command_line_flags",
         ],
 )
 
@@ -1255,4 +1255,5 @@ cc_library(
 alias(
     name = "bazel_issue_21519",
     actual = ":empty",
+    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 0d259e573afdec..5ea81b05c3276d 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -42,6 +42,7 @@ tsl_gpu_library(
     ],
     deps = [
         ":cupti_utils",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -52,7 +53,6 @@ tsl_gpu_library(
         "@local_tsl//tsl/profiler/lib:profiler_interface",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_tsl//tsl/profiler/utils:time_utils",
-        "@local_tsl//tsl/util:env_var",
     ],
     alwayslink = 1,
 )
@@ -329,7 +329,7 @@ tsl_gpu_library(
         "@com_google_absl//absl/memory",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:stringpiece",
-        "@local_tsl//tsl/util:env_var",
+        "//xla/tsl/util:env_var",
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_utils.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_utils.cc
index ee9a542485a48c..a4198811286bed 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_utils.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_utils.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include "xla/backends/profiler/gpu/cupti_error_manager.h"
 #include "xla/backends/profiler/gpu/cupti_interface.h"
 #include "xla/backends/profiler/gpu/cupti_wrapper.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/stringpiece.h"
-#include "tsl/util/env_var.h"
 
 namespace xla {
 namespace profiler {
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
index 70530221b68123..d7bb2524b66762 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/backends/profiler/gpu/cupti_collector.h"
 #include "xla/backends/profiler/gpu/cupti_tracer.h"
 #include "xla/backends/profiler/gpu/cupti_wrapper.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/thread_annotations.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "tsl/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 #include "tsl/profiler/utils/time_utils.h"
-#include "tsl/util/env_var.h"
 
 namespace xla {
 namespace profiler {
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
index 6ce64f81b68657..81eb2d192ea09a 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "xla/backends/profiler/gpu/rocm_collector.h"
 #include "xla/backends/profiler/gpu/rocm_tracer.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/abi.h"
 #include "tsl/platform/env_time.h"
 #include "tsl/platform/errors.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "tsl/profiler/utils/xplane_builder.h"
 #include "tsl/profiler/utils/xplane_schema.h"
 #include "tsl/profiler/utils/xplane_utils.h"
-#include "tsl/util/env_var.h"
 
 namespace xla {
 namespace profiler {
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
index f28b91b5dd9e27..41b21c486eb340 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
 #include "xla/stream_executor/rocm/roctracer_wrapper.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/abi.h"
 #include "tsl/platform/env_time.h"
 #include "tsl/platform/errors.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "tsl/profiler/utils/xplane_builder.h"
 #include "tsl/profiler/utils/xplane_schema.h"
 #include "tsl/profiler/utils/xplane_utils.h"
-#include "tsl/util/env_var.h"
 
 namespace xla {
 namespace profiler {
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index f4f9f865e4d889..8b29c823b1d921 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -36,9 +36,9 @@ limitations under the License.
 #include "xla/debug_options_parsers.h"
 #include "xla/parse_flags_from_env.h"
 #include "xla/stream_executor/cuda/ptx_compiler_support.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/debug_options_flags.h b/third_party/xla/xla/debug_options_flags.h
index 15cc0fb8b448f5..4bc8420441af1e 100644
--- a/third_party/xla/xla/debug_options_flags.h
+++ b/third_party/xla/xla/debug_options_flags.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/logging.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/literal.cc b/third_party/xla/xla/literal.cc
index 935e6092ac07c8..1f3768c1cc1a2b 100644
--- a/third_party/xla/xla/literal.cc
+++ b/third_party/xla/xla/literal.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
+#include "xla/tsl/util/byte_swap_array.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -59,7 +60,6 @@ limitations under the License.
 #include "tsl/platform/ml_dtypes.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/byte_swap_array.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/parse_flags_from_env.cc b/third_party/xla/xla/parse_flags_from_env.cc
index 84ca13de4487cb..0f58671ebff7df 100644
--- a/third_party/xla/xla/parse_flags_from_env.cc
+++ b/third_party/xla/xla/parse_flags_from_env.cc
@@ -32,8 +32,8 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/logging.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/parse_flags_from_env.h b/third_party/xla/xla/parse_flags_from_env.h
index e73a38421faf97..01d476f22fa3dc 100644
--- a/third_party/xla/xla/parse_flags_from_env.h
+++ b/third_party/xla/xla/parse_flags_from_env.h
@@ -51,8 +51,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/types.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/parse_flags_from_env_test.cc b/third_party/xla/xla/parse_flags_from_env_test.cc
index 01e82889130a29..f00cb309c12a96 100644
--- a/third_party/xla/xla/parse_flags_from_env_test.cc
+++ b/third_party/xla/xla/parse_flags_from_env_test.cc
@@ -24,11 +24,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_format.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/subprocess.h"
 #include "tsl/platform/test.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 2620ae166bb7b9..d6f5bfe7af8fcf 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -26,10 +26,10 @@ cc_library(
         "//xla/service:platform_util",
         "//xla/stream_executor",
         "//xla/stream_executor/integrations:device_mem_allocator",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/framework:bfc_allocator",
         "@local_tsl//tsl/framework:device_id_impl",
-        "@local_tsl//tsl/util:env_var",
     ],
 )
 
@@ -86,6 +86,7 @@ cc_library(
         "//xla/stream_executor:platform",
         "//xla/stream_executor/integrations:device_mem_allocator",
         "//xla/stream_executor/integrations:tf_allocator_adapter",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -111,7 +112,6 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/profiler/lib:connected_traceme",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/util:env_var",
     ] + if_cuda_or_rocm([
         ":nccl_id_store",
         "//xla/service/gpu:gpu_compiler",
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc b/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
index efd5fbf8c17f60..c9c6fe4da4dcef 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
+++ b/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/stream_executor/integrations/device_host_allocator.h"
 #include "xla/stream_executor/integrations/device_mem_allocator.h"
+#include "xla/tsl/util/env_var.h"
 #include "xla/util.h"
 #include "tsl/framework/device_id.h"
-#include "tsl/util/env_var.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 6a72a4431f47c0..79c7d1051897b9 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -7459,12 +7459,12 @@ xla_cc_binary(
         ":cpu_plugin",
         "//xla:status",
         "//xla/tools:xla_compile_lib",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:types",
-        "@local_tsl//tsl/util:command_line_flags",
     ] + if_cuda_is_configured([
         "//xla/service/gpu:executable_proto_cc",
         "//xla/service/gpu:gpu_compiler",
@@ -7654,10 +7654,10 @@ cc_library(
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_proto_cc",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util:command_line_flags",
     ],
 )
 
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 04d5a6bff995ab..a69aafc5c113f3 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -1602,7 +1602,7 @@ cc_library(
     srcs = ["onednn_matmul.cc"],
     hdrs = [
         "onednn_matmul.h",
-        "@local_tsl//tsl/util:onednn_util_hdrs",
+        "//xla/tsl/util:onednn_util_hdrs",
     ],
     copts = runtime_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
@@ -1627,7 +1627,7 @@ cc_library(
     srcs = ["onednn_layer_norm.cc"],
     hdrs = [
         "onednn_layer_norm.h",
-        "@local_tsl//tsl/util:onednn_util_hdrs",
+        "//xla/tsl/util:onednn_util_hdrs",
     ],
     copts = runtime_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
@@ -1650,7 +1650,7 @@ cc_library(
     srcs = ["onednn_softmax.cc"],
     hdrs = [
         "onednn_softmax.h",
-        "@local_tsl//tsl/util:onednn_util_hdrs",
+        "//xla/tsl/util:onednn_util_hdrs",
     ],
     copts = runtime_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
@@ -1680,7 +1680,7 @@ cc_library(
     hdrs = [
         "onednn_matmul.h",
         "onednn_matmul_rewriter.h",
-        "@local_tsl//tsl/util:onednn_util_hdrs",
+        "//xla/tsl/util:onednn_util_hdrs",
     ],
     copts = tsl_copts(),
     deps = [
diff --git a/third_party/xla/xla/service/cpu/onednn_layer_norm.cc b/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
index 1d42f0290ee839..d2109a1bc2f956 100644
--- a/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
+++ b/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/onednn_memory_util.h"
 #include "xla/service/cpu/runtime_lightweight_check.h"
-#include "tsl/util/onednn_threadpool.h"
+#include "xla/tsl/util/onednn_threadpool.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul.cc b/third_party/xla/xla/service/cpu/onednn_matmul.cc
index 3686827198df7a..4c01c732a96da9 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul.cc
@@ -35,8 +35,8 @@ limitations under the License.
 #include "xla/service/cpu/runtime_lightweight_check.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/util/onednn_threadpool.h"
 #include "tsl/platform/logging.h"
-#include "tsl/util/onednn_threadpool.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
index e08cd7b6c7118a..c9b9e9b4a04a41 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -32,8 +32,8 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/util/onednn_threadpool.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
-#include "tsl/util/onednn_threadpool.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/onednn_softmax.cc b/third_party/xla/xla/service/cpu/onednn_softmax.cc
index 18efb700eb3fde..5af6de54078596 100644
--- a/third_party/xla/xla/service/cpu/onednn_softmax.cc
+++ b/third_party/xla/xla/service/cpu/onednn_softmax.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/onednn_memory_util.h"
 #include "xla/service/cpu/runtime_lightweight_check.h"
-#include "tsl/util/onednn_threadpool.h"
+#include "xla/tsl/util/onednn_threadpool.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 2e2d4b943ad100..1b8117cf702373 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -757,7 +757,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util/proto:proto_utils",
+        "//xla/tsl/util/proto:proto_utils",
     ]),
 )
 
@@ -1786,6 +1786,7 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor/gpu:redzone_allocator",
+        "//xla/tsl/util/proto:proto_utils",
         "//xla:util",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
@@ -1793,7 +1794,6 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util/proto:proto_utils",
     ]),
 )
 
@@ -2133,7 +2133,7 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util/proto:proto_utils",
+        "//xla/tsl/util/proto:proto_utils",
     ]),
 )
 
@@ -3960,7 +3960,7 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/util:env_var",
+        "//xla/tsl/util:env_var",
     ]),
 )
 
@@ -4433,6 +4433,8 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/stream_executor",
         "//xla/stream_executor:launch_dim",
+        "//xla/tsl/util:env_var",
+        "//xla/tsl/util/proto:proto_utils",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
@@ -4448,8 +4450,6 @@ cc_library(
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util:env_var",
-        "@local_tsl//tsl/util/proto:proto_utils",
     ],
 )
 
@@ -4460,10 +4460,10 @@ xla_cc_test(
         ":stream_executor_util",
         "//xla:autotuning_proto_cc",
         "//xla/service:hlo_module_config",
+        "//xla/tsl/util/proto:proto_utils",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/util/proto:proto_utils",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index 54bde3ac33e147..d7fa93b47e5332 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -64,13 +64,13 @@ limitations under the License.
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/util/proto/proto_utils.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/proto/proto_utils.h"
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
 #include "third_party/gpus/cudnn/cudnn.h"  // IWYU pragma: keep
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index 2efb705ab3926e..515a05f2ffb9d8 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -48,11 +48,11 @@ limitations under the License.
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/scratch_allocator.h"
+#include "xla/tsl/util/proto/proto_utils.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/proto/proto_utils.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/service/gpu/buffer_comparator.h"
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
index f5d934455f818c..58615453d7ea65 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
@@ -80,6 +80,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tools/hlo_decomposer.h"
+#include "xla/tsl/util/proto/proto_utils.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -89,7 +90,6 @@ limitations under the License.
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
-#include "tsl/util/proto/proto_utils.h"
 
 // Log levels used in this file:
 // VLOG(1): Overview
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index 5731422b1c87eb..d1b47ba1bbe021 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -38,6 +38,7 @@ cc_library(
         "//xla/service/llvm_ir:llvm_command_line_options",
         "//xla/service/llvm_ir:llvm_type_conversion_util",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
@@ -66,7 +67,6 @@ cc_library(
         "@local_tsl//tsl/platform:random",
         "@local_tsl//tsl/platform:rocm_rocdl_path",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/util:env_var",
     ] + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
         "@llvm-project//llvm:AMDGPUCodeGen",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 3b87df5b729004..6c6a6c20fbe27f 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -61,6 +61,7 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_type_conversion_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/util/env_var.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "tsl/platform/cuda_libdevice_path.h"
@@ -70,7 +71,6 @@ limitations under the License.
 #include "tsl/platform/random.h"
 #include "tsl/platform/rocm_rocdl_path.h"
 #include "tsl/profiler/lib/traceme.h"
-#include "tsl/util/env_var.h"
 
 #if !defined(PLATFORM_GOOGLE) && TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 45084afa659674..fe2c14ab9ac2c8 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -744,6 +744,7 @@ cc_library(
             "//xla/service:hlo_runner",
             "//xla/service:platform_util",
             "//xla/stream_executor:device_description",
+            "//xla/tsl/util:command_line_flags",
             "@com_google_absl//absl/log",
             "@com_google_absl//absl/strings",
             "@com_google_absl//absl/strings:str_format",
@@ -751,7 +752,6 @@ cc_library(
             "@local_tsl//tsl/platform:path",
             "@local_tsl//tsl/platform:platform_port",
             "@local_tsl//tsl/platform:status",
-            "@local_tsl//tsl/util:command_line_flags",
         ],
     )
     for sm in [
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
index 38479bbc982a38..b71dc91505dbd6 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
@@ -28,12 +28,12 @@ limitations under the License.
 #include "xla/service/hlo_runner.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/status.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 4a29483ce03091..8620bbbb8c0dbb 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -100,6 +100,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/util/env_var.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/env.h"
@@ -109,7 +110,6 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
 #include "tsl/profiler/lib/traceme.h"
-#include "tsl/util/env_var.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.cc b/third_party/xla/xla/service/gpu/stream_executor_util.cc
index b6ae3f29182859..4827fc251ce087 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.cc
@@ -56,12 +56,12 @@ limitations under the License.
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/util/env_var.h"
+#include "xla/tsl/util/proto/proto_utils.h"
 #include "xla/util.h"
 #include "tsl/platform/ml_dtypes.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
-#include "tsl/util/proto/proto_utils.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util_test.cc b/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
index 558de0ec3604ba..cb3be24a6ceaa6 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "xla/autotuning.pb.h"
 #include "xla/service/hlo_module_config.h"
-#include "tsl/util/proto/proto_utils.h"
+#include "xla/tsl/util/proto/proto_utils.h"
 
 namespace xla::gpu {
 namespace {
diff --git a/third_party/xla/xla/service/gpu_compilation_environment.cc b/third_party/xla/xla/service/gpu_compilation_environment.cc
index d6551239db2a99..d598c02df3d5f6 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment.cc
+++ b/third_party/xla/xla/service/gpu_compilation_environment.cc
@@ -25,11 +25,11 @@ limitations under the License.
 #include "xla/service/compilation_environments.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/protobuf.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/xla_compile_main.cc b/third_party/xla/xla/service/xla_compile_main.cc
index 929ae0ec2888d1..b7f97fd800f8af 100644
--- a/third_party/xla/xla/service/xla_compile_main.cc
+++ b/third_party/xla/xla/service/xla_compile_main.cc
@@ -22,10 +22,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/status.h"
 #include "xla/tools/xla_compile_lib.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/types.h"
-#include "tsl/util/command_line_flags.h"
-
 
 namespace xla {
 namespace xla_compile {
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 57163b1845b107..194a0182bdab22 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -566,6 +566,7 @@ cc_library(
         ":platform",
         ":stream_executor_headers",
         ":stream_executor_internal",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
@@ -582,7 +583,6 @@ cc_library(
         "@local_tsl//tsl/platform:stacktrace",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util:env_var",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 4c851dece33b95..1403c0dca73efe 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -396,6 +396,7 @@ cuda_only_cc_library(
         "//xla/stream_executor/gpu:gpu_timer_header",
         "//xla/stream_executor/platform",
         "//xla/tsl/cuda:cudnn",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -420,7 +421,6 @@ cuda_only_cc_library(
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
         "@local_tsl//tsl/platform:tensor_float_32_utils",
         "@local_tsl//tsl/protobuf:dnn_proto_cc",
-        "@local_tsl//tsl/util:env_var",
     ],
     alwayslink = True,
 )
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 3514df43929e7a..1f8ce927570e47 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -69,13 +69,13 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/tensor_float_32_utils.h"
 #include "tsl/protobuf/dnn.pb.h"
-#include "tsl/util/env_var.h"
 
 // clang-format off
 #include "third_party/gpus/cuda/include/library_types.h"
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index f90bf78da0cc23..cfe260ce180fb3 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -445,6 +445,7 @@ gpu_only_cc_library(
         "//xla/stream_executor/cuda:ptx_compiler",
         "//xla/stream_executor/cuda:ptx_compiler_support",
         "//xla/stream_executor/platform",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
@@ -469,7 +470,6 @@ gpu_only_cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:subprocess",
-        "@local_tsl//tsl/util:env_var",
     ] + if_cuda_is_configured([
         "//xla/stream_executor/cuda:cuda_asm_compiler",
         "//xla/stream_executor/cuda:cuda_driver",
@@ -591,6 +591,7 @@ tsl_gpu_library(
     deps = [
         ":gpu_init_impl",
         "//xla/stream_executor:stream_executor_headers",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -598,7 +599,6 @@ tsl_gpu_library(
         "@local_tsl//tsl/framework:device_id",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:mutex",
-        "@local_tsl//tsl/util:env_var",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc b/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
index a97ebdd56b3e77..c7a7ad403e3aa7 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
@@ -32,11 +32,11 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "xla/stream_executor/gpu/gpu_init.h"  // IWYU pragma: keep
 #include "xla/stream_executor/stream_executor.h"  // IWYU pragma: keep
+#include "xla/tsl/util/env_var.h"  // IWYU pragma: keep
 #include "tsl/framework/allocator.h"
 #include "tsl/framework/device_id.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/mutex.h"
-#include "tsl/util/env_var.h"  // IWYU pragma: keep
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index 49265995ff5beb..c2ad1f56fb9974 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -229,7 +229,7 @@ cc_library(
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/util:determinism_for_kernels",
+        "//xla/tsl/util:determinism_for_kernels",
     ]),
     alwayslink = True,
 )
@@ -348,8 +348,8 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/util:env_var",
-        "@local_tsl//tsl/util:determinism_for_kernels",
+        "//xla/tsl/util:env_var",
+        "//xla/tsl/util:determinism_for_kernels",
     ]),
     alwayslink = True,
 )
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
index eae41d7d584851..5de18b8557094c 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
@@ -40,8 +40,8 @@ limitations under the License.
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/util/determinism.h"
 #include "tsl/platform/logging.h"
-#include "tsl/util/determinism.h"
 using tsl::OpDeterminismRequired;
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index d3030a0ad7a063..28521b319d54d0 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -43,12 +43,12 @@ limitations under the License.
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/util/determinism.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/hash.h"
 #include "tsl/platform/logging.h"
-#include "tsl/util/determinism.h"
-#include "tsl/util/env_var.h"
 
 namespace {
 
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
index e7ff3bb831df8a..12d2d0a9b33c25 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
@@ -54,12 +54,12 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/stacktrace.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
 
 namespace stream_executor {
 namespace {
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 601c27f26f0b09..4b11cb0362ab5c 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -55,6 +55,7 @@ xla_cc_binary(
     srcs = ["hex_floats_to_packed_literal.cc"],
     deps = [
         "//xla:types",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/lib/io:buffered_inputstream",
@@ -63,7 +64,6 @@ xla_cc_binary(
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/util:command_line_flags",
     ],
 )
 
@@ -228,11 +228,11 @@ xla_cc_binary(
         "//xla:statusor",
         "//xla:util",
         "//xla/service:hlo_proto_cc",
+        "//xla/tsl/util:command_line_flags",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/util:command_line_flags",
     ],
 )
 
@@ -297,13 +297,13 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass_pipeline",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/util:command_line_flags",
     ],
 )
 
@@ -323,7 +323,7 @@ cc_library(
         "//xla/service:sharding_propagation",
         "//xla/service:triangular_solve_expander",
         "//xla/service/spmd:stateful_rng_spmd_partitioner",
-        "@local_tsl//tsl/util:command_line_flags",
+        "//xla/tsl/util:command_line_flags",
     ],
 )
 
@@ -397,6 +397,7 @@ xla_cc_binary(
         "//xla/service:hlo_runner",
         "//xla/service:local_service",
         "//xla/service:platform_util",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -405,7 +406,6 @@ xla_cc_binary(
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:subprocess",
         "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
-        "@local_tsl//tsl/util:command_line_flags",
     ] + if_cuda_or_rocm([
         "//xla/service:gpu_plugin",
     ]) + if_cuda([
@@ -562,13 +562,13 @@ xla_cc_binary(
         "//xla/service:hlo_runner",
         "//xla/service:interpreter_plugin",
         "//xla/service:platform_util",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/util:command_line_flags",
     ] + if_cuda_or_rocm([
         "//xla/service:gpu_plugin",
     ]) + if_cuda([
@@ -663,6 +663,7 @@ xla_cc_binary(
         "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_proto_cc",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -672,7 +673,6 @@ xla_cc_binary(
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util:command_line_flags",
     ],
 )
 
diff --git a/third_party/xla/xla/tools/extract_collective_operations.cc b/third_party/xla/xla/tools/extract_collective_operations.cc
index cc579b7bd445fc..1a15fa8fbc1fe6 100644
--- a/third_party/xla/xla/tools/extract_collective_operations.cc
+++ b/third_party/xla/xla/tools/extract_collective_operations.cc
@@ -29,12 +29,12 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/tools/hlo_module_loader.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace {
 const char* const kUsage = R"(
diff --git a/third_party/xla/xla/tools/hex_floats_to_packed_literal.cc b/third_party/xla/xla/tools/hex_floats_to_packed_literal.cc
index a845dabe02b3a1..c4d591ba34928a 100644
--- a/third_party/xla/xla/tools/hex_floats_to_packed_literal.cc
+++ b/third_party/xla/xla/tools/hex_floats_to_packed_literal.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/types.h"
 #include "tsl/lib/io/buffered_inputstream.h"
 #include "tsl/lib/io/random_inputstream.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
-#include "tsl/util/command_line_flags.h"
 
 using std::string;
 
diff --git a/third_party/xla/xla/tools/hlo_bisect/BUILD b/third_party/xla/xla/tools/hlo_bisect/BUILD
index f0dce33a38533a..732384eca9dada 100644
--- a/third_party/xla/xla/tools/hlo_bisect/BUILD
+++ b/third_party/xla/xla/tools/hlo_bisect/BUILD
@@ -32,8 +32,8 @@ xla_cc_binary(
         "//xla/service:cpu_plugin",
         "//xla/service:gpu_plugin",
         "//xla/service:interpreter_plugin",
+        "//xla/tsl/util:command_line_flags",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/util:command_line_flags",
     ] + if_cuda(["//xla/stream_executor/cuda:cublas_plugin"]),
 )
 
diff --git a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect.cc b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect.cc
index fda3fb25968ac2..73b018323f34c3 100644
--- a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect.cc
+++ b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <vector>
 
 #include "xla/tools/hlo_bisect/hlo_bisect_utils.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
-#include "tsl/util/command_line_flags.h"
 
 const char* const kUsage = R"(
 Given an HloModule that manifests an XLA bug, either crashes the compiler or
diff --git a/third_party/xla/xla/tools/hlo_expand.cc b/third_party/xla/xla/tools/hlo_expand.cc
index 70aed16de04941..cd568564339d27 100644
--- a/third_party/xla/xla/tools/hlo_expand.cc
+++ b/third_party/xla/xla/tools/hlo_expand.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "xla/service/sharding_propagation.h"
 #include "xla/service/spmd/stateful_rng_spmd_partitioner.h"
 #include "xla/service/triangular_solve_expander.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/tools/hlo_expand.h b/third_party/xla/xla/tools/hlo_expand.h
index a80f05b5e789a6..5c1818e91a81da 100644
--- a/third_party/xla/xla/tools/hlo_expand.h
+++ b/third_party/xla/xla/tools/hlo_expand.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "xla/service/hlo_pass_pipeline.h"
-#include "tsl/util/command_line_flags.h"
+#include "xla/tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/tools/hlo_expand_main.cc b/third_party/xla/xla/tools/hlo_expand_main.cc
index 4e34c9f6c37fed..60a83c3b55837c 100644
--- a/third_party/xla/xla/tools/hlo_expand_main.cc
+++ b/third_party/xla/xla/tools/hlo_expand_main.cc
@@ -25,11 +25,11 @@ limitations under the License.
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/tools/hlo_expand.h"
 #include "xla/tools/hlo_module_loader.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/path.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace {
 
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index cf847c5ad47d3b..1487bc0a484d84 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -117,6 +117,7 @@ cc_library(
         "//xla/service:platform_util",
         "//xla/tools:hlo_module_loader",
         "//xla/tools:run_hlo_module_lib",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -128,7 +129,6 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/util:command_line_flags",
     ] + if_gpu_is_configured([
         ":gpu_opt",
     ]) + if_cuda_is_configured([
diff --git a/third_party/xla/xla/tools/hlo_opt/opt_main.cc b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
index a0f803c6751633..ef24a0a4da2405 100644
--- a/third_party/xla/xla/tools/hlo_opt/opt_main.cc
+++ b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/tools/hlo_module_loader.h"
 #include "xla/tools/hlo_opt/opt_lib.h"
 #include "xla/tools/run_hlo_module.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/init_main.h"
@@ -48,7 +49,6 @@ limitations under the License.
 #include "tsl/platform/path.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace {
 const char* const kUsage = R"(
diff --git a/third_party/xla/xla/tools/hlo_proto_to_json.cc b/third_party/xla/xla/tools/hlo_proto_to_json.cc
index ef16a966edcd7f..cdadd1e3ce1d79 100644
--- a/third_party/xla/xla/tools/hlo_proto_to_json.cc
+++ b/third_party/xla/xla/tools/hlo_proto_to_json.cc
@@ -30,12 +30,12 @@ limitations under the License.
 
 #include "xla/service/hlo.pb.h"
 #include "xla/statusor.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
-#include "tsl/util/command_line_flags.h"
 
 using std::string;
 
diff --git a/third_party/xla/xla/tools/interactive_graphviz.cc b/third_party/xla/xla/tools/interactive_graphviz.cc
index 6bd32a96427f8d..10d68162d54627 100644
--- a/third_party/xla/xla/tools/interactive_graphviz.cc
+++ b/third_party/xla/xla/tools/interactive_graphviz.cc
@@ -46,12 +46,12 @@ limitations under the License.
 #include "xla/service/local_service.h"
 #include "xla/service/platform_util.h"
 #include "xla/tools/hlo_extractor.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/subprocess.h"
 #include "tsl/protobuf/error_codes.pb.h"
-#include "tsl/util/command_line_flags.h"
 #if defined(PLATFORM_GOOGLE)
 #include "util/readline/readline.h"
 #endif
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index fe7b8b8bf03482..1a2ac425ed166c 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -40,13 +40,13 @@ xla_cc_binary(
         "//xla:statusor",
         "//xla/pjrt:pjrt_client",
         "//xla/service:cpu_plugin",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util:command_line_flags",
     ] + if_cuda_or_rocm([
         "//xla/service:gpu_plugin",
     ]) + if_cuda([
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
index 6a005380ad62c4..b0666ff491fcba 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
@@ -28,11 +28,11 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/tools/multihost_hlo_runner/functional_hlo_runner.h"
 #include "xla/tools/multihost_hlo_runner/hlo_runner_flags.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace {
 const char* const kUsage = R"(
diff --git a/third_party/xla/xla/tools/run_hlo_module_main.cc b/third_party/xla/xla/tools/run_hlo_module_main.cc
index 6f4005a2da5d9c..c19130df2c6561 100644
--- a/third_party/xla/xla/tools/run_hlo_module_main.cc
+++ b/third_party/xla/xla/tools/run_hlo_module_main.cc
@@ -26,11 +26,11 @@ limitations under the License.
 #include "xla/service/hlo_runner.h"
 #include "xla/service/platform_util.h"
 #include "xla/tools/run_hlo_module.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/test.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace {
 const char* const kUsage = R"(
diff --git a/third_party/xla/third_party/tsl/tsl/util/BUILD b/third_party/xla/xla/tsl/util/BUILD
similarity index 77%
rename from third_party/xla/third_party/tsl/tsl/util/BUILD
rename to third_party/xla/xla/tsl/util/BUILD
index 45432219bdcc4d..c96aa52c81f1f7 100644
--- a/third_party/xla/third_party/tsl/tsl/util/BUILD
+++ b/third_party/xla/xla/tsl/util/BUILD
@@ -5,24 +5,24 @@
 #   to other TF components outside of TSL.
 
 load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
-load(
-    "//tsl:tsl.bzl",
+    "@local_tsl//tsl:tsl.bzl",
     "check_deps",
     "internal_visibility",
     "tsl_copts",
 )
-load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
+load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
-    "//tsl/platform:build_config.bzl",
+    "@local_tsl//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
 load(
-    "//tsl/platform:build_config_root.bzl",
+    "@local_tsl//tsl/platform:build_config_root.bzl",
     "if_static",
 )
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -127,9 +127,9 @@ cc_library(
     srcs = ["byte_swap_array.cc"],
     hdrs = ["byte_swap_array.h"],
     deps = [
-        "//tsl/platform:byte_order",
-        "//tsl/platform:errors",
-        "//tsl/platform:status",
+        "@local_tsl//tsl/platform:byte_order",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -151,8 +151,8 @@ cc_library(
     visibility = internal_visibility(["//tensorflow:__subpackages__"]),
     deps = [
         ":env_var",
-        "//tsl/platform:mutex",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:mutex",
     ],
     alwayslink = 1,
 )
@@ -200,14 +200,14 @@ cc_library(
     srcs = ["env_var.cc"],
     hdrs = ["env_var.h"],
     deps = [
-        "//tsl/platform:errors",
-        "//tsl/platform:logging",
-        "//tsl/platform:numbers",
-        "//tsl/platform:status",
-        "//tsl/platform:str_util",
-        "//tsl/platform:strcat",
-        "//tsl/platform:stringpiece",
-        "//tsl/platform:types",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:numbers",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:str_util",
+        "@local_tsl//tsl/platform:strcat",
+        "@local_tsl//tsl/platform:stringpiece",
+        "@local_tsl//tsl/platform:types",
     ],
 )
 
@@ -217,17 +217,17 @@ cc_library(
     hdrs = ["reporter.h"],
     visibility = internal_visibility([
         "//tensorflow/core:__subpackages__",
-        "//tsl:__subpackages__",
+        "@local_tsl//tsl:__subpackages__",
     ]),
     deps = [
-        "//tsl/platform:env",
-        "//tsl/platform:env_impl",
-        "//tsl/platform:errors",
-        "//tsl/platform:macros",
-        "//tsl/platform:mutex",
-        "//tsl/platform:str_util",
-        "//tsl/platform:types",
-        "//tsl/protobuf:test_log_proto_cc",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:env_impl",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:macros",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:str_util",
+        "@local_tsl//tsl/platform:types",
+        "@local_tsl//tsl/protobuf:test_log_proto_cc",
     ],
 )
 
@@ -242,7 +242,7 @@ cc_library(
     ],
     copts = tsl_copts(),
     visibility = internal_visibility([
-        "//tsl:internal",
+        "@local_tsl//tsl:internal",
     ]),
 )
 
@@ -251,8 +251,8 @@ tsl_cc_test(
     srcs = ["stats_calculator_test.cc"],
     deps = [
         ":stats_calculator_portable",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
     ],
 )
 
@@ -261,9 +261,9 @@ cc_library(
     srcs = ["device_name_utils.cc"],
     hdrs = ["device_name_utils.h"],
     deps = [
-        "//tsl/platform:errors",
-        "//tsl/platform:status",
-        "//tsl/platform:stringpiece",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:stringpiece",
     ],
 )
 
@@ -273,12 +273,12 @@ tsl_cc_test(
     srcs = ["device_name_utils_test.cc"],
     deps = [
         ":device_name_utils",
-        "//tsl/lib/core:status_test_util",
-        "//tsl/platform:errors",
-        "//tsl/platform:strcat",
-        "//tsl/platform:test",
-        "//tsl/platform:test_benchmark",
-        "//tsl/platform:test_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:strcat",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_benchmark",
+        "@local_tsl//tsl/platform:test_main",
     ],
 )
 
@@ -287,12 +287,12 @@ cc_library(
     srcs = ["command_line_flags.cc"],
     hdrs = ["command_line_flags.h"],
     deps = [
-        "//tsl/platform:logging",
-        "//tsl/platform:str_util",
-        "//tsl/platform:stringpiece",
-        "//tsl/platform:stringprintf",
-        "//tsl/platform:types",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:str_util",
+        "@local_tsl//tsl/platform:stringpiece",
+        "@local_tsl//tsl/platform:stringprintf",
+        "@local_tsl//tsl/platform:types",
     ],
 )
 
@@ -311,7 +311,7 @@ filegroup(
         "onednn_threadpool.h",
     ],
     visibility = internal_visibility([
-        "@local_xla//xla:__subpackages__",
+        "//xla:__subpackages__",
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/framework:__pkg__",
         "//tensorflow/core/util:__pkg__",
diff --git a/third_party/xla/third_party/tsl/tsl/util/byte_swap_array.cc b/third_party/xla/xla/tsl/util/byte_swap_array.cc
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/util/byte_swap_array.cc
rename to third_party/xla/xla/tsl/util/byte_swap_array.cc
index e77e4bab8defc0..3b21798f0caf41 100644
--- a/third_party/xla/third_party/tsl/tsl/util/byte_swap_array.cc
+++ b/third_party/xla/xla/tsl/util/byte_swap_array.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/byte_swap_array.h"
+#include "xla/tsl/util/byte_swap_array.h"
 
 #include "tsl/platform/errors.h"
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/byte_swap_array.h b/third_party/xla/xla/tsl/util/byte_swap_array.h
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/util/byte_swap_array.h
rename to third_party/xla/xla/tsl/util/byte_swap_array.h
index ad7e34efcd51f7..88c87afd2696e7 100644
--- a/third_party/xla/third_party/tsl/tsl/util/byte_swap_array.h
+++ b/third_party/xla/xla/tsl/util/byte_swap_array.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_BYTE_SWAP_ARRAY_H_
-#define TENSORFLOW_TSL_UTIL_BYTE_SWAP_ARRAY_H_
+#ifndef XLA_TSL_UTIL_BYTE_SWAP_ARRAY_H_
+#define XLA_TSL_UTIL_BYTE_SWAP_ARRAY_H_
 
 #include "tsl/platform/byte_order.h"
 #include "tsl/platform/errors.h"
@@ -101,4 +101,4 @@ Status ByteSwapArray(char *array, size_t bytes_per_elem, int array_len);
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_BYTE_SWAP_ARRAY_H_
+#endif  // XLA_TSL_UTIL_BYTE_SWAP_ARRAY_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/command_line_flags.cc b/third_party/xla/xla/tsl/util/command_line_flags.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/util/command_line_flags.cc
rename to third_party/xla/xla/tsl/util/command_line_flags.cc
index 5e316e9ae9fc6a..f5a97a50eb1980 100644
--- a/third_party/xla/third_party/tsl/tsl/util/command_line_flags.cc
+++ b/third_party/xla/xla/tsl/util/command_line_flags.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/command_line_flags.h"
+#include "xla/tsl/util/command_line_flags.h"
 
 #include <algorithm>
 #include <cinttypes>
diff --git a/third_party/xla/third_party/tsl/tsl/util/command_line_flags.h b/third_party/xla/xla/tsl/util/command_line_flags.h
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/util/command_line_flags.h
rename to third_party/xla/xla/tsl/util/command_line_flags.h
index 2710de5753cd01..d4b3efd662a94d 100644
--- a/third_party/xla/third_party/tsl/tsl/util/command_line_flags.h
+++ b/third_party/xla/xla/tsl/util/command_line_flags.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_COMMAND_LINE_FLAGS_H_
-#define TENSORFLOW_TSL_UTIL_COMMAND_LINE_FLAGS_H_
+#ifndef XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
+#define XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
 
 #include <functional>
 #include <string>
@@ -145,4 +145,4 @@ class Flags {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_COMMAND_LINE_FLAGS_H_
+#endif  // XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/determinism.cc b/third_party/xla/xla/tsl/util/determinism.cc
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/util/determinism.cc
rename to third_party/xla/xla/tsl/util/determinism.cc
index b9a5abd9af40d1..6089cc96458dc1 100644
--- a/third_party/xla/third_party/tsl/tsl/util/determinism.cc
+++ b/third_party/xla/xla/tsl/util/determinism.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/determinism.h"
+#include "xla/tsl/util/determinism.h"
 
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/mutex.h"
-#include "tsl/util/env_var.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/determinism.h b/third_party/xla/xla/tsl/util/determinism.h
similarity index 86%
rename from third_party/xla/third_party/tsl/tsl/util/determinism.h
rename to third_party/xla/xla/tsl/util/determinism.h
index fff5b195845a39..2f1861ed60a23b 100644
--- a/third_party/xla/third_party/tsl/tsl/util/determinism.h
+++ b/third_party/xla/xla/tsl/util/determinism.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_DETERMINISM_H_
-#define TENSORFLOW_TSL_UTIL_DETERMINISM_H_
+#ifndef XLA_TSL_UTIL_DETERMINISM_H_
+#define XLA_TSL_UTIL_DETERMINISM_H_
 
 namespace tsl {
 
@@ -24,4 +24,4 @@ void EnableOpDeterminism(bool enabled);
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_DETERMINISM_H_
+#endif  // XLA_TSL_UTIL_DETERMINISM_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/determinism_test_util.h b/third_party/xla/xla/tsl/util/determinism_test_util.h
similarity index 84%
rename from third_party/xla/third_party/tsl/tsl/util/determinism_test_util.h
rename to third_party/xla/xla/tsl/util/determinism_test_util.h
index e458dc9cdacc50..34b4552bb62d6a 100644
--- a/third_party/xla/third_party/tsl/tsl/util/determinism_test_util.h
+++ b/third_party/xla/xla/tsl/util/determinism_test_util.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
-#define TENSORFLOW_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
+#ifndef XLA_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
+#define XLA_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
 
-#include "tsl/util/determinism.h"
+#include "xla/tsl/util/determinism.h"
 
 namespace tsl {
 namespace test {
@@ -35,4 +35,4 @@ class DeterministicOpsScope {
 }  // namespace test
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
+#endif  // XLA_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/device_name_utils.cc b/third_party/xla/xla/tsl/util/device_name_utils.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/util/device_name_utils.cc
rename to third_party/xla/xla/tsl/util/device_name_utils.cc
index 0920532c62eddb..180e3336666bca 100644
--- a/third_party/xla/third_party/tsl/tsl/util/device_name_utils.cc
+++ b/third_party/xla/xla/tsl/util/device_name_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/device_name_utils.h"
+#include "xla/tsl/util/device_name_utils.h"
 
 #include <algorithm>
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/device_name_utils.h b/third_party/xla/xla/tsl/util/device_name_utils.h
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/util/device_name_utils.h
rename to third_party/xla/xla/tsl/util/device_name_utils.h
index 162af1c55b4b47..82b5fa3b1aec2e 100644
--- a/third_party/xla/third_party/tsl/tsl/util/device_name_utils.h
+++ b/third_party/xla/xla/tsl/util/device_name_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_DEVICE_NAME_UTILS_H_
-#define TENSORFLOW_TSL_UTIL_DEVICE_NAME_UTILS_H_
+#ifndef XLA_TSL_UTIL_DEVICE_NAME_UTILS_H_
+#define XLA_TSL_UTIL_DEVICE_NAME_UTILS_H_
 
 #include <string>
 
@@ -291,4 +291,4 @@ std::ostream& operator<<(std::ostream& os,
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_DEVICE_NAME_UTILS_H_
+#endif  // XLA_TSL_UTIL_DEVICE_NAME_UTILS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/device_name_utils_test.cc b/third_party/xla/xla/tsl/util/device_name_utils_test.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/util/device_name_utils_test.cc
rename to third_party/xla/xla/tsl/util/device_name_utils_test.cc
index dce1fc5807604f..03aa5fca5899b9 100644
--- a/third_party/xla/third_party/tsl/tsl/util/device_name_utils_test.cc
+++ b/third_party/xla/xla/tsl/util/device_name_utils_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/device_name_utils.h"
+#include "xla/tsl/util/device_name_utils.h"
 
 #include <vector>
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/env_var.cc b/third_party/xla/xla/tsl/util/env_var.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/util/env_var.cc
rename to third_party/xla/xla/tsl/util/env_var.cc
index e7d818445c7def..564617aa082889 100644
--- a/third_party/xla/third_party/tsl/tsl/util/env_var.cc
+++ b/third_party/xla/xla/tsl/util/env_var.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/env_var.h"
+#include "xla/tsl/util/env_var.h"
 
 #include <stdlib.h>
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/env_var.h b/third_party/xla/xla/tsl/util/env_var.h
similarity index 95%
rename from third_party/xla/third_party/tsl/tsl/util/env_var.h
rename to third_party/xla/xla/tsl/util/env_var.h
index 9c6925c57f643b..69c0bff2a1658c 100644
--- a/third_party/xla/third_party/tsl/tsl/util/env_var.h
+++ b/third_party/xla/xla/tsl/util/env_var.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_ENV_VAR_H_
-#define TENSORFLOW_TSL_UTIL_ENV_VAR_H_
+#ifndef XLA_TSL_UTIL_ENV_VAR_H_
+#define XLA_TSL_UTIL_ENV_VAR_H_
 
 #include "tsl/platform/status.h"
 #include "tsl/platform/stringpiece.h"
@@ -53,4 +53,4 @@ Status ReadStringsFromEnvVar(StringPiece env_var_name, StringPiece default_val,
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_ENV_VAR_H_
+#endif  // XLA_TSL_UTIL_ENV_VAR_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/onednn_threadpool.h b/third_party/xla/xla/tsl/util/onednn_threadpool.h
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/util/onednn_threadpool.h
rename to third_party/xla/xla/tsl/util/onednn_threadpool.h
index 7d8a093ae89fa6..0c81806352f863 100644
--- a/third_party/xla/third_party/tsl/tsl/util/onednn_threadpool.h
+++ b/third_party/xla/xla/tsl/util/onednn_threadpool.h
@@ -14,8 +14,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_ONEDNN_THREADPOOL_H_
-#define TENSORFLOW_TSL_UTIL_ONEDNN_THREADPOOL_H_
+#ifndef XLA_TSL_UTIL_ONEDNN_THREADPOOL_H_
+#define XLA_TSL_UTIL_ONEDNN_THREADPOOL_H_
 #ifdef INTEL_MKL
 
 #include <list>
@@ -190,4 +190,4 @@ class OneDnnThreadPool {
 }  // namespace tsl
 
 #endif  // INTEL_MKL
-#endif  // TENSORFLOW_TSL_UTIL_ONEDNN_THREADPOOL_H_
+#endif  // XLA_TSL_UTIL_ONEDNN_THREADPOOL_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/proto/BUILD b/third_party/xla/xla/tsl/util/proto/BUILD
similarity index 100%
rename from third_party/xla/third_party/tsl/tsl/util/proto/BUILD
rename to third_party/xla/xla/tsl/util/proto/BUILD
diff --git a/third_party/xla/third_party/tsl/tsl/util/proto/proto_utils.h b/third_party/xla/xla/tsl/util/proto/proto_utils.h
similarity index 90%
rename from third_party/xla/third_party/tsl/tsl/util/proto/proto_utils.h
rename to third_party/xla/xla/tsl/util/proto/proto_utils.h
index 9a1dee8eed5224..2762f4df0e8af1 100644
--- a/third_party/xla/third_party/tsl/tsl/util/proto/proto_utils.h
+++ b/third_party/xla/xla/tsl/util/proto/proto_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_PROTO_PROTO_UTILS_H_
-#define TENSORFLOW_TSL_UTIL_PROTO_PROTO_UTILS_H_
+#ifndef XLA_TSL_UTIL_PROTO_PROTO_UTILS_H_
+#define XLA_TSL_UTIL_PROTO_PROTO_UTILS_H_
 
 #include "google/protobuf/duration.pb.h"
 #include "absl/time/time.h"
@@ -39,4 +39,4 @@ inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
 }  // namespace proto_utils
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_PROTO_PROTO_UTILS_H_
+#endif  // XLA_TSL_UTIL_PROTO_PROTO_UTILS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/reporter.cc b/third_party/xla/xla/tsl/util/reporter.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/util/reporter.cc
rename to third_party/xla/xla/tsl/util/reporter.cc
index 41501bc68e8ced..c8ee2f2f87c4ea 100644
--- a/third_party/xla/third_party/tsl/tsl/util/reporter.cc
+++ b/third_party/xla/xla/tsl/util/reporter.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/reporter.h"
+#include "xla/tsl/util/reporter.h"
 
 #include "tsl/platform/errors.h"
 #include "tsl/platform/mutex.h"
diff --git a/third_party/xla/third_party/tsl/tsl/util/reporter.h b/third_party/xla/xla/tsl/util/reporter.h
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/util/reporter.h
rename to third_party/xla/xla/tsl/util/reporter.h
index d020e94fae1276..cf1e2b2c274b25 100644
--- a/third_party/xla/third_party/tsl/tsl/util/reporter.h
+++ b/third_party/xla/xla/tsl/util/reporter.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_REPORTER_H_
-#define TENSORFLOW_TSL_UTIL_REPORTER_H_
+#ifndef XLA_TSL_UTIL_REPORTER_H_
+#define XLA_TSL_UTIL_REPORTER_H_
 
 #include <cstdlib>
 #include <memory>
@@ -131,4 +131,4 @@ class TestReporter {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_REPORTER_H_
+#endif  // XLA_TSL_UTIL_REPORTER_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/stat_summarizer_options.h b/third_party/xla/xla/tsl/util/stat_summarizer_options.h
similarity index 88%
rename from third_party/xla/third_party/tsl/tsl/util/stat_summarizer_options.h
rename to third_party/xla/xla/tsl/util/stat_summarizer_options.h
index e07de6e8d5d9d1..c3ed6ffd7e48bf 100644
--- a/third_party/xla/third_party/tsl/tsl/util/stat_summarizer_options.h
+++ b/third_party/xla/xla/tsl/util/stat_summarizer_options.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
-#define TENSORFLOW_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
+#ifndef XLA_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
+#define XLA_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
 namespace tsl {
 // Used to control the output of the statistics summarizer;
 struct StatSummarizerOptions {
@@ -41,4 +41,4 @@ struct StatSummarizerOptions {
 };
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
+#endif  // XLA_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/stats_calculator.cc b/third_party/xla/xla/tsl/util/stats_calculator.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/util/stats_calculator.cc
rename to third_party/xla/xla/tsl/util/stats_calculator.cc
index 99ab1e3e7c6bc5..cdfa46c94417c3 100644
--- a/third_party/xla/third_party/tsl/tsl/util/stats_calculator.cc
+++ b/third_party/xla/xla/tsl/util/stats_calculator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/stats_calculator.h"
+#include "xla/tsl/util/stats_calculator.h"
 
 #include <iomanip>
 #include <map>
diff --git a/third_party/xla/third_party/tsl/tsl/util/stats_calculator.h b/third_party/xla/xla/tsl/util/stats_calculator.h
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/util/stats_calculator.h
rename to third_party/xla/xla/tsl/util/stats_calculator.h
index 5c23f432971c23..84045fb6ceece2 100644
--- a/third_party/xla/third_party/tsl/tsl/util/stats_calculator.h
+++ b/third_party/xla/xla/tsl/util/stats_calculator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_STATS_CALCULATOR_H_
-#define TENSORFLOW_TSL_UTIL_STATS_CALCULATOR_H_
+#ifndef XLA_TSL_UTIL_STATS_CALCULATOR_H_
+#define XLA_TSL_UTIL_STATS_CALCULATOR_H_
 
 #include <stdlib.h>
 
@@ -26,7 +26,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tsl/util/stat_summarizer_options.h"
+#include "xla/tsl/util/stat_summarizer_options.h"
 
 namespace tsl {
 
@@ -198,4 +198,4 @@ class StatsCalculator {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_STATS_CALCULATOR_H_
+#endif  // XLA_TSL_UTIL_STATS_CALCULATOR_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/stats_calculator_test.cc b/third_party/xla/xla/tsl/util/stats_calculator_test.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/util/stats_calculator_test.cc
rename to third_party/xla/xla/tsl/util/stats_calculator_test.cc
index 9093701e4478c9..d58186630598f0 100644
--- a/third_party/xla/third_party/tsl/tsl/util/stats_calculator_test.cc
+++ b/third_party/xla/xla/tsl/util/stats_calculator_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/stats_calculator.h"
+#include "xla/tsl/util/stats_calculator.h"
 
 #include <cfloat>
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/use_cudnn.cc b/third_party/xla/xla/tsl/util/use_cudnn.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/util/use_cudnn.cc
rename to third_party/xla/xla/tsl/util/use_cudnn.cc
index 3156a319b73b3d..a3e1b4d25d2667 100644
--- a/third_party/xla/third_party/tsl/tsl/util/use_cudnn.cc
+++ b/third_party/xla/xla/tsl/util/use_cudnn.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/use_cudnn.h"
+#include "xla/tsl/util/use_cudnn.h"
 
 #include <cstdint>
 
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/str_util.h"
 #include "tsl/platform/stringpiece.h"
-#include "tsl/util/env_var.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cudnn/cudnn.h"
diff --git a/third_party/xla/third_party/tsl/tsl/util/use_cudnn.h b/third_party/xla/xla/tsl/util/use_cudnn.h
similarity index 92%
rename from third_party/xla/third_party/tsl/tsl/util/use_cudnn.h
rename to third_party/xla/xla/tsl/util/use_cudnn.h
index 738e727e4c7808..41c29b256f7be0 100644
--- a/third_party/xla/third_party/tsl/tsl/util/use_cudnn.h
+++ b/third_party/xla/xla/tsl/util/use_cudnn.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // The utility to check Cudnn dependency and set Cudnn-related flags.
 
-#ifndef TENSORFLOW_TSL_UTIL_USE_CUDNN_H_
-#define TENSORFLOW_TSL_UTIL_USE_CUDNN_H_
+#ifndef XLA_TSL_UTIL_USE_CUDNN_H_
+#define XLA_TSL_UTIL_USE_CUDNN_H_
 
 #include <cstdint>
 
@@ -40,4 +40,4 @@ bool ShouldCudnnGroupedConvolutionBeUsed(const int32_t filter_rows,
                                          const int32_t out_depth);
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_USE_CUDNN_H_
+#endif  // XLA_TSL_UTIL_USE_CUDNN_H_
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
index 260fa5c1731310..71b67f8e74ce62 100644
--- a/third_party/xla/xla/xla.bzl
+++ b/third_party/xla/xla/xla.bzl
@@ -65,7 +65,7 @@ _XLA_SHARED_OBJECT_SENSITIVE_DEPS = if_static(extra_deps = [], otherwise = [
     Label("//xla/stream_executor/gpu:gpu_stream"),
     Label("//xla/stream_executor/rocm:all_runtime"),
     Label("//xla/stream_executor/rocm:stream_executor_rocm"),
-    "@local_tsl//tsl/util:determinism",
+    "//xla/tsl/util:determinism",
 ])
 
 def xla_cc_binary(deps = [], copts = tsl_copts(), **kwargs):

From ab3e910fd68f5dd51c3a63450a46f660295f1e73 Mon Sep 17 00:00:00 2001
From: Isha Arkatkar <ishark@google.com>
Date: Thu, 28 Mar 2024 13:46:25 -0700
Subject: [PATCH 561/670] Disable Compiler ir test for h100 GPU targets

PiperOrigin-RevId: 620045586
---
 tensorflow/python/eager/polymorphic_function/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/eager/polymorphic_function/BUILD b/tensorflow/python/eager/polymorphic_function/BUILD
index 663ac4582ba3d9..2492bd4bd06124 100644
--- a/tensorflow/python/eager/polymorphic_function/BUILD
+++ b/tensorflow/python/eager/polymorphic_function/BUILD
@@ -639,6 +639,7 @@ tf_xla_py_strict_test(
     disabled_backends = [
         "cpu_ondemand",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = True,
     python_version = "PY3",

From 2457ae2ef3013935b124b5a85f1af0271afa236e Mon Sep 17 00:00:00 2001
From: "Jiyoun (Jen) Ha" <jiyounha@google.com>
Date: Thu, 28 Mar 2024 13:46:34 -0700
Subject: [PATCH 562/670] Do not wrap lifted function in
 `TF.CustomAggregatorOp` with improper `quantization_method`.

PiperOrigin-RevId: 620045624
---
 .../compiler/mlir/quantization/stablehlo/BUILD    |  2 +-
 .../compiler/mlir/quantization/stablehlo/cc/BUILD |  1 +
 .../stablehlo/cc/pre_calibration_test.cc          |  8 +++++++-
 .../testing/test_pre_calibration_component.cc     |  8 +++++++-
 .../components/pre_calibration_component.mlir     |  8 ++++----
 .../compiler/mlir/quantization/tensorflow/BUILD   |  1 +
 .../passes/insert_custom_aggregation_ops.cc       | 15 +++++++++++----
 7 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 4998c87f70febe..3b53b3c74bb7cb 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -521,6 +521,7 @@ cc_library(
         ":quantization_config_proto_cc",
         ":stablehlo_test_passes_inc_gen",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:config",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:post_calibration",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:pre_calibration",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
@@ -531,7 +532,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
index 7c7b57451a5f4a..77629c7719bf44 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
@@ -299,6 +299,7 @@ tf_cc_test(
     name = "pre_calibration_test",
     srcs = ["pre_calibration_test.cc"],
     deps = [
+        ":config",
         ":pre_calibration",
         "//tensorflow/compiler/mlir/quantization/common:test_base",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration_test.cc
index c17c39d8783ba8..3d4d2295455a5c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
@@ -34,6 +35,8 @@ limitations under the License.
 namespace mlir::quant::stablehlo {
 namespace {
 
+using ::stablehlo::quantization::ExpandPresets;
+using ::stablehlo::quantization::PopulateDefaults;
 using ::stablehlo::quantization::QuantizationConfig;
 using ::testing::Contains;
 using ::testing::SizeIs;
@@ -92,8 +95,11 @@ TEST_F(PreCalibrationComponentTest,
   )mlir");
   ASSERT_TRUE(module_op);
 
+  QuantizationConfig quantization_config{};
+  quantization_config.mutable_static_range_ptq_preset();
+  quantization_config = ExpandPresets(PopulateDefaults(quantization_config));
   absl::StatusOr<ModuleOp> pre_calibration_result =
-      component.Run(*module_op, QuantizationConfig());
+      component.Run(*module_op, quantization_config);
 
   EXPECT_THAT(pre_calibration_result, IsOk());
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_pre_calibration_component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_pre_calibration_component.cc
index 06b53035c80c7a..0c41771a5c43b0 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_pre_calibration_component.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_pre_calibration_component.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "stablehlo/dialect/VhloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
@@ -34,6 +35,8 @@ namespace mlir::quant::stablehlo::testing {
 
 namespace {
 
+using ::stablehlo::quantization::ExpandPresets;
+using ::stablehlo::quantization::PopulateDefaults;
 using ::stablehlo::quantization::QuantizationConfig;
 
 class TestPreCalibrationComponentPass
@@ -52,7 +55,10 @@ void TestPreCalibrationComponentPass::runOnOperation() {
 
   // Simply runs the PreCalibrationComponent with a default configuration.
   PreCalibrationComponent component(&ctx);
-  if (!component.Run(module_op, QuantizationConfig::default_instance()).ok()) {
+  QuantizationConfig quantization_config{};
+  quantization_config.mutable_static_range_ptq_preset();
+  quantization_config = ExpandPresets(PopulateDefaults(quantization_config));
+  if (!component.Run(module_op, quantization_config).ok()) {
     signalPassFailure();
   }
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
index 6a5b58a7ba7b64..1fe56cde49601d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
@@ -8,10 +8,10 @@ func.func @main(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
 }
 // CHECK: @main(%[[ARG_0:.+]]: tensor<1x4xf32>) -> tensor<1x3xf32>
 // CHECK-DAG: %[[CST:.+]] = stablehlo.constant dense<1.000000e+00> : tensor<4x3xf32>
-// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]] = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {calibration_method = 0 : i32, {{.*}}} : (tensor<1x4xf32>) -> tensor<1x4xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]] = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {{.*}}  : (tensor<1x4xf32>) -> tensor<1x4xf32>
 // CHECK: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[CST]])
 // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {calibration_method = 0 : i32, {{.*}}} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {{.*}} : (tensor<1x3xf32>) -> tensor<1x3xf32>
 // CHECK: return %[[CUSTOM_AGGREGATOR_1]] : tensor<1x3xf32>
 // CHECK: }
 // CHECK: }
@@ -28,10 +28,10 @@ func.func @serving_default(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
 }
 // CHECK: @serving_default(%[[ARG_0:.+]]: tensor<1x4xf32>) -> tensor<1x3xf32>
 // CHECK-DAG: %[[CST:.+]] = stablehlo.constant dense<1.000000e+00> : tensor<4x3xf32>
-// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]] = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {calibration_method = 0 : i32, {{.*}}} : (tensor<1x4xf32>) -> tensor<1x4xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]] = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {{.*}} : (tensor<1x4xf32>) -> tensor<1x4xf32>
 // CHECK: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[CST]])
 // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {calibration_method = 0 : i32, {{.*}}} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {{.*}} : (tensor<1x3xf32>) -> tensor<1x3xf32>
 // CHECK: return %[[CUSTOM_AGGREGATOR_1]] : tensor<1x3xf32>
 // CHECK: }
 // CHECK: }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index 60099ccb0ea075..be0792ab76aff3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -448,6 +448,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_googlesource_code_re2//:re2",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
index e518826d7e6d12..56b9d7393aacfd 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/status/statusor.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -32,6 +33,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
@@ -45,6 +47,7 @@ namespace quant {
 namespace {
 
 using ::stablehlo::quantization::CalibrationOptions;
+using ::stablehlo::quantization::Method;
 
 constexpr StringRef kQuantTraitAttrName = "_tfl_quant_trait";
 
@@ -199,7 +202,7 @@ class AddCustomAggregationOp : public RewritePattern {
 
     // The CustomAggregatorOp is only added after quantizable values.
     SmallVector<Value> quantizable_values;
-    if (isCallToLiftedFunction(op)) {
+    if (IsCallToQuantizableLiftedFunction(op)) {
       // Quantize inputs of quantizable composite functions.
       for (Value input : op->getOperands()) {
         Type element_type = getElementTypeOrSelf(input.getType());
@@ -226,7 +229,7 @@ class AddCustomAggregationOp : public RewritePattern {
       // Quantize output of fully quantizable composite functions.
       for (Value input : op->getOperands()) {
         auto defining_op = input.getDefiningOp();
-        if (!isCallToLiftedFunction(defining_op)) {
+        if (!IsCallToQuantizableLiftedFunction(defining_op)) {
           continue;
         }
 
@@ -282,9 +285,13 @@ class AddCustomAggregationOp : public RewritePattern {
   CalibrationOptions calib_opts_;
 
   // Whether the op is a call op to lifted composite function.
-  bool isCallToLiftedFunction(Operation *op) const {
+  bool IsCallToQuantizableLiftedFunction(Operation *op) const {
     if (!op) return false;
-    if (isa<TF::XlaCallModuleOp>(op)) return true;
+    if (auto xla_call_module_op = dyn_cast_or_null<TF::XlaCallModuleOp>(op);
+        xla_call_module_op != nullptr) {
+      absl::StatusOr<Method> method = GetQuantizationMethod(xla_call_module_op);
+      if (method.ok() && method->has_static_range_ptq()) return true;
+    }
 
     TF::PartitionedCallOp call_op = dyn_cast_or_null<TF::PartitionedCallOp>(op);
     return call_op && call_op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&

From 0d87e299d7ed225aa29439a0072f69a2c20d46f5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 13:53:49 -0700
Subject: [PATCH 563/670] Fix a use-after-free issue, change a non-nullable
 pointer argument to a reference and get rid of an unused function argument.

PiperOrigin-RevId: 620047666
---
 .../experimental/auto_sharding/auto_sharding.cc  | 14 +++++++-------
 .../auto_sharding/auto_sharding_util.cc          | 16 +++++++---------
 .../auto_sharding/auto_sharding_util.h           |  6 ++----
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index ef7420bb384f4b..a9caf4daa6308e 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -2179,7 +2179,7 @@ Status SetHloShardingPostProcessing(
     const HloInstructionSequence& sequence, const StrategyMap& strategy_map,
     const CostGraph& cost_graph, absl::Span<const NodeStrategyIdx> s_val,
     const ClusterEnvironment& cluster_env, const bool crash_at_error,
-    absl::flat_hash_map<std::string, std::vector<HloSharding>>*
+    absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserve_shardings) {
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
   const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
@@ -2269,8 +2269,8 @@ Status SetHloShardingPostProcessing(
       // In the analysis itself, we use replicated strategies as a stand-in for
       // the (expected) maximal sharding annotations that send-done ops usually
       // have. Here we restore these maximal shardings if present.
-      auto preserved_sharding_iter = preserve_shardings->find(inst->name());
-      if (preserved_sharding_iter != preserve_shardings->end()) {
+      auto preserved_sharding_iter = preserve_shardings.find(inst->name());
+      if (preserved_sharding_iter != preserve_shardings.end()) {
         const auto& preserved_sharding = preserved_sharding_iter->second;
         if (preserved_sharding.size() > 1) {
           std::vector<Shape> tuple_elements_shape(
@@ -2295,8 +2295,8 @@ Status SetHloShardingPostProcessing(
       // In the analysis itself, we use replicated strategies as a stand-in for
       // the (expected) maximal sharding annotations that send ops usually
       // have. Here we restore these maximal shardings if present.
-      auto preserved_sharding_iter = preserve_shardings->find(inst->name());
-      if (preserved_sharding_iter != preserve_shardings->end()) {
+      auto preserved_sharding_iter = preserve_shardings.find(inst->name());
+      if (preserved_sharding_iter != preserve_shardings.end()) {
         const auto& preserved_sharding = preserved_sharding_iter->second;
         if (preserved_sharding.size() > 1) {
           inst->set_sharding(
@@ -2365,7 +2365,7 @@ Status SetHloShardingPostProcessing(
               }
             }
             FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
-                inst, dst_shardings, device_mesh, preserve_shardings);
+                inst, dst_shardings, device_mesh);
             break;
           }
 
@@ -3846,7 +3846,7 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
       if (!SetHloShardingPostProcessing(
                sequence, strategy_map, cost_graph, s_val, cluster_env,
                /* crash_at_error */ !option_.try_multiple_mesh_shapes,
-               &preserve_shardings)
+               preserve_shardings)
                .ok()) {
         return AutoShardingResult::kModuleUnchanged;
       }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index 827fb83881231d..0cb674711c9b66 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -1351,9 +1351,7 @@ HloInstruction* ReshardTensor(HloInstruction* tensor,
 void FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
     HloInstruction* inst,
     const std::vector<std::optional<HloSharding>>& dst_shardings,
-    const Array<int64_t>& device_mesh,
-    absl::flat_hash_map<std::string, std::vector<HloSharding>>*
-        preserve_shardings) {
+    const Array<int64_t>& device_mesh) {
   size_t tuple_size = inst->shape().tuple_shapes_size();
   auto current_sharding = inst->sharding();
 
@@ -1414,7 +1412,7 @@ void FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
 void FixMixedMeshShapeReshardingGetTupleElement(
     HloInstruction* inst, const HloSharding& dst_sharding,
     const Array<int64_t>& device_mesh,
-    absl::flat_hash_map<std::string, std::vector<HloSharding>>*
+    absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserve_shardings) {
   HloInstruction* operand = inst->mutable_operand(0);
   auto input_tuple_sharding = operand->sharding();
@@ -1444,11 +1442,11 @@ void FixMixedMeshShapeReshardingGetTupleElement(
     TF_CHECK_OK(inst->ReplaceUseWith(user, replace_with));
   }
 
-  CHECK_NE(preserve_shardings, nullptr);
-  if (preserve_shardings->contains(inst->name())) {
-    (*preserve_shardings)[replace_with->name()] =
-        std::vector<HloSharding>(preserve_shardings->at(inst->name()));
-    preserve_shardings->erase(inst->name());
+  auto iter = preserve_shardings.find(inst->name());
+  if (iter != preserve_shardings.end()) {
+    preserve_shardings[replace_with->name()] =
+        std::vector<HloSharding>(iter->second);
+    preserve_shardings.erase(inst->name());
   }
 }
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index e293adddd19ca6..4a129bb159076e 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -465,15 +465,13 @@ Shape ComputeIntermediateShape(const HloSharding& src_sharding,
 void FixMixedMeshShapeReshardingGetTupleElement(
     HloInstruction* inst, const HloSharding& dst_sharding,
     const Array<int64_t>& device_mesh,
-    absl::flat_hash_map<std::string, std::vector<HloSharding>>*
+    absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserve_shardings);
 
 void FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
     HloInstruction* inst,
     const std::vector<std::optional<HloSharding>>& dst_sharding,
-    const Array<int64_t>& device_mesh,
-    absl::flat_hash_map<std::string, std::vector<HloSharding>>*
-        preserve_shardings);
+    const Array<int64_t>& device_mesh);
 
 void FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
                                  const HloSharding& dst_sharding,

From d2f599eb551e2d2596588a79a4a4a69c104c8287 Mon Sep 17 00:00:00 2001
From: Wren Romano <wrengr@google.com>
Date: Thu, 28 Mar 2024 14:18:17 -0700
Subject: [PATCH 564/670] [XLA:Python] Factors the ":logging" library out from
 ":xla_extension".

This is a prospective change for https://github.com/openxla/xla/pull/10966.  In particular, this will help fix an OSS build problem: "tensorflow/xla/linux/cpu/build_cpu" not being able to find the `InitializeAbslLogging` function.

PiperOrigin-RevId: 620055000
---
 third_party/xla/xla/python/BUILD | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index e1f044aead00b3..b77b4eefc7f919 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -1140,13 +1140,18 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "logging",
+    srcs = ["logging.cc"],
+    hdrs = ["logging.h"],
+    deps = [
+        "@com_google_absl//absl/log:initialize",
+    ],
+)
+
 tsl_pybind_extension(
     name = "xla_extension",
-    srcs = [
-        "logging.cc",
-        "logging.h",
-        "xla.cc",
-    ],
+    srcs = ["xla.cc"],
     copts = [
         "-fexceptions",
         "-fno-strict-aliasing",
@@ -1177,6 +1182,7 @@ tsl_pybind_extension(
         ":custom_call_sharding",
         ":dlpack",
         ":jax_jit",
+        ":logging",
         ":mlir",
         ":nb_absl_flat_hash_map",
         ":nb_absl_span",

From 97fd9c1093140beb4178ff095363383b293222d3 Mon Sep 17 00:00:00 2001
From: Swachhand Lokhande <swachhand@google.com>
Date: Thu, 28 Mar 2024 14:26:39 -0700
Subject: [PATCH 565/670] Add support for tiled sharding with
 replicate_on_last_tile_dim in TpuRewrite pass

PiperOrigin-RevId: 620057438
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   3 +
 .../mlir/tensorflow/tests/tpu_rewrite.mlir    | 134 ++++++++++++++++++
 .../tensorflow/utils/xla_sharding_util.cc     | 105 ++++++++++----
 .../mlir/tensorflow/utils/xla_sharding_util.h |   5 +
 .../distributed_tpu_rewrite_pass.cc           |  32 +----
 5 files changed, 227 insertions(+), 52 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 2a0a2222d9aa04..79203111b03ea4 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1375,6 +1375,9 @@ cc_library(
     deps = [
         ":tensorflow",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index 55b68e5de2fb5f..db28242944434e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -2196,6 +2196,73 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
+// Tests inputs to TPUComputation that are tiled in multiple dimensions with
+// replicate_on_last_tile_dim set.
+
+// The following OpSharding is used for TPU computation inputs in below test:
+// Proto debug string:
+//  input 0
+//   type: OTHER
+//   tile_assignment_dimensions: 2
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 2
+//   tile_assignment_devices: 0
+//   tile_assignment_devices: 1
+//   tile_assignment_devices: 2
+//   tile_assignment_devices: 3
+//   replicate_on_last_tile_dim: true
+// Serialized string:
+//  "\08\03\1A\03\02\01\02\22\04\00\01\02\030\01"
+//
+// input 1
+//  type: MAXIMAL
+//  tile_assignment_dimensions: 1
+//  tile_assignment_devices: 1
+// Serialized string:
+//  "\08\01\1A\01\01\22\01\01"
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"]} {
+  // CHECK-LABEL: func @multi_dimension_tiled_input_replicate_last_dim
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<128x10xf32>, %[[ARG_1:[a-z0-9]*]]: tensor<128x10xf32>, %[[ARG_2:[a-z0-9]*]]: tensor<*xi32>, %[[ARG_3:[a-z0-9]*]]: tensor<*xi32>)
+  func.func @multi_dimension_tiled_input_replicate_last_dim(%arg0: tensor<128x10xf32>, %arg1: tensor<128x10xf32>, %arg2: tensor<*xi32>, %arg3: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>) {
+    // CHECK: tf_device.replicate
+    // CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %[[RI_0:[a-z0-9]*]]: tensor<128x10xf32>
+    // CHECK-SAME: [%[[ARG_2]], %[[ARG_3]]] as %[[RI_1:[a-z0-9]*]]: tensor<*xi32>
+    %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
+      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
+      // CHECK-NEXT:   "tf._TPUCompileMlir"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
+      // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
+      // CHECK:      %[[CONST_SPLIT_0_DIM:.*]] = "tf.Const"()
+      // CHECK:      %[[SPLIT_0_OUT:[a-z0-9]+]]:2 = "tf.Split"(%[[CONST_SPLIT_0_DIM]], %[[RI_0]])
+      // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:5 = "tf_device.parallel_execute"
+      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_0_OUTPUT:[0-9]*]]:2 = "tf.TPUExecute"(%[[SPLIT_0_OUT]]#0, %[[COMPILE]]#1)
+      // CHECK:          tf_device.return %[[EXECUTE_0_OUTPUT]]
+      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_1_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[SPLIT_0_OUT]]#0, %[[RI_1]], %[[COMPILE]]#2)
+      // CHECK:          tf_device.return %[[EXECUTE_1_OUTPUT]]
+      // CHECK:        %[[LAUNCH_2_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_2_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[SPLIT_0_OUT]]#1, %[[COMPILE]]#3)
+      // CHECK:          tf_device.return %[[EXECUTE_2_OUTPUT]]
+      // CHECK:        %[[LAUNCH_3_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_3_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[SPLIT_0_OUT]]#1, %[[COMPILE]]#4)
+      // CHECK:          tf_device.return %[[EXECUTE_3_OUTPUT]]
+      %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\03\1A\03\02\01\02\22\04\00\01\02\030\01", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""], use_spmd_for_xla_partitioning = false} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
+    }
+    func.return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
+  }
+  func.func @tpu0_func(%arg0: tensor<128x10xf32>, %arg1: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>) {
+    %1, %2 = "tf.A"(%arg0) : (tensor<128x10xf32>) -> (tensor<*xi32>, tensor<*xi1>)
+    %4 = "tf.B"(%1, %arg1) : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>)
+    %3 = "tf.XlaSharding"(%2) { _XlaSharding = "", sharding = "" } : (tensor<*xi1>) -> tensor<*xi1>
+    func.return %4, %3 : tensor<*xi32>, tensor<*xi1>
+  }
+}
+
+// -----
+
 // Tests that tiled output with multiple dimension sharding works properly.
 
 // The following OpSharding is used for TPU computation outputs in below test:
@@ -2278,6 +2345,73 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
+// Tests that tiled output with multiple dimension sharding works properly with
+// replicate_on_last_tile_dim set.
+
+// The following OpSharding is used for TPU computation outputs in below test:
+// output 0
+//  Proto debug string:
+//   type: OTHER
+//   tile_assignment_dimensions: 2
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 2
+//   tile_assignment_devices: 0
+//   tile_assignment_devices: 1
+//   tile_assignment_devices: 2
+//   tile_assignment_devices: 3
+//   replicate_on_last_tile_dim: true
+// Serialized string:
+//  "\08\03\1A\03\02\01\02\22\04\00\01\02\030\01"
+//
+// output 1
+//  Proto debug string:
+//  type: MAXIMAL
+//  tile_assignment_dimensions: 1
+//  tile_assignment_devices: 0
+// Serialized string:
+//  "\08\01\1A\01\01\22\01\00"
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"]} {
+  // CHECK-LABEL: func @multi_dimension_tiled_output_replicate_last_dim
+  func.func @multi_dimension_tiled_output_replicate_last_dim(%arg0: tensor<128x10xf32>, %arg1: tensor<128x10xf32>, %arg2: tensor<*xi32>, %arg3: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>) {
+    // CHECK: tf_device.replicate
+    // CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %[[RI_0:[a-z0-9]*]]: tensor<128x10xf32>
+    // CHECK-SAME: [%[[ARG_2]], %[[ARG_3]]] as %[[RI_1:[a-z0-9]*]]: tensor<*xi32>
+    %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
+      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
+      // CHECK-NEXT:   "tf._TPUCompileMlir"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
+      // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
+      // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:5 = "tf_device.parallel_execute"
+      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_0_OUTPUT:[0-9]*]]:2 = "tf.TPUExecute"
+      // CHECK:          tf_device.return %[[EXECUTE_0_OUTPUT]]
+      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_1_OUTPUT:[0-9]*]] = "tf.TPUExecute"
+      // CHECK:          tf_device.return %[[EXECUTE_1_OUTPUT]]
+      // CHECK:        %[[LAUNCH_2_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_2_OUTPUT:[0-9]*]] = "tf.TPUExecute"(
+      // CHECK:          tf_device.return %[[EXECUTE_2_OUTPUT]]
+      // CHECK:        %[[LAUNCH_3_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_3_OUTPUT:[0-9]*]] = "tf.TPUExecute"(
+      // CHECK:          tf_device.return %[[EXECUTE_3_OUTPUT]]
+      // CHECK:     %[[CONST_CONCAT_DIM:.*]] = "tf.Const"()
+      // CHECK:     %[[CONCAT_OUTPUT:[0-9]*]] = "tf.Concat"(%[[CONST_CONCAT_DIM]], %[[PARALLEL_EXECUTE_OUTPUT]]#0, %[[PARALLEL_EXECUTE_OUTPUT]]#3
+      %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "", topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\03\1A\03\02\01\02\22\04\00\01\02\030\01", "\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
+    }
+    func.return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
+  }
+  func.func @tpu0_func(%arg0: tensor<128x10xf32>, %arg1: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>) {
+    %1, %2 = "tf.A"(%arg0) : (tensor<128x10xf32>) -> (tensor<*xi32>, tensor<*xi1>)
+    %4 = "tf.B"(%1, %arg1) : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>)
+    %3 = "tf.XlaSharding"(%2) { _XlaSharding = "", sharding = "" } : (tensor<*xi1>) -> tensor<*xi1>
+    func.return %4, %3 : tensor<*xi32>, tensor<*xi1>
+  }
+}
+
+// -----
+
 // Tests inputs device assignment order is well preserved for tiled input sharding.
 
 // The following OpSharding is used for TPU computation inputs in below test:
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index 58adaa41349b14..1d3df520549da7 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
 
+#include <cstdint>
+#include <map>
 #include <numeric>
 #include <string>
 #include <utility>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -152,16 +157,21 @@ mlir::LogicalResult HandleTileShardedInputs(
   // are created such that input data is sharded in row major order.
   // Split nodes at ith depth from the original input node represent nodes
   // that split the input data at i-th dimension.
-  const auto& dimension_splits = input_sharding.tile_assignment_dimensions();
-  for (const auto& num_splits_and_index : llvm::enumerate(dimension_splits)) {
-    const int num_splits = num_splits_and_index.value();
-    const int dimension_index = num_splits_and_index.index();
-    if (num_splits == 1) continue;
+  auto dimension_to_splits_map =
+      GetDimensionIndicesAndNumSplitsFromSharding(input_sharding);
+  if (!dimension_to_splits_map.ok()) {
+    LOG(ERROR) << dimension_to_splits_map.status();
+    return mlir::failure();
+  }
+
+  for (const auto& dimension_and_num_splits : *dimension_to_splits_map) {
+    const int dimension = dimension_and_num_splits.first;
+    const int num_splits = dimension_and_num_splits.second;
 
     // Creates root split op.
     if (split_ops_for_tiled_input.empty()) {
       mlir::TF::SplitOp root_split_op;
-      auto result = CreateSplitOp(num_splits, dimension_index, location,
+      auto result = CreateSplitOp(num_splits, dimension, location,
                                   original_source, builder, &root_split_op);
       if (mlir::failed(result)) return mlir::failure();
 
@@ -176,7 +186,7 @@ mlir::LogicalResult HandleTileShardedInputs(
       for (auto parent_split_output_value : split_op.getResults()) {
         mlir::TF::SplitOp child_split_op;
         auto result =
-            CreateSplitOp(num_splits, dimension_index, location,
+            CreateSplitOp(num_splits, dimension, location,
                           parent_split_output_value, builder, &child_split_op);
         if (mlir::failed(result)) return mlir::failure();
 
@@ -188,12 +198,21 @@ mlir::LogicalResult HandleTileShardedInputs(
   }
 
   // `split_ops_for_tiled_input` now includes final split nodes
-  // from which sharded data will be fed into TPUExcute ops -- sorted by
+  // from which sharded data will be fed into TPUExecute ops -- sorted by
   // row major order.
+  tiled_inputs->clear();
   tiled_inputs->reserve(input_sharding.tile_assignment_devices_size());
-  for (auto split_op : split_ops_for_tiled_input)
-    tiled_inputs->append(split_op.getResults().begin(),
-                         split_op.getResults().end());
+  for (auto split_op : split_ops_for_tiled_input) {
+    for (auto split_op_output : split_op.getResults()) {
+      int64_t repeat_count =
+          input_sharding.replicate_on_last_tile_dim()
+              ? *input_sharding.tile_assignment_dimensions().rbegin()
+              : 1;
+      for (int64_t i = 0; i < repeat_count; ++i) {
+        tiled_inputs->push_back(split_op_output);
+      }
+    }
+  }
 
   return mlir::success();
 }
@@ -205,6 +224,29 @@ bool UnsupportedPartitionedShardingType(xla::OpSharding::Type sharding) {
 
 }  // namespace
 
+absl::StatusOr<std::map<int, int>> GetDimensionIndicesAndNumSplitsFromSharding(
+    const xla::OpSharding& sharding) {
+  int64_t tensor_tile_rank = sharding.tile_assignment_dimensions_size();
+  if (sharding.replicate_on_last_tile_dim()) {
+    tensor_tile_rank--;
+  }
+
+  std::map<int, int> dimension_to_splits_map;
+  for (int dim_index = 0; dim_index < tensor_tile_rank; ++dim_index) {
+    if (sharding.tile_assignment_dimensions(dim_index) > 1) {
+      dimension_to_splits_map.emplace(
+          dim_index, sharding.tile_assignment_dimensions(dim_index));
+    }
+  }
+
+  if (dimension_to_splits_map.empty()) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Arg has unnecessary tiled sharding: ", sharding.DebugString()));
+  }
+
+  return dimension_to_splits_map;
+}
+
 int GetDimsFromXLAShardingTiled(const xla::OpSharding& xla_sharding) {
   return xla_sharding.tile_assignment_dimensions_size() -
          (xla_sharding.replicate_on_last_tile_dim() ? 1 : 0) -
@@ -478,15 +520,25 @@ mlir::LogicalResult GetTileShardedOutputsToMerge(
   const xla::OpSharding& sharding =
       output_sharding_config[cluster_func_output_index];
   outputs_to_merge->reserve(sharding.tile_assignment_devices_size());
-  for (const auto logical_device_id : sharding.tile_assignment_devices()) {
+  for (const auto& core_id_and_index :
+       llvm::enumerate(sharding.tile_assignment_devices())) {
+    auto core_id = core_id_and_index.value();
+    auto tile_index = core_id_and_index.index();
+
+    int last_tile_dim_size = *sharding.tile_assignment_dimensions().rbegin();
+    if (sharding.replicate_on_last_tile_dim() &&
+        tile_index % last_tile_dim_size != 0) {
+      continue;
+    }
+
     int region_output_index;
-    auto status = LookupClusterToCoreIndex(
-        location, cluster_to_core_index, logical_device_id,
-        cluster_func_output_index, &region_output_index);
+    auto status = LookupClusterToCoreIndex(location, cluster_to_core_index,
+                                           core_id, cluster_func_output_index,
+                                           &region_output_index);
     if (failed(status)) return mlir::failure();
     const auto output_from_logical_device =
-        new_parallel_execute.GetRegionOutputs(
-            cluster_idx + logical_device_id)[region_output_index];
+        new_parallel_execute.GetRegionOutputs(cluster_idx +
+                                              core_id)[region_output_index];
     outputs_to_merge->emplace_back(output_from_logical_device);
   }
 
@@ -518,12 +570,18 @@ mlir::LogicalResult HandleTileShardedOutputs(
   // devices to a single replica output.
   const xla::OpSharding& sharding =
       output_sharding_config[cluster_func_output_index];
-  int concat_dimension = sharding.tile_assignment_dimensions_size() - 1;
-  for (auto num_splits : llvm::reverse(sharding.tile_assignment_dimensions())) {
-    if (num_splits == 1) {
-      --concat_dimension;
-      continue;
-    }
+
+  auto dimension_to_splits_map =
+      GetDimensionIndicesAndNumSplitsFromSharding(sharding);
+  if (!dimension_to_splits_map.ok()) {
+    LOG(ERROR) << dimension_to_splits_map.status();
+    return mlir::failure();
+  }
+
+  for (auto it = dimension_to_splits_map->rbegin();
+       it != dimension_to_splits_map->rend(); ++it) {
+    int concat_dimension = it->first;
+    int num_splits = it->second;
 
     llvm::SmallVector<mlir::Value, 4> new_outputs;
     new_outputs.reserve(num_splits);
@@ -539,7 +597,6 @@ mlir::LogicalResult HandleTileShardedOutputs(
     }
 
     std::swap(new_outputs, outputs_to_merge);
-    --concat_dimension;
   }
 
   assert(outputs_to_merge.size() == 1);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
index 6295be3776416e..ab22eb978214ad 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
 
+#include <map>
 #include <string>
 
+#include "absl/status/statusor.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -122,6 +124,9 @@ bool IsSplitSharding(const xla::OpSharding& sharding);
 // REPLICATED type and replicated OTHER type.
 bool IsReplicatedSharding(const xla::OpSharding& sharding);
 
+// Returns a map of dimension indices and number of splits for tiled sharding.
+absl::StatusOr<std::map<int, int>> GetDimensionIndicesAndNumSplitsFromSharding(
+    const xla::OpSharding& sharding);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
index d13eb1dbdfc901..9008591c2800db 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
@@ -650,28 +650,6 @@ Status GetStepMarkerLocation(const Node& replicate_node,
   return absl::OkStatus();
 }
 
-// Extracts a map of dimension and number of splits for tiled input from xla
-// sharding attribute.
-Status GetDimensionIndicesAndNumSplitsFromSharding(
-    const xla::OpSharding& sharding, std::map<int, int>* split_dimension_map) {
-  int64_t tensor_tile_rank = sharding.tile_assignment_dimensions_size();
-  if (sharding.replicate_on_last_tile_dim()) {
-    tensor_tile_rank--;
-  }
-  for (int dim_index = 0; dim_index < tensor_tile_rank; dim_index++) {
-    if (sharding.tile_assignment_dimensions(dim_index) > 1) {
-      split_dimension_map->emplace(
-          dim_index, sharding.tile_assignment_dimensions(dim_index));
-    }
-  }
-
-  if (split_dimension_map->empty()) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Arg has unnecessary tiled sharding: ", sharding.DebugString()));
-  }
-  return absl::OkStatus();
-}
-
 // Updates contents of the function with `function_name` in function library
 // definition `flib_def` to `new_graph`. This is required when graph
 // transformation happens inside a function call body.
@@ -861,9 +839,8 @@ StatusOr<ShardedInputInfo> CreateOrGetSplitNodesForInputSharding(
   }
   // Maps input dimension and number of splits with which the
   // dimension sharded.
-  std::map<int, int> split_dimension_map;
-  TF_RETURN_IF_ERROR(GetDimensionIndicesAndNumSplitsFromSharding(
-      sharding, &split_dimension_map));
+  TF_ASSIGN_OR_RETURN(auto split_dimension_map,
+                      GetDimensionIndicesAndNumSplitsFromSharding(sharding));
   TF_RET_CHECK(!split_dimension_map.empty())
       << "Unnecessary sharding attribute found.";
 
@@ -1280,9 +1257,8 @@ StatusOr<Node*> CreateConcatNodesForRetval(
     const PartialTensorShape& inferred_shape, int replica_id,
     const std::vector<NodeOut>& orig_inputs, Graph* graph,
     absl::string_view device) {
-  std::map<int, int> split_dimension_map;
-  TF_RETURN_IF_ERROR(GetDimensionIndicesAndNumSplitsFromSharding(
-      sharding, &split_dimension_map));
+  TF_ASSIGN_OR_RETURN(auto split_dimension_map,
+                      GetDimensionIndicesAndNumSplitsFromSharding(sharding));
   std::vector<NodeOut> inputs_to_sharded_retval = orig_inputs;
   bool has_paddings = false;
 

From d952cc21e0ad6601f46ffc295072ad32d0e298d0 Mon Sep 17 00:00:00 2001
From: Jian Cai <jiancai@google.com>
Date: Thu, 28 Mar 2024 14:35:50 -0700
Subject: [PATCH 566/670] Add a field to StreamZ metric
 /tensorflow/core/tf_mlir_bridge_first_phase_count

Add a filed for the type of TF2XLA Phase 1 Bridge , i.e. Replicated Bridge vs. Non-replicated Bridge.

PiperOrigin-RevId: 620060074
---
 .../tensorflow/transforms/host_runtime/BUILD  |  3 +-
 .../lower_cluster_to_runtime_ops.cc           | 14 +++++--
 .../lower_cluster_to_runtime_ops_test.cc      |  9 +++--
 .../mlir/tensorflow/utils/attribute_utils.h   | 12 ++++++
 tensorflow/compiler/mlir/tf2xla/api/v1/BUILD  |  3 +-
 .../compiler/mlir/tf2xla/api/v1/cluster_tf.cc | 15 +++----
 .../mlir/tf2xla/api/v1/cluster_tf_test.cc     | 15 +++++--
 tensorflow/compiler/mlir/tf2xla/api/v2/BUILD  |  6 +--
 .../compiler/mlir/tf2xla/api/v2/cluster_tf.cc | 20 ++++++----
 .../mlir/tf2xla/api/v2/cluster_tf_test.cc     | 27 +++++++++----
 tensorflow/compiler/tf2xla/BUILD              |  2 +-
 .../compiler/tf2xla/mlir_bridge_pass.cc       | 39 ++++++++++++++-----
 tensorflow/core/framework/metrics.cc          | 12 +++---
 tensorflow/core/framework/metrics.h           |  6 ++-
 14 files changed, 127 insertions(+), 56 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
index 5d500453d17fe0..15f339eccd2f93 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
@@ -26,10 +26,10 @@ cc_library(
     deps = [
         ":runtime_passes",
         "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_proto_parsing",
@@ -62,6 +62,7 @@ tf_cc_test(
         ":lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir:register_common_dialects",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
index 6f46766a3250fa..713e9080f2e03b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/runtime_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
@@ -121,6 +122,7 @@ void CreateNonTPULowerClusterToRuntimeOpsPassPipeline(
 // TODO(b/306728216): Move this out of the Bridge component and into a Host
 // runtime component.
 tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
+                                       std::string bridge_type,
                                        tsl::DeviceType device_type,
                                        absl::Status status) {
   if (status.ok()) {
@@ -129,11 +131,12 @@ tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
 
   VLOG(2) << error_prefix << " " << status;
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-      device_type.type_string(), /*bridge_version=*/"v2",
+      bridge_type,
+      /*bridge_version=*/mlir::TF::kMlirPh1BridgeCounterV2,
+      device_type.type_string(),
       /*fallback_enabled=*/false,
       /*result=*/"failure");
 
-  constexpr char kBridgeComponent[] = "TFXLABridge";
   std::string bridge_subcomponent = "TFXLA_PHASE_ONE_MLIR_TPU_BRIDGE";
 
   tsl::OkOrSetErrorCounterPayload(
@@ -144,7 +147,7 @@ tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
     bridge_subcomponent = "TFXLA_PHASE_ONE_MLIR_CPU/GPU_BRIDGE";
   }
 
-  tsl::error_logging::Log(kBridgeComponent, bridge_subcomponent,
+  tsl::error_logging::Log(mlir::TF::kBridgeComponent, bridge_subcomponent,
                           status.ToString())
       .IgnoreError();
 
@@ -194,10 +197,13 @@ absl::Status RunLowerClusterToRuntimeOpsPassPipeline(
         module, llvm::StringRef(), &runtime_lowering);
   }
 
+  std::string bridge_type = xla_device_type == DeviceType(DEVICE_TPU_XLA_JIT)
+                                ? mlir::TF::kMlirPh1BridgeCounterReplicated
+                                : mlir::TF::kMlirPh1BridgeCounterNonReplicated;
   auto result_status = diag_handler.ConsumeStatus();
   TF_RETURN_IF_ERROR(
       RecordIfErrorStatus(/*error_prefix=*/"lower_cluster_to_runtime",
-                          xla_device_type, result_status));
+                          bridge_type, xla_device_type, result_status));
 
   return absl::OkStatus();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
index 3e3e8db504f1da..1f0cf146203de2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/register_common_dialects.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tensorflow/core/platform/env.h"
@@ -167,9 +168,11 @@ TEST_F(LowerClusterToRuntimeOpsTest, ErrorsWithBadCluster) {
                    *mlir_module_, DeviceType(DEVICE_TPU_XLA_JIT))
                    .ok());
 
-  EXPECT_EQ(compilation_status.Delta("XLA_TPU_JIT", "v2", "fallback_disabled",
-                                     "failure"),
-            1);
+  EXPECT_EQ(
+      compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterReplicated,
+                               mlir::TF::kMlirPh1BridgeCounterV2, "XLA_TPU_JIT",
+                               "fallback_disabled", "failure"),
+      1);
 }
 
 TEST_F(LowerClusterToRuntimeOpsTest, DumpsPipelinePasses) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
index b50135c9bdfac3..5a99806d4295f3 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
@@ -121,6 +121,18 @@ inline constexpr llvm::StringRef kDynamicArgIndexAttr = "_dynamic_arg_index";
 inline constexpr llvm::StringRef kParallelExecAnnotation =
     "_parallel_execution_ids";
 
+// Logging
+
+// Name of component for error logging. This name is fixed and required to
+// enable logging.
+inline const char kBridgeComponent[] = "TFXLABridge";
+inline const char kMlirPh1BridgeCounterReplicated[] = "replicated";
+inline const char kMlirPh1BridgeCounterNonReplicated[] = "nonreplicated";
+inline const char kMlirPh1BridgeCounterV1[] = "v1";
+inline const char kMlirPh1BridgeCounterV2[] = "v2";
+inline const char kMlirPh1BridgeCounterTpu[] = "tpu";
+inline const char kMlirPh1BridgeCounterNonTpu[] = "cpu/gpu";
+
 // Copies attributes that satisfy the given predicate from `from` to `to`.
 template <typename Predicate>
 void CopyAttributes(Operation *from, Operation *to, Predicate P) {
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
index 3b55c2f954a22b..53a65bd3ae3662 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
@@ -194,6 +194,7 @@ cc_library(
     ],
     deps = [
         ":tf_dialect_to_executor",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
@@ -232,6 +233,7 @@ tf_cc_test(
     deps = [
         ":cluster_tf",
         "//tensorflow/compiler/mlir:register_common_dialects",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
         "//tensorflow/core/lib/monitoring:cell_reader",
         "//tensorflow/core/platform:resource_loader",
@@ -241,7 +243,6 @@ tf_cc_test(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/lib/monitoring:test_utils",
         "@local_tsl//tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
index 09209d8673524c..38c11ec857f072 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
@@ -60,10 +61,6 @@ using mlir::func::FuncOp;
 
 namespace {
 
-// Name of component for error logging. This name is fixed and required to
-// enable logging.
-constexpr char kBridgeComponent[] = "TFXLABridge";
-
 void CreateReplicatedBridgePipelineV1(OpPassManager &pm) {
   pm.addPass(mlir::tf2xla::internal::CreateInferenceMetricsPass());
 
@@ -152,10 +149,12 @@ tensorflow::Status RecordStatusIfError(const std::string error_prefix,
   }
 
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-      /*device_type=*/"tpu", /*bridge_version=*/"v1",
+      /*bridge_type=*/mlir::TF::kMlirPh1BridgeCounterReplicated,
+      /*bridge_version=*/mlir::TF::kMlirPh1BridgeCounterV1,
+      /*device_type*/ mlir::TF::kMlirPh1BridgeCounterTpu,
       /*fallback_enabled=*/is_in_fallback_enabled_mode,
       /*result=*/"failure");
-  tsl::error_logging::Log(kBridgeComponent,
+  tsl::error_logging::Log(mlir::TF::kBridgeComponent,
                           "TFXLA_PHASE_ONE_MLIR_TPU_V1_COMPAT_BRIDGE",
                           status.ToString())
       .IgnoreError();
@@ -221,7 +220,9 @@ tensorflow::Status RunSessionTf2xlaClusteringBridge(
       RunClusteringPipelineOnSubmodule(module, is_in_fallback_enabled_mode));
 
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-      /*device_type=*/"tpu", /*bridge_version=*/"v1",
+      /*bridge_type=*/mlir::TF::kMlirPh1BridgeCounterReplicated,
+      /*bridge_version=*/mlir::TF::kMlirPh1BridgeCounterV1,
+      /*device_type*/ mlir::TF::kMlirPh1BridgeCounterTpu,
       /*n_fallback_enabled*/ is_in_fallback_enabled_mode,
       /*result=*/"success");
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
index 44eafb25f579c8..e674989d2174ba 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/register_common_dialects.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -84,8 +85,11 @@ TEST_F(SessionClusterTensorflowDialectTest, ClustersTf) {
   TF_EXPECT_OK(
       RunSessionTf2xlaClusteringBridge(*mlir_module_,
                                        /*is_in_fallback_enabled_mode=*/false));
-  EXPECT_EQ(
-      compilation_status.Delta("tpu", "v1", "fallback_disabled", "success"), 1);
+  EXPECT_EQ(compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterReplicated,
+                                     mlir::TF::kMlirPh1BridgeCounterV1,
+                                     mlir::TF::kMlirPh1BridgeCounterTpu,
+                                     "fallback_disabled", "success"),
+            1);
 }
 
 TEST_F(SessionClusterTensorflowDialectTest, FailsWithMultipleSubmodules) {
@@ -98,8 +102,11 @@ TEST_F(SessionClusterTensorflowDialectTest, FailsWithMultipleSubmodules) {
                                        /*is_in_fallback_enabled_mode=*/false)
           .ok());
 
-  EXPECT_EQ(
-      compilation_status.Delta("tpu", "v1", "fallback_disabled", "failure"), 1);
+  EXPECT_EQ(compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterReplicated,
+                                     mlir::TF::kMlirPh1BridgeCounterV1,
+                                     mlir::TF::kMlirPh1BridgeCounterTpu,
+                                     "fallback_disabled", "failure"),
+            1);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index a92239e8dbba69..545203ad20ea23 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -119,12 +119,11 @@ cc_library(
     ],
     deps = [
         ":device_type_proto_cc",
-        ":tf_dialect_to_executor",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
-        "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir/tf2xla/internal:clustering_bridge_passes",
         "//tensorflow/compiler/mlir/tf2xla/internal:logging_hooks",
         "//tensorflow/core:framework",
@@ -133,7 +132,6 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:stacktrace",
         "//tensorflow/core/platform:status",
-        "//tensorflow/core/tpu:tpu_defs",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
@@ -143,7 +141,6 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:error_logging",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -159,6 +156,7 @@ tf_cc_test(
         ":cluster_tf",
         "//tensorflow/compiler/mlir:register_common_dialects",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
         "//tensorflow/core/lib/monitoring:cell_reader",
         "//tensorflow/core/platform:resource_loader",
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
index 23480374032aaa..41df5eb0750459 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/device_type.pb.h"
@@ -52,8 +53,6 @@ using mlir::OpPassManager;
 using mlir::PassManager;
 using mlir::func::FuncOp;
 
-constexpr char kBridgeComponent[] = "TFXLABridge";
-
 // Run the TF XLA Bridge based on the input pipeline, which can be either TPU
 // bridge pipeline or non TPU bridge pipeline.
 tensorflow::Status RunTFXLABridge(
@@ -114,6 +113,7 @@ tensorflow::Status RunTFXLABridge(
 
 tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
                                        bool fallback_enabled,
+                                       std::string bridge_type,
                                        std::string device_type,
                                        absl::Status status) {
   if (status.ok()) {
@@ -122,7 +122,7 @@ tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
 
   VLOG(2) << error_prefix << " " << status;
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-      device_type, /*bridge_version=*/"v2",
+      /*bridge_type*/ bridge_type, /*bridge_version=*/"v2", device_type,
       /*fallback_enabled=*/fallback_enabled,
       /*result=*/"failure");
 
@@ -135,7 +135,7 @@ tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
     bridge_subcomponent = "TFXLA_PHASE_ONE_MLIR_CPU/GPU_BRIDGE";
   }
 
-  tsl::error_logging::Log(kBridgeComponent, bridge_subcomponent,
+  tsl::error_logging::Log(mlir::TF::kBridgeComponent, bridge_subcomponent,
                           status.ToString())
       .IgnoreError();
 
@@ -162,8 +162,9 @@ void CreateReplicatedClusteringPipelineV2(OpPassManager &pm) {
 tensorflow::Status RunFunctionTf2xlaClusteringBridge(
     ModuleOp module, bool is_supported_by_replicated_brige,
     bool is_in_fallback_enabled_mode, llvm::StringRef module_name) {
-  std::string device_type_filter =
-      is_supported_by_replicated_brige ? "tpu" : "cpu/gpu";
+  std::string device_type = is_supported_by_replicated_brige
+                                ? mlir::TF::kMlirPh1BridgeCounterTpu
+                                : mlir::TF::kMlirPh1BridgeCounterNonTpu;
 
   VLOG(2)
       << (is_supported_by_replicated_brige ? "Replicated" : "NonReplicated")
@@ -186,14 +187,17 @@ tensorflow::Status RunFunctionTf2xlaClusteringBridge(
                 },
                 module_name, /*dump_prefix=*/"tf_xla_bridge_v2_nonreplicated");
 
+  std::string bridge_type = is_supported_by_replicated_brige
+                                ? mlir::TF::kMlirPh1BridgeCounterReplicated
+                                : mlir::TF::kMlirPh1BridgeCounterNonReplicated;
   // TODO(b/317798386): add is_supported_by_replicated_brige as a filter.
   TF_RETURN_IF_ERROR(RecordIfErrorStatus(
       /*error_prefix=*/"clustering_v2", is_in_fallback_enabled_mode,
-      device_type_filter, clustering_status));
+      bridge_type, device_type, clustering_status));
 
   // TODO(b/317798386): add is_supported_by_replicated_brige as a filter.
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-      device_type_filter, /*bridge_version=*/"v2",
+      bridge_type, /*bridge_version=*/"v2", device_type,
       /*fallback_enabled=*/is_in_fallback_enabled_mode,
       /*result=*/"success");
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
index c4a96702533c49..a5f64a91cd8cb4 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/register_common_dialects.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -94,8 +95,11 @@ TEST_F(FunctionClusterTensorflowDialectTest, ClustersTfReplicatedBridge) {
   FuncOp main = mlir_module_->lookupSymbol<mlir::func::FuncOp>("main");
   ASSERT_TRUE(main);
 
-  EXPECT_EQ(
-      compilation_status.Delta("tpu", "v2", "fallback_disabled", "success"), 1);
+  EXPECT_EQ(compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterReplicated,
+                                     mlir::TF::kMlirPh1BridgeCounterV2,
+                                     mlir::TF::kMlirPh1BridgeCounterTpu,
+                                     "fallback_disabled", "success"),
+            1);
 }
 
 TEST_F(FunctionClusterTensorflowDialectTest,
@@ -118,8 +122,11 @@ TEST_F(FunctionClusterTensorflowDialectTest,
   });
 
   EXPECT_TRUE(has_cluster_op);
-  EXPECT_EQ(
-      compilation_status.Delta("tpu", "v2", "fallback_disabled", "success"), 1);
+  EXPECT_EQ(compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterReplicated,
+                                     mlir::TF::kMlirPh1BridgeCounterV2,
+                                     mlir::TF::kMlirPh1BridgeCounterTpu,
+                                     "fallback_disabled", "success"),
+            1);
 }
 
 TEST_F(FunctionClusterTensorflowDialectTest, ClustersTFNonReplicatedBridge) {
@@ -135,7 +142,10 @@ TEST_F(FunctionClusterTensorflowDialectTest, ClustersTFNonReplicatedBridge) {
   ASSERT_TRUE(main);
 
   EXPECT_EQ(
-      compilation_status.Delta("cpu/gpu", "v2", "fallback_disabled", "success"),
+      compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterNonReplicated,
+                               mlir::TF::kMlirPh1BridgeCounterV2,
+                               mlir::TF::kMlirPh1BridgeCounterNonTpu,
+                               "fallback_disabled", "success"),
       1);
 }
 
@@ -148,8 +158,11 @@ TEST_F(FunctionClusterTensorflowDialectTest, LogsFallbackMode) {
       *mlir_module_, /*is_supported_by_replicated_brige*/ true,
       /*is_in_fallback_enabled_mode=*/true));
 
-  EXPECT_EQ(
-      compilation_status.Delta("tpu", "v2", "fallback_enabled", "success"), 1);
+  EXPECT_EQ(compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterReplicated,
+                                     mlir::TF::kMlirPh1BridgeCounterV2,
+                                     mlir::TF::kMlirPh1BridgeCounterTpu,
+                                     "fallback_enabled", "success"),
+            1);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 16315f71d9652c..01e85cc7c6cfc7 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1172,11 +1172,11 @@ cc_library(
     hdrs = ["mlir_bridge_pass.h"],
     visibility = [":internal"],
     deps = [
-        ":tf2xla_defs",
         ":xla_op_registry",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:device_util",
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 0402508fe92f56..c24654c894b34f 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -29,13 +29,14 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.h"
-#include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
+// #include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/device.h"
@@ -162,16 +163,28 @@ MlirOptimizationPassState GetPassStateImpl(
               << " Bridge, disabled by user. "
                  "The fallback will evaluate.";
       metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-          is_supported_by_replicated_brige ? "tpu" : "cpu/gpu", "v2", true,
-          "disabled_by_user");
+          /*bridge_type*/ is_supported_by_replicated_brige
+              ? mlir::TF::kMlirPh1BridgeCounterReplicated
+              : mlir::TF::kMlirPh1BridgeCounterNonReplicated,
+          /*bridge_version*/ mlir::TF::kMlirPh1BridgeCounterV2,
+          /*device_type*/
+          is_supported_by_replicated_brige
+              ? mlir::TF::kMlirPh1BridgeCounterTpu
+              : mlir::TF::kMlirPh1BridgeCounterNonTpu,
+          /*fallback_enabled*/ true,
+          /*result*/ "disabled_by_user");
       return MlirOptimizationPassState::Disabled;
     }
     case MlirBridgeRolloutPolicy::kDisabledAfterGraphAnalysis:
       // Graph analysis only runs on TPU graph.
       VLOG(1) << "Skipping MLIR TPU Bridge, disabled because the "
                  "graph has unsupported features. The fallback will evaluate.";
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v2", true,
-                                                   "invalid_graph");
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter(
+          /*bridge_type*/ mlir::TF::kMlirPh1BridgeCounterReplicated,
+          /*bridge_version*/ mlir::TF::kMlirPh1BridgeCounterV2,
+          /*device_type*/ mlir::TF::kMlirPh1BridgeCounterTpu,
+          /*fallback_enabled*/ true,
+          /*result*/ "invalid_graph");
       // We set `uses_uninitialized_resource_args` to false here because the
       // first phase of the bridge is not affected by uninitialized resource
       // args.
@@ -305,16 +318,24 @@ MlirOptimizationPassState MlirBridgeV1CompatPass::GetPassState(
       VLOG(1) << "Skipping MLIR Replicated Bridge V1 Compat, MLIR Replicated "
                  "bridge disabled "
                  "by user. Fallback will evaluate.";
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v1", true,
-                                                   "disabled_by_user");
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter(
+          /*bridge_type*/ mlir::TF::kMlirPh1BridgeCounterReplicated,
+          /*bridge_version*/ mlir::TF::kMlirPh1BridgeCounterV1,
+          /*device_type*/ mlir::TF::kMlirPh1BridgeCounterTpu,
+          /*fallback_enabled*/ true,
+          /*result*/ "disabled_by_user");
       return MlirOptimizationPassState::Disabled;
     case MlirBridgeRolloutPolicy::kDisabledAfterGraphAnalysis:
       VLOG(1) << "Skipping MLIR Replicated Bridge V1 Compat, MLIR Replicated "
                  "bridge disabled "
                  "because graph has unsupported features. Old bridge will "
                  "evaluate.";
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v1", true,
-                                                   "invalid_graph");
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter(
+          /*bridge_type*/ mlir::TF::kMlirPh1BridgeCounterReplicated,
+          /*bridge_version*/ mlir::TF::kMlirPh1BridgeCounterV1,
+          /*device_type*/ mlir::TF::kMlirPh1BridgeCounterTpu,
+          /*fallback_enabled*/ true,
+          /*result*/ "invalid_graph");
       // We set `uses_uninitialized_resource_args` to false here because the
       // first phase of the bridge is not affected by uninitialized resource
       // args.
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index 902b25bd12356d..863f0c209513ac 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -443,10 +443,10 @@ auto* eager_client_error_counter = tsl::monitoring::Counter<2>::New(
     "Count the errors in eager client as a central place.", "error_source",
     "error_type");
 
-auto* mlir_bridge_first_phase_counter = tsl::monitoring::Counter<4>::New(
+auto* mlir_bridge_first_phase_counter = tsl::monitoring::Counter<5>::New(
     "/tensorflow/core/tf_mlir_bridge_first_phase_count",
-    "Tracks processing state in first phase of mlir bridge", "device",
-    "version", "fallback", "result");
+    "Tracks processing state in first phase of mlir bridge", "bridge",
+    "version", "device", "fallback", "result");
 
 auto* mlir_second_phase_count = tensorflow::monitoring::Counter<1>::New(
     "/tensorflow/core/tf2xla/api/v2/phase2_compilation_status" /*metric_name*/,
@@ -948,14 +948,16 @@ void TestDelta::Reset() { last_value_ = cell_->value(); }
 
 int64 TestDelta::Get() { return cell_->value() - last_value_; }
 
-void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type,
+void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& bridge_type,
                                          const std::string& bridge_version,
+                                         const std::string& device_type,
                                          bool fallback_enabled,
                                          const std::string& result) {
   std::string fallback_status =
       fallback_enabled ? "fallback_enabled" : "fallback_disabled";
   mlir_bridge_first_phase_counter
-      ->GetCell(device_type, bridge_version, fallback_status, result)
+      ->GetCell(bridge_type, bridge_version, device_type, fallback_status,
+                result)
       ->IncrementBy(1);
 }
 
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index 955a6461bd7c36..1a6ba8a88bf890 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -341,12 +341,14 @@ int64_t GetFunctionGraphOptimizationCacheLoadCount(
 
 // Records the activity of the first phase of the mlir bridge using the
 // tf_metadata.tf_mlir_bridge_first_phase_count metric.
-// device_type: tpu, cpu, gpu, etc.
+// bridge_type: replicated, nonreplicated, etc.
 // bridge_version: v1 compat, v2, etc.
+// device_type: tpu, cpu, gpu, etc.
 // fallback_enabled: true if fallback will happen, false if not
 // result: outcome of bridge (success, failure, disabled, invalid_graph, etc.)
-void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type,
+void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& bridge_type,
                                          const std::string& bridge_version,
+                                         const std::string& device_type,
                                          bool fallback_enabled,
                                          const std::string& result);
 

From 3107f4090909cd323051c7926bfbee8a2b1bcd55 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 28 Mar 2024 14:40:01 -0700
Subject: [PATCH 567/670] [MLIR Exporter] Move created `NodeDef` and
 `AttrValue` protocol buffers into the exported `Graph`.

PiperOrigin-RevId: 620061282
---
 .../mlir/tensorflow/translate/export_graphdef.cc   |  6 +++---
 .../compiler/mlir/tensorflow/utils/export_utils.cc | 14 ++++++++------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 6042ae37ee8fa2..550ff9fee330a4 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -421,7 +421,7 @@ Status Exporter::AddInstructionNode(Operation* inst) {
                           inst, name, /*ignore_unregistered_attrs=*/false));
   UseOriginalFunctionNames(*node_def);
 
-  TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(*node_def));
+  TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(std::move(*node_def)));
   DCHECK(node != nullptr);
   nodes_[inst] = node;
   return OkStatus();
@@ -436,7 +436,7 @@ bool IsEntryFunctionArg(BlockArgument arg) {
 Status Exporter::AddArgumentNode(BlockArgument arg, unsigned index,
                                  llvm::StringRef name) {
   TF_ASSIGN_OR_RETURN(auto node_def, GetArgumentNode(arg, index, name));
-  TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(*node_def));
+  TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(std::move(*node_def)));
   args_[arg] = node;
   return OkStatus();
 }
@@ -455,7 +455,7 @@ Status Exporter::AddFetchNode(FuncOp function, mlir::tf_executor::FetchOp fetch,
         GetReturnNode(function, operand_and_idx.value(),
                       operand_and_idx.index(),
                       names.empty() ? "" : names[operand_and_idx.index()]));
-    TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(*node_def));
+    TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(std::move(*node_def)));
     return_nodes.push_back(node);
   }
   return OkStatus();
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index f9dc740cee1aae..f01a3f0e09d19b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -61,6 +61,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace {
@@ -395,12 +396,12 @@ Status ConvertAttributes(
     if (auto symbol_ref = attr.dyn_cast<mlir::SymbolRefAttr>()) {
       TF_RETURN_IF_ERROR(
           ConvertAttribute(symbol_ref.cast<mlir::FlatSymbolRefAttr>(), &value));
-      func_call_attrs[string(name)] = value;
+      func_call_attrs[string(name)] = std::move(value);
       continue;
     }
     if (auto func_attr = attr.dyn_cast<mlir::TF::FuncAttr>()) {
       TF_RETURN_IF_ERROR(ConvertAttribute(func_attr, remove_ref_type, &value));
-      func_call_attrs[string(name)] = value;
+      func_call_attrs[string(name)] = std::move(value);
       continue;
     }
     if (attr.isa<mlir::AffineMapAttr>()) {
@@ -434,13 +435,14 @@ Status ConvertAttributes(
     TF_RET_CHECK(name_tokens.size() <= 2);
     auto it = func_call_attrs.find(name_tokens[0]);
     if (it == func_call_attrs.end()) {
-      (*values)[string(name)] = value;
+      (*values)[string(name)] = std::move(value);
     } else {
-      (*it->second.mutable_func()->mutable_attr())[name_tokens[1]] = value;
+      (*it->second.mutable_func()->mutable_attr())[name_tokens[1]] =
+          std::move(value);
     }
   }
-  for (const auto& it : func_call_attrs) {
-    (*values)[it.first] = it.second;
+  for (auto& it : func_call_attrs) {
+    (*values)[it.first] = std::move(it.second);
   }
   return OkStatus();
 }

From 86e08a6048d564c37c7be6a51826c4ac9f581c76 Mon Sep 17 00:00:00 2001
From: Wren Romano <wrengr@google.com>
Date: Thu, 28 Mar 2024 14:50:54 -0700
Subject: [PATCH 568/670] [XLA:Python] Add python function to convert
 `xla::LiteralProto` into a tuple-tree of `numpy.ndarray`.

This is intended for internal debugging use. It cannot be used on OSS because the relevant protobufs are not part of the public API. (Though it must not break the OSS build, naturally.)

PiperOrigin-RevId: 620064326
---
 third_party/xla/xla/python/tools/BUILD        |  93 +++++++++
 third_party/xla/xla/python/tools/__init__.py  |   0
 third_party/xla/xla/python/tools/_types.cc    | 128 ++++++++++++
 third_party/xla/xla/python/tools/_types.pyi   |  23 +++
 third_party/xla/xla/python/tools/types.py     |  51 +++++
 .../xla/xla/python/tools/types_test.py        | 183 ++++++++++++++++++
 6 files changed, 478 insertions(+)
 create mode 100644 third_party/xla/xla/python/tools/BUILD
 create mode 100644 third_party/xla/xla/python/tools/__init__.py
 create mode 100644 third_party/xla/xla/python/tools/_types.cc
 create mode 100644 third_party/xla/xla/python/tools/_types.pyi
 create mode 100644 third_party/xla/xla/python/tools/types.py
 create mode 100644 third_party/xla/xla/python/tools/types_test.py

diff --git a/third_party/xla/xla/python/tools/BUILD b/third_party/xla/xla/python/tools/BUILD
new file mode 100644
index 00000000000000..ac45d9d3f10c66
--- /dev/null
+++ b/third_party/xla/xla/python/tools/BUILD
@@ -0,0 +1,93 @@
+load("@local_tsl//tsl:tsl.default.bzl", "tsl_pybind_extension")
+
+# NOTE: We can't use `pytype_pybind_extension` nor `pytype_strict_contrib_test`
+# because the OSS versions of these files do not include ports of those rules.
+# We must instead use `tsl_pybind_extension` and `py_strict_test`.
+load("//xla:pytype.default.bzl", "pytype_strict_library")
+load("//xla:strict.default.bzl", "py_strict_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+exports_files([
+    "__init__.py",
+    "types.py",
+    "_types.pyi",
+])
+
+# NOTE: This wrapper library is necessary in order to capture the Python
+# dependencies of our extension (namely `ml_dtypes`).  Although the
+# underlying `pybind_extension` rule has a `py_deps` argument for capturing
+# such dependencies directly, the `tsl_pybind_extension` rule doesn't expose
+# that `py_deps` argument for us to use.
+#
+# NOTE: On the OSS side, the `pytype_strict_library` rule is changed into
+# the non-typed rule, which in turn causes an error about the `pytype_srcs`
+# field.  The "..:xla_client" target gets around this by adding a custom
+# copybara rule; but in lieu of adding yet another custom rule to maintain,
+# we just use the generic copybara mechanism for commenting the field out
+# on the OSS side.
+# TODO(wrengr,phawkins): Once cl/619904840 lands, we can remove the
+# pragma and the preceding commentary.
+pytype_strict_library(
+    name = "types",
+    srcs = ["types.py"],
+    # copybara:uncomment pytype_srcs = ["_types.pyi"],
+    srcs_version = "PY3",
+    # Cannot build this on OSS because the ":xla_data_proto_py_pb2"
+    # dependency isn't part of the public API.
+    tags = ["no_oss"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":_types",  # buildcleaner: keep
+        "//third_party/py/numpy",
+        "//xla:xla_data_proto_py_pb2",
+        "@ml_dtypes",
+    ],
+)
+
+# NOTE: Copybara detects the `tsl_pybind_extension` rule and automatically
+# injects the "@com_google_protobuf//:protobuf_python" python dependency
+# required by "@pybind11_protobuf//pybind11_protobuf:native_proto_caster".
+tsl_pybind_extension(
+    name = "_types",
+    srcs = ["_types.cc"],
+    pytype_deps = ["//third_party/py/numpy"],
+    pytype_srcs = ["_types.pyi"],
+    # Users should depend on ":types" instead.
+    visibility = ["//visibility:private"],
+    deps = [
+        "//third_party/nanobind",
+        "//xla:literal",
+        "//xla:xla_data_proto_cc",
+        "//xla/python:logging",
+        "//xla/python:types",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/python/lib/core:numpy",
+        "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:status_casters",
+        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
+    ],
+)
+
+py_strict_test(
+    name = "types_test",
+    size = "small",
+    srcs = ["types_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    # Cannot build this on OSS because the ":xla_data_proto_py_pb2"
+    # dependency isn't part of the public API.
+    tags = ["no_oss"],
+    deps = [
+        ":types",
+        #internal proto upb dep
+        "//third_party/py/numpy",
+        "//xla:xla_data_proto_py_pb2",
+        "@absl_py//absl/testing:absltest",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/third_party/xla/xla/python/tools/__init__.py b/third_party/xla/xla/python/tools/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/third_party/xla/xla/python/tools/_types.cc b/third_party/xla/xla/python/tools/_types.cc
new file mode 100644
index 00000000000000..f18e360c13c7fb
--- /dev/null
+++ b/third_party/xla/xla/python/tools/_types.cc
@@ -0,0 +1,128 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
+#include "pybind11/detail/common.h"  // from @pybind11
+#include "pybind11/numpy.h"  // from @pybind11
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil
+#include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
+#include "xla/literal.h"
+#include "xla/python/logging.h"
+#include "xla/python/types.h"
+#include "xla/xla_data.pb.h"
+// NOTE: The tsl-numpy header forbids importing the actual NumPy arrayobject.h
+// header before tsl-numpy (whereas, importing pybind11-numpy before tsl-numpy
+// is fine); however, tsl-numpy does reexport NumPy's arrayobject.h header.
+// Since one of the TF headers above already includes tsl-numpy, therefore
+// we must include it down here rather than including actual NumPy directly.
+#include "tsl/python/lib/core/numpy.h"
+
+namespace py = ::pybind11;
+namespace nb = ::nanobind;
+
+namespace {
+absl::StatusOr<py::object> MakeNdarray(const xla::LiteralProto& proto) {
+  auto m_lit = xla::Literal::CreateFromProto(proto);
+  if (!m_lit.ok()) {
+    // NOTE: The OSS version of XLA is still using an old version of
+    // Abseil (LTS branch, Aug 2023, Patch 1) which does not have the
+    // `AbslStringify` interface for implicitly converting `absl::Status`
+    // into the `absl::AlphaNum` required by `absl::StrCat`.  Therefore we
+    // inline the latest definition of the `AbslStringify` overload.
+    throw py::value_error(absl::StrCat(
+        "Cannot `xla::Literal::CreateFromProto`: ",
+        m_lit.status().ToString(absl::StatusToStringMode::kWithEverything)));
+  }
+
+  // Move (not copy) the literal onto the heap, for sharing with Python.
+  auto lit = std::make_shared<xla::Literal>(std::move(m_lit).value());
+
+  TF_ASSIGN_OR_RETURN(auto nbobj, xla::LiteralToPython(std::move(lit)));
+
+  // Convert `nb::object` into `py::object`.
+  return py::reinterpret_steal<py::object>(nbobj.release().ptr());
+}
+}  // namespace
+
+// NOTE: It seems insurmountable to get "native_proto_caster.h" to work
+// with nanobind modules; therefore, we define our extension as a pybind11
+// module so that we can use `pybind11::module_::def`.
+PYBIND11_MODULE(_types, py_m) {
+  // Initialize ABSL logging because code within XLA uses it.
+  // (As per `xla::Init` in "xla.cc"; though we don't need it ourselves.)
+#ifndef PLATFORM_GOOGLE
+  xla::InitializeAbslLogging();
+#endif  // PLATFORM_GOOGLE
+
+  // Normally this would happen at the start of NB_MODULE, but since
+  // this is a pybind11 module we have to do this ourselves.
+  // (As per `xla::Init` in "xla.cc".)
+  nb::detail::init(NB_DOMAIN_STR);
+
+  // Import implicit conversions from Python protobuf objects to C++
+  // protobuf objects.
+  pybind11_protobuf::ImportNativeProtoCasters();
+
+  // Import implicit conversions from `absl::StatusOr` to Python exceptions.
+  // (The code for performing conversions is easy enough to port to nanobind;
+  // albeit, the conversion calls themselves have to be made explicit,
+  // since `nb::detail::type_caster` disallows raising exceptions.)
+  py::google::ImportStatusModule();
+
+  // Import the 'ml_dtypes' module; which is implicitly required by
+  // `xla::LiteralToPython`.
+  // NOTE: If the `tsl_pybind_extension` build rule allowed us to specify
+  // this as a py_dep, then importing the module here would mean that
+  // client Python code need not import the hidden dependency themselves.
+  // However, since `tsl_pybind_extension` does not allow specifying py_deps,
+  // if client rules do not themselves declare the dependency then this will
+  // generate a `ModuleNotFoundError` / `ImportError` exception.  Hence why
+  // we define the "types.py" wrapper library to encapsulate the dependency.
+  py::module_::import("ml_dtypes");
+
+  // Ensure that tsl-numpy initializes datastructures of the actual-NumPy
+  // implementation, and does whatever else tsl-numpy needs.
+  tsl::ImportNumpy();
+
+  // Declare that C++ can `nb::cast` from `std::shared_ptr<xla::Literal>`
+  // to `nb::object`; which is implicitly required by `xla::LiteralToPython`.
+  // (FWIW: This also enables using `nb::type<xla::Literal>()` to get
+  // the Python-type-object associated with the C++ class.)
+  //
+  // NOTE: This does *not* mean that C++ can `py::cast` from `xla::Literal`
+  // to `py::object`.  It's unclear whether we can simultaneously provide
+  // both nanobind and pybind11 bindings (if we wanted the latter).
+  nb::module_ nb_m = nb::cast<nb::module_>(nb::borrow(py_m.ptr()));
+  nb::class_<xla::Literal>(nb_m, "Literal")
+      .def("__repr__", &xla::Literal::ToString);
+
+  // We do not define `py_m.doc()` here, since it wouldn't be inherited
+  // by the "types.py" wrapper library.  See there for the python docstring.
+
+  // LINT.IfChange
+  py_m.def("make_ndarray", &MakeNdarray, py::arg("proto").none(false),
+           py::pos_only(), R"pbdoc(
+    Converts `tensorflow.compiler.xla.xla_data_pb2.LiteralProto`
+    into an `xla::Literal` and then converts that literal into a tree
+    of tuples with leaves being `numpy.ndarray` views of array-shaped
+    sub-literals.
+  )pbdoc");
+  // LINT.ThenChange(_types.pyi)
+}
diff --git a/third_party/xla/xla/python/tools/_types.pyi b/third_party/xla/xla/python/tools/_types.pyi
new file mode 100644
index 00000000000000..1ca5071367a0cd
--- /dev/null
+++ b/third_party/xla/xla/python/tools/_types.pyi
@@ -0,0 +1,23 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Union
+import numpy as np
+from xla import xla_data_pb2
+
+# LINT.IfChange
+NdarrayTree = Union[np.ndarray, tuple['NdarrayTree', ...]]
+def make_ndarray(proto: xla_data_pb2.LiteralProto, /) -> NdarrayTree: ...
+# LINT.ThenChange(types.py, _types.cc)
diff --git a/third_party/xla/xla/python/tools/types.py b/third_party/xla/xla/python/tools/types.py
new file mode 100644
index 00000000000000..d13ee2241ed479
--- /dev/null
+++ b/third_party/xla/xla/python/tools/types.py
@@ -0,0 +1,51 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""tensorflow.compiler.xla.python.tools.types.
+
+This module provides Python bindings for various functions in
+'tensorflow/compiler/xla/python/types.h'.  It is primarily intended
+to assist internal users in debugging things; and is not considered
+part of the public API for OpenXLA.
+
+NOTE: This module *does* depend on Python protocol buffers; so beware!
+The XLA Python bindings are currently packaged both as part of jaxlib and
+as part of TensorFlow.  Therefore, since we use protocol buffers here,
+importing both jaxlib and TensorFlow may fail with duplicate protocol
+buffer message definitions.
+"""
+
+from typing import Union
+# NOTE: `ml_dtypes` is implicitly required by `xla::LiteralToPython`.
+# The entire goal of this wrapper library is to capture this dependency,
+# so that client code need not be aware of it.
+import ml_dtypes  # pylint: disable=unused-import
+import numpy
+# NOTE: These protos are not part of TensorFlow's public API, therefore
+# we cannot abide by [g-direct-tensorflow-import].
+# pylint: disable=g-direct-tensorflow-import,unused-import
+from local_xla.xla import xla_data_pb2
+# pylint: enable=g-direct-tensorflow-import,unused-import
+
+# NOTE: `import <name> as <name>` is required for names to be exported.
+# See PEP 484 & <https://github.com/google/jax/issues/7570>
+# pylint: disable=g-importing-member,useless-import-alias,unused-import
+# LINT.IfChange
+from ._types import (
+    make_ndarray as make_ndarray,
+)
+# TODO(wrengr): We can't import the `NdarrayTree` defined in the pyi file.
+# So re-defining it here for now.
+NdarrayTree = Union[numpy.ndarray, tuple['NdarrayTree', ...]]
+# LINT.ThenChange(_types.pyi)
diff --git a/third_party/xla/xla/python/tools/types_test.py b/third_party/xla/xla/python/tools/types_test.py
new file mode 100644
index 00000000000000..e056e05be24f35
--- /dev/null
+++ b/third_party/xla/xla/python/tools/types_test.py
@@ -0,0 +1,183 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import itertools
+import math
+import re
+from typing import List, NamedTuple
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import numpy as np
+
+# NOTE: These protos are not part of the public API, therefore we cannot
+# abide by [g-direct-tensorflow-import].
+# pylint: disable=g-direct-tensorflow-import
+from local_xla.xla import xla_data_pb2
+from xla.python.tools import types
+# pylint: enable=g-direct-tensorflow-import
+
+
+class MakeNdarrayInvalidTest(absltest.TestCase):
+  """Tests for invalid/unsupported arguments to `make_ndarray`."""
+
+  def setUp(self):
+    super().setUp()
+    self.assert_cannot_create_from_proto = self.assertRaisesRegex(
+        ValueError, re.escape('Cannot `xla::Literal::CreateFromProto`')
+    )
+
+  # NOTE: The `Literal(const Shape&, bool, ArrayValueState)` ctor does
+  # a CHECK forbidding `element_size_in_bits` from being specified;
+  # so we can't test anything about custom sizes here.
+
+  def testMissingLayout(self):
+    # NOTE: `CreateFromProto` requires explicit `shape.layout.minor_to_major`.
+    # Though in principle it could use a default ctor instead, like we
+    # do in `make_named_parameter` below`.
+    pb = xla_data_pb2.LiteralProto(
+        shape=xla_data_pb2.ShapeProto(
+            element_type=xla_data_pb2.PrimitiveType.F64,
+            dimensions=[1, 2, 3],
+        )
+    )
+    with self.assert_cannot_create_from_proto:
+      types.make_ndarray(pb)
+
+  def testMissingMinorToMajor(self):
+    # NOTE: `CreateFromProto` requires explicit `shape.layout.minor_to_major`.
+    # Though in principle it could use a default ctor instead, like we
+    # do in `make_named_parameter` below`.
+    pb = xla_data_pb2.LiteralProto(
+        shape=xla_data_pb2.ShapeProto(
+            element_type=xla_data_pb2.PrimitiveType.F64,
+            dimensions=[1, 2, 3],
+            layout=xla_data_pb2.LayoutProto(),
+        )
+    )
+    with self.assert_cannot_create_from_proto:
+      types.make_ndarray(pb)
+
+  def testInvalidPrimitiveType(self):
+    # NOTE: The `is_dynamic_dimension` field isn't required by
+    # `CreateFromProto`; however, the `Shape(const ShapeProto&)` ctor
+    # will log warnings if we leave it unspecified.
+    pb = xla_data_pb2.LiteralProto(
+        shape=xla_data_pb2.ShapeProto(
+            element_type=xla_data_pb2.PrimitiveType.PRIMITIVE_TYPE_INVALID,
+            dimensions=[1, 2, 3],
+            is_dynamic_dimension=[False, False, False],
+            layout=xla_data_pb2.LayoutProto(
+                minor_to_major=[0, 1, 2],
+            ),
+        )
+    )
+    with self.assert_cannot_create_from_proto:
+      types.make_ndarray(pb)
+
+  def testHasDimLevelTypes(self):
+    # NOTE: `CreateFromProto` forbids `dim_level_types` (even if all-dense).
+    pb = xla_data_pb2.LiteralProto(
+        shape=xla_data_pb2.ShapeProto(
+            element_type=xla_data_pb2.PrimitiveType.F64,
+            dimensions=[1, 2, 3],
+            is_dynamic_dimension=[False, False, False],
+            layout=xla_data_pb2.LayoutProto(
+                dim_level_types=[
+                    xla_data_pb2.DimLevelType.DIM_DENSE,
+                    xla_data_pb2.DimLevelType.DIM_DENSE,
+                    xla_data_pb2.DimLevelType.DIM_DENSE,
+                ],
+                minor_to_major=[0, 1, 2],
+            ),
+        )
+    )
+    with self.assert_cannot_create_from_proto:
+      types.make_ndarray(pb)
+
+
+class MakeNdarrayValidTestParameter(NamedTuple):
+  testcase_name: str
+  proto: xla_data_pb2.LiteralProto
+  arr: np.ndarray
+
+
+def make_named_parameter(
+    testcase_name: str,
+    dimensions: List[int],
+    data: List[float],
+) -> MakeNdarrayValidTestParameter:
+  """Helper function to construct parameters for `MakeNdarrayValidTest`."""
+  assert math.prod(dimensions) == len(data)
+  nd = len(dimensions)
+  proto = xla_data_pb2.LiteralProto(
+      shape=xla_data_pb2.ShapeProto(
+          element_type=xla_data_pb2.PrimitiveType.F64,
+          dimensions=dimensions,
+          is_dynamic_dimension=itertools.repeat(False, nd),
+          layout=xla_data_pb2.LayoutProto(
+              minor_to_major=range(nd),
+          ),
+      ),
+      f64s=data,
+  )
+  arr = types.make_ndarray(proto)
+  return MakeNdarrayValidTestParameter(testcase_name, proto, arr)
+
+
+@parameterized.named_parameters(
+    make_named_parameter('A', [2, 3], [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
+    make_named_parameter('B', [1, 2, 3], [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
+    make_named_parameter('C', [2, 3], [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]),
+    make_named_parameter('D', [3, 2], [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]),
+)
+class MakeNdarrayValidTest(parameterized.TestCase):
+  """Correctness tests for valid arguments to `make_ndarray`."""
+
+  def testHasCorrectDtype(self, proto, arr):
+    """Test that the result has the right dtype."""
+    # Silence [unused-argument] warning.
+    del proto
+    # TODO(wrengr): Add pybind for `xla::PrimitiveTypeToDtype`,
+    # so that we can avoid hard-coding the expected np.dtype.
+    # Alternatively, we could use `xla_client.dtype_to_etype` (ideally
+    # after refactoring that into a small library, so we need not pull in
+    # all the rest of xla_client).
+    self.assertEqual(np.float64, arr.dtype)
+
+  def testHasCorrectRank(self, proto, arr):
+    """Test that the result has the right rank."""
+    self.assertLen(proto.shape.dimensions, arr.ndim)
+
+  def testHasCorrectShape(self, proto, arr):
+    """Test that the result has the same/right shape."""
+    self.assertTupleEqual(tuple(proto.shape.dimensions), arr.shape)
+
+  def testHasCorrectData(self, proto, arr):
+    """Test that the result has the same/right data."""
+    # TODO(wrengr): Figure out a way to abstract away the name of the
+    # proto field containing the data; so that we can test multiple types.
+    self.assertSequenceAlmostEqual(proto.f64s, list(np.nditer(arr)))
+
+  # TODO(wrengr): Add tests for:
+  # * dynamic dimension sizes.
+  # * non-trivial `minor_to_major`.
+  # * problematic types {PRED,F16,C64,C128} are all handled correctly.
+  # * BF16 is handled correctly.
+  # * tuples are handled correctly
+
+
+if __name__ == '__main__':
+  absltest.main()

From 2d48a6d1eba05a768e3a8a09326c370518722086 Mon Sep 17 00:00:00 2001
From: Matthias Kramm <kramm@google.com>
Date: Thu, 28 Mar 2024 14:51:37 -0700
Subject: [PATCH 569/670] Reduce memory usage of
 convert_control_to_data_outputs.

PiperOrigin-RevId: 620064529
---
 .../tensorflow/analysis/resource_dataflow.cc  | 71 ++++++++++++++-----
 .../tensorflow/analysis/resource_dataflow.h   | 17 ++++-
 .../convert_control_to_data_outputs.cc        |  6 +-
 3 files changed, 71 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc
index d0a05e45617cf6..5ceda80490f688 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h"
 
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Debug.h"
 #include "mlir/Analysis/DataFlow/SparseAnalysis.h"  // from @llvm-project
 #include "mlir/Analysis/DataFlowFramework.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -55,9 +54,6 @@ ResourceConstructingOps ResourceConstructingOps::EntryState(Value value) {
           tf_saved_model::GlobalTensorOp>(func, barg.getArgNumber(),
                                           symbol_table);
       ResourceConstructingOps result(global_tensor);
-      if (func.getArgAttr(barg.getArgNumber(), kCompositeDevice)) {
-        result.is_on_composite_device = true;
-      }
       return result;
     }
   } else if (auto vh = dyn_cast<TF::VarHandleOp>(value.getDefiningOp())) {
@@ -75,17 +71,47 @@ ResourceConstructingOps ResourceConstructingOps::join(
   ResourceConstructingOps ret;
   ret.ops.insert(lhs.ops.begin(), lhs.ops.end());
   ret.ops.insert(rhs.ops.begin(), rhs.ops.end());
-  ret.is_on_composite_device =
-      lhs.is_on_composite_device || rhs.is_on_composite_device;
   return ret;
 }
 
 void ResourceConstructingOps::print(raw_ostream &os) const {
   llvm::interleaveComma(ops, os << "[");
+  os << "]";
+}
+
+IsComposite::IsComposite(Operation *op) {}
+
+IsComposite IsComposite::EntryState(MLIRContext *context) {
+  return IsComposite();
+}
+
+IsComposite IsComposite::EntryState(Value value) {
+  IsComposite result;
+  if (auto barg = value.dyn_cast<BlockArgument>()) {
+    if (func::FuncOp func =
+            dyn_cast<func::FuncOp>(barg.getOwner()->getParentOp())) {
+      if (func.getArgAttr(barg.getArgNumber(), kCompositeDevice)) {
+        result.is_on_composite_device = true;
+      }
+      return result;
+    }
+  }
+  return result;
+}
+
+IsComposite IsComposite::join(const IsComposite &lhs, const IsComposite &rhs) {
+  IsComposite ret;
+  ret.is_on_composite_device =
+      lhs.is_on_composite_device || rhs.is_on_composite_device;
+  return ret;
+}
+
+void IsComposite::print(raw_ostream &os) const {
   if (is_on_composite_device) {
-    os << " COMPOSITE";
+    os << "COMPOSITE";
+  } else {
+    os << "NOT_COMPOSITE";
   }
-  os << "]";
 }
 
 class ResourceDataflowAnalysis
@@ -94,23 +120,32 @@ class ResourceDataflowAnalysis
   using TensorflowDataflowAnalysis<
       ResourceConstructingOps>::TensorflowDataflowAnalysis;
   void visitOperation(Operation *op, ArrayRef<const StateT *> operands,
-                      ArrayRef<StateT *> results) override;
+                      ArrayRef<StateT *> results) override {
+    if (ForwardThroughTFOperation(op, operands, results)) return;
+    setAllToEntryStates(results);
+  }
   ~ResourceDataflowAnalysis() override = default;
 };
 
-void ResourceDataflowAnalysis::visitOperation(Operation *op,
-                                              ArrayRef<const StateT *> operands,
-                                              ArrayRef<StateT *> results) {
-  LLVM_DEBUG(llvm::dbgs() << "ResAn: Visiting operation: " << *op << "\n");
-
-  if (ForwardThroughTFOperation(op, operands, results)) return;
-
-  setAllToEntryStates(results);
-}
+class IsCompositeDataflowAnalysis
+    : public TensorflowDataflowAnalysis<IsComposite> {
+ public:
+  using TensorflowDataflowAnalysis<IsComposite>::TensorflowDataflowAnalysis;
+  void visitOperation(Operation *op, ArrayRef<const StateT *> operands,
+                      ArrayRef<StateT *> results) override {
+    if (ForwardThroughTFOperation(op, operands, results)) return;
+    setAllToEntryStates(results);
+  }
+  ~IsCompositeDataflowAnalysis() override = default;
+};
 
 void LoadResourceDataflowAnalysis(DataFlowSolver &solver) {
   solver.load<ResourceDataflowAnalysis>();
 }
 
+void LoadIsCompositeDataflowAnalysis(DataFlowSolver &solver) {
+  solver.load<IsCompositeDataflowAnalysis>();
+}
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
index 9015b9dc739634..0cf3611af1d20c 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
@@ -46,8 +46,7 @@ struct ResourceConstructingOps {
   static ResourceConstructingOps EntryState(MLIRContext *context);
   static ResourceConstructingOps EntryState(Value value);
   bool operator==(const ResourceConstructingOps &rhs) const {
-    return ops == rhs.ops &&
-           is_on_composite_device == rhs.is_on_composite_device;
+    return ops == rhs.ops;
   }
 
   static ResourceConstructingOps join(const ResourceConstructingOps &lhs,
@@ -57,13 +56,27 @@ struct ResourceConstructingOps {
   // The operation(s) which created the resource value.
   // IR constructs (i.e., GlobalTensorOp) are not const-correct.
   mutable DenseSet<Operation *> ops;
+};
+
+struct IsComposite {
+  explicit IsComposite(Operation *op = nullptr);
+  static IsComposite EntryState(MLIRContext *context);
+  static IsComposite EntryState(Value value);
+  bool operator==(const IsComposite &rhs) const {
+    return is_on_composite_device == rhs.is_on_composite_device;
+  }
+
+  static IsComposite join(const IsComposite &lhs, const IsComposite &rhs);
+  void print(raw_ostream &os) const;
 
   bool is_on_composite_device = false;
 };
 
 typedef dataflow::Lattice<ResourceConstructingOps> ResourceDataflowState;
+typedef dataflow::Lattice<IsComposite> IsCompositeDataflowState;
 
 void LoadResourceDataflowAnalysis(DataFlowSolver &solver);
+void LoadIsCompositeDataflowAnalysis(DataFlowSolver &solver);
 
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
index f265ac68fa5f27..4de43317677f63 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
@@ -145,7 +145,7 @@ bool OnlyOperatesOnCompositeDevices(
       continue;
     }
     auto lattice =
-        solver.lookupState<TF::ResourceDataflowState>(arg.get())->getValue();
+        solver.lookupState<TF::IsCompositeDataflowState>(arg.get())->getValue();
     bool is_read = read_array.contains(arg.getOperandNumber());
     bool is_update = update_array.contains(arg.getOperandNumber());
     // We want the resource operands that are on composite devices to be the
@@ -214,7 +214,7 @@ void CollectChainResources(
       // device-specific (see below).
       bool resource_is_on_composite_device = false;
       for (Value value : alias_analysis.GetValuesForResourceId(resource_id)) {
-        auto lattice = solver.lookupState<TF::ResourceDataflowState>(value);
+        auto lattice = solver.lookupState<TF::IsCompositeDataflowState>(value);
         if (lattice) {
           resource_is_on_composite_device |=
               lattice->getValue().is_on_composite_device;
@@ -604,7 +604,7 @@ void ConvertControlToDataOutputsPass::runOnOperation() {
   DataFlowSolver solver;
   solver.load<dataflow::DeadCodeAnalysis>();
   solver.load<dataflow::SparseConstantPropagation>();
-  TF::LoadResourceDataflowAnalysis(solver);
+  TF::LoadIsCompositeDataflowAnalysis(solver);
   if (failed(solver.initializeAndRun(module))) return signalPassFailure();
 
   // This pass assumes that all functions are suitable for export i.e., each

From 9926a407d0929e6444cb63cafe9a934eae9c17e7 Mon Sep 17 00:00:00 2001
From: "Jiyoun (Jen) Ha" <jiyounha@google.com>
Date: Thu, 28 Mar 2024 15:01:23 -0700
Subject: [PATCH 570/670] Add quantization/legalization for `stablehlo.add` and
 respective pipeline changes.

* Added `enable_full_int_quantization` in `StaticRangePtqPreset` to determine full int quantization. This value will be `false` by default, meaning only compute-heavy ops will be quantized unless specified.
* Added tests for the above config change.
* Follow up tests will include e2e python tests.

PiperOrigin-RevId: 620067140
---
 .../uniform-quantized-stablehlo-to-tfl.mlir   | 172 +++++++++++-------
 ...uniform_quantized_stablehlo_to_tfl_pass.cc | 164 ++++++++---------
 .../mlir/quantization/stablehlo/cc/config.cc  |   8 +-
 .../quantization/stablehlo/cc/config_test.cc  |  22 ++-
 .../stablehlo/cc/pass_pipeline.cc             |   3 +
 ...t_quantizable_spots_as_functions_simple.td |   8 +
 .../quantization/stablehlo/passes/passes.td   |   8 +
 .../stablehlo/passes/quantization_patterns.cc |   6 +
 .../stablehlo/passes/quantization_patterns.h  |   5 +
 .../quantization/stablehlo/passes/quantize.cc |   7 +
 .../passes/quantize_composite_functions.cc    |   5 +-
 .../stablehlo/passes/testing/passes.h         |   7 +-
 .../stablehlo/passes/testing/passes.td        |   4 +-
 ...ts_as_functions_with_quantization_specs.cc |  11 ++
 .../integration_test/quantize_model_test.py   |   4 +-
 .../stablehlo/quantization_config.proto       |   5 +-
 ..._as_functions_with_quantization_specs.mlir |  63 +++++--
 .../quantize_composite_functions_all_ops.mlir |  46 +++++
 tensorflow/lite/python/lite.py                |   1 +
 19 files changed, 363 insertions(+), 186 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_all_ops.mlir

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
index 05d00443ebcca6..5653dfeb9f2b8f 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
@@ -764,7 +764,7 @@ func.func @conv_with_bias_same_padding_srq_depthwise(%arg0: tensor<1x4x5x3x!quan
 
 // -----
 
-// Tests that a quantized stablehlo.transpose is converted to tfl.transpose.
+// Tests that a quantized `stablehlo.transpose` is converted to `tfl.transpose`.
 
 func.func @transpose(
     %arg0: tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
@@ -781,19 +781,19 @@ func.func @transpose(
 
 // -----
 
-// Tests that a float stablehlo.transpose is not converted to tfl.transpose.
+// Tests that a float `stablehlo.transpose` is not converted to `tfl.transpose`.
 
-func.func @float_transpose(%arg0: tensor<2x3x4xf32>) -> tensor<4x3x2xf32> {
+func.func @transpose_float(%arg0: tensor<2x3x4xf32>) -> tensor<4x3x2xf32> {
   %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<2x3x4xf32>) -> tensor<4x3x2xf32>
   return %0 : tensor<4x3x2xf32>
 }
-// CHECK-LABEL: float_transpose
+// CHECK-LABEL: transpose_float
 // CHECK-NOT: tfl.transpose
 // CHECK: stablehlo.transpose
 
 // -----
 
-// Tests that a quantized stablehlo.reshape is converted to tfl.reshape.
+// Tests that a quantized `stablehlo.reshape` is converted to `tfl.reshape`.
 
 func.func @reshape(
     %arg0: tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
@@ -810,19 +810,19 @@ func.func @reshape(
 
 // -----
 
-// Tests that a float stablehlo.reshape is not converted to tfl.reshape.
+// Tests that a float `stablehlo.reshape` is not converted to `tfl.reshape`.
 
-func.func @float_reshape(%arg0: tensor<2x3x4xf32>) -> tensor<6x4xf32> {
+func.func @reshape_float(%arg0: tensor<2x3x4xf32>) -> tensor<6x4xf32> {
   %0 = stablehlo.reshape %arg0 : (tensor<2x3x4xf32>) -> tensor<6x4xf32>
   return %0 : tensor<6x4xf32>
 }
-// CHECK-LABEL: float_reshape
+// CHECK-LABEL: reshape_float
 // CHECK-NOT: tfl.reshape
 // CHECK: stablehlo.reshape
 
 // -----
 
-// Tests that a quantized stablehlo.select is converted to tfl.select_v2.
+// Tests that a quantized `stablehlo.select` is converted to `tfl.select_v2`.
 
 func.func @select(
     %arg0: tensor<1x3xi1>,
@@ -844,19 +844,20 @@ func.func @select(
 
 // -----
 
-// Tests that a float stablehlo.select is not converted to tfl.select_v2.
+// Tests that a float `stablehlo.select` is not converted to `tfl.select_v2`.
 
-func.func @float_select(%arg0: tensor<1x3xi1>, %arg1: tensor<1x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> {
+func.func @select_float(%arg0: tensor<1x3xi1>, %arg1: tensor<1x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> {
   %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<1x3xi1>, tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
   return %0 : tensor<1x3xf32>
 }
-// CHECK-LABEL: float_select
+// CHECK-LABEL: select_float
 // CHECK-NOT: tfl.select_v2
 // CHECK: stablehlo.select
 
 // -----
 
-// Tests that a quantized stablehlo.concatenate is converted to tfl.concatenation.
+// Tests that a quantized `stablehlo.concatenate` is converted to
+// `tfl.concatenation`.
 
 func.func @concatenate(
     %arg0: tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
@@ -876,20 +877,21 @@ func.func @concatenate(
 
 // -----
 
-// Tests that a float stablehlo.concatenate is not converted to tfl.concatenation.
+// Tests that a float `stablehlo.concatenate` is not converted to
+// `tfl.concatenation`.
 
-func.func @float_concatenate(%arg0: tensor<3x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<4x2xf32> {
+func.func @concatenate_float(%arg0: tensor<3x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<4x2xf32> {
   %0 = "stablehlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x2xf32>, tensor<1x2xf32>) -> tensor<4x2xf32>
   return %0 : tensor<4x2xf32>
 }
-// CHECK-LABEL: float_concatenate
+// CHECK-LABEL: concatenate_float
 // CHECK-NOT: tfl.concatenation
 // CHECK: stablehlo.concatenate
 
 // -----
 
-// Tests that a quantized stablehlo.pad without interior padding is converted to
-// tfl.padv2.
+// Tests that a quantized `stablehlo.pad` without interior padding is
+// converted to `tfl.padv2`.
 
 func.func @pad_without_interior_padding(
     %arg0: tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
@@ -911,8 +913,8 @@ func.func @pad_without_interior_padding(
 
 // -----
 
-// Tests that a quantized stablehlo.pad with interior padding is converted to
-// tfl.dilate and tfl.padv2.
+// Tests that a quantized `stablehlo.pad` with interior padding is converted to
+// `tfl.dilate` and `tfl.padv2`.
 
 func.func @pad_with_interior_padding(
     %arg0: tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
@@ -937,20 +939,20 @@ func.func @pad_with_interior_padding(
 
 // -----
 
-// Tests that a float stablehlo.pad is not converted to tfl.padv2.
+// Tests that a float `stablehlo.pad` is not converted to `tfl.padv2`.
 
-func.func @float_pad(%arg0: tensor<2x3xf32>, %arg1: tensor<f32>) -> tensor<4x5xf32> {
+func.func @pad_float(%arg0: tensor<2x3xf32>, %arg1: tensor<f32>) -> tensor<4x5xf32> {
   %0 = stablehlo.pad %arg0, %arg1, low = [0, 1], high = [2, 1], interior = [0, 0] : (tensor<2x3xf32>, tensor<f32>) -> tensor<4x5xf32>
   return %0 : tensor<4x5xf32>
 }
-// CHECK-LABEL: float_pad
+// CHECK-LABEL: pad_float
 // CHECK-NOT: tfl.padv2
 // CHECK: stablehlo.pad
 
 // -----
 
-// Tests that a quantized stablehlo.slice is converted to tfl.slice when stride
-// is 1.
+// Tests that a quantized `stablehlo.slice` is converted to
+// `tfl.slice` when stride is 1.
 
 func.func @slice(
     %arg0: tensor<3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
@@ -973,8 +975,8 @@ func.func @slice(
 
 // -----
 
-// Tests that a quantized stablehlo.slice is converted to tfl.strided_slice when
-// stride is not 1.
+// Tests that a quantized `stablehlo.slice` is converted to `tfl.strided_slice`
+// when stride is not 1.
 
 func.func @strided_slice(
     %arg0: tensor<3x6x!quant.uniform<i8:f32, 2.000000e+00:-1>>
@@ -1001,9 +1003,9 @@ func.func @strided_slice(
 
 // -----
 
-// Tests that a float stablehlo.slice is not converted to tfl.slice.
+// Tests that a float `stablehlo.slice` is not converted to `tfl.slice`.
 
-func.func @float_slice(%arg0: tensor<3x4xf32>) -> tensor<2x2xf32> {
+func.func @slice_float(%arg0: tensor<3x4xf32>) -> tensor<2x2xf32> {
   %0 = "stablehlo.slice"(%arg0) {
     start_indices = array<i64: 1, 2>,
     limit_indices = array<i64: 3, 4>,
@@ -1011,15 +1013,15 @@ func.func @float_slice(%arg0: tensor<3x4xf32>) -> tensor<2x2xf32> {
   } : (tensor<3x4xf32>) -> tensor<2x2xf32>
   return %0 : tensor<2x2xf32>
 }
-// CHECK-LABEL: float_slice
+// CHECK-LABEL: slice_float
 // CHECK-NOT: tfl.slice
 // CHECK-NOT: tfl.strided_slice
 // CHECK: stablehlo.slice
 
 // -----
 
-// Tests that a quantized stablehlo.broadcast_in_dim is converted to
-// tfl.broadcast_to.
+// Tests that a quantized `stablehlo.broadcast_in_dim` is converted to
+// `tfl.broadcast_to`.
 
 func.func @broadcast_in_dim(
     %arg0: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
@@ -1038,8 +1040,8 @@ func.func @broadcast_in_dim(
 
 // -----
 
-// Tests that a quantized stablehlo.broadcast_in_dim is converted to
-// tfl.transpose and tfl.broadcast_to when broadcast_dimensions is not in
+// Tests that a quantized `stablehlo.broadcast_in_dim` is converted to
+// `tfl.transpose` and `tfl.broadcast_to` when `broadcast_dimensions` is not in
 // ascending order.
 
 func.func @broadcast_in_dim_with_transpose(
@@ -1062,8 +1064,8 @@ func.func @broadcast_in_dim_with_transpose(
 
 // -----
 
-// Tests that a quantized stablehlo.broadcast_in_dim is converted to
-// tfl.expand_dims and tfl.broadcast_to when input rank is smaller than output
+// Tests that a quantized `stablehlo.broadcast_in_dim` is converted to
+// tfl.expand_dims and `tfl.broadcast_to` when input rank is smaller than output
 // rank.
 
 func.func @broadcast_in_dim_with_expand(
@@ -1086,9 +1088,10 @@ func.func @broadcast_in_dim_with_expand(
 
 // -----
 
-// Tests that a quantized stablehlo.broadcast_in_dim is converted to
-// tfl.transpose, tfl.expand_dims and tfl.broadcast_to when broadcast_dimensions
-// is not in ascending order and input rank is smaller than output rank.
+// Tests that a quantized `stablehlo.broadcast_in_dim` is converted to
+// `tfl.transpose`, `tfl.expand_dims` and `tfl.broadcast_to` when
+// `broadcast_dimensions` is not in ascending order and input rank is smaller
+// than output rank.
 
 func.func @broadcast_in_dim_with_transpose_and_expand(
     %arg0: tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:3>>
@@ -1112,15 +1115,16 @@ func.func @broadcast_in_dim_with_transpose_and_expand(
 
 // -----
 
-// Tests that a float stablehlo.broadcast_in_dim is not converted to tfl.broadcast_to.
+// Tests that a float `stablehlo.broadcast_in_dim` is not converted to
+// `tfl.broadcast_to`.
 
-func.func @float_broadcast_in_dim(%arg0: tensor<1x2xf32>) -> tensor<3x2xf32> {
+func.func @broadcast_in_dim_float(%arg0: tensor<1x2xf32>) -> tensor<3x2xf32> {
   %0 = "stablehlo.broadcast_in_dim"(%arg0) {
     broadcast_dimensions = array<i64: 0, 1>
   } : (tensor<1x2xf32>) -> tensor<3x2xf32>
   return %0 : tensor<3x2xf32>
 }
-// CHECK-LABEL: float_broadcast_in_dim
+// CHECK-LABEL: broadcast_in_dim_float
 // CHECK-NOT: tfl.broadcast_to
 // CHECK-NOT: tfl.transpose
 // CHECK-NOT: tfl.expand_dims
@@ -1128,8 +1132,8 @@ func.func @float_broadcast_in_dim(%arg0: tensor<1x2xf32>) -> tensor<3x2xf32> {
 
 // -----
 
-// Test that a quantized stablehlo.reduce_window with max is converted to
-// tfl.max_pool_2d.
+// Tests that a quantized `stablehlo.reduce_window` with max is converted to
+// `tfl.max_pool_2d`.
 
 func.func @reduce_window_with_max(
   %arg0: tensor<2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
@@ -1153,8 +1157,8 @@ func.func @reduce_window_with_max(
 
 // -----
 
-// Test that a quantized stablehlo.reduce_window with max whose rank is not 4
-// is not converted to tfl.max_pool_2d.
+// Tests that a quantized `stablehlo.reduce_window `with max whose rank is not 4
+// is not converted to `tfl.max_pool_2d`.
 
 func.func @reduce_window_not_4d(
   %arg0: tensor<3x2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
@@ -1174,8 +1178,8 @@ func.func @reduce_window_not_4d(
 
 // -----
 
-// Test that a quantized stablehlo.reduce_window with max that takes multiple
-// inputs is not converted to tfl.max_pool_2d.
+// Tests that a quantized `stablehlo.reduce_window` with max that takes multiple
+// inputs is not converted to `tfl.max_pool_2d`.
 
 func.func @reduce_window_not_binary(
   %arg0: tensor<3x2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
@@ -1198,10 +1202,10 @@ func.func @reduce_window_not_binary(
 
 // -----
 
-// Test that a float stablehlo.reduce_window with max is not converted to
-// tfl.max_pool_2d.
+// Tests that a float `stablehlo.reduce_window` with max is not converted to
+// `tfl.max_pool_2d`.
 
-func.func @float_reduce_window_with_max(
+func.func @reduce_window_with_max_float(
   %arg0: tensor<2x9x10x3xf32>,
   %arg1: tensor<f32>
 ) -> tensor<2x4x3x3xf32> {
@@ -1213,13 +1217,14 @@ func.func @float_reduce_window_with_max(
   return %0 : tensor<2x4x3x3xf32>
 }
 
-// CHECK-LABEL: float_reduce_window_with_max
+// CHECK-LABEL: reduce_window_with_max_float
 // CHECK: stablehlo.reduce_window
 // CHECK-NOT: tfl.max_pool_2d
 
 // -----
 
-// Test that a quantized stablehlo.dynamic_reshape is converted to tfl.reshape.
+// Tests that a quantized `stablehlo.dynamic_reshape` is converted to
+// `tfl.reshape`.
 
 func.func @dynamic_reshape(
     %arg0: tensor<?x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
@@ -1240,20 +1245,21 @@ func.func @dynamic_reshape(
 
 // -----
 
-// Test that a float stablehlo.dynamic_reshape is not converted to tfl.reshape.
+// Tests that a float `stablehlo.dynamic_reshape` is not converted to
+// `tfl.reshape`.
 
-func.func @float_dynamic_reshape(%arg0: tensor<?x3xf32>, %arg1: tensor<2xi32>) -> tensor<?x?xf32> {
+func.func @dynamic_reshape_float(%arg0: tensor<?x3xf32>, %arg1: tensor<2xi32>) -> tensor<?x?xf32> {
   %0 = "stablehlo.dynamic_reshape"(%arg0, %arg1) : (tensor<?x3xf32>, tensor<2xi32>) -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: func @float_dynamic_reshape
+// CHECK-LABEL: func @dynamic_reshape_float
 // CHECK: stablehlo.dynamic_reshape
 // CHECK-NOT: tfl.reshape
 
 // -----
 
-// Test that a quantized stablehlo.gather is converted to tfl.gather_nd.
+// Tests that a quantized `stablehlo.gather` is converted to tfl.gather_nd.
 
 func.func @gather(
     %arg0: tensor<3x4x2x2x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
@@ -1282,8 +1288,8 @@ func.func @gather(
 
 // -----
 
-// Test that a quantized stablehlo.gather with unsorted start_index_map is not
-// converted to tfl.gather_nd (condition 1 is not satisfied).
+// Tests that a quantized `stablehlo.gather` with unsorted start_index_map is
+// not converted to `tfl.gather_nd` (condition 1 is not satisfied).
 
 func.func @gather_start_index_map_not_sorted(
     %arg0: tensor<3x4x2x2x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
@@ -1311,7 +1317,7 @@ func.func @gather_start_index_map_not_sorted(
 
 // -----
 
-// Test that a quantized stablehlo.gather is not converted to tfl.gather_nd
+// Tests that a quantized `stablehlo.gather` is not converted to tfl.gather_nd
 // when index_vector_dim is not the last dimension of start_indices (condition 2
 // is not satisfied).
 
@@ -1341,7 +1347,7 @@ func.func @gather_start_index_vector_dim_not_at_last(
 
 // -----
 
-// Test that a quantized stablehlo.gather is not converted to tfl.gather_nd
+// Tests that a quantized `stablehlo.gather` is not converted to tfl.gather_nd
 // when offset_dims are not the last dimensions of the output (condition 3 is
 // not satisfied).
 
@@ -1371,7 +1377,7 @@ func.func @gather_offset_dims_not_at_last(
 
 // -----
 
-// Test that a quantized stablehlo.gather is not converted to tfl.gather_nd
+// Tests that a quantized `stablehlo.gather` is not converted to tfl.gather_nd
 // when shape of slice is not same with shape of offset (condition 4 is not
 // satisfied).
 
@@ -1401,9 +1407,9 @@ func.func @gather_different_slice_and_offset(
 
 // -----
 
-// Test that a float stablehlo.gather is not converted to tfl.gather_nd.
+// Tests that a float `stablehlo.gather` is not converted to `tfl.gather_nd`.
 
-func.func @float_gather(%arg0: tensor<3x4x2x2xf32>, %arg1: tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32> {
+func.func @gather_float(%arg0: tensor<3x4x2x2xf32>, %arg1: tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32> {
   %0 = "stablehlo.gather"(%arg0, %arg1) {
     dimension_numbers = #stablehlo.gather<
       offset_dims = [2, 3],
@@ -1416,14 +1422,14 @@ func.func @float_gather(%arg0: tensor<3x4x2x2xf32>, %arg1: tensor<2x3x2xi64>) ->
   return %0 : tensor<2x3x2x2xf32>
 }
 
-// CHECK-LABEL: func @float_gather
+// CHECK-LABEL: func @gather_float
 // CHECK: stablehlo.gather
 // CHECK-NOT: tfl.gather_nd
 // CHECK-NOT: tfl.gather
 
 // -----
 
-// Test that a quantized stablehlo.dynamic_slice is converted to tfl.slice.
+// Tests that a quantized `stablehlo.dynamic_slice` is converted to `tfl.slice`.
 
 // CHECK-LABEL: func @dynamic_slice
 // CHECK-SAME: %[[ARG0:.+]]: tensor<4x4x!quant.uniform<i8:f32, 3.000000e-01:-5>>, %[[ARG1:.+]]: tensor<i64>, %[[ARG2:.+]]: tensor<i64>
@@ -1457,18 +1463,46 @@ func.func @dynamic_slice(
 
 // -----
 
-// Test that a float stablehlo.dynamic_slice is not converted to tfl.slice.
+// Tests that a float `stablehlo.dynamic_slice` is not converted to `tfl.slice`.
 
-func.func @float_dynamic_slice(%arg0: tensor<4x4xf32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<2x1xf32> {
+func.func @dynamic_slice_float(%arg0: tensor<4x4xf32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<2x1xf32> {
   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {
     slice_sizes = array<i64: 2, 1>
   } : (tensor<4x4xf32>, tensor<i64>, tensor<i64>) -> tensor<2x1xf32>
   return %0 : tensor<2x1xf32>
 }
 
-// CHECK-LABEL: func @float_dynamic_slice
+// CHECK-LABEL: func @dynamic_slice_float
 // CHECK: stablehlo.dynamic_slice
 // CHECK-NOT: tfl.bitcast
 // CHECK-NOT: tfl.minimum
 // CHECK-NOT: tfl.maximum
 // CHECK-NOT: tfl.slice
+
+// -----
+
+// Tests that `stablehlo.add` with both operands int8 UniformQuantizedType is
+// properly converted into `tfl.add`.
+
+func.func @add(%arg0: tensor<1x3x!quant.uniform<i8:f32, 1.000000e+0:8>>, %arg1: tensor<1x3x!quant.uniform<i8:f32, 2.000000e+0:8>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+0:8>> {
+  %0 = stablehlo.add %arg0, %arg1 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e+0:8>>, tensor<1x3x!quant.uniform<i8:f32, 2.000000e+0:8>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+0:8>>
+  return %0 : tensor<1x3x!quant.uniform<i8:f32, 3.000000e+0:8>>
+}
+
+// CHECK-LABEL: func @add
+// CHECK: %[[ADD:.+]] = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[ADD]]
+
+// -----
+
+// Tests that `stablehlo.add` with int32 UniformQuantizedPerAxisTypes is
+// not converted.
+
+func.func @add_i32(%arg0: tensor<1x3x!quant.uniform<i32:f32:1, {1.000000e+0, 1.000000e+0, 1.000000e+0}>>, %arg1: tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>> {
+  %0 = stablehlo.add %arg0, %arg1 : (tensor<1x3x!quant.uniform<i32:f32:1, {1.000000e+0, 1.000000e+0, 1.000000e+0}>>, tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>
+  return %0 : tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>
+}
+
+// CHECK-LABEL: func @add_i32
+// CHECK: stablehlo.add
+// CHECK-NOT: tfl.add
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
index c124f33d6f55eb..f9417d6da30274 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
@@ -62,6 +62,7 @@ using ::mlir::quant::CreateI32F32UniformQuantizedType;
 using ::mlir::quant::CreateI8F32UniformQuantizedPerAxisType;
 using ::mlir::quant::CreateI8F32UniformQuantizedType;
 using ::mlir::quant::FindUserOfType;
+using ::mlir::quant::GetElementType;
 using ::mlir::quant::IsI32F32UniformQuantizedPerAxisType;
 using ::mlir::quant::IsI32F32UniformQuantizedType;
 using ::mlir::quant::IsI8F32UniformQuantizedPerAxisType;
@@ -142,10 +143,7 @@ TFL::QConstOp CreateTransposedTflConstOpForFilter(
   Type new_filter_quantized_type;
 
   if (is_per_channel) {
-    auto filter_quantized_type = filter_constant_op.getResult()
-                                     .getType()
-                                     .cast<TensorType>()
-                                     .getElementType()
+    auto filter_quantized_type = GetElementType(filter_constant_op.getResult())
                                      .cast<UniformQuantizedPerAxisType>();
     new_filter_quantized_type = CreateI8F32UniformQuantizedPerAxisType(
         filter_constant_op->getLoc(), *rewriter.getContext(),
@@ -153,10 +151,7 @@ TFL::QConstOp CreateTransposedTflConstOpForFilter(
         filter_quantized_type.getZeroPoints(),
         /*quantization_dimension=*/0, /*narrow_range=*/true);
   } else {
-    auto filter_quantized_type = filter_constant_op.getResult()
-                                     .getType()
-                                     .cast<TensorType>()
-                                     .getElementType()
+    auto filter_quantized_type = GetElementType(filter_constant_op.getResult())
                                      .cast<UniformQuantizedType>();
     new_filter_quantized_type = CreateI8F32UniformQuantizedType(
         filter_constant_op->getLoc(), *rewriter.getContext(),
@@ -224,9 +219,7 @@ TFL::QConstOp CreateTflConstOpForDummyBias(
   Type bias_quantized_type;
   if (is_per_channel) {
     const auto filter_quantized_element_type =
-        filter_const_op.getResult()
-            .getType()
-            .getElementType()
+        GetElementType(filter_const_op.getResult())
             .cast<UniformQuantizedPerAxisType>();
 
     // The storage type is i32 for bias, which is the precision used for
@@ -238,9 +231,7 @@ TFL::QConstOp CreateTflConstOpForDummyBias(
         /*quantization_dimension=*/0);
   } else {
     const auto filter_quantized_element_type =
-        filter_const_op.getResult()
-            .getType()
-            .getElementType()
+        GetElementType(filter_const_op.getResult())
             .cast<UniformQuantizedType>();
 
     // The storage type is i32 for bias, which is the precision used for
@@ -277,8 +268,9 @@ arith::ConstantOp CreateI32ShapeConstantOp(const TensorType op_type,
 }
 
 // Returns the desired qi8 per-tensor quantized output type for a given gemm op.
-Type GetOutputType(Operation* op, MLIRContext& ctx, const bool has_i32_output,
-                   const bool fuse_bias_constant) {
+Type GetQuantizedOutputType(Operation* op, PatternRewriter& rewriter,
+                            const bool has_i32_output,
+                            const bool fuse_bias_constant) {
   Operation* uniform_quantize_op;
   if (!has_i32_output) return op->getResult(0).getType();
   if (fuse_bias_constant) {
@@ -289,17 +281,15 @@ Type GetOutputType(Operation* op, MLIRContext& ctx, const bool has_i32_output,
   }
   // StableHLO Quantizer outputs an i32 type. Rewrite to i8 type result
   // to meet TFLite op requirement.
-  auto result_quantized_type = uniform_quantize_op->getResult(0)
-                                   .getType()
-                                   .cast<TensorType>()
-                                   .getElementType()
+  auto result_quantized_type = GetElementType(uniform_quantize_op->getResult(0))
                                    .cast<UniformQuantizedType>();
   auto new_result_quantized_type = CreateI8F32UniformQuantizedType(
-      uniform_quantize_op->getLoc(), ctx, result_quantized_type.getScale(),
-      result_quantized_type.getZeroPoint());
+      uniform_quantize_op->getLoc(), *rewriter.getContext(),
+      result_quantized_type.getScale(), result_quantized_type.getZeroPoint());
   // Omit any bias and requantize ops as `tfl.{gemm_op}` outputs a
   // fused `qi8` type.
-  FindUserOfType<>(uniform_quantize_op)->setOperand(0, op->getResult(0));
+  rewriter.replaceAllUsesWith(uniform_quantize_op->getResult(0),
+                              op->getResult(0));
   return op->getResult(0).getType().cast<TensorType>().clone(
       new_result_quantized_type);
 }
@@ -315,8 +305,7 @@ class RewriteUniformQuantizeOp
   // detailed limitations
   // (https://github.com/tensorflow/tensorflow/blob/8f145d579aa0ee7f4187af32dbbf4e12fdabbffe/tensorflow/lite/kernels/quantize.cc#L105).
   LogicalResult match(stablehlo::UniformQuantizeOp op) const override {
-    const Type input_element_type =
-        op.getOperand().getType().cast<TensorType>().getElementType();
+    const Type input_element_type = GetElementType(op.getOperand());
     if (!(input_element_type.isa<FloatType>() ||
           IsI32F32UniformQuantizedType(input_element_type) ||
           IsI32F32UniformQuantizedPerAxisType(input_element_type))) {
@@ -328,10 +317,7 @@ class RewriteUniformQuantizeOp
 
     // Output type of `UniformQuantizeOp` is guaranteed to be a quantized
     // tensor with integer storage type.
-    const auto output_storage_type = op.getResult()
-                                         .getType()
-                                         .cast<TensorType>()
-                                         .getElementType()
+    const auto output_storage_type = GetElementType(op.getResult())
                                          .cast<QuantizedType>()
                                          .getStorageType()
                                          .cast<IntegerType>();
@@ -363,10 +349,7 @@ class RewriteUniformDequantizeOp
   // detailed limitations
   // (https://github.com/tensorflow/tensorflow/blob/8f145d579aa0ee7f4187af32dbbf4e12fdabbffe/tensorflow/lite/kernels/dequantize.cc#L52).
   LogicalResult match(stablehlo::UniformDequantizeOp op) const override {
-    const auto input_storage_type = op.getOperand()
-                                        .getType()
-                                        .cast<TensorType>()
-                                        .getElementType()
+    const auto input_storage_type = GetElementType(op.getOperand())
                                         .cast<QuantizedType>()
                                         .getStorageType()
                                         .cast<IntegerType>();
@@ -377,11 +360,8 @@ class RewriteUniformDequantizeOp
     }
 
     // Output type is guaranteed to be a float tensor for a valid StableHLO.
-    const auto output_element_type = op.getResult()
-                                         .getType()
-                                         .cast<TensorType>()
-                                         .getElementType()
-                                         .cast<FloatType>();
+    const auto output_element_type =
+        GetElementType(op.getResult()).cast<FloatType>();
     if (!output_element_type.isa<Float32Type>()) {
       LLVM_DEBUG(llvm::dbgs() << "Uniform dequantize op's output element type "
                                  "should be f32. Got: "
@@ -448,8 +428,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
         op.getDotDimensionNumbers();
     const bool is_batch_matmul =
         !dot_dimension_nums.getLhsBatchingDimensions().empty();
-    const Type elem_type =
-        op.getResult().getType().cast<TensorType>().getElementType();
+    const Type elem_type = GetElementType(op.getResult());
     const bool has_i32_output = IsI32F32UniformQuantizedType(elem_type) ||
                                 IsI32F32UniformQuantizedPerAxisType(elem_type);
 
@@ -479,8 +458,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
 
   void rewrite(stablehlo::DotGeneralOp op,
                PatternRewriter& rewriter) const override {
-    const Type output_type =
-        op.getResult().getType().cast<TensorType>().getElementType();
+    const Type output_type = GetElementType(op.getResult());
     const bool has_i32_output =
         IsI32F32UniformQuantizedType(output_type) ||
         IsI32F32UniformQuantizedPerAxisType(output_type);
@@ -656,8 +634,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
   static LogicalResult MatchOutput(const Value output,
                                    const bool has_i32_output,
                                    const bool is_batch_matmul) {
-    const Type output_element_type =
-        output.getType().cast<TensorType>().getElementType();
+    const Type output_element_type = GetElementType(output);
     if (has_i32_output) {
       if (is_batch_matmul &&
           !IsI32F32UniformQuantizedType(output_element_type)) {
@@ -760,11 +737,8 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
     TFL::QConstOp filter_constant_op = CreateTflConstOpForFilter(
         rhs_value.getDefiningOp(), rewriter, /*is_per_channel=*/true);
 
-    const double input_scale = lhs_value.getType()
-                                   .cast<TensorType>()
-                                   .getElementType()
-                                   .cast<UniformQuantizedType>()
-                                   .getScale();
+    const double input_scale =
+        GetElementType(lhs_value).cast<UniformQuantizedType>().getScale();
     TFL::QConstOp bias_tfl_op;
     bool fuse_bias_constant =
         FindUserOfType<stablehlo::AddOp>(op) && has_i32_output;
@@ -800,16 +774,10 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
         Operation* add_op = FindUserOfType<stablehlo::AddOp>(op);
         uniform_quantize_op = FindUserOfType<TFL::QuantizeOp>(add_op);
         const auto filter_quantized_type =
-            op->getOperand(1)
-                .getType()
-                .cast<TensorType>()
-                .getElementType()
+            GetElementType(op->getOperand(1))
                 .cast<UniformQuantizedPerAxisType>();
         const SmallVector<double> bias_scales = GetBiasScales(
-            /*input_scale=*/op->getOperand(0)
-                .getType()
-                .cast<TensorType>()
-                .getElementType()
+            /*input_scale=*/GetElementType(op->getOperand(0))
                 .cast<UniformQuantizedType>()
                 .getScale(),
             /*filter_scales=*/filter_quantized_type.getScales());
@@ -821,10 +789,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
         const auto bias_quantized_type =
             CreateI32F32UniformQuantizedPerAxisType(
                 op->getLoc(), *op->getContext(), std::move(bias_scales),
-                op->getResult(0)
-                    .getType()
-                    .cast<TensorType>()
-                    .getElementType()
+                GetElementType(op->getResult(0))
                     .cast<UniformQuantizedPerAxisType>()
                     .getZeroPoints(),
                 /*quantization_dimension=*/0);
@@ -841,11 +806,9 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
         uniform_quantize_op = FindUserOfType<TFL::QuantizeOp>(op);
       }
 
-      const auto result_quantized_type = uniform_quantize_op->getResult(0)
-                                             .getType()
-                                             .cast<TensorType>()
-                                             .getElementType()
-                                             .cast<UniformQuantizedType>();
+      const auto result_quantized_type =
+          GetElementType(uniform_quantize_op->getResult(0))
+              .cast<UniformQuantizedType>();
       const auto new_result_quantized_type = CreateI8F32UniformQuantizedType(
           uniform_quantize_op->getLoc(), *rewriter.getContext(),
           result_quantized_type.getScale(),
@@ -856,8 +819,8 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
       // fused `qi8` type.
       FindUserOfType<>(uniform_quantize_op)->setOperand(0, op->getResult(0));
     } else {
-      output_type = GetOutputType(op, *rewriter.getContext(), has_i32_output,
-                                  fuse_bias_constant);
+      output_type = GetQuantizedOutputType(op, rewriter, has_i32_output,
+                                           fuse_bias_constant);
     }
     return output_type;
   }
@@ -898,8 +861,8 @@ class RewriteQuantizedConvolutionOp
  public:
   using OpRewritePattern<stablehlo::ConvolutionOp>::OpRewritePattern;
   LogicalResult match(stablehlo::ConvolutionOp op) const override {
-    const bool has_i32_output = IsI32F32UniformQuantizedPerAxisType(
-        op.getResult().getType().cast<TensorType>().getElementType());
+    const bool has_i32_output =
+        IsI32F32UniformQuantizedPerAxisType(GetElementType(op.getResult()));
     const bool fuse_bias_constant =
         FindUserOfType<stablehlo::AddOp>(op) && has_i32_output;
     stablehlo::ConvDimensionNumbersAttr dimension_numbers =
@@ -965,8 +928,8 @@ class RewriteQuantizedConvolutionOp
 
   void rewrite(stablehlo::ConvolutionOp op,
                PatternRewriter& rewriter) const override {
-    const bool has_i32_output = IsI32F32UniformQuantizedPerAxisType(
-        op.getResult().getType().cast<TensorType>().getElementType());
+    const bool has_i32_output =
+        IsI32F32UniformQuantizedPerAxisType(GetElementType(op.getResult()));
     stablehlo::ConvDimensionNumbersAttr dimension_numbers =
         op.getDimensionNumbers();
 
@@ -993,8 +956,8 @@ class RewriteQuantizedConvolutionOp
       input_value = pad_op.getResult();
     }
 
-    const Type output_type = GetOutputType(op, *rewriter.getContext(),
-                                           has_i32_output, fuse_bias_constant);
+    const Type output_type = GetQuantizedOutputType(
+        op, rewriter, has_i32_output, fuse_bias_constant);
     const auto [stride_h, stride_w] = GetStrides(op);
     const auto [dilation_h_factor, dilation_w_factor] = GetDilationFactors(op);
     if (is_depthwise) {
@@ -1110,8 +1073,7 @@ class RewriteQuantizedConvolutionOp
   }
 
   static LogicalResult MatchOutput(Value output) {
-    const Type output_element_type =
-        output.getType().cast<TensorType>().getElementType();
+    const Type output_element_type = GetElementType(output);
     if (!IsI32F32UniformQuantizedPerAxisType(output_element_type) &&
         !IsI8F32UniformQuantizedType(output_element_type)) {
       LLVM_DEBUG(
@@ -1397,10 +1359,7 @@ class RewriteQuantizedConvolutionOp
     Value filter_value = op.getOperand(1);
     Operation* filter_op = filter_value.getDefiningOp();
     auto filter_uniform_quantized_type =
-        filter_value.getType()
-            .cast<TensorType>()
-            .getElementType()
-            .cast<UniformQuantizedPerAxisType>();
+        GetElementType(filter_value).cast<UniformQuantizedPerAxisType>();
     auto filter_constant_value_attr = cast<DenseIntElementsAttr>(
         cast<stablehlo::ConstantOp>(filter_value.getDefiningOp()).getValue());
     const DenseIntElementsAttr new_filter_value_attr =
@@ -1440,10 +1399,7 @@ class RewriteQuantizedConvolutionOp
       const SmallVector<int64_t, 1> bias_shape, const bool has_i32_output,
       const bool fuse_bias_constant) const {
     const SmallVector<double> bias_scales = GetBiasScales(
-        /*input_scale=*/op.getOperand(0)
-            .getType()
-            .cast<TensorType>()
-            .getElementType()
+        /*input_scale=*/GetElementType(op.getOperand(0))
             .cast<UniformQuantizedType>()
             .getScale(),
         /*filter_scales=*/new_filter_quantized_type.getScales());
@@ -2108,6 +2064,44 @@ class RewriteQuantizedDynamicSliceOp
   }
 };
 
+class RewriteQuantizedAddOp : public OpRewritePattern<stablehlo::AddOp> {
+ public:
+  using OpRewritePattern<stablehlo::AddOp>::OpRewritePattern;
+
+  LogicalResult match(stablehlo::AddOp op) const override {
+    return success(IsI8F32UniformQuantizedType(GetElementType(op.getLhs())) &&
+                   IsI8F32UniformQuantizedType(GetElementType(op.getRhs())));
+  }
+
+  void rewrite(stablehlo::AddOp op, PatternRewriter& rewriter) const override {
+    TFL::QConstOp lhs_qconst_op;
+    TFL::QConstOp rhs_qconst_op;
+
+    auto GetBroadcastedConstOp = [&](Value operand) -> TFL::QConstOp {
+      if (auto broadcast_op = dyn_cast_or_null<stablehlo::BroadcastInDimOp>(
+              operand.getDefiningOp())) {
+        auto stablehlo_const_op = dyn_cast_or_null<stablehlo::ConstantOp>(
+            broadcast_op.getOperand().getDefiningOp());
+        auto const_uniform_quantized_type =
+            stablehlo_const_op.getResult().getType().cast<ShapedType>();
+        return rewriter.create<TFL::QConstOp>(
+            op.getLoc(), TypeAttr::get(const_uniform_quantized_type),
+            cast<DenseIntElementsAttr>(stablehlo_const_op.getValue()));
+      }
+      return nullptr;
+    };
+
+    lhs_qconst_op = GetBroadcastedConstOp(op.getLhs());
+    rhs_qconst_op = GetBroadcastedConstOp(op.getRhs());
+
+    rewriter.replaceOpWithNewOp<TFL::AddOp>(
+        op, op.getResult().getType(),
+        lhs_qconst_op ? lhs_qconst_op : op.getOperand(0),
+        rhs_qconst_op ? rhs_qconst_op : op.getOperand(1),
+        /*fused_activation_function=*/rewriter.getStringAttr("NONE"));
+  }
+};
+
 void UniformQuantizedStableHloToTflPass::runOnOperation() {
   func::FuncOp func_op = getOperation();
   MLIRContext& ctx = getContext();
@@ -2121,7 +2115,7 @@ void UniformQuantizedStableHloToTflPass::runOnOperation() {
                RewriteQuantizedGatherOp, RewriteQuantizedPadOp,
                RewriteQuantizedReduceWindowOpWithMax, RewriteQuantizedReshapeOp,
                RewriteQuantizedSelectOp, RewriteQuantizedSliceOp,
-               RewriteQuantizedTransposeOp>(&ctx);
+               RewriteQuantizedTransposeOp, RewriteQuantizedAddOp>(&ctx);
 
   if (failed(applyPatternsAndFoldGreedily(func_op, std::move(patterns)))) {
     func_op.emitError() << "Failed to convert stablehlo ops with uniform "
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
index ccf2ddf768b88b..0f9932d053cb4d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
@@ -98,10 +98,11 @@ void PopulateDefaultCalibrationOptions(QuantizationConfig& quant_config) {
 //   {matcher {function_name {regex: ".*"}}
 //   {method {static_range_ptq {}}}
 // }
-QuantizationSpec GetDefaultStaticRangePtqSpec() {
+QuantizationSpec GetDefaultStaticRangePtqSpec(StaticRangePtqPreset preset) {
   QuantizationSpec spec{};
   // Default for all ops.
-  spec.mutable_matcher()->mutable_function_name()->set_regex(".*");
+  spec.mutable_matcher()->mutable_function_name()->set_regex(
+      preset.enable_full_int_quantization() ? ".*" : "^.*(conv|dot|gather).*");
   spec.mutable_method()->mutable_static_range_ptq();
 
   return spec;
@@ -161,7 +162,8 @@ void ExpandStaticRangePtqPreset(const StaticRangePtqPreset& preset,
   // expansion from `StaticRangePtqPreset` gets populated first and then
   // user-provided explicit `QuantizationSpec`s will be appended.
   QuantizationSpecs new_specs{};
-  *new_specs.add_specs() = GetDefaultStaticRangePtqSpec();
+  *new_specs.add_specs() =
+      GetDefaultStaticRangePtqSpec(/*preset=*/config.static_range_ptq_preset());
   *new_specs.add_specs() = GetStaticRangePtqSpecForConvolution();
 
   // Append user-provided specs to override existing specs.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
index 70d23808f6df97..e3f2bfde3d10c3 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
@@ -147,10 +147,12 @@ TEST(ExpandPresetsTest, ExpandUnspecifiedPreset) {
   EXPECT_FALSE(new_config.has_pipeline_config());
 }
 
-TEST(ExpandPresetsTest, ExpandStaticRangePtqPreset) {
+TEST(ExpandPresetsTest, ExpandStaticRangePtqEnableFullIntquantization) {
   QuantizationConfig config{};
   RepresentativeDatasetConfig& preset_dataset_config =
       *config.mutable_static_range_ptq_preset()->add_representative_datasets();
+  config.mutable_static_range_ptq_preset()->set_enable_full_int_quantization(
+      true);
   preset_dataset_config.mutable_tf_record()->set_path("/test/path");
 
   const QuantizationConfig new_config = ExpandPresets(config);
@@ -185,6 +187,21 @@ TEST(ExpandPresetsTest, ExpandStaticRangePtqPreset) {
               StrEq("/test/path"));
 }
 
+TEST(ExpandPresetsTest, ExpandStaticRangePtqPresetDefault) {
+  QuantizationConfig config{};
+  RepresentativeDatasetConfig& preset_dataset_config =
+      *config.mutable_static_range_ptq_preset()->add_representative_datasets();
+  preset_dataset_config.mutable_tf_record()->set_path("/test/path");
+
+  const QuantizationConfig new_config = ExpandPresets(config);
+  ASSERT_THAT(new_config.specs().specs(), SizeIs(2));
+
+  const QuantizationSpec& spec = new_config.specs().specs(0);
+  EXPECT_THAT(spec.matcher().function_name().regex(),
+              StrEq("^.*(conv|dot|gather).*"));
+  EXPECT_TRUE(spec.method().has_static_range_ptq());
+}
+
 TEST(ExpandPresetsTest,
      ExpandStaticRangePtqPresetWithTopLevelRepresentativeDataset) {
   // Test the scenario where both
@@ -216,7 +233,8 @@ TEST(ExpandPresetsTest,
 
 TEST(ExpandPresetsTest, ExpandStaticRangePtqPresetThenAppendExplicitSpecs) {
   QuantizationConfig config{};
-  config.mutable_static_range_ptq_preset();
+  config.mutable_static_range_ptq_preset()->set_enable_full_int_quantization(
+      true);
 
   QuantizationSpec& user_provided_spec = *config.mutable_specs()->add_specs();
   user_provided_spec.mutable_matcher()->mutable_function_name()->set_regex(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
index c871ab3ac1adc2..ebe950c58142f6 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
@@ -58,8 +58,11 @@ void AddPostCalibrationPasses(
     OpPassManager& pm, const PipelineConfig& pipeline_config,
     const StaticRangePtqPreset& static_range_ptq_preset) {
   QuantizeCompositeFunctionsPassOptions options;
+  // TODO: b/331120943 - Use QuantizationConfig instead of preset flags.
   options.enable_per_channel_quantized_weight_ =
       static_range_ptq_preset.enable_per_channel_quantized_weight();
+  options.enable_full_int_quantization_ =
+      static_range_ptq_preset.enable_full_int_quantization();
   // For debugging purposes.
   options.mlir_dump_file_name_ = "quantize_composite_functions";
   options.enable_weight_only_ = false;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td
index 07598356cce7d3..eaa8a9092f41f2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td
@@ -67,3 +67,11 @@ def LiftGather : Pat<
       (NamedAttr<"slice_sizes"> $slice_sizes),
       (NamedAttr<"indices_are_sorted"> (DefaultOrNullAttr $indices_are_sorted)))),
   [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $operand)], [], (addBenefit 1)>;
+
+def LiftAdd : Pat<
+  (StableHLO_AddOp:$res
+      $lhs, $rhs),
+  (LiftAsTFXlaCallModule<"composite_add_fn">
+    (ArgumentList $lhs, $rhs),
+    (ResultList $res)),
+  [(IsNotInLiftedFunc $res), (IsNotInStableHloOpRegion $res)], [], (addBenefit 1)>;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
index e69f4d02b4ba84..80847e8283652e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -60,6 +60,10 @@ def QuantizeCompositeFunctionsPass : Pass<"stablehlo-quantize-composite-function
         "enable-per-channel-quantized-weight",
         "bool", /*default=*/"true",
         "Whether to enable per-channel quantized weights.">,
+    Option<"enable_full_int_quantization_",
+        "enable-full-int-quantization",
+        "bool", /*default=*/"false",
+        "Whether to enable full int quantization, including non compute-heavy ops.">,
     Option<"mlir_dump_file_name_", "mlir-dump-file-name",
         "std::optional<std::string>", /*default=*/"std::nullopt",
         "MLIR dump file name.">,
@@ -102,6 +106,10 @@ def QuantizePass : Pass<"stablehlo-quantize", "mlir::ModuleOp"> {
         "enable-per-channel-quantized-weight",
         "bool", /*default=*/"true",
         "Whether to enable per-channel quantized weights.">,
+    Option<"enable_full_int_quantization_",
+      "enable-full-int-quantization",
+      "bool", /*default=*/"false",
+      "Whether to apply full int quantization, including non compute-heavy ops.">,
     Option<"enable_weight_only_",
         "enable-weight-only",
         "bool", /*default=*/"false",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
index 3b53bef99ba179..f2b78caeb3cb44 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
@@ -954,6 +954,12 @@ void PopulateComputeHeavyPatterns(
   patterns.add<QuantizeOpWithRegionPattern>(ctx);
 }
 
+void PopulateAllQuantizablePatterns(MLIRContext& ctx,
+                                    RewritePatternSet& patterns) {
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeSingularOpPattern<AddOp>>>(
+      ctx, /*enable_per_channel_quantized_weight=*/false);
+}
+
 void PopulateQuantizeWeightOnlyPatterns(MLIRContext& ctx,
                                         RewritePatternSet& patterns) {
   patterns.add<
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
index 7e30fb54966077..9aa33ee0316ee1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
@@ -254,6 +254,11 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
 void PopulateComputeHeavyPatterns(MLIRContext& ctx, RewritePatternSet& patterns,
                                   bool enable_per_channel_quantized_weight);
 
+// Populates conversion patterns for all quantizable ops, including
+// ops that are not compute-heavy and data movement ops.
+void PopulateAllQuantizablePatterns(MLIRContext& ctx,
+                                    RewritePatternSet& patterns);
+
 // Populates pattern weight-only quantization.
 void PopulateQuantizeWeightOnlyPatterns(MLIRContext& ctx,
                                         RewritePatternSet& patterns);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
index a0749a4f3d3caa..4d6f4b3fe86832 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
@@ -96,9 +96,11 @@ class QuantizePass : public impl::QuantizePassBase<QuantizePass> {
   using impl::QuantizePassBase<QuantizePass>::QuantizePassBase;
 
   explicit QuantizePass(const bool enable_per_channel_quantized_weight,
+                        const bool enable_full_int_quantization,
                         const bool enable_weight_only,
                         const QuantizationSpecs& quant_specs) {
     enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
+    enable_full_int_quantization_ = enable_full_int_quantization;
     enable_weight_only_ = enable_weight_only;
   }
 
@@ -120,6 +122,11 @@ void QuantizePass::runOnOperation() {
   PopulateComputeHeavyPatterns(ctx, patterns,
                                enable_per_channel_quantized_weight_);
 
+  // Quantize all quantizable ops, including ops that are not compute-heavy.
+  if (enable_full_int_quantization_) {
+    PopulateAllQuantizablePatterns(ctx, patterns);
+  }
+
   if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
     // There are cases where no rewrites happen even if a pattern matches,
     // causing this to result in a convergence failure. Consider this as a
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
index 3583ff4cb4c08d..f3cf92dde359d1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
@@ -55,8 +55,9 @@ class QuantizeCompositeFunctionsPass
 
   explicit QuantizeCompositeFunctionsPass(
       const bool enable_per_channel_quantized_weight,
-      const bool enable_weight_only) {
+      const bool enable_weight_only, const bool enable_full_int_quantization) {
     enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
+    enable_full_int_quantization_ = enable_full_int_quantization;
     enable_weight_only_ = enable_weight_only;
   }
 
@@ -89,6 +90,8 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   QuantizePassOptions quantize_options;
   quantize_options.enable_per_channel_quantized_weight_ =
       enable_per_channel_quantized_weight_;
+  quantize_options.enable_full_int_quantization_ =
+      enable_full_int_quantization_;
   quantize_options.enable_weight_only_ = enable_weight_only_;
   // QuantizePass modifies FuncOps referenced outside of its given scope
   // and therefore requires a module-level context.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h
index 7ba129d1c7a40d..a8a59d1cd3b46b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h
@@ -23,9 +23,10 @@ namespace mlir::quant::stablehlo::testing {
 // `TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass`. The pass
 // option argument is specified in line comments for each enum value.
 enum class TestQuantizationSpecs {
-  kEmpty,                 // empty
-  kDisableAllDotGeneral,  // disable-all-dot-general
-  kStaticRangePtqToAll,   // static-range-ptq-to-all
+  kEmpty,                         // empty
+  kDisableAllDotGeneral,          // disable-all-dot-general
+  kStaticRangePtqToAll,           // static-range-ptq-to-all
+  kStaticRangePtqToComputeHeavy,  // static-range-ptq-to-compute-heavy
 };
 
 // Adds generated pass default constructors or options definitions.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
index c2be397d764d58..ee525f2deead04 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
@@ -80,7 +80,9 @@ def TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass :
         clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kDisableAllDotGeneral,
           "disable-all-dot-general", "Disables all dot_general ops by matching lifted function names"),
         clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kStaticRangePtqToAll,
-          "static-range-ptq-to-all", "Applies `StaticRangePtq` to all quantizable units.")
+          "static-range-ptq-to-all", "Applies `StaticRangePtq` to all quantizable units."),
+        clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kStaticRangePtqToComputeHeavy,
+          "static-range-ptq-to-compute-heavy", "Applies `StaticRangePtq` to only compute heavy units.")
       )}]>
   ];
   let dependentDialects = [
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
index 25920c986e4d1d..062fbdddd4150d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
@@ -62,6 +62,15 @@ constexpr absl::string_view kSpecsStaticRangePtqToAll =
            method { static_range_ptq {} }
          }])pb";
 
+// Configure `QuantizationSpecs` to apply `StaticRangePtq` to compute heavy
+// units.
+constexpr absl::string_view kSpecsStaticRangePtqToComputeHeavy =
+    R"pb(specs
+         [ {
+           matcher { function_name { regex: "^.*(conv|dot|gather).*" } }
+           method { static_range_ptq {} }
+         }])pb";
+
 class TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass
     : public impl::
           TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPassBase<
@@ -88,6 +97,8 @@ absl::string_view GetQuantizationSpecsTextProto(
       return kSpecsDisableAllDotGeneral;
     case TestQuantizationSpecs::kStaticRangePtqToAll:
       return kSpecsStaticRangePtqToAll;
+    case TestQuantizationSpecs::kStaticRangePtqToComputeHeavy:
+      return kSpecsStaticRangePtqToComputeHeavy;
   }
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
index bc25b9a858440f..a76ca4e75ac764 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -442,7 +442,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         testing.get_size_ratio(
             self._output_saved_model_path, self._input_saved_model_path
         ),
-        0.6,
+        0.61,
     )
 
   @parameterized.parameters(
@@ -931,7 +931,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         testing.get_size_ratio(
             self._output_saved_model_path, self._input_saved_model_path
         ),
-        0.4,
+        0.46,
     )
 
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
index 93d98d9067ef9c..efdceebd6c2008 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
@@ -53,7 +53,7 @@ message RepresentativeDatasetConfig {
 //     channel dimension, which assumes the weight tensor is in NHWC format.
 //   * Applies static-range PTQ for all other ops.
 //
-// Next ID: 3
+// Next ID: 4
 message StaticRangePtqPreset {
   // Configures representative dataset. Each item corresponds to a
   // representative dataset used to calibrate a function.
@@ -72,6 +72,9 @@ message StaticRangePtqPreset {
   //
   // Default value: true
   bool enable_per_channel_quantized_weight = 2 [deprecated = true];
+
+  // Whether to quantize all quantizable ops or only compute-heavy ops.
+  bool enable_full_int_quantization = 3;
 }
 
 // Applies int8 per-tensor weight-only quantization for all dot_general op.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
index c8bffa8be6b6b4..69bf09104c814d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
@@ -1,9 +1,5 @@
 // RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=disable-all-dot-general" \
 // RUN:   -split-input-file | FileCheck %s --check-prefix=DISABLE-ALL-DOT-GENERAL
-// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=empty" \
-// RUN:   -split-input-file | FileCheck %s --check-prefix=EMPTY
-// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=static-range-ptq-to-all" \
-// RUN:   -split-input-file | FileCheck %s --check-prefix=STATIC-RANGE-PTQ-TO-ALL
 
 // Tests that `composite_dot_general_fn_1` and its corresponding XlaCallModuleOp
 // contains attributes required for quantization, including the
@@ -16,8 +12,8 @@ func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
   return %1 : tensor<1x1x64xf32>
 }
 
-// DISABLE-ALL-DOT-GENERAL: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
-// DISABLE-ALL-DOT-GENERAL: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// DISABLE-ALL-DOT-GENERAL: %[[CONST:.+]] = stablehlo.constant dense<2.000000e+00>
+// DISABLE-ALL-DOT-GENERAL: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
 
 // Check that the `_quantization_method` attribute contains the quantization
 // method in textproto format. The dot_general op quantization is explicitly
@@ -27,17 +23,20 @@ func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
 // DISABLE-ALL-DOT-GENERAL-SAME: _quantization_method = "no_quantization { }"
 // DISABLE-ALL-DOT-GENERAL-SAME: _tfl_quant_trait = "fully_quantizable"
 
-// DISABLE-ALL-DOT-GENERAL: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// DISABLE-ALL-DOT-GENERAL: return %[[XLA_CALL_MODULE:.+]] : tensor<1x1x64xf32>
 // DISABLE-ALL-DOT-GENERAL: }
 
 // DISABLE-ALL-DOT-GENERAL-LABEL: private @composite_dot_general_fn_1
 // DISABLE-ALL-DOT-GENERAL-SAME: tf_quant.composite_function
-// DISABLE-ALL-DOT-GENERAL: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
-// DISABLE-ALL-DOT-GENERAL: return %[[DOT_GENERAL:.*]] : tensor<1x1x64xf32>
+// DISABLE-ALL-DOT-GENERAL: %[[DOT_GENERAL:.+]] = stablehlo.dot_general %arg0, %arg1
+// DISABLE-ALL-DOT-GENERAL: return %[[DOT_GENERAL:.+]] : tensor<1x1x64xf32>
 // DISABLE-ALL-DOT-GENERAL: }
 
 // -----
 
+// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=empty" \
+// RUN:   -split-input-file | FileCheck %s --check-prefix=EMPTY
+
 // Tests that `composite_dot_general_fn_1` and its corresponding XlaCallModuleOp
 // contains attributes required for quantization. `_quantization_method` is not
 // set, as it is implicitly disabled.
@@ -49,8 +48,8 @@ func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
   return %1 : tensor<1x1x64xf32>
 }
 
-// EMPTY: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
-// EMPTY: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// EMPTY: %[[CONST:.+]] = stablehlo.constant dense<2.000000e+00>
+// EMPTY: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
 
 // Check that the `_quantization_method` attribute doesn't contain the
 // quantization method, implying "no_quantization".
@@ -59,17 +58,20 @@ func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
 // EMPTY-NOT: _quantization_method
 // EMPTY-SAME: _tfl_quant_trait = "fully_quantizable"
 
-// EMPTY: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// EMPTY: return %[[XLA_CALL_MODULE:.+]] : tensor<1x1x64xf32>
 // EMPTY: }
 
 // EMPTY-LABEL: private @composite_dot_general_fn_1
 // EMPTY-SAME: tf_quant.composite_function
-// EMPTY: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
-// EMPTY: return %[[DOT_GENERAL:.*]] : tensor<1x1x64xf32>
+// EMPTY: %[[DOT_GENERAL:.+]] = stablehlo.dot_general %arg0, %arg1
+// EMPTY: return %[[DOT_GENERAL:.+]] : tensor<1x1x64xf32>
 // EMPTY: }
 
 // -----
 
+// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=static-range-ptq-to-all" \
+// RUN:   -split-input-file | FileCheck %s --check-prefix=STATIC-RANGE-PTQ-TO-ALL
+
 // STATIC-RANGE-PTQ-TO-ALL: @main
 func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
   %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
@@ -80,8 +82,8 @@ func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
 // contains attributes required for quantization, including the
 // `_quantization_method` attribute that contains textpb of `Method`.
 
-// STATIC-RANGE-PTQ-TO-ALL: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
-// STATIC-RANGE-PTQ-TO-ALL: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// STATIC-RANGE-PTQ-TO-ALL: %[[CONST:.+]] = stablehlo.constant dense<2.000000e+00>
+// STATIC-RANGE-PTQ-TO-ALL: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
 
 // Check that the `_quantization_method` attribute contains the quantization
 // method in textproto format, enabling static-range PTQ.
@@ -90,11 +92,34 @@ func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
 // STATIC-RANGE-PTQ-TO-ALL-SAME: _quantization_method = "static_range_ptq { }"
 // STATIC-RANGE-PTQ-TO-ALL-SAME: _tfl_quant_trait = "fully_quantizable"
 
-// STATIC-RANGE-PTQ-TO-ALL: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// STATIC-RANGE-PTQ-TO-ALL: return %[[XLA_CALL_MODULE:.+]] : tensor<1x1x64xf32>
 // STATIC-RANGE-PTQ-TO-ALL: }
 
 // STATIC-RANGE-PTQ-TO-ALL-LABEL: private @composite_dot_general_fn_1
 // STATIC-RANGE-PTQ-TO-ALL-SAME: tf_quant.composite_function
-// STATIC-RANGE-PTQ-TO-ALL: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
-// STATIC-RANGE-PTQ-TO-ALL: return %[[DOT_GENERAL:.*]] : tensor<1x1x64xf32>
+// STATIC-RANGE-PTQ-TO-ALL: %[[DOT_GENERAL:.+]] = stablehlo.dot_general %arg0, %arg1
+// STATIC-RANGE-PTQ-TO-ALL: return %[[DOT_GENERAL:.+]] : tensor<1x1x64xf32>
 // STATIC-RANGE-PTQ-TO-ALL: }
+
+// -----
+
+// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=static-range-ptq-to-compute-heavy" \
+// RUN:   -split-input-file | FileCheck %s --check-prefix=STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY
+
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: @main
+func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x2xf32> {
+  %0 = stablehlo.add %arg0, %arg0 : tensor<1x2xf32>
+  return %0 : tensor<1x2xf32>
+}
+// Tests that `composite_add_fn_1` does not quantize when quantizing
+// only compute-heavy ops.
+
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: %[[CONST:.+]] = stablehlo.constant dense<2.000000e+00>
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%arg0, %arg0)
+
+// Check that the `_quantization_method` attribute contains the quantization
+// method in textproto format, enabling static-range PTQ.
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: _entry_function = @composite_add_fn_1
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: _original_entry_function
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY-NOT: _quantization_method
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: _tfl_quant_trait = "fully_quantizable"
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_all_ops.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_all_ops.mlir
new file mode 100644
index 00000000000000..72851d92b64b75
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_all_ops.mlir
@@ -0,0 +1,46 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:     -stablehlo-quantize-composite-functions=enable-full-int-quantization=true | FileCheck --check-prefix=CHECK-FULL-INT %s
+
+// Tests that a basic `stablehlo.add` and a fused `stablehlo.dot_general`
+// are properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+// CHECK-FULL-INT: func.func private @quantize_add_fn(%[[ARG:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+  func.func private @quantize_add_fn(%arg: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst_0 = "tf.Const"() {value = dense<1.00000000e-1> : tensor<1x2xf32>} : () -> tensor<1x2xf32>
+    %cst_1 = "tf.Const"() {value = dense<1.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %0 = "quantfork.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst_0) {Sout = [#tf_type.shape<1x2>], _entry_function = @composite_add_fn, _original_entry_function = "composite_add_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %3 = "quantfork.stats"(%2) {layerStats = dense<[5.00000000e-6, 6.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %4 = "tf.XlaCallModule"(%3, %cst_1) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %5 = "quantfork.stats"(%4) {layerStats = dense<[5.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %5 : tensor<1x3xf32>
+  }
+// CHECK-FULL-INT: %[[CONST:.+]] = stablehlo.constant() {value = dense<127> : tensor<1x2xi8>} : () -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>> 
+// CHECK-FULL-INT: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[ARG]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: %[[CALL:.+]] = call @quantized_add_fn(%[[UNIFORM_QUANTIZE]], %[[CONST]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: %[[UNIFORM_DEQUANTIZE:.+]] = stablehlo.uniform_dequantize %[[CALL]] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2xf32>
+// CHECK-FULL-INT: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[UNIFORM_DEQUANTIZE]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3xf32>
+// CHECK-FULL-INT: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
+
+// CHECK-FULL-INT: func.func private @quantized_add_fn(%[[ARG_0:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+  func.func private @composite_add_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<1x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.add %arg0, %arg1 : tensor<1x2xf32>
+    return %0 : tensor<1x2xf32>
+  }
+// CHECK-FULL-INT: %[[ADD:.+]] = stablehlo.add %arg0, %arg1 : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: return %[[ADD]] : tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-FULL-INT: func.func private @quantized_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_1:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+// CHECK-FULL-INT: %[[DOT_GENERAL:.+]] = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1,{{.*}}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>
+// CHECK-FULL-INT: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[DOT_GENERAL]] : (tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: return %[[UNIFORM_QUANTIZE]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+}
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 849e854798368b..9b91a640bc7923 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -868,6 +868,7 @@ def _get_base_converter_args(self):
                     )
                 ],
                 enable_per_channel_quantized_weight=True,
+                enable_full_int_quantization=True,
             ),
             # For ODML use cases, uniform quantized types should be left intact.
             pipeline_config=qc.PipelineConfig(

From a7d81b539f9834e4033a178730676e4b0e48c485 Mon Sep 17 00:00:00 2001
From: Wren Romano <wrengr@google.com>
Date: Thu, 28 Mar 2024 15:04:52 -0700
Subject: [PATCH 571/670] [XLA:Python] Adding `xla::PrimitiveType` <->
 `numpy.dtype` conversions to the library for internal debugging tools.

PiperOrigin-RevId: 620068167
---
 third_party/xla/xla/python/tools/BUILD        |  1 +
 third_party/xla/xla/python/tools/_types.cc    | 33 ++++++++++++++++++-
 third_party/xla/xla/python/tools/_types.pyi   |  2 ++
 third_party/xla/xla/python/tools/types.py     |  4 ++-
 .../xla/xla/python/tools/types_test.py        | 14 ++++----
 5 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/python/tools/BUILD b/third_party/xla/xla/python/tools/BUILD
index ac45d9d3f10c66..2338525fe744d3 100644
--- a/third_party/xla/xla/python/tools/BUILD
+++ b/third_party/xla/xla/python/tools/BUILD
@@ -63,6 +63,7 @@ tsl_pybind_extension(
         "//xla:literal",
         "//xla:xla_data_proto_cc",
         "//xla/python:logging",
+        "//xla/python:nb_numpy",
         "//xla/python:types",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/python/tools/_types.cc b/third_party/xla/xla/python/tools/_types.cc
index f18e360c13c7fb..320404637a0462 100644
--- a/third_party/xla/xla/python/tools/_types.cc
+++ b/third_party/xla/xla/python/tools/_types.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
 #include "xla/literal.h"
 #include "xla/python/logging.h"
+#include "xla/python/nb_numpy.h"
 #include "xla/python/types.h"
 #include "xla/xla_data.pb.h"
 // NOTE: The tsl-numpy header forbids importing the actual NumPy arrayobject.h
@@ -59,6 +60,19 @@ absl::StatusOr<py::object> MakeNdarray(const xla::LiteralProto& proto) {
   // Convert `nb::object` into `py::object`.
   return py::reinterpret_steal<py::object>(nbobj.release().ptr());
 }
+
+// Partial reversion of cl/617156835, until we can get the proto-casters
+// (and hence the extension) switched over to nanobind.
+// TODO(wrengr): Or can we mix `{py,nb}::module_::def` calls??
+absl::StatusOr<xla::PrimitiveType> DtypeToEtype(const py::dtype& py_d) {
+  auto nb_d = nb::borrow<xla::nb_dtype>(py_d.ptr());
+  return xla::DtypeToPrimitiveType(nb_d);
+}
+
+absl::StatusOr<py::dtype> EtypeToDtype(xla::PrimitiveType p) {
+  TF_ASSIGN_OR_RETURN(xla::nb_dtype nb_d, xla::PrimitiveTypeToNbDtype(p));
+  return py::reinterpret_steal<py::dtype>(nb_d.release().ptr());
+}
 }  // namespace
 
 // NOTE: It seems insurmountable to get "native_proto_caster.h" to work
@@ -98,7 +112,8 @@ PYBIND11_MODULE(_types, py_m) {
   py::module_::import("ml_dtypes");
 
   // Ensure that tsl-numpy initializes datastructures of the actual-NumPy
-  // implementation, and does whatever else tsl-numpy needs.
+  // implementation, and does whatever else tsl-numpy needs.  This is
+  // also necessary for using the `xla::nb_dtype` type.
   tsl::ImportNumpy();
 
   // Declare that C++ can `nb::cast` from `std::shared_ptr<xla::Literal>`
@@ -124,5 +139,21 @@ PYBIND11_MODULE(_types, py_m) {
     of tuples with leaves being `numpy.ndarray` views of array-shaped
     sub-literals.
   )pbdoc");
+
+  // This method name is based on `xla_client.dtype_to_etype`.
+  // NOTE: `xla_client` uses a Python class wrapping the protobuf-enum,
+  // rather than using the protobuf-enum directly.  See the module docstring
+  // in "types.py" for more explanation on why.
+  py_m.def("dtype_to_etype", &DtypeToEtype, py::arg("dtype").none(false),
+           py::pos_only(), R"pbdoc(
+    Converts `numpy.dtype` into
+    `tensorflow.compiler.xla.xla_data_pb2.PrimitiveType`.
+  )pbdoc");
+
+  py_m.def("etype_to_dtype", &EtypeToDtype, py::arg("ptype").none(false),
+           py::pos_only(), R"pbdoc(
+    Converts `tensorflow.compiler.xla.xla_data_pb2.PrimitiveType` into
+    `numpy.dtype`.
+  )pbdoc");
   // LINT.ThenChange(_types.pyi)
 }
diff --git a/third_party/xla/xla/python/tools/_types.pyi b/third_party/xla/xla/python/tools/_types.pyi
index 1ca5071367a0cd..f355656f05b674 100644
--- a/third_party/xla/xla/python/tools/_types.pyi
+++ b/third_party/xla/xla/python/tools/_types.pyi
@@ -20,4 +20,6 @@ from xla import xla_data_pb2
 # LINT.IfChange
 NdarrayTree = Union[np.ndarray, tuple['NdarrayTree', ...]]
 def make_ndarray(proto: xla_data_pb2.LiteralProto, /) -> NdarrayTree: ...
+def dtype_to_etype(dtype: np.dtype, /) -> xla_data_pb2.PrimitiveType: ...
+def etype_to_dtype(ptype: xla_data_pb2.PrimitiveType, /) -> np.dtype: ...
 # LINT.ThenChange(types.py, _types.cc)
diff --git a/third_party/xla/xla/python/tools/types.py b/third_party/xla/xla/python/tools/types.py
index d13ee2241ed479..189758f1e749c8 100644
--- a/third_party/xla/xla/python/tools/types.py
+++ b/third_party/xla/xla/python/tools/types.py
@@ -40,10 +40,12 @@
 
 # NOTE: `import <name> as <name>` is required for names to be exported.
 # See PEP 484 & <https://github.com/google/jax/issues/7570>
-# pylint: disable=g-importing-member,useless-import-alias,unused-import
+# pylint: disable=g-importing-member,useless-import-alias,unused-import,g-multiple-import
 # LINT.IfChange
 from ._types import (
     make_ndarray as make_ndarray,
+    dtype_to_etype as dtype_to_etype,
+    etype_to_dtype as etype_to_dtype,
 )
 # TODO(wrengr): We can't import the `NdarrayTree` defined in the pyi file.
 # So re-defining it here for now.
diff --git a/third_party/xla/xla/python/tools/types_test.py b/third_party/xla/xla/python/tools/types_test.py
index e056e05be24f35..a6cdb1d0f76b13 100644
--- a/third_party/xla/xla/python/tools/types_test.py
+++ b/third_party/xla/xla/python/tools/types_test.py
@@ -148,14 +148,12 @@ class MakeNdarrayValidTest(parameterized.TestCase):
 
   def testHasCorrectDtype(self, proto, arr):
     """Test that the result has the right dtype."""
-    # Silence [unused-argument] warning.
-    del proto
-    # TODO(wrengr): Add pybind for `xla::PrimitiveTypeToDtype`,
-    # so that we can avoid hard-coding the expected np.dtype.
-    # Alternatively, we could use `xla_client.dtype_to_etype` (ideally
-    # after refactoring that into a small library, so we need not pull in
-    # all the rest of xla_client).
-    self.assertEqual(np.float64, arr.dtype)
+    e = proto.shape.element_type
+    d = arr.dtype
+    with self.subTest(msg='etype_to_dtype'):
+      self.assertEqual(types.etype_to_dtype(e), d)
+    with self.subTest(msg='dtype_to_etype'):
+      self.assertEqual(e, types.dtype_to_etype(d))
 
   def testHasCorrectRank(self, proto, arr):
     """Test that the result has the right rank."""

From a9db13b49666b6461249bd6eb82fdc02fb5bb16c Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Thu, 28 Mar 2024 15:09:11 -0700
Subject: [PATCH 572/670] Integrate StableHLO at openxla/stablehlo@271e8634

PiperOrigin-RevId: 620069321
---
 third_party/stablehlo/workspace.bzl                 | 4 ++--
 third_party/xla/third_party/stablehlo/workspace.bzl | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index b79bde1851c6c6..ca2f3a937f73ad 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "f4459e76553770ecc94f23de29984c7859ad9f05"
-    STABLEHLO_SHA256 = "00e2bcd62db577297a0a9b6f9203a9f2f58bd40bfa2574908ffa883ad7f60fd5"
+    STABLEHLO_COMMIT = "271e8634de184fbfafd677d3876170feb6d08c97"
+    STABLEHLO_SHA256 = "06db84c751bd4a980dc76249e02f10e119175fceba3eebed008da122cb480bab"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index b79bde1851c6c6..ca2f3a937f73ad 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "f4459e76553770ecc94f23de29984c7859ad9f05"
-    STABLEHLO_SHA256 = "00e2bcd62db577297a0a9b6f9203a9f2f58bd40bfa2574908ffa883ad7f60fd5"
+    STABLEHLO_COMMIT = "271e8634de184fbfafd677d3876170feb6d08c97"
+    STABLEHLO_SHA256 = "06db84c751bd4a980dc76249e02f10e119175fceba3eebed008da122cb480bab"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(

From 0adcfc22f441b53a581ce14b1fc48298bb35705e Mon Sep 17 00:00:00 2001
From: Matthias Kramm <kramm@google.com>
Date: Thu, 28 Mar 2024 15:22:06 -0700
Subject: [PATCH 573/670] Move sparsecore passes under transforms/sparsecore.

PiperOrigin-RevId: 620072718
---
 tensorflow/compiler/mlir/BUILD                |   2 +
 .../compiler/mlir/tensorflow/transforms/BUILD |   3 -
 .../tensorflow/transforms/host_runtime/BUILD  |   1 +
 .../lower_cluster_to_runtime_ops.cc           |   1 +
 .../mlir/tensorflow/transforms/passes.h       |   7 -
 .../tensorflow/transforms/sparsecore/BUILD    | 123 ++++++++++++++++++
 .../{ => sparsecore}/embedding_pipelining.cc  |   6 +-
 .../{ => sparsecore}/embedding_program_key.cc |   3 +-
 .../{ => sparsecore}/embedding_sequencing.cc  |   8 +-
 .../transforms/sparsecore/sparsecore_passes.h |  50 +++++++
 .../sparsecore/sparsecore_passes.td           |  83 ++++++++++++
 .../mlir/tensorflow/transforms/tf_passes.td   |  67 ----------
 .../compiler/mlir/tf2xla/internal/BUILD       |   1 +
 .../internal/clustering_bridge_passes.cc      |   1 +
 tensorflow/compiler/mlir/tf_mlir_opt_main.cc  |   5 +-
 15 files changed, 277 insertions(+), 84 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/BUILD
 rename tensorflow/compiler/mlir/tensorflow/transforms/{ => sparsecore}/embedding_pipelining.cc (99%)
 rename tensorflow/compiler/mlir/tensorflow/transforms/{ => sparsecore}/embedding_program_key.cc (99%)
 rename tensorflow/compiler/mlir/tensorflow/transforms/{ => sparsecore}/embedding_sequencing.cc (98%)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.td

diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index b30f08a1bfe1b4..d0286e5acff9ce 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -59,6 +59,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",  # buildcleaner:keep
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:runtime_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms/sparsecore:sparsecore_passes",
         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
         "//tensorflow/compiler/mlir/tf2xla/internal/passes:clustering_passes",
         "//tensorflow/compiler/mlir/tf2xla/internal/passes:mlir_to_graph_passes",
@@ -69,6 +70,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tosa:tfl_passes",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir/framework/ir:xla_framework",
         "@local_xla//xla/mlir/framework/transforms:passes",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index c84871fd564156..3d1cf1bd58fa38 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -460,9 +460,6 @@ cc_library(
         "device_index_selector.cc",
         "drop_while_shape_invariant.cc",
         "einsum.cc",
-        "embedding_pipelining.cc",
-        "embedding_program_key.cc",
-        "embedding_sequencing.cc",
         "executor_island_coarsening.cc",
         "executor_tpuv1_inline_tpu_island.cc",
         "executor_tpuv1_island_coarsening.cc",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
index 15f339eccd2f93..f8e75d9032f3e5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
@@ -31,6 +31,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "//tensorflow/compiler/mlir/tensorflow/transforms/sparsecore:sparsecore_passes",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core/platform:error_payloads",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
index 713e9080f2e03b..a239c7304a0ae0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/runtime_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 9c475f1f9f5281..da89e77cb0862c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -446,13 +446,6 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateReplicateToIslandPass(
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreateReplicaIDToDeviceOrdinalPass();
 
-// Creates a pass that adds pipelining to a graph that contains device
-// accelerated embeddings. The EmbeddingSequencingPass is a temporary fallback
-// while developing full pipelining capabilities.
-std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingSequencingPass();
-std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingPipeliningPass();
-std::unique_ptr<OperationPass<func::FuncOp>> CreateEmbeddingProgramKeyPass();
-
 // Creates a pass that creates `tf_executor.island` from a single
 // `tf_device.parallel_execute` island.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateParallelExecuteToIslandsPass(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/BUILD
new file mode 100644
index 00000000000000..bff95d357c885f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/BUILD
@@ -0,0 +1,123 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/compiler/mlir:__pkg__",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:__pkg__",
+        "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:__pkg__",
+        "//tensorflow/compiler/mlir/tf2xla/api:__subpackages__",
+        "//tensorflow/compiler/mlir/tf2xla/internal:__pkg__",
+    ],
+    licenses = ["notice"],
+)
+
+gentbl_cc_library(
+    name = "sparsecore_passes_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=SparseCore",
+            ],
+            "sparsecore_passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "sparsecore_passes.td",
+    deps = [
+        "@llvm-project//mlir:PassBaseTdFiles",
+    ],
+)
+
+cc_library(
+    name = "sparsecore_passes",
+    hdrs = [
+        "sparsecore_passes.h",
+    ],
+    textual_hdrs = [
+        "sparsecore_passes.h.inc",
+    ],
+    deps = [
+        ":embedding_pipelining",
+        ":embedding_program_key",
+        ":embedding_sequencing",
+        ":sparsecore_passes_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+cc_library(
+    name = "embedding_pipelining",
+    srcs = ["embedding_pipelining.cc"],
+    hdrs = [
+        "sparsecore_passes.h",
+    ],
+    deps = [
+        ":sparsecore_passes_inc_gen",
+        "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "embedding_sequencing",
+    srcs = ["embedding_sequencing.cc"],
+    hdrs = [
+        "sparsecore_passes.h",
+    ],
+    deps = [
+        ":sparsecore_passes_inc_gen",
+        "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "embedding_program_key",
+    srcs = ["embedding_program_key.cc"],
+    hdrs = [
+        "sparsecore_passes.h",
+    ],
+    deps = [
+        ":sparsecore_passes_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla/mlir_hlo",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
similarity index 99%
rename from tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc
rename to tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
index ee334b3f032155..0c450126e4e090 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
@@ -157,7 +157,7 @@ return selected_results
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 #define GEN_PASS_DEF_EMBEDDINGPIPELININGPASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h.inc"
 
 static constexpr char kEmbeddingPipelining[] = "_embedding_pipelining";
 static constexpr char kEmbeddingPipeliningInlineAttr[] =
@@ -1289,7 +1289,7 @@ LogicalResult StartStep0(OpBuilder& builder, Location& loc,
   func::FuncOp orig_parent_func =
       callers.backward->getParentOfType<func::FuncOp>();
 
-  std::vector<Value> operands = loop_operands_nm0;
+  const std::vector<Value>& operands = loop_operands_nm0;
 
   // Input types will be the same as the original loop body.
   std::vector<Type> input_types = GetValueTypes(operands);
@@ -1373,7 +1373,7 @@ LogicalResult StartStep1(OpBuilder& builder, Location& loc,
   func::FuncOp orig_parent_func =
       callers.backward->getParentOfType<func::FuncOp>();
 
-  std::vector<Value> operands = loop_operands_1;
+  const std::vector<Value>& operands = loop_operands_1;
 
   // Input types will be the same as the original loop body.
   std::vector<Type> input_types = GetValueTypes(operands);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
similarity index 99%
rename from tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc
rename to tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
index a5575ef156ddb9..3e41762feb16c2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
@@ -42,7 +41,7 @@ constexpr char kMiniBatchSplitsAttr[] = "mini_batch_splits";
 constexpr char kMiniBatchCsrAttr[] = "mini_batch_in_csr";
 
 #define GEN_PASS_DEF_EMBEDDINGPROGRAMKEYPASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h.inc"
 
 struct EmbeddingProgramKeyPass
     : public impl::EmbeddingProgramKeyPassBase<EmbeddingProgramKeyPass> {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_sequencing.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc
similarity index 98%
rename from tensorflow/compiler/mlir/tensorflow/transforms/embedding_sequencing.cc
rename to tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc
index a77dd6f498a144..7ed29a3ed58cc3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_sequencing.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc
@@ -32,6 +32,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/Casting.h"
@@ -40,6 +42,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
@@ -47,17 +50,20 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 #define GEN_PASS_DEF_EMBEDDINGSEQUENCINGPASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h.inc"
 
 static constexpr char kEmbeddingPipelining[] = "_embedding_pipelining";
 static constexpr char kEmbeddingForward[] = "forward";
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h
new file mode 100644
index 00000000000000..8944745dd3fff9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SPARSECORE_SPARSECORE_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SPARSECORE_SPARSECORE_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFDevice {
+
+// For architectures that support accelerated embedding lookups, this pass will
+// rewrite the graph to use pipelining for better device utilization.
+std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingSequencingPass();
+
+// This is a strictly sequential and formally correct fallback option for the
+// embedding pipelining pass intended for debugging during pipelining
+// development.
+std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingPipeliningPass();
+
+// Passes in the program key to embedding ops, by moving the embedding ops
+// after the _TPUCompileMlir op.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateEmbeddingProgramKeyPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_EMBEDDINGSEQUENCINGPASS
+#define GEN_PASS_DECL_EMBEDDINGPIPELININGPASS
+#define GEN_PASS_DECL_EMBEDDINGPROGRAMKEYPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h.inc"
+
+}  // namespace TFDevice
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SPARSECORE_SPARSECORE_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.td
new file mode 100644
index 00000000000000..a9c5981393df6c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.td
@@ -0,0 +1,83 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def EmbeddingPipeliningPass : Pass<"tf-embedding-pipelining", "mlir::ModuleOp"> {
+  let summary = "Rewrite graph for embedding pipelining";
+  let constructor = "TFDevice::CreateEmbeddingPipeliningPass()";
+    let description = [{
+    For architectures that support accelerated embedding lookups, this pass will
+    rewrite the graph to use pipelining for better device utilization.
+  }];
+}
+
+def EmbeddingSequencingPass : Pass<"tf-embedding-sequencing", "mlir::ModuleOp"> {
+  let summary = "Rewrite graph for sequential execution of embeddings";
+  let constructor = "TFDevice::CreateEmbeddingSequencingPass()";
+    let description = [{
+    This is a strictly sequential and formally correct fallback option for the
+    embedding pipelining pass intended for debugging during pipelining
+    development.
+  }];
+}
+
+def EmbeddingProgramKeyPass : Pass<"tf-embedding-program-key", "mlir::func::FuncOp"> {
+  let summary = "Sets the program key for embedding ops.";
+  let constructor = "TFDevice::CreateEmbeddingProgramKeyPass()";
+    let description = [{
+    Passes in the program key to embedding ops. Will move the embedding ops
+    after a _TPUCompileMlir op if there is no predecessor _TPUCompileMlir op.
+    Both the embedding op and compile op are assumed to be wrapped in separate
+    tf_device.launch() ops. This is because the embedding op is head outside
+    compiled and the compile op is wrapped in launch to execute on host
+    during TPURewritePass.
+
+    For example, the tf.OpA with the `mini_batch_splits` attribute will be
+    moved after _TPUCompileMlir and the first input will use the
+    _TPUCompileMlir program output:
+
+    ```mlir
+    "tf_device.launch"() ({
+     %cst_0 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+     "tf.OpA"(%cst_0) { mini_batch_splits = ""} : (tensor<1x!tf_type.string>) -> ()
+     tf_device.return
+   }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+   %0:2 = "tf_device.launch"() ({
+     %compilation_status, %program = "tf._TPUCompileMlir"() { metadata = "...", mlir_module = "..." } : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+     tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
+   }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+   ```
+
+   becomes:
+
+   ```mlir
+     %0:2 = "tf_device.launch"() ({
+       %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+       tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
+     }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+     "tf_device.launch"() ({
+       %cst = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+       "tf.OpA"(%0#1) {mini_batch_splits = ""} : (tensor<3x!tf_type.string>) -> ()
+       tf_device.return
+     }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+   ```
+  }];
+
+  let dependentDialects = [
+    "mhlo::MhloDialect",
+    "tf_device::TensorFlowDeviceDialect"
+  ];
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
index b00e70eb73c4cc..6b53cae7099688 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
@@ -329,73 +329,6 @@ def ReplicaIDToDeviceOrdinalPass : Pass<"tf-replica-id-to-device-ordinal", "mlir
   }];
 }
 
-def EmbeddingPipeliningPass : Pass<"tf-embedding-pipelining", "mlir::ModuleOp"> {
-  let summary = "Rewrite graph for embedding pipelining";
-  let constructor = "TFDevice::CreateEmbeddingPipeliningPass()";
-    let description = [{
-    For architectures that support accelerated embedding lookups, this pass will
-    rewrite the graph to use pipelining for better device utilization.
-  }];
-}
-
-def EmbeddingProgramKeyPass : Pass<"tf-embedding-program-key", "mlir::func::FuncOp"> {
-  let summary = "Sets the program key for embedding ops.";
-  let constructor = "TFDevice::CreateEmbeddingProgramKeyPass()";
-    let description = [{
-    Passes in the program key to embedding ops. Will move the embedding ops
-    after a _TPUCompileMlir op if there is no predecessor _TPUCompileMlir op.
-    Both the embedding op and compile op are assumed to be wrapped in separate
-    tf_device.launch() ops. This is because the embedding op is head outside
-    compiled and the compile op is wrapped in launch to execute on host
-    during TPURewritePass.
-
-    For example, the tf.OpA with the `mini_batch_splits` attribute will be
-    moved after _TPUCompileMlir and the first input will use the
-    _TPUCompileMlir program output:
-
-    ```mlir
-    "tf_device.launch"() ({
-     %cst_0 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
-     "tf.OpA"(%cst_0) { mini_batch_splits = ""} : (tensor<1x!tf_type.string>) -> ()
-     tf_device.return
-   }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
-   %0:2 = "tf_device.launch"() ({
-     %compilation_status, %program = "tf._TPUCompileMlir"() { metadata = "...", mlir_module = "..." } : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
-     tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
-   }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
-   ```
-
-   becomes:
-
-   ```mlir
-     %0:2 = "tf_device.launch"() ({
-       %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
-       tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
-     }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
-     "tf_device.launch"() ({
-       %cst = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
-       "tf.OpA"(%0#1) {mini_batch_splits = ""} : (tensor<3x!tf_type.string>) -> ()
-       tf_device.return
-     }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
-   ```
-  }];
-
-  let dependentDialects = [
-    "mhlo::MhloDialect",
-    "tf_device::TensorFlowDeviceDialect"
-  ];
-}
-
-def EmbeddingSequencingPass : Pass<"tf-embedding-sequencing", "mlir::ModuleOp"> {
-  let summary = "Rewrite graph for sequential execution of embeddings";
-  let constructor = "TFDevice::CreateEmbeddingSequencingPass()";
-    let description = [{
-    This is a strictly sequential and formally correct fallback option for the
-    embedding pipelining pass intended for debugging during pipelining
-    development.
-  }];
-}
-
 def ConvertReadonlyReferenceVariablesToResourceVariablesPass :
   Pass<"tf-readonly-references-to-resources", "mlir::func::FuncOp"> {
   let summary = "Convert readonly reference variables to resource variables.";
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
index 246481c5cab7db..7e937d2ce49f8b 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
@@ -187,6 +187,7 @@ cc_library(
         "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "//tensorflow/compiler/mlir/tensorflow/transforms/sparsecore:sparsecore_passes",
         "//tensorflow/compiler/mlir/tf2xla/internal/passes:clustering_passes",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
index 603d928daf9032..e289934b69fbe0 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index 2c49198be7bad8..1ce45fe7345c11 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow//compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/runtime_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/test_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mlprogram_util.h"
@@ -35,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tosa/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/tosa/tfl_passes.h"
 #include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
-#include "xla/mlir/framework/ir/xla_framework.h"
 #include "xla/mlir/framework/transforms/passes.h"
 #include "xla/mlir_hlo/lhlo/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
@@ -69,6 +70,8 @@ int main(int argc, char **argv) {
   tensorflow::RegisterGraphOptimizationPasses();
   tensorflow::RegisterMlProgramPasses();
   mlir::TFTPU::registerRuntimeLoweringPasses();
+  mlir::TFDevice::registerSparseCorePasses();
+
   tensorflow::tfrt_compiler::RegisterTPULowerClusterToRuntimeOpsPassPipeline();
   tensorflow::tfrt_compiler::
       RegisterNonTPULowerClusterToRuntimeOpsPassPipeline();

From a3db0abb7adc8101d90d2646edb2c7700b1f0c3b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 15:41:03 -0700
Subject: [PATCH 574/670] Internal cleanup of BUILD/.bzl files

PiperOrigin-RevId: 620077983
---
 tensorflow/lite/acceleration/configuration/BUILD              | 2 ++
 tensorflow/lite/experimental/acceleration/configuration/BUILD | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tensorflow/lite/acceleration/configuration/BUILD b/tensorflow/lite/acceleration/configuration/BUILD
index f78221caf8ee2c..a8e13de931c3b0 100644
--- a/tensorflow/lite/acceleration/configuration/BUILD
+++ b/tensorflow/lite/acceleration/configuration/BUILD
@@ -13,6 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 
+# buildifier: disable=out-of-order-load
+
 load("@flatbuffers//:build_defs.bzl", "DEFAULT_FLATC_ARGS", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library", "flatc_path")
 
 # copybara:comment_begin(oss-only)
diff --git a/tensorflow/lite/experimental/acceleration/configuration/BUILD b/tensorflow/lite/experimental/acceleration/configuration/BUILD
index 79932ae571be7e..f4f92bcbe6860b 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/BUILD
+++ b/tensorflow/lite/experimental/acceleration/configuration/BUILD
@@ -13,6 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 
+# buildifier: disable=out-of-order-load
+
 load("@flatbuffers//:build_defs.bzl", "DEFAULT_FLATC_ARGS", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library")
 
 # copybara:comment_begin(oss-only)

From 8ae67af5aeb33f15b9f8323a594c5c76bbfa94d0 Mon Sep 17 00:00:00 2001
From: Michael Levesque-Dion <mlevesquedion@google.com>
Date: Thu, 28 Mar 2024 16:06:00 -0700
Subject: [PATCH 575/670] Delete populateRankSpecialization*Patterns functions

These were used by KernelGen but are no longer needed.

PiperOrigin-RevId: 620084345
---
 third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
index 14e3add6f814fe..c40a087ab52cbd 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
@@ -166,13 +166,6 @@ void populateGroupReductionDimensionsPatterns(MLIRContext *context,
                                               RewritePatternSet *patterns,
                                               bool preferColumnsReductions);
 
-/// Populate rank specialization clustering and lowering patterns.
-void populateRankSpecializationClusterPatterns(MLIRContext *context,
-                                               RewritePatternSet *patterns);
-void populateRankSpecializationToSCFPatterns(MLIRContext *context,
-                                             RewritePatternSet *patterns,
-                                             int64_t maxTargetRank);
-
 /// Populate sparse tensor specific rewriting patterns.
 void populateSparseRewritingPatterns(RewritePatternSet *patterns,
                                      MLIRContext *ctx);

From 90ab9c1d4030fabb2e9af171ab43dbe0ecec97ef Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Thu, 28 Mar 2024 16:58:40 -0700
Subject: [PATCH 576/670] Add a Resource for KV Cache buffer storage

PiperOrigin-RevId: 620097541
---
 tensorflow/lite/experimental/resource/BUILD   | 26 +++++++++
 .../experimental/resource/cache_buffer.cc     | 56 +++++++++++++++++++
 .../lite/experimental/resource/cache_buffer.h | 51 +++++++++++++++++
 .../resource/cache_buffer_test.cc             | 47 ++++++++++++++++
 .../experimental/resource/resource_variable.h |  2 +-
 5 files changed, 181 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/lite/experimental/resource/cache_buffer.cc
 create mode 100644 tensorflow/lite/experimental/resource/cache_buffer.h
 create mode 100644 tensorflow/lite/experimental/resource/cache_buffer_test.cc

diff --git a/tensorflow/lite/experimental/resource/BUILD b/tensorflow/lite/experimental/resource/BUILD
index 45ed5395dbc9b8..bed57f7489d25e 100644
--- a/tensorflow/lite/experimental/resource/BUILD
+++ b/tensorflow/lite/experimental/resource/BUILD
@@ -6,6 +6,32 @@ package(
     licenses = ["notice"],
 )
 
+cc_library(
+    name = "cache_buffer",
+    srcs = ["cache_buffer.cc"],
+    hdrs = [
+        "cache_buffer.h",
+        "//tensorflow/lite/core/c:common.h",
+    ],
+    deps = [
+        ":resource",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:compatibility",
+    ],
+)
+
+cc_test(
+    name = "cache_buffer_test",
+    srcs = ["cache_buffer_test.cc"],
+    deps = [
+        ":cache_buffer",
+        "//tensorflow/lite/c:common",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "resource",
     srcs = [
diff --git a/tensorflow/lite/experimental/resource/cache_buffer.cc b/tensorflow/lite/experimental/resource/cache_buffer.cc
new file mode 100644
index 00000000000000..0e221589b4cc64
--- /dev/null
+++ b/tensorflow/lite/experimental/resource/cache_buffer.cc
@@ -0,0 +1,56 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/resource/cache_buffer.h"
+
+#include <cstdlib>
+#include <cstring>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace resource {
+
+constexpr char kCacheBufferTensorName[] = "CacheBuffer";
+
+TfLiteStatus CacheBuffer::Initialize(const TfLiteIntArray &shape,
+                                     const TfLiteType &type) {
+  // Set basic parameters.
+  tensor_.name = kCacheBufferTensorName;
+  tensor_.allocation_type = kTfLiteDynamic;
+  tensor_.type = type;
+
+  // Set the shape and allocate the memory.
+  tensor_.dims = TfLiteIntArrayCopy(&shape);
+  const size_t num_bytes = TfLiteTypeGetSize(type) * NumElements(&tensor_);
+  TfLiteTensorRealloc(num_bytes, &tensor_);
+
+  memset(tensor_.data.raw, 0, tensor_.bytes);
+  is_initialized_ = true;
+  return kTfLiteOk;
+}
+
+size_t CacheBuffer::GetNumEntries() const { return num_entries_; }
+
+void CacheBuffer::SetNumEntries(size_t count) {
+  TFLITE_DCHECK(count <= tensor_.dims->data[2]);
+  num_entries_ = count;
+}
+
+}  // namespace resource
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/resource/cache_buffer.h b/tensorflow/lite/experimental/resource/cache_buffer.h
new file mode 100644
index 00000000000000..1e500fab07c269
--- /dev/null
+++ b/tensorflow/lite/experimental/resource/cache_buffer.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_CACHE_BUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_CACHE_BUFFER_H_
+
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/experimental/resource/resource_variable.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace resource {
+
+/// WARNING: Experimental interface, subject to change.
+// A Cache Buffer class. Useful for keeping the keys and values of a
+// transformer block attention mechanism in autoregressive decode.
+// Ops can access this buffer and add tensors to it. It also keeps track of the
+// number of used entries in the cache.
+class CacheBuffer : public ResourceVariable {
+ public:
+  CacheBuffer() = default;
+  CacheBuffer(const CacheBuffer &) = delete;
+  CacheBuffer &operator=(const CacheBuffer &) = delete;
+  // Initialize tensor of a certain shape using the provided type.
+  TfLiteStatus Initialize(const TfLiteIntArray &shape, const TfLiteType &type);
+  size_t GetNumEntries() const;
+  void SetNumEntries(size_t count);
+
+ private:
+  // The number of entries currently used in the buffer;
+  size_t num_entries_ = 0;
+};
+
+}  // namespace resource
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_CACHE_BUFFER_H_
diff --git a/tensorflow/lite/experimental/resource/cache_buffer_test.cc b/tensorflow/lite/experimental/resource/cache_buffer_test.cc
new file mode 100644
index 00000000000000..6b54f6c787138d
--- /dev/null
+++ b/tensorflow/lite/experimental/resource/cache_buffer_test.cc
@@ -0,0 +1,47 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/resource/cache_buffer.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace resource {
+
+TEST(CacheBufferTest, Initialize) {
+  TfLiteIntArray* shape = TfLiteIntArrayCreate(4);
+  shape->data[0] = 1;
+  shape->data[1] = 3;
+  shape->data[2] = 5;
+  shape->data[3] = 7;
+
+  TfLiteType type = kTfLiteFloat32;
+  CacheBuffer cache_buffer;
+  cache_buffer.Initialize(*shape, type);
+
+  EXPECT_EQ(cache_buffer.GetTensor()->type, type);
+  EXPECT_EQ(cache_buffer.GetTensor()->dims->size, 4);
+  EXPECT_EQ(cache_buffer.GetTensor()->dims->data[0], 1);
+  EXPECT_EQ(cache_buffer.GetTensor()->dims->data[1], 3);
+  EXPECT_EQ(cache_buffer.GetTensor()->bytes, 420);
+  ASSERT_NE(cache_buffer.GetTensor()->data.raw, nullptr);
+  EXPECT_EQ(cache_buffer.GetNumEntries(), 0);
+  cache_buffer.SetNumEntries(3);
+  EXPECT_EQ(cache_buffer.GetNumEntries(), 3);
+  TfLiteIntArrayFree(shape);
+}
+
+}  // namespace resource
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/resource/resource_variable.h b/tensorflow/lite/experimental/resource/resource_variable.h
index 3f34082c85f553..881aaa1f0b3a19 100644
--- a/tensorflow/lite/experimental/resource/resource_variable.h
+++ b/tensorflow/lite/experimental/resource/resource_variable.h
@@ -50,7 +50,7 @@ class ResourceVariable : public ResourceBase {
     return is_initialized_ ? tensor_.bytes : 0;
   }
 
- private:
+ protected:
   // The tensor (and its buffer stored in `tensor_.data` is fully owned by
   // the `ResourceVariable` object.
   TfLiteTensor tensor_;

From 7fadd476bd7915ac0b855c18fd747b813c681975 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 17:44:19 -0700
Subject: [PATCH 577/670] Correctly handle output streaming case where the
 MoveToHost annotation is the entry computation root

PiperOrigin-RevId: 620107928
---
 third_party/xla/xla/service/BUILD             |   8 +-
 third_party/xla/xla/service/hlo_verifier.cc   |   3 +-
 .../xla/xla/service/host_offload_legalize.cc  |  12 ++-
 third_party/xla/xla/service/host_offloader.cc |  30 +++---
 .../xla/xla/service/host_offloader_test.cc    | 102 +++++++++++++++---
 5 files changed, 118 insertions(+), 37 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 79c7d1051897b9..1db01dd5c90dea 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -5882,17 +5882,13 @@ cc_library(
     srcs = ["host_offload_legalize.cc"],
     hdrs = ["host_offload_legalize.h"],
     deps = [
+        ":call_graph",
         ":hlo_alias_analysis",
-        ":hlo_buffer",
         ":hlo_pass",
         ":hlo_value",
         ":host_memory_offload_annotations_hdr",
-        ":host_offloader",
-        ":pattern_matcher",
-        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
@@ -5900,7 +5896,9 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index 9b526c3bae41f0..751756ccdab359 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -1952,7 +1952,8 @@ Status ShapeVerifier::VerifyEntryComputationLayout(const HloModule& module) {
                   result_layout.shape(),
                   Shape::Equal()
                       .IgnoreTilesInLayout()
-                      .IgnoreTailPaddingAlignmentInElements())) {
+                      .IgnoreTailPaddingAlignmentInElements()
+                      .IgnoreMemorySpaceInLayout())) {
     return Internal(
         "Shape of the root instruction of entry computation (%s) should be "
         "compatible to one specified in module's entry computation layout (%s)",
diff --git a/third_party/xla/xla/service/host_offload_legalize.cc b/third_party/xla/xla/service/host_offload_legalize.cc
index 958ec67718c51e..e80e0ef32b5a50 100644
--- a/third_party/xla/xla/service/host_offload_legalize.cc
+++ b/third_party/xla/xla/service/host_offload_legalize.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
-#include <string>
+#include <memory>
 #include <utility>
 #include <vector>
 
@@ -26,15 +26,18 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/call_graph.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/host_memory_offload_annotations.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -336,8 +339,11 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
            instruction->parent()->IsEntryComputation();
   };
 
+  if (instruction->IsRoot()) {
+    return false;
+  }
   HloInstruction* starting_instr =
-      FindDUSFromAnnotation(instruction->users()[0]);
+      FindDUSFromAnnotation(instruction->users().at(0));
   // If it's the pure copy case reset instruction.
   if (starting_instr->opcode() != HloOpcode::kDynamicUpdateSlice) {
     starting_instr = instruction;
diff --git a/third_party/xla/xla/service/host_offloader.cc b/third_party/xla/xla/service/host_offloader.cc
index 9058a9aa48c515..d17277a9141c6b 100644
--- a/third_party/xla/xla/service/host_offloader.cc
+++ b/third_party/xla/xla/service/host_offloader.cc
@@ -222,28 +222,34 @@ Status HostOffloader::HandleMoveToHostCustomCall(HloInstruction* custom_call) {
   // Save a pointer to this custom call for when we want to remove it later.
   custom_calls_to_remove_.emplace(custom_call);
 
-  // We expect that the DUS is the only user of this custom call.
-  if (custom_call->user_count() != 1) {
+  // We expect that either the custom call is the root or the DUS is the only
+  // user of this custom call.
+  if (!custom_call->IsRoot() && custom_call->user_count() != 1) {
     return FailedPrecondition(
-        "Expecting custom call %s to only have 1 user; it has %d users: [%s]",
+        "Expecting custom call %s to either be the root or only have 1 user; "
+        "it is not the root and has %d users: [%s]",
         custom_call->name(), custom_call->user_count(),
         absl::StrJoin(custom_call->users(), ", ",
                       [](std::string* out, const HloInstruction* user) {
                         out->append(user->name());
                       }));
   }
-  HloInstruction* op_being_annotated = custom_call->users()[0];
 
-  // Skip past any bitcasts.
-  while (op_being_annotated->opcode() == HloOpcode::kBitcast) {
-    VLOG(1) << "Skipping bitcast " << op_being_annotated->ToString();
-    op_being_annotated = op_being_annotated->users()[0];
+  HloInstruction* consumer = nullptr;
+  if (!custom_call->IsRoot()) {
+    consumer = custom_call->users().at(0);
+    // Skip past any bitcasts.
+    while (consumer != nullptr && consumer->opcode() == HloOpcode::kBitcast) {
+      VLOG(1) << "Skipping bitcast " << consumer->ToString();
+      consumer = consumer->users().at(0);
+    }
   }
 
-  if (op_being_annotated->opcode() == HloOpcode::kDynamicUpdateSlice) {
-    TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithDus(op_being_annotated));
-  } else if (op_being_annotated->opcode() == HloOpcode::kCopy) {
-    TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithCopy(op_being_annotated));
+  if (consumer != nullptr &&
+      consumer->opcode() == HloOpcode::kDynamicUpdateSlice) {
+    TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithDus(consumer));
+  } else if (consumer != nullptr && consumer->opcode() == HloOpcode::kCopy) {
+    TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithCopy(consumer));
   } else {
     TF_ASSIGN_OR_RETURN(bool did_output_streaming,
                         TryOutputStreaming(custom_call));
diff --git a/third_party/xla/xla/service/host_offloader_test.cc b/third_party/xla/xla/service/host_offloader_test.cc
index 6b367fe53a2f54..162bbb4630de45 100644
--- a/third_party/xla/xla/service/host_offloader_test.cc
+++ b/third_party/xla/xla/service/host_offloader_test.cc
@@ -1856,22 +1856,22 @@ ENTRY main {
 
 TEST_F(HostOffloaderTest, OutputStreaming) {
   const std::string& hlo_string = R"(
-HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})->(s32[2,1]{1,0:T(2,128)S(5)}, s32[2,1]{1,0:T(2,128)})}
-
-ENTRY main {
-  param_0 = s32[2,1]{1,0} parameter(0)
-  param_1 = s32[2,1]{1,0} parameter(1)
-  constant_2 = s32[] constant(2)
-  constant_4 = s32[] constant(4)
-  broadcast_0 = s32[2,1]{1,0} broadcast(constant_2), dimensions={}
-  multiply_0 = s32[2,1]{1,0} multiply(param_1, broadcast_0)
-  multiply_1 = s32[2,1]{1,0} multiply(multiply_0, param_0)
-  broadcast_1 = s32[2,1]{1,0} broadcast(constant_4), dimensions={}
-  multiply_2 = s32[2,1]{1,0} multiply(multiply_1, broadcast_1)
-  custom_call = s32[2,1]{1,0} custom-call(multiply_2), custom_call_target="MoveToHost"
-  ROOT tuple = (s32[2,1]{1,0}, s32[2,1]{1,0}) tuple(custom_call, multiply_1)
-}
-)";
+    HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})->(s32[2,1]{1,0:T(2,128)S(5)}, s32[2,1]{1,0:T(2,128)})}
+
+    ENTRY main {
+      param_0 = s32[2,1]{1,0} parameter(0)
+      param_1 = s32[2,1]{1,0} parameter(1)
+      constant_2 = s32[] constant(2)
+      constant_4 = s32[] constant(4)
+      broadcast_0 = s32[2,1]{1,0} broadcast(constant_2), dimensions={}
+      multiply_0 = s32[2,1]{1,0} multiply(param_1, broadcast_0)
+      multiply_1 = s32[2,1]{1,0} multiply(multiply_0, param_0)
+      broadcast_1 = s32[2,1]{1,0} broadcast(constant_4), dimensions={}
+      multiply_2 = s32[2,1]{1,0} multiply(multiply_1, broadcast_1)
+      custom_call = s32[2,1]{1,0} custom-call(multiply_2), custom_call_target="MoveToHost"
+      ROOT tuple = (s32[2,1]{1,0}, s32[2,1]{1,0}) tuple(custom_call, multiply_1)
+    }
+  )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
@@ -1935,6 +1935,76 @@ ENTRY main {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, OutputStreamingCustomCallRoot) {
+  const std::string& hlo_string = R"(
+    HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})->s32[2,1]{1,0:T(2,128)S(5)}}
+
+    ENTRY main {
+      param_0 = s32[2,1]{1,0} parameter(0)
+      param_1 = s32[2,1]{1,0} parameter(1)
+      constant_2 = s32[] constant(2)
+      constant_4 = s32[] constant(4)
+      broadcast_0 = s32[2,1]{1,0} broadcast(constant_2), dimensions={}
+      multiply_0 = s32[2,1]{1,0} multiply(param_1, broadcast_0)
+      multiply_1 = s32[2,1]{1,0} multiply(multiply_0, param_0)
+      broadcast_1 = s32[2,1]{1,0} broadcast(constant_4), dimensions={}
+      multiply_2 = s32[2,1]{1,0} multiply(multiply_1, broadcast_1)
+      ROOT custom_call = s32[2,1]{1,0} custom-call(multiply_2), custom_call_target="MoveToHost"
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  //         constant
+  //            |
+  // param1 broadcast  param0
+  //     \  /          /
+  //   multiply       /
+  //       \         /
+  //        \       /
+  //         multiply   constant
+  //         |             |
+  //         |  ---+---broadcast
+  //         | /
+  //      multiply
+  //          |
+  //         copy
+  HloInstruction* param_1;
+  HloInstruction* broadcast_0;
+  HloInstruction* multiply_0;
+  HloInstruction* param_0;
+  HloInstruction* multiply_1;
+  HloInstruction* broadcast_1;
+  HloInstruction* multiply_2;
+  HloInstruction* copy;
+  auto multiplyPattern =
+      m::Multiply(&multiply_1,
+                  m::Multiply(&multiply_0, m::Parameter(&param_1),
+                              m::Broadcast(&broadcast_0, m::ConstantScalar(2))),
+                  m::Parameter(&param_0));
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Copy(
+                  &copy, m::Multiply(&multiply_2, multiplyPattern,
+                                     m::Broadcast(&broadcast_1,
+                                                  m::ConstantScalar(4))))));
+  TestShapeHasMemorySpace(param_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(broadcast_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(param_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(broadcast_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_2->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy->shape(), kHostMemorySpaceColor);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
 }  // namespace
 
 }  // namespace xla

From 56e98e590e7ebf6d5b470328d6cba39eff98b1cf Mon Sep 17 00:00:00 2001
From: Siqiao Wu <siqiaowu@google.com>
Date: Thu, 28 Mar 2024 18:04:29 -0700
Subject: [PATCH 578/670] Remove the duplicate device assignment in
 ifrt_serving_executable.

PiperOrigin-RevId: 620111648
---
 .../mlir/tfrt/transforms/ifrt/tf2hlo.cc       |  1 +
 .../core/tfrt/ifrt/ifrt_serving_executable.cc | 35 +++++--------------
 2 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
index 65b5078495ea04..a0b01ba1ffc3f7 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
@@ -128,6 +128,7 @@ absl::StatusOr<tensorflow::tpu::TPUCompileMetadataProto> GetCompileMetadata(
 
   // Create a default device assignment if one is not given by the model.
   if (!metadata.has_device_assignment()) {
+    // TODO(b/316068010): integrate core selection.
     TF_ASSIGN_OR_RETURN(
         auto device_assignment,
         ifrt_client.GetDefaultDeviceAssignment(
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
index 868f49ad070ac4..bb9176c3b3de6f 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
@@ -87,37 +87,21 @@ absl::StatusOr<std::vector<DtypeAndShape>> BuildDtypeAndShape(
 }
 
 absl::StatusOr<xla::DeviceAssignment> GetXlaDeviceAssignment(
-    const xla::ifrt::Client& ifrt_client,
     const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata) {
-  int num_replicas = compile_metadata.num_replicas();
-  int num_partitions = compile_metadata.num_cores_per_replica();
-
-  VLOG(2) << " Number of replcas is " << num_replicas
-          << " and num_partitions is " << num_partitions;
-
-  if (num_replicas > 1) {
-    return absl::UnimplementedError(
-        absl::StrCat("Only support single replica, but replica number is ",
-                     num_replicas, " and num_partitions is ", num_partitions));
-  }
-
-  if (compile_metadata.has_device_assignment()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::DeviceAssignment> da,
-                        xla::DeviceAssignment::Deserialize(
-                            compile_metadata.device_assignment()));
-
-    return *std::move(da);
-  } else {
-    // TODO(b/316068010): integrate core selection.
-    return ifrt_client.GetDefaultDeviceAssignment(num_replicas, num_partitions);
+  if (!compile_metadata.has_device_assignment()) {
+    return absl::InternalError("No device assignment found.");
   }
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::DeviceAssignment> da,
+      xla::DeviceAssignment::Deserialize(compile_metadata.device_assignment()));
+  return *da;
 }
 
 absl::StatusOr<std::vector<xla::ifrt::Device*>> GetAssignedDevices(
     const xla::ifrt::Client& ifrt_client,
     const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata) {
   TF_ASSIGN_OR_RETURN(auto device_assignment,
-                      GetXlaDeviceAssignment(ifrt_client, compile_metadata));
+                      GetXlaDeviceAssignment(compile_metadata));
 
   const int num_devices =
       device_assignment.replica_count() * device_assignment.computation_count();
@@ -173,9 +157,8 @@ IfrtServingExecutable::CreateExecutableSynchronously(
                      num_replicas, " and num_partitions is ", num_partitions));
   }
 
-  TF_ASSIGN_OR_RETURN(
-      xla::DeviceAssignment da,
-      GetXlaDeviceAssignment(*ifrt_client_, tf2hlo_result.compile_metadata));
+  TF_ASSIGN_OR_RETURN(xla::DeviceAssignment da,
+                      GetXlaDeviceAssignment(tf2hlo_result.compile_metadata));
 
   VLOG(2) << "Device assignment :" << da.ToString();
 

From 41bfc5d180756de01ed1d181428c65cc20468102 Mon Sep 17 00:00:00 2001
From: Jake Harmon <jakeharmon@google.com>
Date: Thu, 28 Mar 2024 18:06:04 -0700
Subject: [PATCH 579/670] Set release_base for all release platforms

PiperOrigin-RevId: 620111958
---
 .bazelrc                                 | 12 ++++++++++--
 third_party/xla/.bazelrc                 | 12 ++++++++++--
 third_party/xla/third_party/tsl/.bazelrc | 12 ++++++++++--
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index c630c1350bbd77..d8990ac5c12cc5 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -597,8 +597,12 @@ try-import %workspace%/.bazelrc.user
 # Build TensorFlow v2.
 test:release_base --test_size_filters=small,medium
 
+# Ensure release_base is set on linux
+build:release_linux_base --config=release_base
+
 # Target the AVX instruction set
 build:release_linux_base --config=avx_linux
+
 # Enable support for all targets
 build:release_base --config=cpu_cross
 
@@ -679,12 +683,14 @@ build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gc
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 
 build:release_cpu_macos --config=avx_linux
-test:release_cpu_macos --config=release_base
 
 # Base build configs for macOS
 build:release_macos_base --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
 build:release_macos_base --define=no_nccl_support=true --output_filter=^$
 
+# Ensure release_base is set on mac
+build:release_macos_base --config=release_base
+
 # Build configs for macOS x86
 build:release_macos_x86 --config=release_macos_base
 # Build with the AVX instruction set when on macOS x86
@@ -714,10 +720,12 @@ test:release_macos_x86 --config=release_macos_base
 # Test configs for macOS Arm64
 test:release_macos_arm64 --config=release_macos_base
 
+# Ensure release_base is set on windows
+build:release_cpu_windows --config=release_base
+
 # TODO(kanglan): Update windows configs after b/289091160 is fixed
 build:release_cpu_windows --config=avx_win
 build:release_cpu_windows --define=no_tensorflow_py_deps=true
-test:release_cpu_windows --config=release_base
 
 # Exclude TFRT integration for anything but Linux.
 build:android --config=no_tfrt
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index c630c1350bbd77..d8990ac5c12cc5 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -597,8 +597,12 @@ try-import %workspace%/.bazelrc.user
 # Build TensorFlow v2.
 test:release_base --test_size_filters=small,medium
 
+# Ensure release_base is set on linux
+build:release_linux_base --config=release_base
+
 # Target the AVX instruction set
 build:release_linux_base --config=avx_linux
+
 # Enable support for all targets
 build:release_base --config=cpu_cross
 
@@ -679,12 +683,14 @@ build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gc
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 
 build:release_cpu_macos --config=avx_linux
-test:release_cpu_macos --config=release_base
 
 # Base build configs for macOS
 build:release_macos_base --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
 build:release_macos_base --define=no_nccl_support=true --output_filter=^$
 
+# Ensure release_base is set on mac
+build:release_macos_base --config=release_base
+
 # Build configs for macOS x86
 build:release_macos_x86 --config=release_macos_base
 # Build with the AVX instruction set when on macOS x86
@@ -714,10 +720,12 @@ test:release_macos_x86 --config=release_macos_base
 # Test configs for macOS Arm64
 test:release_macos_arm64 --config=release_macos_base
 
+# Ensure release_base is set on windows
+build:release_cpu_windows --config=release_base
+
 # TODO(kanglan): Update windows configs after b/289091160 is fixed
 build:release_cpu_windows --config=avx_win
 build:release_cpu_windows --define=no_tensorflow_py_deps=true
-test:release_cpu_windows --config=release_base
 
 # Exclude TFRT integration for anything but Linux.
 build:android --config=no_tfrt
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index c630c1350bbd77..d8990ac5c12cc5 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -597,8 +597,12 @@ try-import %workspace%/.bazelrc.user
 # Build TensorFlow v2.
 test:release_base --test_size_filters=small,medium
 
+# Ensure release_base is set on linux
+build:release_linux_base --config=release_base
+
 # Target the AVX instruction set
 build:release_linux_base --config=avx_linux
+
 # Enable support for all targets
 build:release_base --config=cpu_cross
 
@@ -679,12 +683,14 @@ build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gc
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 
 build:release_cpu_macos --config=avx_linux
-test:release_cpu_macos --config=release_base
 
 # Base build configs for macOS
 build:release_macos_base --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
 build:release_macos_base --define=no_nccl_support=true --output_filter=^$
 
+# Ensure release_base is set on mac
+build:release_macos_base --config=release_base
+
 # Build configs for macOS x86
 build:release_macos_x86 --config=release_macos_base
 # Build with the AVX instruction set when on macOS x86
@@ -714,10 +720,12 @@ test:release_macos_x86 --config=release_macos_base
 # Test configs for macOS Arm64
 test:release_macos_arm64 --config=release_macos_base
 
+# Ensure release_base is set on windows
+build:release_cpu_windows --config=release_base
+
 # TODO(kanglan): Update windows configs after b/289091160 is fixed
 build:release_cpu_windows --config=avx_win
 build:release_cpu_windows --define=no_tensorflow_py_deps=true
-test:release_cpu_windows --config=release_base
 
 # Exclude TFRT integration for anything but Linux.
 build:android --config=no_tfrt

From a6bbcb71721040569f8aa737aabc78d5df5050de Mon Sep 17 00:00:00 2001
From: Matthias Kramm <kramm@google.com>
Date: Thu, 28 Mar 2024 18:11:34 -0700
Subject: [PATCH 580/670] Add missing 'END' to definition of XlaSplitND.

PiperOrigin-RevId: 620112924
---
 tensorflow/core/api_def/base_api/api_def_XlaSplitND.pbtxt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSplitND.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSplitND.pbtxt
index c7deacb4fec21d..a31cfa5c4d85ec 100644
--- a/tensorflow/core/api_def/base_api/api_def_XlaSplitND.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSplitND.pbtxt
@@ -5,6 +5,7 @@ op {
     name: "input"
     description: <<END
 Input tensor to split across all dimensions.
+END
   }
   out_arg {
     name: "outputs"

From 2ec2502e77d2278cdc53cbebe4fc3cb8e33cf841 Mon Sep 17 00:00:00 2001
From: Doyeon Kim <doyeonkim@google.com>
Date: Thu, 28 Mar 2024 18:50:52 -0700
Subject: [PATCH 581/670] Enable weight-only quantization of
 stablehlo.convolution

PiperOrigin-RevId: 620121421
---
 .../quantization/stablehlo/passes/passes.td   |   4 +-
 .../stablehlo/passes/quantization_patterns.cc | 111 ++++++++----------
 .../quantization/stablehlo/passes/quantize.cc |  14 ++-
 .../integration_test/quantize_model_test.py   |  93 ++++++++++++++-
 .../passes/quantize/quantize_weight_only.mlir |  36 +++++-
 ...ntize_composite_functions_weight_only.mlir |  30 +++++
 6 files changed, 217 insertions(+), 71 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
index 80847e8283652e..63f6f822dbebdf 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -70,7 +70,7 @@ def QuantizeCompositeFunctionsPass : Pass<"stablehlo-quantize-composite-function
     Option<"enable_weight_only_",
         "enable-weight-only",
         "bool", /*default=*/"false",
-        "Whether to produce weight-only quantized op for dot_general op.">,
+        "Whether to produce weight-only quantized op for convolution and dot_general op.">,
   ];
   let dependentDialects = [
     "mlir::arith::ArithDialect",
@@ -113,7 +113,7 @@ def QuantizePass : Pass<"stablehlo-quantize", "mlir::ModuleOp"> {
     Option<"enable_weight_only_",
         "enable-weight-only",
         "bool", /*default=*/"false",
-        "Whether to produce weight-only quantized op for dot_general op.">,
+        "Whether to produce weight-only quantized op for convolution and dot_general op.">,
   ];
   let dependentDialects = [
     "mlir::stablehlo::StablehloDialect",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
index f2b78caeb3cb44..10b15f1132fe62 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
@@ -410,9 +410,11 @@ void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter,
 class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
   explicit QuantizeDotGeneralOpPattern(
-      const bool enable_per_channel_quantized_weight)
+      const bool enable_per_channel_quantized_weight,
+      const bool enable_weight_only)
       : enable_per_channel_quantized_weight_(
-            enable_per_channel_quantized_weight) {}
+            enable_per_channel_quantized_weight),
+        enable_weight_only_(enable_weight_only) {}
 
   LogicalResult match(func::FuncOp entry_func_op) const override {
     return MatchGemmStyleOp<DotGeneralOp>(entry_func_op);
@@ -420,6 +422,7 @@ class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
 
   void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {
+    if (enable_weight_only_) return;
     DotGeneralOp dot_general_op = *entry_func_op.getOps<DotGeneralOp>().begin();
     const bool should_quantize_per_channel =
         enable_per_channel_quantized_weight_ &&
@@ -432,15 +435,20 @@ class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
   [[deprecated(
       "Do not rely on this field for per-channel quantization. Use `Method` "
       "instead.")]] const bool enable_per_channel_quantized_weight_;
+  // TODO: b/331510853 - Deprecate boolean flag and use `Method` to perform
+  // weight-only quantization.
+  const bool enable_weight_only_;
 };
 
 // Quantizes the entry function's body containing a `ConvolutionOp`.
 class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
   explicit QuantizeConvolutionOpPattern(
-      const bool enable_per_channel_quantized_weight)
+      const bool enable_per_channel_quantized_weight,
+      const bool enable_weight_only)
       : enable_per_channel_quantized_weight_(
-            enable_per_channel_quantized_weight) {}
+            enable_per_channel_quantized_weight),
+        enable_weight_only_(enable_weight_only) {}
 
   LogicalResult match(func::FuncOp entry_func_op) const override {
     return MatchGemmStyleOp<ConvolutionOp>(entry_func_op);
@@ -448,6 +456,7 @@ class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
 
   void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {
+    if (enable_weight_only_) return;
     RewriteGemmStyleOp<ConvolutionOp>(
         entry_func_op, rewriter,
         enable_per_channel_quantized_weight_ &&
@@ -475,13 +484,17 @@ class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
   [[deprecated(
       "Do not rely on this field for per-channel quantization. Use `Method` "
       "instead.")]] const bool enable_per_channel_quantized_weight_;
+  // TODO: b/331510853 - Deprecate boolean flag and use `Method` to perform
+  // weight-only quantization.
+  const bool enable_weight_only_;
 };
 
 template <typename SingularOpT>
 class QuantizeSingularOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
   explicit QuantizeSingularOpPattern(
-      const bool enable_per_channel_quantized_weight) {}
+      const bool enable_per_channel_quantized_weight,
+      const bool enable_weight_only) {}
 
   LogicalResult match(func::FuncOp entry_func_op) const override {
     const auto op_iterator_range = entry_func_op.getOps<SingularOpT>();
@@ -569,10 +582,12 @@ template <typename FuncBodyRewritePatternT,
 class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
  public:
   explicit XlaCallModuleOpToCallOp(
-      MLIRContext& ctx, const bool enable_per_channel_quantized_weight)
+      MLIRContext& ctx, const bool enable_per_channel_quantized_weight,
+      const bool enable_weight_only)
       : OpRewritePattern<TF::XlaCallModuleOp>(&ctx),
         enable_per_channel_quantized_weight_(
-            enable_per_channel_quantized_weight) {}
+            enable_per_channel_quantized_weight),
+        enable_weight_only_(enable_weight_only) {}
 
   LogicalResult match(TF::XlaCallModuleOp op) const override {
     ModuleOp module_op = op->getParentOfType<ModuleOp>();
@@ -581,13 +596,19 @@ class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
     // Ignore unquantized ops.
     if (!IsQuantizedXlaCallModuleOp(op)) return failure();
 
+    // For weight-only quantization, op should be hybrid quantized.
+    if (enable_weight_only_ && !IsHybridQuantizedOp(op)) {
+      return failure();
+    }
+
     func::FuncOp entry_func_op = GetEntryFuncOp(op, symbol_table);
     if (!entry_func_op) {
       op->emitError("Failed to find a valid entry function.");
       return failure();
     }
 
-    return FuncBodyRewritePatternT(enable_per_channel_quantized_weight_)
+    return FuncBodyRewritePatternT(enable_per_channel_quantized_weight_,
+                                   enable_weight_only_)
         .match(entry_func_op);
   }
 
@@ -601,7 +622,8 @@ class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
 
     ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
         *rewriter.getContext(), rewriter, xla_call_module_op,
-        FuncBodyRewritePatternT(enable_per_channel_quantized_weight_),
+        FuncBodyRewritePatternT(enable_per_channel_quantized_weight_,
+                                enable_weight_only_),
         quantization_method);
   }
 
@@ -609,6 +631,9 @@ class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
   [[deprecated(
       "Do not rely on this field for per-channel quantization. Use `Method` "
       "instead.")]] const bool enable_per_channel_quantized_weight_;
+  // TODO: b/331510853 - Deprecate boolean flag and use `Method` to perform
+  // weight-only quantization.
+  const bool enable_weight_only_;
 };
 
 // Quantizes op with regions such as stablehlo.reduce_window op.
@@ -883,72 +908,32 @@ bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op) {
   return false;
 }
 
-class QuantizeWeightOnlyDotGeneralPattern
-    : public EntryFuncBodyQuantizationPattern {
+template <typename OpT>
+class QuantizeWeightOnlyOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
-  explicit QuantizeWeightOnlyDotGeneralPattern() = default;
+  explicit QuantizeWeightOnlyOpPattern(
+      const bool enable_per_channel_quantized_weight) {}
 
   LogicalResult match(func::FuncOp entry_func_op) const override {
-    return MatchGemmStyleOp<DotGeneralOp>(entry_func_op);
+    return MatchGemmStyleOp<OpT>(entry_func_op);
   }
 
   void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {}
 };
 
-template <typename FuncBodyRewritePatternT,
-          typename = std::enable_if_t<std::is_base_of_v<
-              EntryFuncBodyQuantizationPattern, FuncBodyRewritePatternT>>>
-class WeightOnlyXlaCallModuleOpToCallOp
-    : public OpRewritePattern<TF::XlaCallModuleOp> {
- public:
-  explicit WeightOnlyXlaCallModuleOpToCallOp(
-      MLIRContext& ctx, const bool enable_per_channel_quantized_weight)
-      : OpRewritePattern<TF::XlaCallModuleOp>(&ctx) {};
-
-  LogicalResult match(TF::XlaCallModuleOp op) const override {
-    ModuleOp module_op = op->getParentOfType<ModuleOp>();
-    SymbolTable symbol_table(module_op);
-
-    // Ignore unquantized ops.
-    if (!IsHybridQuantizedOp(op) || !IsOpQuantizableStableHlo(op)) {
-      return failure();
-    }
-
-    func::FuncOp entry_func_op = GetEntryFuncOp(op, symbol_table);
-    if (!entry_func_op) {
-      op->emitError("Failed to find a valid entry function.");
-      return failure();
-    }
-
-    return FuncBodyRewritePatternT().match(entry_func_op);
-  }
-
-  void rewrite(TF::XlaCallModuleOp xla_call_module_op,
-               PatternRewriter& rewriter) const override {
-    // TODO: b/331145946 - Each quantization method should be valid
-    // (GetQuantizationMethodOrDefault swallows invalid method attribute). Check
-    // the validity in `match()`. Use accessors to achieve this.
-    const Method quantization_method =
-        GetQuantizationMethodOrDefault(xla_call_module_op);
-
-    ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
-        *rewriter.getContext(), rewriter, xla_call_module_op,
-        FuncBodyRewritePatternT(), quantization_method);
-  }
-};
-
 // Compute heavy patterns should be quantized for both server and ODML targets.
 void PopulateComputeHeavyPatterns(
     MLIRContext& ctx, RewritePatternSet& patterns,
     const bool enable_per_channel_quantized_weight) {
   patterns.add<XlaCallModuleOpToCallOp<QuantizeConvolutionOpPattern>>(
-      ctx, enable_per_channel_quantized_weight);
+      ctx, enable_per_channel_quantized_weight, /*enable_weight_only=*/false);
   patterns.add<XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>>(
-      ctx, enable_per_channel_quantized_weight);
+      ctx, enable_per_channel_quantized_weight, /*enable_weight_only=*/false);
   // TODO: b/307620772 - Per-channel quantization for gather.
   patterns.add<XlaCallModuleOpToCallOp<QuantizeSingularOpPattern<GatherOp>>>(
-      ctx, /*enable_per_channel_quantized_weight=*/false);
+      ctx, /*enable_per_channel_quantized_weight=*/false,
+      /*enable_weight_only=*/false);
   // Populate pattern for quantization of ops with regions such as
   // `stablehlo.reduce_window` op.
   patterns.add<QuantizeOpWithRegionPattern>(ctx);
@@ -957,14 +942,16 @@ void PopulateComputeHeavyPatterns(
 void PopulateAllQuantizablePatterns(MLIRContext& ctx,
                                     RewritePatternSet& patterns) {
   patterns.add<XlaCallModuleOpToCallOp<QuantizeSingularOpPattern<AddOp>>>(
-      ctx, /*enable_per_channel_quantized_weight=*/false);
+      ctx, /*enable_per_channel_quantized_weight=*/false,
+      /*enable_weight_only=*/false);
 }
 
 void PopulateQuantizeWeightOnlyPatterns(MLIRContext& ctx,
                                         RewritePatternSet& patterns) {
-  patterns.add<
-      WeightOnlyXlaCallModuleOpToCallOp<QuantizeWeightOnlyDotGeneralPattern>>(
-      ctx, /*enable_per_channel_quantized_weight=*/false);
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeConvolutionOpPattern>,
+               XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>>(
+      ctx, /*enable_per_channel_quantized_weight*/ false,
+      /*enable_weight_only=*/true);
 }
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
index 4d6f4b3fe86832..8bb2bd33564481 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
@@ -77,6 +77,14 @@ struct StableHloQuantizationReverse
                                   quantfork::QuantizeCastOp>(ctx) {}
 };
 
+bool IsHybridQuantizableOp(Operation& op) {
+  auto call_op = cast<TF::XlaCallModuleOp>(op);
+  if (call_op == nullptr) return false;
+  StringRef entry_function_name = GetEntryFunctionName(call_op);
+  return entry_function_name.contains("conv") ||
+         entry_function_name.contains("dot_general");
+}
+
 // Quantization rewrite pattern using DQ as the root op.
 struct StableHloQuantizationWeightOnly
     : public StableHloQuantizationBase<StableHloQuantizationWeightOnly> {
@@ -84,8 +92,7 @@ struct StableHloQuantizationWeightOnly
       : StableHloQuantizationBase<StableHloQuantizationWeightOnly>(ctx) {}
 
   static bool AllowWeightOnlyQuantization(Operation& op) {
-    auto call_op = cast<TF::XlaCallModuleOp>(op);
-    return call_op && GetEntryFunctionName(call_op).contains("dot_general");
+    return IsHybridQuantizableOp(op);
   }
 };
 
@@ -97,8 +104,7 @@ class QuantizePass : public impl::QuantizePassBase<QuantizePass> {
 
   explicit QuantizePass(const bool enable_per_channel_quantized_weight,
                         const bool enable_full_int_quantization,
-                        const bool enable_weight_only,
-                        const QuantizationSpecs& quant_specs) {
+                        const bool enable_weight_only) {
     enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
     enable_full_int_quantization_ = enable_full_int_quantization;
     enable_weight_only_ = enable_weight_only;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
index a76ca4e75ac764..80a2c560ef865b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -1014,12 +1014,101 @@ def test_matmul_weight_only_model(
     # dequantization.
     self.assertTrue(re.search('stablehlo.subtract', module_str))
     self.assertTrue(re.search('stablehlo.multiply', module_str))
+    # Tests that the output graph contains float dot_general.
     self.assertTrue(
         re.search('stablehlo.dot_general.*xf32>.*xf32>.*xf32>', module_str)
     )
+
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.3,
+    )
+
+  @parameterized.parameters(
+      testing.parameter_combinations([{
+          'bias_fn': (
+              None,
+              nn_ops.bias_add,
+          ),
+          'activation_fn': (
+              None,
+              nn_ops.relu,
+              nn_ops.relu6,
+          ),
+          'has_batch_norm': (False,),
+          'input_shape_dynamic': (
+              False,
+              True,
+          ),
+      }])
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_conv_weight_only_model(
+      self,
+      bias_fn: Optional[ops.Operation],
+      activation_fn: Optional[ops.Operation],
+      has_batch_norm: bool,
+      input_shape_dynamic: bool,
+      dilations: Sequence[int] = None,
+  ):
+    input_shape = (None, 3, 4, 3) if input_shape_dynamic else (1, 3, 4, 3)
+    filter_shape = (2, 3, 3, 2)
+    strides = (1, 1, 1, 1)
+    model = self._create_conv2d_model(
+        input_shape,
+        filter_shape,
+        self._input_saved_model_path,
+        bias_fn,
+        activation_fn,
+        has_batch_norm,
+        strides,
+        dilations,
+    )
+
+    rng = np.random.default_rng(1234)
+    static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
+    input_data = ops.convert_to_tensor(
+        rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
+            np.float32
+        )
+    )
+
+    config = qc.QuantizationConfig(
+        weight_only_preset=qc.WeightOnlyPreset(),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    expected_outputs = model.conv2d(input_data)
+
+    root = load.load(self._output_saved_model_path)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+
+    new_outputs = root.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+    # Tests that the quantized graph outputs similar values. The rtol and atol
+    # values are arbitrary.
+    self.assertAllClose(new_outputs, expected_outputs, rtol=0.03, atol=0.2)
+
+    module_str = self._extract_first_xla_call_module_op(
+        self._output_saved_model_path
+    )
+
+    # Tests that the output graph contains subtract and multiply for
+    # dequantization.
+    self.assertTrue(re.search('stablehlo.subtract', module_str))
+    self.assertTrue(re.search('stablehlo.multiply', module_str))
     # Tests that the output graph contains float dot_general.
     self.assertTrue(
-        re.search('stablehlo.dot_general.*xf32>.*xf32>.*xf32>', module_str)
+        re.search('stablehlo.convolution.*xf32>.*xf32>.*xf32>', module_str)
     )
 
     # Due to other meta data, the compression is not exactly 1/4.
@@ -1027,7 +1116,7 @@ def test_matmul_weight_only_model(
         testing.get_size_ratio(
             self._output_saved_model_path, self._input_saved_model_path
         ),
-        0.3,
+        0.35,
     )
 
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
index f9a6aaea3a500f..6db474de676ccc 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
@@ -1,6 +1,7 @@
 // RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-quantize=enable-weight-only=true | FileCheck %s
 
-// Test that hybrid quantized op is produced when q/dq pair only exists for weight.
+// Test that hybrid quantized dot_general is produced when q/dq pair only exists
+// for weight.
 
 module attributes {tf_saved_model.semantics} {
   func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
@@ -29,3 +30,36 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG1]], %[[ARG2]]
 // CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
 // CHECK: return %[[DOT]]
+
+// -----
+
+// Test that hybrid quantized convolution is produced when q/dq pair only exists
+// for weight.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
+    %0 = "quantfork.qcast"(%cst) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    %1 = "quantfork.dcast"(%0) : (tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<2x3x3x2xf32>
+    %2 = "tf.XlaCallModule"(%arg0, %1) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_conv_fn
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x4x3xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
+// CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[Q]]) : (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_conv_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x3x4x3xf32>,  %[[ARG2:.+]]: tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG1]], %[[ARG2]])
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CONV]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
index c14ff0e36340b3..dce15fe07760e2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
@@ -28,3 +28,33 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG1]], %[[ARG2]]
 // CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3xf32>
 // CHECK: return %[[DOT]]
+
+// -----
+
+// Test that hybrid quantized convolution op is produced when enable-weight-only
+// is set to true.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %1 : tensor<1x3x4x2xf32>
+  }
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_conv_fn
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x4x3xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>
+// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[CST]]) : (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_conv_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x3x4x3xf32>,  %[[ARG2:.+]]: tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG1]], %[[ARG2]])
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CONV]]

From e84bf191f278a6a52e9acca0c0deda4037bd3867 Mon Sep 17 00:00:00 2001
From: Marcello Maggioni <maggioni@google.com>
Date: Thu, 28 Mar 2024 18:51:38 -0700
Subject: [PATCH 582/670] [XLA] Respect min_rank for reduce scatter version of
 MatchReduceScatter.

We need to honor it.

PiperOrigin-RevId: 620121620
---
 .../xla/xla/service/collective_opt_utils.cc       | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/service/collective_opt_utils.cc b/third_party/xla/xla/service/collective_opt_utils.cc
index 183173801077c5..cbc7a4c8867bd4 100644
--- a/third_party/xla/xla/service/collective_opt_utils.cc
+++ b/third_party/xla/xla/service/collective_opt_utils.cc
@@ -267,12 +267,13 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
   return true;
 }
 
-ReduceScatterSpec SpecFromReduceScatterInstr(const HloInstruction* rs_instr,
-                                             int64_t num_partitions,
-                                             int64_t num_replicas,
-                                             bool is_constrain_layout,
-                                             bool use_global_device_ids,
-                                             bool is_cross_module) {
+std::optional<ReduceScatterSpec> SpecFromReduceScatterInstr(
+    const HloInstruction* rs_instr, int64_t num_partitions,
+    int64_t num_replicas, int64_t min_rank, bool is_constrain_layout,
+    bool use_global_device_ids, bool is_cross_module) {
+  if (rs_instr->shape().rank() < min_rank) {
+    return std::nullopt;
+  }
   CHECK(rs_instr->opcode() == HloOpcode::kReduceScatter);
   ReduceScatterSpec spec;
   spec.split_dim = rs_instr->dimensions(0);
@@ -303,7 +304,7 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
     HloPredicate match_partition_id, HloPredicate match_replica_id) {
   if (ar->opcode() == HloOpcode::kReduceScatter) {
     return SpecFromReduceScatterInstr(
-        ar, num_partitions, num_replicas, ar->constrain_layout(),
+        ar, num_partitions, num_replicas, min_rank, ar->constrain_layout(),
         ar->use_global_device_ids(), ar->channel_id().has_value());
   }
   auto spec = MatchWithDynamicSlice(

From 0a9c74775046f815cbd30563cb5bed847774e16f Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <dmitrig@google.com>
Date: Thu, 28 Mar 2024 19:04:05 -0700
Subject: [PATCH 583/670] Integrate LLVM at llvm/llvm-project@aa2c14de1adc

Updates LLVM usage to match
[aa2c14de1adc](https://github.com/llvm/llvm-project/commit/aa2c14de1adc)

PiperOrigin-RevId: 620124069
---
 .../compiler/mlir/lite/experimental/tac/BUILD |   1 +
 .../compiler/mlir/lite/quantization/ir/BUILD  |   1 +
 .../mlir/quantization/common/ir/BUILD         |   1 +
 tensorflow/compiler/mlir/tensorflow/BUILD     |   4 +
 .../mlir/tensorflow/ir/host_runtime/BUILD     |   1 +
 tensorflow/compiler/mlir/tfrt/ir/BUILD        |   2 +
 .../compiler/mlir/tools/kernel_gen/ir/BUILD   |   1 +
 third_party/llvm/generated.patch              | 206 ++++++++++++++++++
 third_party/llvm/workspace.bzl                |   4 +-
 third_party/xla/xla/mlir_hlo/BUILD            |   2 +
 third_party/xla/xla/pjrt/BUILD                |   1 +
 third_party/xla/xla/python/BUILD              |   1 +
 .../xla/xla/service/gpu/fusions/mlir/ir/BUILD |   1 +
 .../xla/xla/service/gpu/gpu_compiler.cc       |   4 +-
 14 files changed, 226 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
index 1c5a0703d0a58a..248a55c7fe17e1 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
@@ -82,6 +82,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/BUILD b/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
index ffac0779313307..a6d6c61444548e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
@@ -92,6 +92,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/BUILD b/tensorflow/compiler/mlir/quantization/common/ir/BUILD
index c1429a27368d51..615f54f70d2373 100644
--- a/tensorflow/compiler/mlir/quantization/common/ir/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/ir/BUILD
@@ -70,6 +70,7 @@ cc_library(
     deps = [
         ":QuantOpsIncGen",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:QuantOps",
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 79203111b03ea4..26d5e4d52b41d7 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -356,6 +356,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
@@ -404,6 +405,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
@@ -453,6 +455,7 @@ cc_library(
         "//tensorflow/core/common_runtime:lower_function_call_inline_policy",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
@@ -556,6 +559,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:CallOpInterfacesIncGen",
         "@llvm-project//mlir:ControlFlowDialect",
         "@llvm-project//mlir:ControlFlowInterfaces",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/BUILD b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/BUILD
index ddef04d4185e1d..ccf7b0b547ab90 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/BUILD
@@ -73,6 +73,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/framework:resource_handle",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
diff --git a/tensorflow/compiler/mlir/tfrt/ir/BUILD b/tensorflow/compiler/mlir/tfrt/ir/BUILD
index 88baa91f6de0d0..68e9624e118453 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/BUILD
@@ -51,6 +51,7 @@ cc_library(
         ":tfrt_fallback_common",
         ":tfrt_fallback_opdefs",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:SideEffectInterfaces",
@@ -256,6 +257,7 @@ cc_library(
         ":tfrt_fallback_opdefs",
         ":tfrt_gpu_opdefs_inc_gen",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:IR",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
index 29335382ec41a7..42d679c35d0173 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
@@ -86,6 +86,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@llvm-project//mlir:AllocationOpInterface",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 509398da979e83..229971a2e9ad47 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1 +1,207 @@
 Auto generated patch. Do not edit or delete it, even if empty.
+diff -ruN --strip-trailing-cr a/clang/lib/APINotes/APINotesWriter.cpp b/clang/lib/APINotes/APINotesWriter.cpp
+--- a/clang/lib/APINotes/APINotesWriter.cpp
++++ b/clang/lib/APINotes/APINotesWriter.cpp
+@@ -441,7 +441,7 @@
+   std::sort(VI.begin(), VI.end(),
+             [](const std::pair<VersionTuple, T> &LHS,
+                const std::pair<VersionTuple, T> &RHS) -> bool {
+-              assert(LHS.first != RHS.first &&
++              assert((&LHS == &RHS || LHS.first != RHS.first) &&
+                      "two entries for the same version");
+               return LHS.first < RHS.first;
+             });
+diff -ruN --strip-trailing-cr a/clang/test/APINotes/module-cache.m b/clang/test/APINotes/module-cache.m
+--- a/clang/test/APINotes/module-cache.m
++++ b/clang/test/APINotes/module-cache.m
+@@ -27,6 +27,7 @@
+ // RUN: FileCheck -check-prefix=CHECK-ONE-ERROR %s < %t/before.log
+ 
+ // Change the API notes file, after the module has rebuilt once.
++// RUN: chmod u+w %t/APINotes/SomeOtherKit.apinotes
+ // RUN: echo '      - Selector: "methodA"' >> %t/APINotes/SomeOtherKit.apinotes
+ // RUN: echo '        MethodKind: Instance' >> %t/APINotes/SomeOtherKit.apinotes
+ // RUN: echo '        Availability: none' >> %t/APINotes/SomeOtherKit.apinotes
+diff -ruN --strip-trailing-cr a/lld/test/ELF/lto/libcall-archive.ll b/lld/test/ELF/lto/libcall-archive.ll
+--- a/lld/test/ELF/lto/libcall-archive.ll
++++ b/lld/test/ELF/lto/libcall-archive.ll
+@@ -4,8 +4,8 @@
+ ; RUN: llvm-as -o %t2.o %S/Inputs/libcall-archive.ll
+ ; RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux -o %t3.o %S/Inputs/libcall-archive.s
+ ; RUN: llvm-ar rcs %t.a %t2.o %t3.o
+-; RUN: ld.lld --why-extract=why.txt -o %t %t.o %t.a
+-; RUN: FileCheck %s --input-file=why.txt --check-prefix=CHECK-WHY
++; RUN: ld.lld --why-extract=%t.why.txt -o %t %t.o %t.a
++; RUN: FileCheck %s --input-file=%t.why.txt --check-prefix=CHECK-WHY
+ ; RUN: llvm-nm %t | FileCheck %s
+ ; RUN: ld.lld -o %t2 %t.o --start-lib %t2.o %t3.o --end-lib
+ ; RUN: llvm-nm %t2 | FileCheck %s
+diff -ruN --strip-trailing-cr a/llvm/include/llvm/IR/Verifier.h b/llvm/include/llvm/IR/Verifier.h
+--- a/llvm/include/llvm/IR/Verifier.h
++++ b/llvm/include/llvm/IR/Verifier.h
+@@ -77,7 +77,6 @@
+   /// Visit an instruction and return true if it is valid, return false if an
+   /// invalid TBAA is attached.
+   bool visitTBAAMetadata(Instruction &I, const MDNode *MD);
+-  bool visitTBAAStructMetadata(Instruction &I, const MDNode *MD);
+ };
+ 
+ /// Check a function for errors, useful for use when debugging a
+diff -ruN --strip-trailing-cr a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
+--- a/llvm/lib/IR/Verifier.cpp
++++ b/llvm/lib/IR/Verifier.cpp
+@@ -5096,9 +5096,6 @@
+   if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa))
+     TBAAVerifyHelper.visitTBAAMetadata(I, TBAA);
+ 
+-  if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa_struct))
+-    TBAAVerifyHelper.visitTBAAStructMetadata(I, TBAA);
+-
+   if (MDNode *MD = I.getMetadata(LLVMContext::MD_noalias))
+     visitAliasScopeListMetadata(MD);
+   if (MDNode *MD = I.getMetadata(LLVMContext::MD_alias_scope))
+@@ -7422,35 +7419,6 @@
+   return true;
+ }
+ 
+-bool TBAAVerifier::visitTBAAStructMetadata(Instruction &I, const MDNode *MD) {
+-  CheckTBAA(MD->getNumOperands() % 3 == 0,
+-            "tbaa.struct operands must occur in groups of three", &I, MD);
+-
+-  // Each group of three operands must consist of two integers and a
+-  // tbaa node. Moreover, the regions described by the offset and size
+-  // operands must be non-overlapping.
+-  std::optional<APInt> NextFree;
+-  for (unsigned int Idx = 0; Idx < MD->getNumOperands(); Idx += 3) {
+-    auto *OffsetCI =
+-        mdconst::dyn_extract_or_null<ConstantInt>(MD->getOperand(Idx));
+-    CheckTBAA(OffsetCI, "Offset must be a constant integer", &I, MD);
+-
+-    auto *SizeCI =
+-        mdconst::dyn_extract_or_null<ConstantInt>(MD->getOperand(Idx + 1));
+-    CheckTBAA(SizeCI, "Size must be a constant integer", &I, MD);
+-
+-    MDNode *TBAA = dyn_cast_or_null<MDNode>(MD->getOperand(Idx + 2));
+-    CheckTBAA(TBAA, "TBAA tag missing", &I, MD);
+-    visitTBAAMetadata(I, TBAA);
+-
+-    bool NonOverlapping = !NextFree || NextFree->ule(OffsetCI->getValue());
+-    CheckTBAA(NonOverlapping, "Overlapping tbaa.struct regions", &I, MD);
+-
+-    NextFree = OffsetCI->getValue() + SizeCI->getValue();
+-  }
+-  return true;
+-}
+-
+ char VerifierLegacyPass::ID = 0;
+ INITIALIZE_PASS(VerifierLegacyPass, "verify", "Module Verifier", false, false)
+ 
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
+--- a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
++++ b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
+@@ -518,6 +518,4 @@
+ !1 = !{!"omnipotent char", !2}
+ !2 = !{!"Simple C/C++ TBAA"}
+ !3 = !{!"short", !1}
+-!4 = !{i64 0, i64 4, !5, i64 4, i64 2, !6, i64 8, i64 4, !5, i64 12, i64 2, !6, i64 16, i64 4, !5, i64 20, i64 2, !6}
+-!5 = !{!0, !0, i64 0}
+-!6 = !{!3, !3, i64 0}
++!4 = !{i64 0, i64 4, !0, i64 4, i64 2, !3, i64 8, i64 4, !0, i64 12, i64 2, !3, i64 16, i64 4, !0, i64 20, i64 2, !3}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
+--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
++++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
+@@ -141,4 +141,4 @@
+ !5 = distinct !{!5, !"some domain"}
+ !6 = !{!7}
+ !7 = distinct !{!7, !5, !"some scope 2"}
+-!8 = !{i64 0, i64 8, !0}
++!8 = !{i64 0, i64 8, null}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
+--- a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
++++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
+@@ -75,7 +75,7 @@
+ !1 = !{!"omnipotent char", !0}
+ !2 = !{!5, !5, i64 0}
+ !3 = !{i64 0, i64 4, !2}
+-!4 = !{i64 0, i64 8, !2}
++!4 = !{i64 0, i64 8, null}
+ !5 = !{!"float", !0}
+ !6 = !{i64 0, i64 4, !2, i64 4, i64 4, !2}
+ !7 = !{i64 0, i64 2, !2, i64 4, i64 6, !2}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
+--- a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
++++ b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
+@@ -836,6 +836,5 @@
+ !2 = !{ !"set2", !0 }
+ !3 = !{ !3, !{!"llvm.loop.parallel_accesses", !13} }
+ !4 = !{ float 4.0 }
+-!5 = !{ i64 0, i64 8, !6 }
+-!6 = !{ !1, !1, i64 0 }
++!5 = !{ i64 0, i64 8, null }
+ !13 = distinct !{}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/Scalarizer/basic.ll b/llvm/test/Transforms/Scalarizer/basic.ll
+--- a/llvm/test/Transforms/Scalarizer/basic.ll
++++ b/llvm/test/Transforms/Scalarizer/basic.ll
+@@ -870,6 +870,5 @@
+ !2 = !{ !"set2", !0 }
+ !3 = !{ !3, !{!"llvm.loop.parallel_accesses", !13} }
+ !4 = !{ float 4.0 }
+-!5 = !{ i64 0, i64 8, !6 }
+-!6 = !{ !1, !1, i64 0 }
++!5 = !{ i64 0, i64 8, null }
+ !13 = distinct !{}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SROA/tbaa-struct3.ll b/llvm/test/Transforms/SROA/tbaa-struct3.ll
+--- a/llvm/test/Transforms/SROA/tbaa-struct3.ll
++++ b/llvm/test/Transforms/SROA/tbaa-struct3.ll
+@@ -539,7 +539,7 @@
+ !6 = !{!5, !5, i64 0}
+ !7 = !{i64 0, i64 8, !6, i64 8, i64 4, !1}
+ !8 = !{i64 0, i64 4, !1, i64 4, i64 8, !6}
+-!9 = !{i64 0, i64 8, !6, i64 8, i64 8, !1}
++!9 = !{i64 0, i64 8, !6, i64 4, i64 8, !1}
+ !10 = !{i64 0, i64 2, !1, i64 2, i64 2, !1}
+ !11 = !{i64 0, i64 1, !1, i64 1, i64 3, !1}
+ !12 = !{i64 0, i64 2, !1, i64 2, i64 6, !1}
+diff -ruN --strip-trailing-cr a/llvm/test/Verifier/tbaa-struct.ll b/llvm/test/Verifier/tbaa-struct.ll
+--- a/llvm/test/Verifier/tbaa-struct.ll
++++ b/llvm/test/Verifier/tbaa-struct.ll
+@@ -1,36 +1,28 @@
+-; RUN: not llvm-as < %s 2>&1 | FileCheck %s
++; RUN: llvm-as < %s 2>&1
++
++; FIXME: The verifer should reject the invalid !tbaa.struct nodes below.
+ 
+ define void @test_overlapping_regions(ptr %a1) {
+-; CHECK: Overlapping tbaa.struct regions
+-; CHECK-NEXT:  %ld = load i8, ptr %a1, align 1, !tbaa.struct !0
+   %ld = load i8, ptr %a1, align 1, !tbaa.struct !0
+   ret void
+ }
+ 
+ define void @test_size_not_integer(ptr %a1) {
+-; CHECK: Size must be a constant integer
+-; CHECK-NEXT:  store i8 1, ptr %a1, align 1, !tbaa.struct !5
+   store i8 1, ptr %a1, align 1, !tbaa.struct !5
+   ret void
+ }
+ 
+ define void @test_offset_not_integer(ptr %a1, ptr %a2) {
+-; CHECK: Offset must be a constant integer
+-; CHECK-NEXT:  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !6
+   tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !6
+   ret void
+ }
+ 
+ define void @test_tbaa_missing(ptr %a1, ptr %a2) {
+-; CHECK: TBAA tag missing
+-; CHECK-NEXT:  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !7
+   tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !7
+   ret void
+ }
+ 
+ define void @test_tbaa_invalid(ptr %a1) {
+-; CHECK: Old-style TBAA is no longer allowed, use struct-path TBAA instead
+-; CHECK-NEXT:  store i8 1, ptr %a1, align 1, !tbaa.struct !8
+   store i8 1, ptr %a1, align 1, !tbaa.struct !8
+   ret void
+ }
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index ad8f149f602ed5..1a7d56b5764590 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "feebcd65fb7e0534f5219e05432a05e45aa8cd2a"
-    LLVM_SHA256 = "39b2b0c5f5fefb54866a0e9738f1617d79049dbac3b5cdecb7b1f785a57bb669"
+    LLVM_COMMIT = "aa2c14de1adcd265bf0c0fb44f97b5d6c1c38710"
+    LLVM_SHA256 = "50d2c7cd5355ec04a75991f2a4e2c89a3876b46fc1b71cd9fa3245f212d55da0"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index 0727dd7bc7f0fe..dfed55f77e5fd3 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -440,6 +440,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:Dialect",
@@ -1104,6 +1105,7 @@ cc_library(
         "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:GPUCommonTransforms",
         "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:GPUToGPURuntimeTransforms",
         "@llvm-project//mlir:GPUToNVVMTransforms",
         "@llvm-project//mlir:GPUToROCDLTransforms",
         "@llvm-project//mlir:GPUTransforms",
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 7d316928c72772..fe75981da57781 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -570,6 +570,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BytecodeWriter",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index b77b4eefc7f919..2f022afb5c51f1 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -907,6 +907,7 @@ cc_library(
         "//xla/mlir/utils:error_util",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeWriter",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
index d81257a2a05382..e82df33e965214 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
@@ -58,6 +58,7 @@ cc_library(
     deps = [
         ":xla_gpu_ops_inc_gen",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index c6f0490777370a..c9c7e3523170c4 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1695,12 +1695,12 @@ absl::StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
 using OutputInfoMap =
     absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>;
 
-static void NullDiagnosticHandler(const llvm::DiagnosticInfo& diag_info,
+static void NullDiagnosticHandler(const llvm::DiagnosticInfo* diag_info,
                                   void* context) {
   std::string error_string;
   llvm::raw_string_ostream string_printer(error_string);
   llvm::DiagnosticPrinterRawOStream diagnostic_printer(string_printer);
-  diag_info.print(diagnostic_printer);
+  diag_info->print(diagnostic_printer);
 
   VLOG(5) << error_string;
 }

From 8330127f0aa8f9f117ddaa4134c5598ed9ea3e61 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 19:46:52 -0700
Subject: [PATCH 584/670] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 620131121
---
 tensorflow/go/op/wrappers.go | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 4f3a7d7af782be..8a825b310ca47c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -59743,17 +59743,13 @@ func XlaSplitNDPaddings(value []int64) XlaSplitNDAttr {
 //
 // Arguments:
 //
-//		input: Input tensor to split across all dimensions.
-//	  }
-//	  out_arg {
-//	    name: "outputs"
-//	    description: <<END
-//
-// Output slices based on input and num_splits defined, in row-major order.
+//	input: Input tensor to split across all dimensions.
 //
 //	num_splits: Number of ways to split per dimension. Shape dimensions must be evenly
 //
 // divisible.
+//
+// Returns Output slices based on input and num_splits defined, in row-major order.
 func XlaSplitND(scope *Scope, input tf.Output, N int64, num_splits []int64, optional ...XlaSplitNDAttr) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return

From 9bad4ef3cc803e10a9fd6f86b0fabfe7b97516d2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 21:35:01 -0700
Subject: [PATCH 585/670] Automated Code Change

PiperOrigin-RevId: 620149815
---
 .../xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
index e6e5bfed1493cc..a4b01ae517f650 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PROFILER_LIB_CONNECTED_TRACEME_H_
 #define TENSORFLOW_TSL_PROFILER_LIB_CONNECTED_TRACEME_H_
 
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -79,7 +80,7 @@ class TraceMeProducer : public TraceMe {
   template <typename NameT>
   explicit TraceMeProducer(NameT&& name,
                            ContextType context_type = ContextType::kGeneric,
-                           absl::optional<uint64> context_id = absl::nullopt,
+                           std::optional<uint64> context_id = std::nullopt,
                            int level = 2)
       : TraceMe(std::forward<NameT>(name), level),
         context_id_(context_id.has_value() ? context_id.value()

From 903f741b18ef0fcf004d8677f03c893bb66ee2f2 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Thu, 28 Mar 2024 22:26:52 -0700
Subject: [PATCH 586/670] [xla:gpu][NFC] Use absl::Span more consistenly

PiperOrigin-RevId: 620156928
---
 .../address_computation_fusion_rewriter.cc    | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index 89a47d7cd9cd05..631ce71a4407d1 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -59,13 +59,20 @@ namespace {
 
 // A dataflow path flowing from a definition to a user.
 using DefUseDataflowPath = absl::InlinedVector<HloInstruction*, 2>;
+
 // All dataflow paths flowing from a definition to all users. Each user will
 // have a separate entry in the vector.
 using DefUseDataflowPaths = absl::InlinedVector<DefUseDataflowPath, 4>;
+
 // A dataflow path flowing from a user to a definition.
 using UseDefDataflowPath = absl::InlinedVector<HloInstruction*, 4>;
+
 // All dataflow paths flowing from a user to all definitions of its operands.
 using UseDefDataflowPaths = absl::InlinedVector<HloInstruction*, 8>;
+
+using DataflowPathView = absl::Span<HloInstruction* const>;
+using DataflowPathsView = absl::Span<DataflowPathView>;
+
 using InstructionSet = absl::flat_hash_set<HloInstruction*>;
 
 bool IsNoOp(const HloInstruction* hlo) {
@@ -262,7 +269,7 @@ DefUseDataflowPaths GetSlicedUserPaths(const HloInstruction* instr) {
 }
 
 absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
-    absl::Span<HloInstruction* const> matches) {
+    DataflowPathView matches) {
   absl::InlinedVector<HloInstruction*, 4> captures;
 
   InstructionSet matched_instrs(matches.begin(), matches.end());
@@ -280,7 +287,7 @@ absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
 }
 
 Status CreateRootTuple(HloInstruction* hero, HloComputation::Builder& builder,
-                       DefUseDataflowPaths sliced_user_paths,
+                       DataflowPathsView sliced_user_paths,
                        absl::flat_hash_map<const HloInstruction*,
                                            HloInstruction*>& instr_mapping) {
   unsigned tuple_size = hero->shape().tuple_shapes_size();
@@ -314,9 +321,8 @@ Status CreateRootTuple(HloInstruction* hero, HloComputation::Builder& builder,
 }
 
 absl::StatusOr<HloComputation*> CreateFusionBody(
-    HloModule* module, absl::Span<HloInstruction* const> sliced_operand_paths,
-    DefUseDataflowPaths sliced_user_paths,
-    absl::Span<HloInstruction* const> captures) {
+    HloModule* module, DataflowPathView sliced_operand_paths,
+    DataflowPathsView sliced_user_paths, DataflowPathView captures) {
   HloComputation::Builder builder("address-computation");
 
   // A mapping from original instructions to instructions in the fusion body.
@@ -366,9 +372,8 @@ absl::StatusOr<HloComputation*> CreateFusionBody(
 }
 
 absl::StatusOr<HloInstruction*> CreateFusionInstruction(
-    HloModule* module, HloInstruction* orig,
-    absl::Span<HloInstruction* const> captures, HloComputation* body,
-    bool dynamic) {
+    HloModule* module, HloInstruction* orig, DataflowPathView captures,
+    HloComputation* body, bool dynamic) {
   HloComputation* parent = orig->parent();
 
   // Add a fusion operation calling outlined fusion computation.
@@ -446,14 +451,21 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
       std::vector<HloInstruction*> matched_instrs;
       absl::c_copy(sliced_operand_paths, std::back_inserter(matched_instrs));
 
-      for (auto& sliced_user_path : sliced_user_paths)
+      std::vector<DataflowPathView> sliced_user_paths_view;
+      for (auto& sliced_user_path : sliced_user_paths) {
         absl::c_copy(sliced_user_path, std::back_inserter(matched_instrs));
+        DataflowPathView sliced_user_path_view{&sliced_user_path.front(),
+                                               sliced_user_path.size()};
+        sliced_user_paths_view.push_back(std::move(sliced_user_path_view));
+      }
 
       auto captures = GetPatternCaptures(matched_instrs);
 
-      TF_ASSIGN_OR_RETURN(HloComputation * fusion_body,
-                          CreateFusionBody(module, sliced_operand_paths,
-                                           sliced_user_paths, captures));
+      TF_ASSIGN_OR_RETURN(
+          HloComputation * fusion_body,
+          CreateFusionBody(module, sliced_operand_paths,
+                           DataflowPathsView(sliced_user_paths_view),
+                           captures));
 
       TF_ASSIGN_OR_RETURN(HloInstruction * fusion,
                           CreateFusionInstruction(module, hero, captures,

From 3d9b57da50c0e04454f35ef511a1542f68884af7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 Mar 2024 22:29:12 -0700
Subject: [PATCH 587/670] [xla][gpu] Extracting triton codegen requirements for
 hlo instructions This CL extracts current triton codegen requirements for
 each hlo instruction into a single function to clean the codes in the triton
 fusion passes.

PiperOrigin-RevId: 620157253
---
 third_party/xla/xla/service/gpu/BUILD         |  42 +
 .../xla/service/gpu/cublas_pad_for_gemms.cc   |   3 +-
 .../xla/xla/service/gpu/fusions/triton.cc     |   1 +
 .../xla/xla/service/gpu/gemm_fusion.cc        | 116 +--
 third_party/xla/xla/service/gpu/gemm_fusion.h |   4 -
 .../xla/xla/service/gpu/ir_emitter_triton.cc  |  27 +-
 .../service/gpu/softmax_rewriter_triton.cc    |  67 +-
 .../gpu/softmax_rewriter_triton_test.cc       |   9 +-
 .../xla/xla/service/gpu/triton_support.cc     | 218 ++++
 .../xla/xla/service/gpu/triton_support.h      |   6 +
 .../xla/service/gpu/triton_support_test.cc    | 940 ++++++++++++++++++
 11 files changed, 1246 insertions(+), 187 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/triton_support_test.cc

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 1b8117cf702373..b8bd5348f53318 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1477,8 +1477,49 @@ cc_library(
         ":variant_visitor",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:instruction_fusion",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
+        "@local_tsl//tsl/platform:tensor_float_32_utils",
+    ],
+)
+
+xla_test(
+    name = "triton_support_test",
+    srcs = if_cuda_is_configured(["triton_support_test.cc"]),
+    backends = [
+        "gpu_a100",
+    ],
+    shard_count = 10,
+    tags = ["nomac"],
+    deps = [
+        ":gpu_device_info_for_tests",
+        ":gpu_float_support",
+        ":ir_emission_utils",
+        ":ir_emitter_triton",
+        ":matmul_utils",
+        ":triton_fusion_analysis",
+        ":triton_support",
+        "//xla:error_spec",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:float_normalization",
+        "//xla/service:hlo_pass_pipeline",
+        "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3069,6 +3110,7 @@ cc_library(
     deps = [
         ":gemm_fusion",
         ":ir_emission_utils",
+        ":triton_support",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:statusor",
diff --git a/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc b/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc
index 42a0b55b8bc1ac..050f219d12b6c8 100644
--- a/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc
+++ b/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/service/gpu/gemm_fusion.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/triton_support.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
@@ -179,7 +180,7 @@ static std::vector<HloDotInstruction*> GetRelevantDots(
                 ->config()
                 .debug_options()
                 .xla_gpu_enable_triton_gemm() &&
-            CanTritonHandleGEMM(*dot, gpu_compute_capability) &&
+            IsTritonSupportedInstruction(*dot, gpu_compute_capability) &&
             ShouldTritonHandleGEMM(*dot, gpu_compute_capability))) {
         gemms.push_back(dot);
       }
diff --git a/third_party/xla/xla/service/gpu/fusions/triton.cc b/third_party/xla/xla/service/gpu/fusions/triton.cc
index 2b9565d87bbee9..2fc6d15898da6c 100644
--- a/third_party/xla/xla/service/gpu/fusions/triton.cc
+++ b/third_party/xla/xla/service/gpu/fusions/triton.cc
@@ -153,6 +153,7 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
         triton_config.set_split_k(1);
         triton_config.set_num_stages(1);
         triton_config.set_num_warps(2);
+        triton_config.set_num_ctas(1);
       }
       TF_ASSIGN_OR_RETURN(
           TritonGemmConfig config,
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion.cc b/third_party/xla/xla/service/gpu/gemm_fusion.cc
index 7518fa51269533..05e758a73f3d47 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion.cc
@@ -618,10 +618,11 @@ absl::StatusOr<FusionDecision> CreateDotFusion(
     std::vector<HloInstruction*>& fusion_inputs,
     HloInstruction** fusion_output_ptr) {
   VLOG(5) << dot.ToString();
-  if (FusionDecision can_handle = CanTritonHandleGEMM(dot, gpu_version);
-      !can_handle) {
-    VLOG(3) << can_handle.Explain();
-    return can_handle;
+  if (CodegenDecision is_supported =
+          IsTritonSupportedInstruction(dot, gpu_version);
+      !is_supported) {
+    VLOG(3) << is_supported.Explain();
+    return is_supported;
   }
 
   // Verify sparse dot constraints.
@@ -785,116 +786,9 @@ absl::StatusOr<bool> RunOnComputation(
   return visitor.changed();
 }
 
-bool IsSupportedByTriton(PrecisionConfig::Algorithm algorithm,
-                         const se::GpuComputeCapability& gpu_version) {
-  auto cuda_compute_capability =
-      std::get_if<se::CudaComputeCapability>(&gpu_version);
-  auto rocm_compute_capability =
-      std::get_if<se::RocmComputeCapability>(&gpu_version);
-  switch (algorithm) {
-    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
-      if (cuda_compute_capability) {
-        return true;
-      }
-      return false;
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
-
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
-      if (cuda_compute_capability) {
-        return true;
-      }
-      if (rocm_compute_capability) {
-        return rocm_compute_capability->has_bf16_dtype_support();
-      }
-      return false;
-
-    // TODO(b/326579472): Fix the support of this algorithm and maybe allow it
-    // here.
-    case PrecisionConfig::ALG_DOT_F16_F16_F32:
-    // Slow to compile:
-    case PrecisionConfig::ALG_DOT_F32_F32_F32:
-    default:
-      return false;
-  }
-}
 
 }  // namespace
 
-FusionDecision CanTritonHandleGEMM(
-    const HloDotInstruction& dot, const se::GpuComputeCapability& gpu_version) {
-  auto cuda_compute_capability =
-      std::get_if<se::CudaComputeCapability>(&gpu_version);
-  auto rocm_compute_capability =
-      std::get_if<se::RocmComputeCapability>(&gpu_version);
-
-  if (!cuda_compute_capability && !rocm_compute_capability) {
-    return "Non CUDA or ROCM device.";
-  }
-
-  if (dot.precision_config().algorithm() == PrecisionConfig::ALG_UNSET) {
-    if (!tsl::tensor_float_32_execution_enabled() ||
-        absl::c_any_of(dot.precision_config().operand_precision(),
-                       [](int x) { return x != PrecisionConfig::DEFAULT; })) {
-      return "Non-default precision.";
-    }
-  } else {
-    if (!IsSupportedByTriton(dot.precision_config().algorithm(),
-                             *cuda_compute_capability)) {
-      return "Unsupported algorithm on the current device(s).";
-    }
-  }
-
-  auto supported_output_type = [&](const PrimitiveType t) {
-    switch (t) {
-      case F16:
-      case F32:
-        return true;
-      case BF16:
-        if (cuda_compute_capability) {
-          return true;
-        }
-        if (rocm_compute_capability) {
-          return rocm_compute_capability->has_bf16_dtype_support();
-        }
-        return false;
-      default:
-        return false;
-    }
-  };
-
-  // TODO(b/266862493): Support more output types.
-  if (!supported_output_type(dot.shape().element_type())) {
-    return "Unsupported output data type.";
-  }
-
-  if (!IsTritonSupportedDataType(dot.operand(0)->shape().element_type(),
-                                 gpu_version) ||
-      !IsTritonSupportedDataType(dot.operand(1)->shape().element_type(),
-                                 gpu_version)) {
-    return "Unsupported input data type.";
-  }
-
-  const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers();
-
-  // TODO(b/269580541): support multiple batch dimensions.
-  if (dim_numbers.lhs_batch_dimensions().size() > 1) {
-    return "Multiple batch dimensions.";
-  }
-
-  // Cases where lhs or rhs have no non-contracting dims are not handled.
-  if (dim_numbers.lhs_batch_dimensions().size() +
-              dim_numbers.lhs_contracting_dimensions().size() ==
-          dot.operand(0)->shape().rank() ||
-      dim_numbers.rhs_batch_dimensions().size() +
-              dim_numbers.rhs_contracting_dimensions().size() ==
-          dot.operand(1)->shape().rank()) {
-    return "No non-contracting dimensions.";
-  }
-
-  return FusionDecision{};
-}
-
 bool ShouldTritonHandleGEMM(HloDotInstruction& dot,
                             const se::GpuComputeCapability& gpu_version) {
   std::vector<HloInstruction*> fusion_inputs;
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion.h b/third_party/xla/xla/service/gpu/gemm_fusion.h
index 1ddf1bd850fc95..1138ad28a36a5f 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion.h
+++ b/third_party/xla/xla/service/gpu/gemm_fusion.h
@@ -30,10 +30,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// Filters GEMMs which can be handled using Triton.
-FusionDecision CanTritonHandleGEMM(const HloDotInstruction&,
-                                   const se::GpuComputeCapability&);
-
 // Filters GEMMs which are better to handle using Triton.
 bool ShouldTritonHandleGEMM(HloDotInstruction&,
                             const se::GpuComputeCapability&);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 184880632da09e..427ac4dbd6b4fa 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -415,9 +415,11 @@ Value AddPtr(ImplicitLocOpBuilder& b, Value ptr, Value offset) {
   return b.create<mt::AddPtrOp>(ptr.getType(), ptr, offset);
 }
 
-Value EmitElementwise(ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
-                      const se::DeviceDescription& device_info,
-                      const HloInstruction& hlo, ValueRange inputs) {
+absl::StatusOr<Value> EmitElementwise(ImplicitLocOpBuilder& b,
+                                      absl::string_view libdevice_path,
+                                      const se::DeviceDescription& device_info,
+                                      const HloInstruction& hlo,
+                                      ValueRange inputs) {
   if (mlir::getElementTypeOrSelf(inputs[0]).isF32() ||
       mlir::getElementTypeOrSelf(inputs[0]).isF64()) {
     auto dev_fn_id = GetTargetDeviceFunctionID(hlo.opcode());
@@ -489,7 +491,8 @@ Value EmitElementwise(ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
                   mlir::mhlo::ComparisonDirection::NE),
           inputs[1], inputs[2]);
     default:
-      LOG(FATAL) << "Unsupported operation " << hlo.ToString();
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported elementwise operation ", hlo.ToString()));
   }
 }
 
@@ -901,7 +904,8 @@ absl::StatusOr<Value> EmitScope(
       for (const HloInstruction* operand : hlo->operands()) {
         operands.push_back(values[operand]);
       }
-      result = EmitElementwise(b, libdevice_path, device_info, *hlo, operands);
+      TF_ASSIGN_OR_RETURN(result, EmitElementwise(b, libdevice_path,
+                                                  device_info, *hlo, operands));
     } else if (hlo->opcode() == HloOpcode::kTuple) {
       TF_RET_CHECK(hlo->IsRoot()) << hlo->ToString();
     } else if (hlo->opcode() == HloOpcode::kBitcast ||
@@ -919,7 +923,8 @@ absl::StatusOr<Value> EmitScope(
                           EmitNestedFusion(b, libdevice_path, device_info,
                                            *fusion_instruction, values));
     } else {
-      LOG(FATAL) << hlo->ToString();
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported operation ", hlo->ToString()));
     }
     TF_RET_CHECK(values.insert({hlo, result}).second) << hlo->ToString();
     VLOG(8) << "Emitted " << hlo->ToString(HloPrintOptions::ShortParsable());
@@ -1191,11 +1196,11 @@ struct MatMulLaunchConfig {
   matmul_dims.out_lhs_noncontracting_dim_idx = dot.shape().rank() - 2;
 
   auto* root = dot.parent()->root_instruction();
-  matmul_dims.n = analysis
-                      .IterSpec(TritonFusionAnalysis::Scope::OUTPUT, root,
-                                matmul_dims.out_rhs_noncontracting_dim_idx)
-                      ->at(0)
-                      .count;
+  auto iter_spec =
+      analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, root,
+                        matmul_dims.out_rhs_noncontracting_dim_idx);
+  TF_RET_CHECK(iter_spec != nullptr);
+  matmul_dims.n = iter_spec->at(0).count;
   // Contracting dimension length.
   if (config.split_k > 1 &&
       dot.operand(0)->operand(0)->opcode() == HloOpcode::kPad) {
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
index 902a10ed935b71..2db88bc59379a1 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
@@ -59,38 +59,6 @@ bool HasDefaultLayout(const Shape& shape) {
          LayoutUtil::IsMonotonicWithDim0Major(shape.layout());
 }
 
-bool IsTritonSupportedInstruction(const HloInstruction* instr,
-                                  const se::GpuComputeCapability& gpu_version) {
-  if (!instr->shape().IsArray()) {
-    return false;
-  }
-
-  if (!IsTritonSupportedDataType(instr->shape().element_type(), gpu_version)) {
-    return false;
-  }
-
-  for (const HloInstruction* operand : instr->operands()) {
-    if (!IsTritonSupportedDataType(operand->shape().element_type(),
-                                   gpu_version)) {
-      return false;
-    }
-  }
-
-  // TODO(bchetioui): expand with non-trivial instructions.
-  if (instr->IsElementwise()) {
-    return IsTritonSupportedElementwise(instr->opcode(),
-                                        instr->shape().element_type());
-  }
-
-  switch (instr->opcode()) {
-    case HloOpcode::kBitcast:
-    case HloOpcode::kParameter:
-      return true;
-    default:
-      return false;
-  }
-}
-
 // Returns true if a trivially connected producer of 'consumer' with opcode
 // 'opcode' exists. If such an instruction is found, the value of 'producer' is
 // set to it. The definition of "trivial" operations is as given in
@@ -268,7 +236,7 @@ bool IsTriviallyFusible(HloInstruction* instr,
   }
 
   if (instr->IsElementwise() && instr->operand_count() == 1) {
-    return IsTritonSupportedInstruction(instr, gpu_version);
+    return static_cast<bool>(IsTritonSupportedInstruction(*instr, gpu_version));
   }
 
   // Elementwise binary ops are trivially fusible if the operands are the same,
@@ -280,7 +248,8 @@ bool IsTriviallyFusible(HloInstruction* instr,
     // Elementwise binary ops should be fused if both operands are the same and
     // if the operand is triton supported.
     if (operand_0 == operand_1) {
-      return IsTritonSupportedInstruction(instr, gpu_version);
+      return static_cast<bool>(
+          IsTritonSupportedInstruction(*instr, gpu_version));
     }
 
     // For simplicity we only fuse elementwise binary ops with splat operands
@@ -291,7 +260,8 @@ bool IsTriviallyFusible(HloInstruction* instr,
          IsSupportedBroadcastOfParameter(*operand_0)) ^
         (IsBroadcastOfScalarConstant(*operand_1) ||
          IsSupportedBroadcastOfParameter(*operand_1))) {
-      return IsTritonSupportedInstruction(instr, gpu_version);
+      return static_cast<bool>(
+          IsTritonSupportedInstruction(*instr, gpu_version));
     }
   }
 
@@ -337,14 +307,6 @@ bool IsTriviallyConnectedProducerOf(
   return false;
 }
 
-bool IsTritonSupportedComputation(const HloComputation* computation,
-                                  const se::GpuComputeCapability& gpu_version) {
-  return absl::c_all_of(
-      computation->instructions(), [&](const HloInstruction* instr) {
-        return IsTritonSupportedInstruction(instr, gpu_version);
-      });
-}
-
 // Finds the first non-fusible producer of a diamond. This instruction is either
 //   1. the direct producer of the diamond, if that producer is used more than
 //      twice and/or is not otherwise trivially fusible
@@ -447,7 +409,7 @@ SoftmaxRewriterTriton::MatchesTritonCompatibleClosedReductionDiamond(
     return "Root is not elementwise binary.";
   }
 
-  if (!IsTritonSupportedInstruction(instr, gpu_version_)) {
+  if (!IsTritonSupportedInstruction(*instr, gpu_version_)) {
     return "Root is not supported for Triton instruction.";
   }
 
@@ -471,13 +433,11 @@ SoftmaxRewriterTriton::MatchesTritonCompatibleClosedReductionDiamond(
     return "Broadcast or reduce have non-default layouts.";
   }
 
-  if (!(reduce->operand_count() == 2 &&
-        reduce->operand(1)->opcode() == HloOpcode::kConstant)) {
-    return "Reduce has a non-constant second operand and/or is variadic.";
-  }
-
-  if (!(IsTritonSupportedComputation(reduce->to_apply(), gpu_version_))) {
-    return "Unsupported reduction by Triton.";
+  if (CodegenDecision is_supported =
+          IsTritonSupportedInstruction(*reduce, gpu_version_);
+      !is_supported) {
+    VLOG(3) << is_supported.Explain();
+    return is_supported;
   }
 
   if (!HasOneUse(broadcast) || !HasOneUse(reduce)) {
@@ -486,11 +446,6 @@ SoftmaxRewriterTriton::MatchesTritonCompatibleClosedReductionDiamond(
 
   producer = reduce->mutable_operand(0);
 
-  if (reduce->dimensions().size() != 1 ||
-      reduce->dimensions(0) != producer->shape().rank() - 1) {
-    return "Reduction is not a row-reduction of a single operand.";
-  }
-
   if (absl::c_linear_search(broadcast->dimensions(),
                             broadcast->shape().rank() - 1)) {
     return "Broadcast is not along the reduction dimension.";
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
index ef0bc0f6f7d7f6..74e800f9a815cc 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
@@ -1770,10 +1770,11 @@ ENTRY main {
     if (std::holds_alternative<FusionDecision>(decision)) {
       std::string actual_decision =
           std::get<FusionDecision>(decision).Explain();
-      EXPECT_THAT(actual_decision,
-                  AnyOf(HasSubstr("Root is not elementwise binary"),
-                        HasSubstr("Reduce has a non-constant second operand "
-                                  "and/or is variadic")));
+      EXPECT_THAT(
+          actual_decision,
+          AnyOf(HasSubstr("Root is not elementwise binary"),
+                HasSubstr("Reduction init value should be a constant or a "
+                          "convert of a constant.")));
       unmatched++;
     } else {
       matched++;
diff --git a/third_party/xla/xla/service/gpu/triton_support.cc b/third_party/xla/xla/service/gpu/triton_support.cc
index 3e2b15222320d1..66631dbd19ad3d 100644
--- a/third_party/xla/xla/service/gpu/triton_support.cc
+++ b/third_party/xla/xla/service/gpu/triton_support.cc
@@ -20,11 +20,15 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/variant_visitor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/tensor_float_32_utils.h"
 
 namespace xla {
 namespace gpu {
@@ -50,6 +54,7 @@ bool IsDistributiveOverAddition(const HloInstruction& hlo) {
 // BF16 is supported in a sense that all operations on it are implemented
 // through F32 and converts have to be inserted into the HLO graph, but
 // they can be missing during fusion.
+// TODO(b/266862493): Support more data types (F8, F64, etc.).
 bool IsTritonSupportedDataType(PrimitiveType type,
                                const se::GpuComputeCapability& gpu_version) {
   switch (type) {
@@ -130,5 +135,218 @@ bool IsTritonSupportedElementwise(HloOpcode opcode,
                                opcode);
 }
 
+CodegenDecision CanTritonHandleElementwise(
+    const HloInstruction& instr, const se::GpuComputeCapability& gpu_version) {
+  if (!IsTritonSupportedDataType(instr.shape().element_type(), gpu_version)) {
+    return "Unsupported output data type.";
+  }
+
+  for (const HloInstruction* operand : instr.operands()) {
+    if (!IsTritonSupportedDataType(operand->shape().element_type(),
+                                   gpu_version)) {
+      return "Unsupported input data type.";
+    }
+  }
+
+  if (instr.opcode() == HloOpcode::kConstant) {
+    return CodegenDecision{};
+  } else if (!IsTritonSupportedElementwise(
+                 instr.opcode(), instr.operand(0)->shape().element_type())) {
+    return "Unsupported elementwise operation.";
+  }
+  return CodegenDecision{};
+}
+
+bool IsDotAlgorithmSupportedByTriton(
+    PrecisionConfig::Algorithm algorithm,
+    const se::GpuComputeCapability& gpu_version) {
+  auto cuda_compute_capability =
+      std::get_if<se::CudaComputeCapability>(&gpu_version);
+  auto rocm_compute_capability =
+      std::get_if<se::RocmComputeCapability>(&gpu_version);
+  switch (algorithm) {
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
+      if (cuda_compute_capability) {
+        return true;
+      }
+      return false;
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
+      if (cuda_compute_capability) {
+        return true;
+      }
+      if (rocm_compute_capability) {
+        return rocm_compute_capability->has_bf16_dtype_support();
+      }
+      return false;
+
+    // TODO(b/326579472): Fix the support of this algorithm and maybe allow it
+    // here.
+    case PrecisionConfig::ALG_DOT_F16_F16_F32:
+    // TODO(b/311331155): Triton F32 is about 3x slower than Triton TF32 and is
+    // slow to compile. Disable it for now.
+    case PrecisionConfig::ALG_DOT_F32_F32_F32:
+    default:
+      return false;
+  }
+}
+
+// Filters GEMMs which can be handled using Triton.
+CodegenDecision CanTritonHandleGEMM(
+    const HloDotInstruction& dot, const se::GpuComputeCapability& gpu_version) {
+  auto cuda_compute_capability =
+      std::get_if<se::CudaComputeCapability>(&gpu_version);
+  auto rocm_compute_capability =
+      std::get_if<se::RocmComputeCapability>(&gpu_version);
+
+  CHECK(cuda_compute_capability || rocm_compute_capability);
+
+  if (dot.precision_config().algorithm() == PrecisionConfig::ALG_UNSET) {
+    if (!tsl::tensor_float_32_execution_enabled() ||
+        absl::c_any_of(dot.precision_config().operand_precision(),
+                       [](int x) { return x != PrecisionConfig::DEFAULT; })) {
+      return "Having non-default operand precisions or TensorFloat-32 disabled "
+             "for Dot op with unset algorithm.";
+    }
+  } else {
+    if (!IsDotAlgorithmSupportedByTriton(dot.precision_config().algorithm(),
+                                         gpu_version)) {
+      return "Unsupported algorithm on the current device(s).";
+    }
+  }
+
+  auto supported_output_type = [&](const PrimitiveType t) {
+    switch (t) {
+      case F16:
+      case F32:
+        return true;
+      case BF16:
+        if (cuda_compute_capability) {
+          return true;
+        }
+        if (rocm_compute_capability) {
+          return rocm_compute_capability->has_bf16_dtype_support();
+        }
+        return false;
+      default:
+        return false;
+    }
+  };
+
+  // TODO(b/266862493): Support more output types.
+  if (!supported_output_type(dot.shape().element_type())) {
+    return "Unsupported output data type for Dot op.";
+  }
+
+  if (!IsTritonSupportedDataType(dot.operand(0)->shape().element_type(),
+                                 gpu_version) ||
+      !IsTritonSupportedDataType(dot.operand(1)->shape().element_type(),
+                                 gpu_version)) {
+    return "Unsupported input data type for Dot op.";
+  }
+
+  const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers();
+
+  // TODO(b/269580541): support multiple batch dimensions.
+  if (dim_numbers.lhs_batch_dimensions().size() > 1) {
+    return "Multiple batch dimensions.";
+  }
+
+  // Cases where lhs or rhs have no non-contracting dims are not handled.
+  if (dim_numbers.lhs_batch_dimensions().size() +
+              dim_numbers.lhs_contracting_dimensions().size() ==
+          dot.operand(0)->shape().rank() ||
+      dim_numbers.rhs_batch_dimensions().size() +
+              dim_numbers.rhs_contracting_dimensions().size() ==
+          dot.operand(1)->shape().rank()) {
+    return "No non-contracting dimensions.";
+  }
+
+  return CodegenDecision{};
+}
+
+// Filters Reduces which can be handled using Triton.
+CodegenDecision CanTritonHandleReduce(
+    const HloReduceInstruction& reduce,
+    const se::GpuComputeCapability& gpu_version) {
+  if (!IsTritonSupportedDataType(reduce.shape().element_type(), gpu_version)) {
+    return "Unsupported output data type for Reduce op.";
+  }
+
+  for (const HloInstruction* operand : reduce.operands()) {
+    if (!IsTritonSupportedDataType(operand->shape().element_type(),
+                                   gpu_version)) {
+      return "Unsupported input data type for Reduce op.";
+    }
+  }
+
+  bool is_triton_supported_reduction_computation = [&]() {
+    return absl::c_all_of(
+        reduce.to_apply()->instructions(), [&](const HloInstruction* instr) {
+          return IsTritonSupportedInstruction(*instr, gpu_version);
+        });
+  }();
+  if (!is_triton_supported_reduction_computation) {
+    return "Unsupported reduction computation by Triton.";
+  }
+
+  if (reduce.dimensions().size() == 1 &&
+      reduce.dimensions().front() == reduce.operand(0)->shape().rank() - 1 &&
+      reduce.operand_count() == 2) {
+    const HloInstruction* operand = reduce.operand(1);
+    // We assume that the reduction init value was input as a constant, or in
+    // the case of a data type affected by float normalization, a convert of a
+    // constant.
+    if (operand->opcode() == HloOpcode::kConvert) {
+      if (operand->operand(0)->opcode() == HloOpcode::kConstant &&
+          operand->operand(0)->shape().element_type() == BF16 &&
+          operand->shape().element_type() == F32) {
+        return CodegenDecision{};
+      }
+    } else if (operand->opcode() == HloOpcode::kConstant) {
+      return CodegenDecision{};
+    }
+    return "Reduction init value should be a constant or a convert of a "
+           "constant.";
+  }
+  return "Reduction is not a row-reduction of a single operand.";
+}
+
+CodegenDecision IsTritonSupportedInstruction(
+    const HloInstruction& instr, const se::GpuComputeCapability& gpu_version) {
+  if (instr.IsElementwise()) {
+    return CanTritonHandleElementwise(instr, gpu_version);
+  }
+
+  switch (instr.opcode()) {
+    case HloOpcode::kDot: {
+      return CanTritonHandleGEMM(*Cast<HloDotInstruction>(&instr), gpu_version);
+    }
+    case HloOpcode::kReduce: {
+      return CanTritonHandleReduce(*Cast<HloReduceInstruction>(&instr),
+                                   gpu_version);
+    }
+    case HloOpcode::kTuple: {
+      if (instr.IsRoot()) {
+        return CodegenDecision{};
+      }
+      return "Only supports root tuples.";
+    }
+    case HloOpcode::kBitcast:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kSlice:
+    case HloOpcode::kReshape:
+    case HloOpcode::kPad:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kParameter:
+    case HloOpcode::kBroadcast:
+      return CodegenDecision{};
+    default:
+      break;
+  }
+  return "Unsupported opcode.";
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_support.h b/third_party/xla/xla/service/gpu/triton_support.h
index 02f6da34089f89..072c9ab948ec00 100644
--- a/third_party/xla/xla/service/gpu/triton_support.h
+++ b/third_party/xla/xla/service/gpu/triton_support.h
@@ -22,11 +22,13 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/instruction_fusion.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
+using CodegenDecision = FusionDecision;
 
 // Tells if f(a+b) == f(a) + f(b).
 bool IsDistributiveOverAddition(const HloInstruction& hlo);
@@ -46,6 +48,10 @@ bool IsTritonSupportedDataType(PrimitiveType, const se::GpuComputeCapability&);
 // Checks elementwise operation against all supported by Triton GEMM codegen.
 bool IsTritonSupportedElementwise(HloOpcode, PrimitiveType);
 
+// Checks instruction against requirements of triton emitter.
+CodegenDecision IsTritonSupportedInstruction(
+    const HloInstruction& instr, const se::GpuComputeCapability& gpu_version);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/triton_support_test.cc b/third_party/xla/xla/service/gpu/triton_support_test.cc
new file mode 100644
index 00000000000000..e3ad43b2f0f783
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_support_test.cc
@@ -0,0 +1,940 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/triton_support.h"
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/optimization.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "llvm/IR/Module.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/primitive_util.h"
+#include "xla/service/float_normalization.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/gpu_float_support.h"
+#include "xla/service/gpu/ir_emitter_triton.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
+#include "xla/service/hlo_pass_pipeline.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class TritonSupportTest : public GpuCodegenTest {
+ public:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+  absl::StatusOr<bool> ApplyFloatNormalization(HloModule* module) {
+    const GpuFloatSupport bf16_support(GetCudaComputeCapability(), BF16);
+    HloPassPipeline pipeline("hlo float normalization");
+    pipeline.AddPass<FloatNormalization>(&bf16_support);
+    return pipeline.Run(module);
+  }
+
+  float getTolerance(PrimitiveType data_type) {
+    float tolerance;
+    switch (data_type) {
+      case F64:
+      case F32:
+        tolerance = 1e-6;
+        break;
+      case F16:
+        tolerance = 2e-4;
+        break;
+      case BF16:
+        tolerance = 2e-2;
+        break;
+      case PRED:
+      case S8:
+        tolerance = 3e-2;
+        break;
+      case S16:
+        tolerance = 3e-3;
+        break;
+      case S32:
+        tolerance = 3e-3;
+        break;
+      default:
+        ABSL_UNREACHABLE();
+    }
+    return tolerance;
+  }
+
+ protected:
+  llvm::LLVMContext llvm_ctx_;
+  llvm::Module llvm_module_{"module", llvm_ctx_};
+  mlir::MLIRContext mlir_context_;
+  TritonGemmConfig config_{16, 32, 512, 1, 4, 8};
+};
+
+class TritonSupportTestWithParam : public TritonSupportTest,
+                                   public ::testing::WithParamInterface<
+                                       std::tuple<PrimitiveType, HloOpcode>> {};
+
+std::string TestParamsToString(
+    const ::testing::TestParamInfo<std::tuple<PrimitiveType, HloOpcode>>&
+        data) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = data.param;
+  return absl::StrCat(
+      primitive_util::LowercasePrimitiveTypeName(data_type), "_",
+      absl::StrReplaceAll(HloOpcodeString(opcode), {{"-", "_"}}));
+}
+
+using UnaryElementwiseTest = TritonSupportTestWithParam;
+
+// TODO(b/331636835): updates elementwise op tests to directly emit single op
+// instead of relying on triton gemm kernel.
+TEST_P(UnaryElementwiseTest, IsTritonSupportedExecutesCorrectlyForUnary) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = GetParam();
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = f32[15,33]{1,0} parameter(0)
+  parameter_1 = $0[33,68]{1,0} parameter(1)
+  unary = $0[33,68]{1,0} $1(parameter_1)
+  convert = f32[33,68]{1,0} convert(unary)
+  ROOT dot = f32[15,68]{1,0} dot(parameter_0, convert),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  parameter_0 = f32[15,33]{1,0} parameter(0)
+  parameter_1 = $0[33,68]{1,0} parameter(1)
+  ROOT triton_gemm = f32[15,68]{1,0} fusion(parameter_0, parameter_1),
+    kind=kCustom, calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
+  if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
+    float tolerance = getTolerance(data_type);
+    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+  } else {
+    // TODO(b/331632717): update the check to use SymbolicTileAnalysis to avoid
+    // tiling failures and check triton emitter fails gracefully.
+    EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+                tsl::testing::StatusIs(
+                    absl::StatusCode::kFailedPrecondition,
+                    ::testing::HasSubstr(
+                        "Can not propagate dim orders and requirements")));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    UnaryElementwiseTestSuite, UnaryElementwiseTest,
+    ::testing::Combine(::testing::Values(S8, S16, S32, F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kConvert, HloOpcode::kAbs,
+                                         HloOpcode::kNegate)),
+    TestParamsToString);
+INSTANTIATE_TEST_SUITE_P(
+    UnaryPREDTestSuite, UnaryElementwiseTest,
+    ::testing::Combine(::testing::Values(PRED),
+                       ::testing::Values(HloOpcode::kConvert, HloOpcode::kNot)),
+    TestParamsToString);
+INSTANTIATE_TEST_SUITE_P(
+    UnaryMathTestSuite, UnaryElementwiseTest,
+    ::testing::Combine(::testing::Values(F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kCos, HloOpcode::kExp,
+                                         HloOpcode::kExpm1, HloOpcode::kLog,
+                                         HloOpcode::kLog1p, HloOpcode::kRsqrt,
+                                         HloOpcode::kSin, HloOpcode::kSqrt,
+                                         HloOpcode::kCbrt, HloOpcode::kTan,
+                                         HloOpcode::kTanh, HloOpcode::kErf)),
+    TestParamsToString);
+
+using BinaryElementwiseTest = TritonSupportTestWithParam;
+
+TEST_P(BinaryElementwiseTest, IsTritonSupportedExecutesCorrectlyForBinaryE) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = GetParam();
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  parameter_2 = $0[11,63]{1,0} parameter(2)
+  binary = $0[11,63]{1,0} $1(parameter_1, parameter_2)
+  convert = f32[11,63]{1,0} convert(binary)
+  ROOT dot = f32[92,63]{1,0} dot(parameter_0, convert),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  parameter_2 = $0[11,63]{1,0} parameter(2)
+  ROOT triton_gemm = f32[92,63]{1,0} fusion(parameter_0, parameter_1, parameter_2),
+    kind=kCustom, calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
+  if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
+    float tolerance = getTolerance(data_type);
+    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+  } else {
+    EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+                ::testing::AnyOf(
+                    tsl::testing::StatusIs(
+                        absl::StatusCode::kInternal,
+                        ::testing::HasSubstr(
+                            "std::holds_alternative<DimOrdersAndReqs>")),
+                    tsl::testing::StatusIs(
+                        absl::StatusCode::kFailedPrecondition,
+                        ::testing::HasSubstr(
+                            "Can not propagate dim orders and requirements"))));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BinaryElementwiseTestSuite, BinaryElementwiseTest,
+    ::testing::Combine(::testing::Values(S8, S16, S32, F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kAdd, HloOpcode::kMultiply,
+                                         HloOpcode::kMaximum,
+                                         HloOpcode::kMinimum,
+                                         HloOpcode::kSubtract)),
+    TestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(BinaryPREDTestSuite, BinaryElementwiseTest,
+                         ::testing::Combine(::testing::Values(PRED),
+                                            ::testing::Values(HloOpcode::kAnd,
+                                                              HloOpcode::kOr,
+                                                              HloOpcode::kXor)),
+                         TestParamsToString);
+INSTANTIATE_TEST_SUITE_P(
+    BinaryMathTestSuite, BinaryElementwiseTest,
+    ::testing::Combine(::testing::Values(F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kAtan2, HloOpcode::kDivide,
+                                         HloOpcode::kPower)),
+    TestParamsToString);
+
+using CompareTest = TritonSupportTestWithParam;
+
+TEST_P(CompareTest, IsTritonSupportedExecutesCorrectlyForCompare) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = GetParam();
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  parameter_2 = $0[11,63]{1,0} parameter(2)
+  compare = pred[11,63]{1,0} $1(parameter_1, parameter_2), direction=GE
+  convert = f32[11,63]{1,0} convert(compare)
+  ROOT dot = f32[92,63]{1,0} dot(parameter_0, convert),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  parameter_2 = $0[11,63]{1,0} parameter(2)
+  ROOT triton_gemm = f32[92,63]{1,0} fusion(parameter_0, parameter_1, parameter_2),
+    kind=kCustom, calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
+  if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
+    float tolerance = getTolerance(data_type);
+    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+  } else {
+    EXPECT_THAT(
+        TritonFusionAnalysis::Execute(*computation),
+        tsl::testing::StatusIs(
+            absl::StatusCode::kInternal,
+            ::testing::HasSubstr("std::holds_alternative<DimOrdersAndReqs>")));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CompareTestSuite, CompareTest,
+    ::testing::Combine(::testing::Values(PRED, S8, S16, S32, F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kCompare)),
+    TestParamsToString);
+
+using TernaryElementwiseTest = TritonSupportTestWithParam;
+
+TEST_P(TernaryElementwiseTest, IsTritonSupportedExecutesCorrectlyForTernary) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = GetParam();
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = f32[92,13]{1,0} parameter(0)
+  parameter_1 = $0[13,63]{1,0} parameter(1)
+  parameter_2 = $0[13,63]{1,0} parameter(2)
+  parameter_3 = pred[13,63]{1,0} parameter(3)
+  ternary = $0[13,63]{1,0} $1(parameter_3, parameter_1, parameter_2)
+  convert = f32[13,63]{1,0} convert(ternary)
+  ROOT dot = f32[92,63]{1,0} dot(parameter_0, convert),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  parameter_0 = f32[92,13]{1,0} parameter(0)
+  parameter_1 = $0[13,63]{1,0} parameter(1)
+  parameter_2 = $0[13,63]{1,0} parameter(2)
+  parameter_3 = pred[13,63]{1,0} parameter(3)
+  ROOT triton_gemm = f32[92,63]{1,0} fusion(parameter_0, parameter_1, parameter_2, parameter_3),
+    kind=kCustom, calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
+  if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
+    float tolerance = getTolerance(data_type);
+    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+  } else {
+    EXPECT_THAT(
+        TritonFusionAnalysis::Execute(*computation),
+        tsl::testing::StatusIs(
+            absl::StatusCode::kInternal,
+            ::testing::HasSubstr("std::holds_alternative<DimOrdersAndReqs>")));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    TernaryElementwiseTestSuite, TernaryElementwiseTest,
+    ::testing::Combine(::testing::Values(PRED, S8, S16, S32, F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kSelect)),
+    TestParamsToString);
+
+using DotTest = TritonSupportTestWithParam;
+
+TEST_P(DotTest, IsTritonSupportedExecutesCorrectlyForDot) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = GetParam();
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = $0[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  ROOT dot = $0[92,63]{1,0} $1(parameter_0, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  parameter_0 = $0[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  ROOT triton_gemm = $0[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
+    calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
+  if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
+    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
+  } else {
+    const se::DeviceDescription dev_info =
+        TestGpuDeviceInfo::RTXA6000DeviceInfo(GetCudaComputeCapability());
+    EXPECT_THAT(
+        TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
+                      computation, GetCudaComputeCapability(), dev_info,
+                      config_, &llvm_module_, &EmitMatMul, mlir_context_),
+        tsl::testing::StatusIs(
+            absl::StatusCode::kInternal,
+            ::testing::HasSubstr("Failed to compile Triton kernel")));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(DotTestTestSuite, DotTest,
+                         ::testing::Combine(::testing::Values(F16, F32, BF16),
+                                            ::testing::Values(HloOpcode::kDot)),
+                         TestParamsToString);
+
+TEST_F(TritonSupportTest, UnsupportedDotOutputTypeFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+triton_gemm___computation {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = f32[11,63]{1,0} parameter(1)
+  ROOT dot = pred[92,63]{1,0} dot(parameter_0, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = f32[11,63]{1,0} parameter(1)
+  ROOT triton_gemm = pred[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
+    calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo(GetCudaComputeCapability());
+  EXPECT_THAT(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .Explain(),
+              ::testing::HasSubstr("Unsupported output data type for Dot op."));
+  EXPECT_THAT(
+      TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
+                    computation, GetCudaComputeCapability(), dev_info, config_,
+                    &llvm_module_, &EmitMatMul, mlir_context_),
+      tsl::testing::StatusIs(
+          absl::StatusCode::kInternal,
+          ::testing::HasSubstr("pm.run(triton_module.get()).succeeded()")));
+}
+
+TEST_F(TritonSupportTest,
+       UnsupportedDotWithMultipleBatchDimensionsFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+triton_gemm___computation {
+  parameter_0 = f32[2,2,2,2]{3,2,1,0} parameter(0)
+  parameter_1 = f32[2,2,2,2]{3,2,1,0} parameter(1)
+  ROOT dot = f32[2,2,2,2]{3,2,1,0} dot(parameter_0, parameter_1),
+    lhs_contracting_dims={3}, lhs_batch_dims={1,0}, rhs_contracting_dims={2},
+    rhs_batch_dims={1,0}
+}
+
+ENTRY e {
+  parameter_0 = f32[2,2,2,2]{3,2,1,0} parameter(0)
+  parameter_1 = f32[2,2,2,2]{3,2,1,0} parameter(1)
+  ROOT triton_gemm = f32[2,2,2,2]{3,2,1,0} fusion(parameter_0, parameter_1),
+    kind=kCustom, calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo(GetCudaComputeCapability());
+  EXPECT_THAT(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .Explain(),
+              ::testing::HasSubstr("Multiple batch dimensions"));
+  EXPECT_THAT(
+      TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
+                    computation, GetCudaComputeCapability(), dev_info, config_,
+                    &llvm_module_, &EmitMatMul, mlir_context_),
+      tsl::testing::StatusIs(absl::StatusCode::kInternal,
+                             ::testing::HasSubstr("num_batch_dims <= 1")));
+}
+
+TEST_F(TritonSupportTest,
+       UnsupportedDotWithNoNonContractingDimensionsFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+triton_gemm___computation {
+  parameter_0 = f32[2]{0} parameter(0)
+  parameter_1 = f32[2]{0} parameter(1)
+  ROOT dot = f32[] dot(parameter_0, parameter_1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  parameter_0 = f32[2]{0} parameter(0)
+  parameter_1 = f32[2]{0} parameter(1)
+  ROOT triton_gemm = f32[] fusion(parameter_0, parameter_1), kind=kCustom,
+    calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
+  EXPECT_THAT(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .Explain(),
+              ::testing::HasSubstr("No non-contracting dimensions."));
+  EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+              tsl::testing::StatusIs(
+                  absl::StatusCode::kInternal,
+                  ::testing::HasSubstr("non_contracting_dims.size() == 1")));
+}
+
+using ReduceConstTest = TritonSupportTestWithParam;
+TEST_P(ReduceConstTest,
+       IsTritonSupportedExecutesCorrectlyForReduceWithConstInit) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = GetParam();
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  const std::string kHloTestTemplate = R"(
+HloModule t
+add {
+  Arg_0 = $0[] parameter(0)
+  Arg_1 = $0[] parameter(1)
+  ROOT add = $0[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = $0[125,127]{1,0} parameter(0)
+  multiply_0 = $0[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = $0[] constant(0)
+  reduce = $0[125]{0} $1(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast = $0[125,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply = $0[125,127]{1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = $0[125,127]{1,0} parameter(0)
+  ROOT triton_softmax = $0[125,127]{1,0} fusion(parameter_0),
+                          kind=kCustom, calls=triton_softmax_computation,
+                          backend_config={"fusion_backend_config":
+                                           {"kind":"__triton_softmax"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
+  if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
+    float tolerance = getTolerance(data_type);
+    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+  } else {
+    const se::DeviceDescription dev_info =
+        TestGpuDeviceInfo::RTXA6000DeviceInfo(GetCudaComputeCapability());
+    EXPECT_THAT(
+        TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
+                      computation, GetCudaComputeCapability(), dev_info,
+                      config_, &llvm_module_, &EmitSoftMax, mlir_context_),
+        tsl::testing::StatusIs(
+            absl::StatusCode::kInternal,
+            ::testing::HasSubstr("Failed to compile Triton kernel")));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ReduceConstTestSuite, ReduceConstTest,
+    ::testing::Combine(::testing::Values(F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kReduce)),
+    TestParamsToString);
+
+TEST_F(TritonSupportTest,
+       SupportedReduceWithConvertConstantIsCodegenedSuccessfullyWithTriton) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const std::string kHloTest = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = bf16[] constant(0)
+  convert_0 = f32[] convert(constant_0)
+  reduce = f32[125]{0} reduce(multiply_0, convert_0), dimensions={1}, to_apply=add
+  broadcast = f32[125,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(parameter_0), kind=kCustom,
+  calls=triton_softmax_computation,
+                        backend_config={"fusion_backend_config":
+                        {"kind":"__triton_softmax"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+  EXPECT_TRUE(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .CanFuse());
+  EXPECT_OK(ApplyFloatNormalization(hlo_module.get()));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(hlo_module), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
+}
+
+TEST_F(
+    TritonSupportTest,
+    UnsupportedReduceWithMoreThanOneReduceDimensionsFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[2,125,127]{2,1,0} parameter(0)
+  multiply_0 = f32[2,125,127]{2,1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce = f32[2]{0} reduce(multiply_0, constant_0), dimensions={1,2}, to_apply=add
+  broadcast = f32[2,125,127]{2,1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply = f32[2,125,127]{2,1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = f32[2,125,127]{2,1,0} parameter(0)
+  ROOT triton_softmax = f32[2,125,127]{2,1,0} fusion(parameter_0),
+                          kind=kCustom, calls=triton_softmax_computation,
+                          backend_config={"fusion_backend_config":
+                                            {"kind":"__triton_softmax"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+  EXPECT_THAT(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .Explain(),
+              ::testing::HasSubstr(
+                  "Reduction is not a row-reduction of a single operand."));
+  EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+              tsl::testing::StatusIs(
+                  absl::StatusCode::kFailedPrecondition,
+                  ::testing::HasSubstr(
+                      "Can not propagate dim orders and requirements")));
+}
+
+TEST_F(TritonSupportTest,
+       UnsupportedReduceWithNoneLastReduceDimensionFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[2,125,127]{2,1,0} parameter(0)
+  multiply_0 = f32[2,125,127]{2,1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce = f32[2,127]{1,0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast = f32[2,125,127]{2,1,0} broadcast(reduce), dimensions={0,2}
+  ROOT multiply = f32[2,125,127]{2,1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = f32[2,125,127]{2,1,0} parameter(0)
+  ROOT triton_softmax = f32[2,125,127]{2,1,0} fusion(parameter_0),
+                          kind=kCustom, calls=triton_softmax_computation,
+                          backend_config={"fusion_backend_config":
+                                            {"kind":"__triton_softmax"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+  EXPECT_THAT(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .Explain(),
+              ::testing::HasSubstr(
+                  "Reduction is not a row-reduction of a single operand."));
+  EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+              tsl::testing::StatusIs(
+                  absl::StatusCode::kFailedPrecondition,
+                  ::testing::HasSubstr(
+                      "Can not propagate dim orders and requirements")));
+}
+
+TEST_F(TritonSupportTest,
+       UnsupportedReduceWithMoreThanOneOperandsFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_2 = f32[] parameter(1)
+  Arg_1 = f32[] parameter(2)
+  Arg_3 = f32[] parameter(3)
+  add_0 = f32[] add(Arg_0, Arg_2)
+  add_1 = f32[] add(Arg_1, Arg_3)
+  ROOT pair = (f32[], f32[]) tuple(add_0, add_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127] parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  tuple_0 = (f32[125]{0}, f32[125]{0}) reduce(multiply_0, multiply_0, constant_0, constant_0), dimensions={1}, to_apply=add
+  reduce = f32[125]{0} get-tuple-element(tuple_0), index=0
+  broadcast = f32[125,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(parameter_0),
+                          kind=kCustom, calls=triton_softmax_computation,
+                          backend_config={"fusion_backend_config":
+                                           {"kind":"__triton_softmax"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+  EXPECT_THAT(
+      IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+          .Explain(),
+      ::testing::HasSubstr("Unsupported output data type for Reduce op."));
+  EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+              tsl::testing::StatusIs(
+                  absl::StatusCode::kFailedPrecondition,
+                  ::testing::HasSubstr(
+                      "Can not propagate dim orders and requirements")));
+}
+
+TEST_F(TritonSupportTest,
+       UnsupportedReduceWithNonConstReduceValueFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  init = f32[] parameter(1)
+  reduce = f32[125]{0} reduce(multiply_0, init), dimensions={1}, to_apply=add
+  broadcast = f32[125,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  parameter_1 = f32[] parameter(1)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(parameter_0, parameter_1),
+                          kind=kCustom, calls=triton_softmax_computation,
+                        backend_config={"fusion_backend_config":
+                                         {"kind":"__triton_softmax"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo(GetCudaComputeCapability());
+  EXPECT_THAT(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .Explain(),
+              ::testing::HasSubstr("Reduction init value should be a constant "
+                                   "or a convert of a constant."));
+  EXPECT_THAT(
+      TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
+                    computation, GetCudaComputeCapability(), dev_info, config_,
+                    &llvm_module_, &EmitSoftMax, mlir_context_),
+      tsl::testing::StatusIs(
+          absl::StatusCode::kInternal,
+          ::testing::HasSubstr("operand->opcode() == HloOpcode::kConstant")));
+}
+
+TEST_F(TritonSupportTest,
+       UnsupportedReductionComputationFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+HloModule t
+custom_call {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT custom_call = f32[] custom-call(Arg_0, Arg_1), custom_call_target="foo"
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=custom_call
+  broadcast = f32[125,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(parameter_0),
+                          kind=kCustom, calls=triton_softmax_computation,
+                          backend_config={"fusion_backend_config":
+                                         {"kind":"__triton_softmax"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo(GetCudaComputeCapability());
+  EXPECT_THAT(
+      IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+          .Explain(),
+      ::testing::HasSubstr("Unsupported reduction computation by Triton."));
+  EXPECT_THAT(
+      TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
+                    computation, GetCudaComputeCapability(), dev_info, config_,
+                    &llvm_module_, &EmitSoftMax, mlir_context_),
+      tsl::testing::StatusIs(absl::StatusCode::kInvalidArgument,
+                             ::testing::HasSubstr("Unsupported operation")));
+}
+}  // namespace
+}  // namespace gpu
+}  // namespace xla

From a829ac043c3531ad514febe0a7dfa003621a6d7f Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 29 Mar 2024 00:54:46 -0700
Subject: [PATCH 588/670] [xla:gpu] Create fake buffer allocations for embedded
 thunk

This is required in cases where embedded thunk arguments share the same buffer (i.e. they are located at different offsets of the same buffer)

PiperOrigin-RevId: 620179451
---
 .../xla/xla/service/gpu/fusions/custom.cc     |  32 ++-
 .../gpu/runtime/address_computation_thunk.cc  |  12 +-
 .../gpu/runtime/address_computation_thunk.h   |   2 +
 .../runtime/address_computation_thunk_test.cc | 244 +++++++++++++-----
 4 files changed, 205 insertions(+), 85 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 147fb0e89e401b..753d610360c691 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -295,6 +295,12 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
   int64_t out_fake_byte_size = ShapeUtil::ByteSizeOf(
       custom_call.shape().IsArray() ? custom_call.shape()
                                     : custom_call.shape().tuple_shapes(0));
+
+  // Handling cases where multiple operands share the same buffer, with
+  // different offset by creating new fake allocations so each operand will have
+  // a different buffer index. The slices can thus always start at offset 0.
+  // AddressComputationThunk will take care of the offset adjustment.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
   if (fusion.shape().IsArray()) {
     TF_ASSIGN_OR_RETURN(output,
                         get_original_result_slice(&custom_call, /*index=*/{}));
@@ -312,8 +318,10 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
                                                       &fusion, /*index=*/{1}));
     slice_instr = nullptr;
     collect_slice_info();
-    slice_workspace_fake =
-        BufferAllocation::Slice(workspace->allocation(), 0, workspace->size());
+    fake_allocations[3] = std::make_unique<BufferAllocation>(
+        /*index=*/3, workspace->size(), /*color=*/0);
+    slice_workspace_fake = BufferAllocation::Slice(fake_allocations[3].get(), 0,
+                                                   workspace->size());
   }
 
   if (absl::c_all_of(offset_buffer_indices, [&](auto offset_slices) {
@@ -331,20 +339,23 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
       GemmConfig config,
       GemmConfig::For(static_cast<const HloInstruction*>(&custom_call)));
 
-  // TODO(vuson): handle cases where LHS and RHS share the same buffer, with
-  // different offset. In such cases, the fake slices need to contain the
-  // correct offset instead of default value 0.
   int64_t lhs_byte_size =
       ShapeUtil::ByteSizeOf(custom_call.operand(0)->shape());
-  BufferAllocation::Slice slice_lhs_fake(lhs_slice.allocation(), 0,
+  fake_allocations[0] = std::make_unique<BufferAllocation>(
+      /*index=*/0, lhs_byte_size, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations[0].get(), 0,
                                          lhs_byte_size);
 
   int64_t rhs_byte_size =
       ShapeUtil::ByteSizeOf(custom_call.operand(1)->shape());
-  BufferAllocation::Slice slice_rhs_fake(rhs_slice.allocation(), 0,
+  fake_allocations[1] = std::make_unique<BufferAllocation>(
+      /*index=*/1, rhs_byte_size, /*color=*/0);
+  BufferAllocation::Slice slice_rhs_fake(fake_allocations[1].get(), 0,
                                          rhs_byte_size);
 
-  BufferAllocation::Slice slice_out_fake(output.allocation(), 0,
+  fake_allocations[2] = std::make_unique<BufferAllocation>(
+      /*index=*/2, out_fake_byte_size, /*color=*/0);
+  BufferAllocation::Slice slice_out_fake(fake_allocations[2].get(), 0,
                                          out_fake_byte_size);
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<GemmThunk>(
@@ -358,7 +369,8 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
   auto thunk = std::make_unique<AddressComputationThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(&custom_call),
       std::make_unique<ThunkSequence>(std::move(seq)), arguments,
-      offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes);
+      std::move(fake_allocations), offset_buffer_indices, orig_shapes,
+      sliced_shapes, offset_byte_sizes);
 
   FusionEmissionResult result;
   result.thunks.push_back(std::move(thunk));
@@ -602,6 +614,8 @@ absl::StatusOr<FusionEmissionResult> AddressComputationFusion::Emit(
 absl::StatusOr<FusionEmissionResult> DynamicAddressComputationFusion::Emit(
     IrEmitterContext& ir_emitter_context,
     const HloFusionInstruction& fusion) const {
+  // std::cerr << "TYB \n"
+  //           << fusion.fused_instructions_computation()->ToString() << '\n';
   const HloFusionAdaptor& adaptor = analysis_.fusion();
   auto maybe_custom_call_adaptor = HloFindIf(
       adaptor.GetRoots(), adaptor,
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 705aede8672f5f..15efc3e89b4b5b 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -46,6 +46,7 @@ namespace gpu {
 AddressComputationThunk::AddressComputationThunk(
     ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
     std::vector<std::optional<const BufferAllocation::Slice>> arguments,
+    std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
     std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
         offset_buffer_indices,
     std::vector<std::optional<const Shape>> orig_shapes,
@@ -55,6 +56,7 @@ AddressComputationThunk::AddressComputationThunk(
       embedded_thunk_(std::make_unique<SequentialThunk>(
           ThunkInfo(thunk_info.op), std::move(*embedded_thunk))),
       embedded_thunk_arguments_(std::move(arguments)),
+      fake_allocations_(std::move(fake_allocations)),
       offset_buffer_indices_(std::move(offset_buffer_indices)),
       orig_shapes_(std::move(orig_shapes)),
       sliced_shapes_(std::move(sliced_shapes)),
@@ -113,8 +115,8 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   auto& stream = *params.stream;
   const BufferAllocations& orig_allocations = *params.buffer_allocations;
-  std::vector<se::DeviceMemoryBase> new_buffers(orig_allocations.size(),
-                                                se::DeviceMemoryBase());
+  std::vector<se::DeviceMemoryBase> new_buffers(
+      embedded_thunk_arguments_.size(), se::DeviceMemoryBase());
 
   // Get memory allocation for copying offsets from device.
   int64_t* offsets_base = [&] {
@@ -136,10 +138,9 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
     // `argument_slice` within `orig_allocations`
     se::DeviceMemoryBase orig_argument =
         orig_allocations.GetDeviceAddress(*argument_slice);
-    auto buffer_idx = argument_slice->index();
 
     if (offset_slice == std::nullopt) {
-      new_buffers[buffer_idx] = orig_argument;
+      new_buffers[argument_idx] = orig_argument;
       continue;
     }
 
@@ -185,7 +186,8 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
       new_offset += start * stride;
     }
 
-    new_buffers[buffer_idx] = orig_argument.GetByteSlice(new_offset, new_size);
+    new_buffers[argument_idx] =
+        orig_argument.GetByteSlice(new_offset, new_size);
   }
 
   // Safe to create a local BufferAllocations here since buffers are only slices
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index 05765f215a9e14..e1c0b30d9953aa 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -45,6 +45,7 @@ class AddressComputationThunk : public Thunk {
   AddressComputationThunk(
       ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
       std::vector<std::optional<const BufferAllocation::Slice>> arguments,
+      std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_,
       std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
           offset_buffer_indices,
       std::vector<std::optional<const Shape>> orig_shapes,
@@ -63,6 +64,7 @@ class AddressComputationThunk : public Thunk {
   std::unique_ptr<SequentialThunk> embedded_thunk_;
   std::vector<std::optional<const BufferAllocation::Slice>>
       embedded_thunk_arguments_;
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
   std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
       offset_buffer_indices_;
   std::vector<std::optional<const Shape>> orig_shapes_;
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index 030e45a154ed7c..8e43b77cc6c04a 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -83,17 +83,30 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
   // Prepare embedded and address computation thunks.
 
   // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
   BufferAllocation alloc_lhs(/*index=*/0, lhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, lhs_length);
 
-  BufferAllocation alloc_rhs(/*index=*/1, rhs_length, /*color=*/0);
-  BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, rhs_length);
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs(fake_allocations.back().get(), 0,
+                                    rhs_length);
 
-  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
-  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
+                                    out_length);
 
-  BufferAllocation alloc_workspace(/*index=*/3, 1024 * 1024, /*color=*/0);
-  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+                                          1024 * 1024);
 
   BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
                                       /*color=*/0);
@@ -105,9 +118,6 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
   BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
                                              offset_length);
 
-  BufferAllocation alloc_lhs_fake(/*index=*/0, rhs_length, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, rhs_length);
-
   // Preparing config for GEMM thunk.
   auto config =
       GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
@@ -130,6 +140,7 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
       {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
        std::nullopt, std::nullopt},
@@ -212,17 +223,33 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   // Prepare embedded and address computation thunks.
 
   // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/0, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/1, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
   BufferAllocation alloc_lhs(/*index=*/0, lhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, lhs_length);
 
   BufferAllocation alloc_rhs(/*index=*/1, rhs_length, /*color=*/0);
   BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, rhs_length);
 
-  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
-  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
+                                    out_length);
 
-  BufferAllocation alloc_workspace(/*index=*/3, 1024 * 1024, /*color=*/0);
-  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+                                          1024 * 1024);
 
   BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
                                       /*color=*/0);
@@ -244,12 +271,6 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   BufferAllocation::Slice slice_rhs_offset_1(&alloc_rhs_offset_1, 0,
                                              offset_length);
 
-  BufferAllocation alloc_lhs_fake(/*index=*/0, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, slice_length);
-
-  BufferAllocation alloc_rhs_fake(/*index=*/1, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_rhs_fake(&alloc_rhs_fake, 0, slice_length);
-
   // Preparing config for GEMM thunk.
   auto config =
       GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2}), {}, {1},
@@ -274,6 +295,7 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
       {lhs_offsets, rhs_offsets, std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}),
        ShapeUtil::MakeShape(PrimitiveType::F32, {4, 3}), std::nullopt,
@@ -362,17 +384,33 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
   // Prepare embedded and address computation thunks.
 
   // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/0, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/1, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
   BufferAllocation alloc_lhs(/*index=*/0, length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, length);
 
   BufferAllocation alloc_rhs(/*index=*/1, length, /*color=*/0);
   BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, length);
 
-  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
-  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
+                                    out_length);
 
-  BufferAllocation alloc_workspace(/*index=*/3, 1024 * 1024, /*color=*/0);
-  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+                                          1024 * 1024);
 
   BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
                                       /*color=*/0);
@@ -394,12 +432,6 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
   BufferAllocation::Slice slice_rhs_offset_1(&alloc_rhs_offset_1, 0,
                                              offset_length);
 
-  BufferAllocation alloc_lhs_fake(/*index=*/0, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs, 0, slice_length);
-
-  BufferAllocation alloc_rhs_fake(/*index=*/1, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_rhs_fake(&alloc_rhs, 0, slice_length);
-
   // Preparing config for GEMM thunk.
   auto config =
       GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
@@ -424,6 +456,7 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
       {lhs_offsets, rhs_offsets, std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}),
        ShapeUtil::MakeShape(PrimitiveType::F32, {8, 1}), std::nullopt,
@@ -542,11 +575,21 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   // Prepare embedded and address computation thunks.
 
   // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
+
+  // Fake slices for embedded thunk creation.
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/0, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_src_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
   BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
   BufferAllocation::Slice slice_src(&alloc_src, 0, src_length);
 
-  BufferAllocation alloc_dst(/*index=*/1, dst_length, /*color=*/0);
-  BufferAllocation::Slice slice_dst(&alloc_dst, 0, dst_length);
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/1, dst_length, /*color=*/0));
+  BufferAllocation::Slice slice_dst(fake_allocations.back().get(), 0,
+                                    dst_length);
 
   BufferAllocation alloc_offset_0(/*index=*/2, offset_length, /*color=*/0);
   BufferAllocation::Slice slice_offset_0(&alloc_offset_0, 0, offset_length);
@@ -560,10 +603,6 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   BufferAllocation alloc_offset_3(/*index=*/5, offset_length, /*color=*/0);
   BufferAllocation::Slice slice_offset_3(&alloc_offset_3, 0, offset_length);
 
-  // Fake slices for embedded thunk creation.
-  BufferAllocation alloc_src_fake(/*index=*/0, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_src_fake(&alloc_src_fake, 0, slice_length);
-
   // Preparing custom call thunk: setting up call target and operands + results
   // buffers.
   auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
@@ -589,7 +628,7 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)), {slice_src, slice_dst},
-      {slice_offsets, std::nullopt},
+      std::move(fake_allocations), {slice_offsets, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 8}), std::nullopt},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
       // original slice result and not bitcasted one)
@@ -672,6 +711,19 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
   // Prepare embedded and address computation thunks.
 
   // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
+
+  // Fake slices for embedded thunk creation.
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/0, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_src_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/1, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_dst_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
   BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
   BufferAllocation::Slice slice_src(&alloc_src, 0, src_length);
 
@@ -710,13 +762,6 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
   BufferAllocation::Slice slice_dst_offset_3(&alloc_dst_offset_3, 0,
                                              offset_length);
 
-  // Fake slices for embedded thunk creation.
-  BufferAllocation alloc_src_fake(/*index=*/0, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_src_fake(&alloc_src_fake, 0, slice_length);
-
-  BufferAllocation alloc_dst_fake(/*index=*/1, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_dst_fake(&alloc_dst_fake, 0, slice_length);
-
   // Preparing custom call thunk: setting up call target and operands + results
   // buffers.
   auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
@@ -746,7 +791,7 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)), {slice_src, slice_dst},
-      {slice_src_offsets, slice_dst_offsets},
+      std::move(fake_allocations), {slice_src_offsets, slice_dst_offsets},
       {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 2}),
        ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2, 2, 2})},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
@@ -849,6 +894,28 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
   // Prepare embedded and address computation thunks.
 
   // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out_fake(fake_allocations.back().get(), 0,
+                                         out_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace_fake(fake_allocations.back().get(), 0,
+                                               1024 * 1024);
+
   BufferAllocation alloc_lhs(/*index=*/1, lhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, lhs_length);
 
@@ -871,9 +938,6 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
   BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
                                              offset_length);
 
-  BufferAllocation alloc_lhs_fake(/*index=*/1, rhs_length, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, rhs_length);
-
   // Preparing config for GEMM thunk.
   auto config =
       GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
@@ -886,8 +950,8 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
   // Creating embedded GEMM thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs,
-      slice_out, slice_workspace, /*deterministic=*/true));
+      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
   std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
@@ -896,6 +960,7 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
       {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
        std::nullopt, std::nullopt},
@@ -977,6 +1042,28 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
   // Prepare embedded and address computation thunks.
 
   // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out_fake(fake_allocations.back().get(), 0,
+                                         out_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace_fake(fake_allocations.back().get(), 0,
+                                               1024 * 1024);
+
   BufferAllocation alloc_lhs(/*index=*/7, lhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, lhs_length);
 
@@ -999,9 +1086,6 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
   BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
                                              offset_length);
 
-  BufferAllocation alloc_lhs_fake(/*index=*/7, rhs_length, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, rhs_length);
-
   // Preparing config for GEMM thunk.
   auto config =
       GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
@@ -1014,8 +1098,8 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
   // Creating embedded GEMM thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs,
-      slice_out, slice_workspace, /*deterministic=*/true));
+      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
   std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
@@ -1024,6 +1108,7 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
       {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
        std::nullopt, std::nullopt},
@@ -1106,17 +1191,30 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
   // Prepare embedded and address computation thunks.
 
   // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
   BufferAllocation alloc_lhs(/*index=*/0, 3 * lhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(&alloc_lhs, lhs_length, lhs_length);
 
-  BufferAllocation alloc_rhs(/*index=*/1, rhs_length, /*color=*/0);
-  BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, rhs_length);
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs(fake_allocations.back().get(), 0,
+                                    rhs_length);
 
-  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
-  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
+                                    out_length);
 
-  BufferAllocation alloc_workspace(/*index=*/3, 1024 * 1024, /*color=*/0);
-  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+                                          1024 * 1024);
 
   BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
                                       /*color=*/0);
@@ -1128,9 +1226,6 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
   BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
                                              offset_length);
 
-  BufferAllocation alloc_lhs_fake(/*index=*/0, rhs_length, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, rhs_length);
-
   // Preparing config for GEMM thunk.
   auto config =
       GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
@@ -1153,6 +1248,7 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
       {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
        std::nullopt, std::nullopt},
@@ -1244,6 +1340,19 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
   // Prepare embedded and address computation thunks.
 
   // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
+
+  // Fake slices for embedded thunk creation.
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/0, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_src_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/1, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_dst_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
   BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
   BufferAllocation::Slice slice_src(&alloc_src, 0, src_length);
 
@@ -1282,13 +1391,6 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
   BufferAllocation::Slice slice_dst_offset_3(&alloc_dst_offset_3, 0,
                                              offset_length);
 
-  // Fake slices for embedded thunk creation.
-  BufferAllocation alloc_src_fake(/*index=*/0, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_src_fake(&alloc_src_fake, 0, slice_length);
-
-  BufferAllocation alloc_dst_fake(/*index=*/1, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_dst_fake(&alloc_dst_fake, 0, slice_length);
-
   // Preparing custom call thunk: setting up call target and operands + results
   // buffers.
   auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
@@ -1318,7 +1420,7 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
       std::make_unique<ThunkSequence>(std::move(seq)), {slice_src, slice_dst},
-      {slice_src_offsets, slice_dst_offsets},
+      std::move(fake_allocations), {slice_src_offsets, slice_dst_offsets},
       {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 2}),
        ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2, 2, 2})},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.

From 06ef8a60031342ed6ba72a572aa509837a97be8b Mon Sep 17 00:00:00 2001
From: Doyeon Kim <doyeonkim@google.com>
Date: Fri, 29 Mar 2024 01:11:55 -0700
Subject: [PATCH 589/670] Convert quantized stablehlo.constant to
 tfl.pseudo_qconst

PiperOrigin-RevId: 620182445
---
 .../uniform-quantized-stablehlo-to-tfl.mlir   | 29 +++++++++++++++++++
 ...uniform_quantized_stablehlo_to_tfl_pass.cc | 23 +++++++++++++--
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
index 5653dfeb9f2b8f..76699b8c860c23 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
@@ -1506,3 +1506,32 @@ func.func @add_i32(%arg0: tensor<1x3x!quant.uniform<i32:f32:1, {1.000000e+0, 1.0
 // CHECK-LABEL: func @add_i32
 // CHECK: stablehlo.add
 // CHECK-NOT: tfl.add
+
+// -----
+
+// Tests that a quantized `stablehlo.constant` is converted into `tfl.qconst`.
+
+// CHECK-LABEL: func @quantized_constant
+func.func @quantized_constant() -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>> {
+  %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
+  return %0 : tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
+}
+
+// CHECK: %[[QCONST:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>}
+// CHECK-SAME: () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK: return %[[QCONST]]
+
+// -----
+
+// Tests that a float `stablehlo.constant` is not converted into `tfl.qconst`.
+
+// CHECK-LABEL: func @float_constant
+func.func @float_constant() -> tensor<1x2x4x5xf32> {
+  %0 = stablehlo.constant() {value = dense<1.0> : tensor<1x2x4x5xf32>} : () -> tensor<1x2x4x5xf32>
+  return %0 : tensor<1x2x4x5xf32>
+}
+
+// CHECK: stablehlo.constant
+// CHECK-NOT: tfl.pseudo_qconst
+// CHECK-NOT: tfl.pseudo_const
+// CHECK-NOT: arith.constant
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
index f9417d6da30274..4148ef49f6604a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
@@ -2102,20 +2102,39 @@ class RewriteQuantizedAddOp : public OpRewritePattern<stablehlo::AddOp> {
   }
 };
 
+// Rewrites quantized `stablehlo.constant` to `tfl.pseudo_qconst`.
+class RewriteQuantizedConstantOp
+    : public OpRewritePattern<stablehlo::ConstantOp> {
+ public:
+  using OpRewritePattern<stablehlo::ConstantOp>::OpRewritePattern;
+
+  LogicalResult match(stablehlo::ConstantOp op) const override {
+    return success(IsQuantizedTensorType(op.getOutput().getType()));
+  }
+
+  void rewrite(stablehlo::ConstantOp op,
+               PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<TFL::QConstOp>(
+        op, /*qtype=*/TypeAttr::get(op.getOutput().getType()),
+        /*value=*/op.getValue());
+  }
+};
+
 void UniformQuantizedStableHloToTflPass::runOnOperation() {
   func::FuncOp func_op = getOperation();
   MLIRContext& ctx = getContext();
 
   RewritePatternSet patterns(&ctx);
   patterns.add<RewriteUniformDequantizeOp, RewriteUniformQuantizeOp,
-               RewriteQuantizedBroadcastInDimOp, RewriteQuantizedConcatenateOp,
+               RewriteQuantizedAddOp, RewriteQuantizedBroadcastInDimOp,
+               RewriteQuantizedConcatenateOp, RewriteQuantizedConstantOp,
                RewriteQuantizedConvolutionOp,
                RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp,
                RewriteQuantizedDynamicReshapeOp, RewriteQuantizedDynamicSliceOp,
                RewriteQuantizedGatherOp, RewriteQuantizedPadOp,
                RewriteQuantizedReduceWindowOpWithMax, RewriteQuantizedReshapeOp,
                RewriteQuantizedSelectOp, RewriteQuantizedSliceOp,
-               RewriteQuantizedTransposeOp, RewriteQuantizedAddOp>(&ctx);
+               RewriteQuantizedTransposeOp>(&ctx);
 
   if (failed(applyPatternsAndFoldGreedily(func_op, std::move(patterns)))) {
     func_op.emitError() << "Failed to convert stablehlo ops with uniform "

From f2a78c0e2beb95fcae7b6aa346802f36ddb62219 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 29 Mar 2024 01:26:57 -0700
Subject: [PATCH 590/670] [xla:gpu][NFC] Add AddressComputationThunk test with
 GEMM operands sharing the same buffer

PiperOrigin-RevId: 620184639
---
 .../runtime/address_computation_thunk_test.cc | 152 ++++++++++++++++++
 1 file changed, 152 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index 8e43b77cc6c04a..ee6b4eee6b6164 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -1511,4 +1511,156 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
   ASSERT_EQ(out, ref);
 }
 
+TEST(AddressComputationThunkTest, SlicedOperandsSameBufferGemm) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t lhs_length = sizeof(float) * 2 * 4;
+  int64_t rhs_length = sizeof(float) * 3 * 1;
+  int64_t out_length = sizeof(float) * 1 * 1;
+  int64_t offset_length = sizeof(int64_t);
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out_fake(fake_allocations.back().get(), 0,
+                                         out_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace_fake(fake_allocations.back().get(), 0,
+                                               1024 * 1024);
+
+  BufferAllocation alloc(/*index=*/0, lhs_length + rhs_length + out_length,
+                         /*color=*/0);
+  BufferAllocation::Slice slice_lhs(&alloc, 0, lhs_length);
+  BufferAllocation::Slice slice_rhs(&alloc, lhs_length, rhs_length);
+  BufferAllocation::Slice slice_out(&alloc, lhs_length + rhs_length,
+                                    out_length);
+
+  BufferAllocation alloc_workspace(/*index=*/1, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+
+  BufferAllocation alloc_lhs_offset_0(/*index=*/2, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_offset_1(/*index=*/3, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
+
+  // Preparing config for GEMM thunk.
+  auto config =
+      GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
+                      0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
+                      se::blas::kDefaultComputePrecision, false, false);
+  ASSERT_TRUE(config.ok());
+
+  // Creating embedded GEMM thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<GemmThunk>(
+      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
+
+  // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
+      {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {sizeof(int64_t), std::nullopt, std::nullopt, std::nullopt});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+
+  // Preparing memory for thunk arguments.
+  // lhs = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+  //
+  // The real `lhs` tensor will look more like this:
+  // lhs = [1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0]
+  // The `lhs` slice that we want to use will be equivalent to this static
+  // slice op:
+  // f32[1,3]{1,0} slice(lhs), slice={[0:1], [1:4]}
+  se::DeviceMemory<float> buffer =
+      executor->AllocateArray<float>(lhs_length + rhs_length + out_length);
+  TF_ASSERT_OK(stream.MemZero(&buffer, lhs_length + rhs_length + out_length));
+
+  se::DeviceMemoryBase lhs = buffer.GetByteSlice(0, lhs_length);
+  std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
+  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+
+  // rhs = [1.0,
+  //        1.0,
+  //        1.0]
+  se::DeviceMemoryBase rhs = buffer.GetByteSlice(lhs_length, rhs_length);
+  std::vector<float> rhs_arr(3, 1);
+  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+
+  se::DeviceMemoryBase out =
+      buffer.GetByteSlice(lhs_length + rhs_length, out_length);
+
+  se::DeviceMemory<float> workspace =
+      executor->AllocateArray<float>(1024 * 1024);
+  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> lhs_offset_arr{0, 1};
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations({buffer, workspace, lhs_offset_0, lhs_offset_1},
+                                0, executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Executing address computation thunk.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copying `out` data back to host for verification.
+  std::vector<float> dst(1, 0);
+  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+
+  ASSERT_EQ(dst, std::vector<float>({9}));
+}
+
 }  // namespace xla::gpu

From 17aaaa5ed5da664a51efa220a32c54fbf5adb985 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Mar 2024 02:02:17 -0700
Subject: [PATCH 591/670] compat: Update forward compatibility horizon to
 2024-03-29

PiperOrigin-RevId: 620190038
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index d3a357f833f2ef..149839623ef935 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 28)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 29)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 30166624ca249a6bdba5487ab0374f124fba566a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Mar 2024 02:02:26 -0700
Subject: [PATCH 592/670] Update GraphDef version to 1816.

PiperOrigin-RevId: 620190062
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index df98002eacc475..6352dcf15edd44 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1815  // Updated: 2024/3/28
+#define TF_GRAPH_DEF_VERSION 1816  // Updated: 2024/3/29
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 06f2799e4630d759841689f04a374dba89774983 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 29 Mar 2024 02:31:43 -0700
Subject: [PATCH 593/670] [xla:gpu][NFC] Use meaningful constexpr

PiperOrigin-RevId: 620194665
---
 .../xla/xla/service/gpu/fusions/custom.cc     | 61 +++++++++++--------
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 753d610360c691..8c2229506e35d5 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -70,6 +70,12 @@ namespace xla {
 namespace gpu {
 namespace {
 
+constexpr unsigned kLHSOperandIndex = 0;
+constexpr unsigned kRHSOperandIndex = 1;
+
+constexpr unsigned kGEMMOutputBufferIndex = 0;
+constexpr unsigned kGEMMWorkspaceBufferIndex = 1;
+
 absl::StatusOr<std::unique_ptr<Thunk>> BuildCustomKernelThunkForFusion(
     IrEmitterContext& ir_emitter_context, const HloFusionInstruction& fusion,
     CustomKernel custom_kernel) {
@@ -144,12 +150,14 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   TF_ASSIGN_OR_RETURN(
       BufferAllocation::Slice lhs_slice,
       GetSliceWithUpdatedOffsetAndSize(buffer_assignment, adaptor, fusion,
-                                       *custom_call.operand(0), /*index=*/{}));
+                                       *custom_call.operand(kLHSOperandIndex),
+                                       /*index=*/{}));
 
   TF_ASSIGN_OR_RETURN(
       BufferAllocation::Slice rhs_slice,
       GetSliceWithUpdatedOffsetAndSize(buffer_assignment, adaptor, fusion,
-                                       *custom_call.operand(1), /*index=*/{}));
+                                       *custom_call.operand(kRHSOperandIndex),
+                                       /*index=*/{}));
 
   BufferAllocation::Slice output;
   std::optional<BufferAllocation::Slice> workspace;
@@ -161,10 +169,11 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
     TF_ASSIGN_OR_RETURN(output,
                         GetAllocationSlice(buffer_assignment, &fusion, {}));
   } else {
-    TF_ASSIGN_OR_RETURN(output,
-                        GetAllocationSlice(buffer_assignment, &fusion, {0}));
+    TF_ASSIGN_OR_RETURN(output, GetAllocationSlice(buffer_assignment, &fusion,
+                                                   {kGEMMOutputBufferIndex}));
     TF_ASSIGN_OR_RETURN(workspace,
-                        GetAllocationSlice(buffer_assignment, &fusion, {1}));
+                        GetAllocationSlice(buffer_assignment, &fusion,
+                                           {kGEMMWorkspaceBufferIndex}));
   }
 
   bool deterministic_ops =
@@ -249,15 +258,15 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
         slice_instr->index_operands().front()->shape().element_type()));
   };
 
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice lhs_slice,
-      get_original_operand_slice(custom_call.operand(0), /*index=*/{}));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice lhs_slice,
+                      get_original_operand_slice(
+                          custom_call.operand(kLHSOperandIndex), /*index=*/{}));
   collect_slice_info();
 
   slice_instr = nullptr;
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice rhs_slice,
-      get_original_operand_slice(custom_call.operand(1), /*index=*/{}));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice rhs_slice,
+                      get_original_operand_slice(
+                          custom_call.operand(kRHSOperandIndex), /*index=*/{}));
   collect_slice_info();
 
   slice_instr = nullptr;
@@ -309,13 +318,15 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
     slice_instr = nullptr;
     collect_slice_info();
   } else {
-    TF_ASSIGN_OR_RETURN(output,
-                        get_original_result_slice(&custom_call, /*index=*/{0}));
+    TF_ASSIGN_OR_RETURN(
+        output, get_original_result_slice(&custom_call,
+                                          /*index=*/{kGEMMOutputBufferIndex}));
     collect_slice_info();
     // TODO(vuson): If we want to support slices of workspace, we'd need to
     // start `HloFindIf` with `get-tuple-element` with the right index.
-    TF_ASSIGN_OR_RETURN(workspace, GetAllocationSlice(buffer_assignment,
-                                                      &fusion, /*index=*/{1}));
+    TF_ASSIGN_OR_RETURN(
+        workspace, GetAllocationSlice(buffer_assignment, &fusion,
+                                      /*index=*/{kGEMMWorkspaceBufferIndex}));
     slice_instr = nullptr;
     collect_slice_info();
     fake_allocations[3] = std::make_unique<BufferAllocation>(
@@ -340,18 +351,18 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
       GemmConfig::For(static_cast<const HloInstruction*>(&custom_call)));
 
   int64_t lhs_byte_size =
-      ShapeUtil::ByteSizeOf(custom_call.operand(0)->shape());
-  fake_allocations[0] = std::make_unique<BufferAllocation>(
-      /*index=*/0, lhs_byte_size, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations[0].get(), 0,
-                                         lhs_byte_size);
+      ShapeUtil::ByteSizeOf(custom_call.operand(kLHSOperandIndex)->shape());
+  fake_allocations[kLHSOperandIndex] = std::make_unique<BufferAllocation>(
+      /*index=*/kLHSOperandIndex, lhs_byte_size, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(
+      fake_allocations[kLHSOperandIndex].get(), 0, lhs_byte_size);
 
   int64_t rhs_byte_size =
-      ShapeUtil::ByteSizeOf(custom_call.operand(1)->shape());
-  fake_allocations[1] = std::make_unique<BufferAllocation>(
-      /*index=*/1, rhs_byte_size, /*color=*/0);
-  BufferAllocation::Slice slice_rhs_fake(fake_allocations[1].get(), 0,
-                                         rhs_byte_size);
+      ShapeUtil::ByteSizeOf(custom_call.operand(kRHSOperandIndex)->shape());
+  fake_allocations[kRHSOperandIndex] = std::make_unique<BufferAllocation>(
+      /*index=*/kRHSOperandIndex, rhs_byte_size, /*color=*/0);
+  BufferAllocation::Slice slice_rhs_fake(
+      fake_allocations[kRHSOperandIndex].get(), 0, rhs_byte_size);
 
   fake_allocations[2] = std::make_unique<BufferAllocation>(
       /*index=*/2, out_fake_byte_size, /*color=*/0);

From 147638c8d4391b3606e0ddcc40d14e6aa5a0f1fa Mon Sep 17 00:00:00 2001
From: Doyeon Kim <doyeonkim@google.com>
Date: Fri, 29 Mar 2024 02:52:05 -0700
Subject: [PATCH 594/670] Split hybrid quantized dot-like StableHLO ops into
 TFLite dequantize and float op

Hybrid quantized op has semantics for weight-only quantization within StableHLO, so it should be splitted into dequantize and float op for legalization towards TFLite.

PiperOrigin-RevId: 620197428
---
 .../uniform-quantized-stablehlo-to-tfl.mlir   | 45 +++++++++++++++++++
 ...uniform_quantized_stablehlo_to_tfl_pass.cc | 35 ++++++++++++++-
 2 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
index 76699b8c860c23..7107f7dcb08a45 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
@@ -1535,3 +1535,48 @@ func.func @float_constant() -> tensor<1x2x4x5xf32> {
 // CHECK-NOT: tfl.pseudo_qconst
 // CHECK-NOT: tfl.pseudo_const
 // CHECK-NOT: arith.constant
+
+// -----
+
+// Tests that a hybrid quantized dot_general is splitted into dequantize and float
+// dot_general.
+
+// CHECK-LABEL: func @dot_general_hybrid
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2x3x4xf32>
+func.func @dot_general_hybrid(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x5xf32> {
+  %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
+  %1 = "stablehlo.dot_general"(%arg0, %0) {
+    dot_dimension_numbers = #stablehlo.dot<
+      lhs_batching_dimensions = [0, 1],
+      rhs_batching_dimensions = [0, 1],
+      lhs_contracting_dimensions = [3],
+      rhs_contracting_dimensions = [2]>,
+      precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]
+  } : (tensor<1x2x3x4xf32>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5xf32>
+  return %1 : tensor<1x2x3x5xf32>
+}
+
+// CHECK: %[[WEIGHT:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>}
+// CHECK: %[[DQ:.+]] = "tfl.dequantize"(%[[WEIGHT]]) : (tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x4x5xf32>
+// CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG0]], %[[DQ]], batching_dims = [0, 1] x [0, 1], contracting_dims = [3] x [2], precision = [DEFAULT, DEFAULT] : (tensor<1x2x3x4xf32>, tensor<1x2x4x5xf32>) -> tensor<1x2x3x5xf32>
+// CHECK: return %[[DOT]]
+
+// -----
+
+// Tests that a hybrid quantized convolution is splitted into dequantize and
+// float convolution.
+
+// CHECK-LABEL: func @convolution_hybrid
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x3x4xf32>
+func.func @convolution_hybrid(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x2xf32> {
+  %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x4x2xi8>} : () -> tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>
+  %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x3x2xf32>
+  return %1 : tensor<1x3x3x2xf32>
+}
+
+// CHECK: %[[WEIGHT:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<3x3x4x2xi8>}
+// CHECK: %[[DQ:.+]] = "tfl.dequantize"(%[[WEIGHT]]) : (tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+02,3.000000e+03}>>) -> tensor<3x3x4x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG0]], %[[DQ]])
+// CHECK{LITERAL}: dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64}
+// CHECK-SAME: (tensor<1x3x3x4xf32>, tensor<3x3x4x2xf32>) -> tensor<1x3x3x2xf32>
+// CHECK: return %[[CONV]]
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
index 4148ef49f6604a..8fed8f3f01ed54 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
@@ -2120,12 +2120,45 @@ class RewriteQuantizedConstantOp
   }
 };
 
+// Splits dot-like hybrid quantized StableHLO ops into `tfl.dequantize` and
+// float StableHLO op. Legalization of float StableHLO op depends on existing
+// passes for conversion of StableHLO -> MHLO -> TF -> TFL.
+template <typename OpType>
+class RewriteHybridQuantizedDotLikeOp : public OpRewritePattern<OpType> {
+ public:
+  using OpRewritePattern<OpType>::OpRewritePattern;
+
+  LogicalResult match(OpType op) const override {
+    if (op->getNumOperands() != 2 || op->getNumResults() != 1) {
+      return failure();
+    }
+    // Lhs and result should not be quantized and rhs should be quantized.
+    return success(!IsQuantizedTensorType(op->getOperand(0).getType()) &&
+                   IsQuantizedTensorType(op->getOperand(1).getType()) &&
+                   !IsQuantizedTensorType(op->getResult(0).getType()));
+  }
+
+  void rewrite(OpType op, PatternRewriter& rewriter) const override {
+    Value rhs = op.getOperand(1);
+    Type lhs_element_type =
+        op.getOperand(0).getType().template cast<TensorType>().getElementType();
+    Type dequantized_rhs_type =
+        quant::CloneTypeWithNewElementType(rhs.getType(), lhs_element_type);
+    auto dq = rewriter.create<TFL::DequantizeOp>(
+        op->getLoc(), /*output=*/dequantized_rhs_type,
+        /*input=*/rhs);
+    rewriter.replaceAllUsesExcept(rhs, dq.getOutput(), dq);
+  }
+};
+
 void UniformQuantizedStableHloToTflPass::runOnOperation() {
   func::FuncOp func_op = getOperation();
   MLIRContext& ctx = getContext();
 
   RewritePatternSet patterns(&ctx);
-  patterns.add<RewriteUniformDequantizeOp, RewriteUniformQuantizeOp,
+  patterns.add<RewriteHybridQuantizedDotLikeOp<stablehlo::ConvolutionOp>,
+               RewriteHybridQuantizedDotLikeOp<stablehlo::DotGeneralOp>,
+               RewriteUniformDequantizeOp, RewriteUniformQuantizeOp,
                RewriteQuantizedAddOp, RewriteQuantizedBroadcastInDimOp,
                RewriteQuantizedConcatenateOp, RewriteQuantizedConstantOp,
                RewriteQuantizedConvolutionOp,

From bcf3638a87a7eb9c3d160546dce4b4242f5b0d79 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Mar 2024 09:10:25 -0700
Subject: [PATCH 595/670] Deduplicate inferred mesh shapes when
 try_multiple_mesh_shapes=true.

PiperOrigin-RevId: 620258542
---
 .../auto_sharding/auto_sharding_util.cc       | 43 ++++++++-----------
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index 0cb674711c9b66..baa827febc909a 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -2199,29 +2199,6 @@ void EnumerateAllPossibleMeshShapesHelper(
   }
 }
 
-std::vector<std::vector<int64_t>> EnumerateAllPossibleMeshShapes(
-    const int64_t num_devices, int num_mesh_dims, bool symmetrical_mesh_dims) {
-  std::vector<std::vector<int64_t>> result;
-  EnumerateAllPossibleMeshShapesHelper(num_devices, num_mesh_dims, {}, result);
-
-  if (symmetrical_mesh_dims) {
-    absl::flat_hash_set<absl::btree_multiset<int64_t>> dedup_result;
-    for (const std::vector<int64_t>& mesh_shape : result) {
-      dedup_result.insert(
-          absl::btree_multiset<int64_t>(mesh_shape.begin(), mesh_shape.end()));
-    }
-
-    result.clear();
-
-    for (const absl::btree_multiset<int64_t>& mesh_shape_set : dedup_result) {
-      result.push_back(
-          std::vector<int64_t>(mesh_shape_set.begin(), mesh_shape_set.end()));
-    }
-  }
-
-  return result;
-}
-
 std::vector<std::vector<int64_t>> InferMeshShapesToTry(
     const HloModule& module) {
   int64_t sharding_1d = -1;
@@ -2280,10 +2257,24 @@ std::vector<std::vector<int64_t>> InferOrEnumerateMeshShapesToTry(
     bool symmetrical_mesh_dims) {
   std::vector<std::vector<int64_t>> mesh_shapes = InferMeshShapesToTry(module);
   if (mesh_shapes.empty()) {
-    mesh_shapes = spmd::EnumerateAllPossibleMeshShapes(
-        num_devices, num_mesh_dims,
-        /* symmetrical_mesh_dims */ symmetrical_mesh_dims);
+    EnumerateAllPossibleMeshShapesHelper(num_devices, num_mesh_dims, {},
+                                         mesh_shapes);
+  }
+  if (symmetrical_mesh_dims) {
+    absl::flat_hash_set<absl::btree_multiset<int64_t>> dedup_result;
+    for (const std::vector<int64_t>& mesh_shape : mesh_shapes) {
+      dedup_result.insert(
+          absl::btree_multiset<int64_t>(mesh_shape.begin(), mesh_shape.end()));
+    }
+
+    mesh_shapes.clear();
+
+    for (const absl::btree_multiset<int64_t>& mesh_shape_set : dedup_result) {
+      mesh_shapes.push_back(
+          std::vector<int64_t>(mesh_shape_set.begin(), mesh_shape_set.end()));
+    }
   }
+
   return mesh_shapes;
 }
 

From 370c98d1260e1ad5f883cb62fc5400d3031ac0ad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Mar 2024 09:17:53 -0700
Subject: [PATCH 596/670] Delete the redundant compilation_cache_test

PiperOrigin-RevId: 620259968
---
 third_party/xla/xla/tests/BUILD               |  23 ---
 .../xla/xla/tests/compilation_cache_test.cc   | 171 ------------------
 2 files changed, 194 deletions(-)
 delete mode 100644 third_party/xla/xla/tests/compilation_cache_test.cc

diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 32788698bf4203..fca608158771f3 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -2181,29 +2181,6 @@ xla_test(
     ],
 )
 
-xla_test(
-    name = "compilation_cache_test",
-    srcs = ["compilation_cache_test.cc"],
-    deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":test_utils",
-        ":xla_internal_test_main",
-        "//xla:literal",
-        "//xla:shape_util",
-        "//xla:statusor",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/client:global_data",
-        "//xla/client:local_client",
-        "//xla/client:xla_builder",
-        "//xla/client:xla_computation",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
 xla_test(
     name = "floor_ceil_test",
     srcs = ["floor_ceil_test.cc"],
diff --git a/third_party/xla/xla/tests/compilation_cache_test.cc b/third_party/xla/xla/tests/compilation_cache_test.cc
deleted file mode 100644
index 057015d5233696..00000000000000
--- a/third_party/xla/xla/tests/compilation_cache_test.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <initializer_list>
-#include <memory>
-#include <string>
-
-#include "absl/types/span.h"
-#include "xla/client/global_data.h"
-#include "xla/client/local_client.h"
-#include "xla/client/xla_builder.h"
-#include "xla/client/xla_computation.h"
-#include "xla/literal.h"
-#include "xla/shape_util.h"
-#include "xla/statusor.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
-#include "xla/tests/test_utils.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace {
-
-class CompilationCacheTest : public ClientLibraryTestBase {
- public:
-  void ExecuteComputationR0F32(const XlaComputation& computation,
-                               absl::Span<GlobalData* const> arguments,
-                               float expected_result, bool expect_cache_hit) {
-    ExecutionProfile execution_profile;
-    Literal result =
-        client_
-            ->ExecuteAndTransfer(computation, arguments,
-                                 /*execution_options=*/&execution_options_,
-                                 &execution_profile)
-            .value();
-    EXPECT_TRUE(LiteralTestUtil::Near(
-        LiteralUtil::CreateR0<float>(expected_result), result, error_spec_));
-    EXPECT_EQ(expect_cache_hit, execution_profile.compilation_cache_hit());
-  }
-
-  void ExecuteComputationR2F32(
-      const XlaComputation& computation,
-      absl::Span<GlobalData* const> arguments,
-      std::initializer_list<std::initializer_list<float>> expected_result,
-      bool expect_cache_hit) {
-    ExecutionProfile execution_profile;
-    auto data_handle = client_
-                           ->Execute(computation, arguments,
-                                     &execution_options_, &execution_profile)
-                           .value();
-    Literal result = client_->Transfer(*data_handle).value();
-    EXPECT_TRUE(LiteralTestUtil::Near(
-        LiteralUtil::CreateR2<float>(expected_result), result, error_spec_));
-    EXPECT_EQ(expect_cache_hit, execution_profile.compilation_cache_hit());
-  }
-
-  ErrorSpec error_spec_{0.0001};
-};
-
-// TODO(b/74197823): Disabled because there is no cache in the new design.
-XLA_TEST_F(CompilationCacheTest, DISABLED_ComputationCalledMultipleTimes) {
-  XlaBuilder builder(TestName());
-  Neg(ConstantR0<float>(&builder, 42.0));
-  XlaComputation computation = builder.Build().value();
-
-  ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/true);
-  ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/true);
-}
-
-// TODO(b/74197823): Disabled because there is no cache in the new design.
-XLA_TEST_F(CompilationCacheTest,
-           DISABLED_ComputationCalledWithDifferentParameters) {
-  std::unique_ptr<GlobalData> data_42 =
-      client_->TransferToServer(LiteralUtil::CreateR0<float>(42.0f)).value();
-  std::unique_ptr<GlobalData> data_123 =
-      client_->TransferToServer(LiteralUtil::CreateR0<float>(123.0f)).value();
-  std::unique_ptr<GlobalData> data_456 =
-      client_->TransferToServer(LiteralUtil::CreateR0<float>(456.0f)).value();
-
-  XlaBuilder builder(TestName());
-  Neg(Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "param"));
-  XlaComputation computation = builder.Build().value();
-
-  ExecuteComputationR0F32(computation, {data_42.get()}, -42.0,
-                          /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation, {data_123.get()}, -123.0,
-                          /*expect_cache_hit=*/true);
-  ExecuteComputationR0F32(computation, {data_456.get()}, -456.0,
-                          /*expect_cache_hit=*/true);
-  ExecuteComputationR0F32(computation, {data_42.get()}, -42.0,
-                          /*expect_cache_hit=*/true);
-}
-
-// TODO(b/74197823): Disabled because there is no cache in the new design.
-XLA_TEST_F(CompilationCacheTest, DISABLED_MultipleComputations) {
-  XlaBuilder builder_neg(TestName() + "_neg");
-  Neg(ConstantR0<float>(&builder_neg, 42.0));
-  XlaComputation computation_neg = builder_neg.Build().value();
-
-  XlaBuilder builder_exp(TestName() + "_exp");
-  Exp(ConstantR0<float>(&builder_exp, 1.0));
-  XlaComputation computation_exp = builder_exp.Build().value();
-
-  XlaBuilder builder_add(TestName() + "_add");
-  Add(ConstantR0<float>(&builder_add, 2.0),
-      ConstantR0<float>(&builder_add, 3.0));
-  XlaComputation computation_add = builder_add.Build().value();
-
-  ExecuteComputationR0F32(computation_neg, {}, -42.0,
-                          /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation_exp, {}, 2.7182817,
-                          /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation_add, {}, 5.0,
-                          /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation_neg, {}, -42.0,
-                          /*expect_cache_hit=*/true);
-}
-
-// TODO(b/74197823): Disabled because there is no cache in the new design.
-XLA_TEST_F(CompilationCacheTest, DISABLED_DifferentParameterLayouts) {
-  // Create two GlobalData arrays with the same shape but different
-  // layouts. Use these arrays as parameters to a simple computation. If the
-  // layout of the array changes then computation should be recompiled (cache
-  // miss).
-  auto rowmaj_array = LiteralUtil::CreateR2WithLayout(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({1, 0}));
-  auto rowmaj_handle = client_->TransferToServer(rowmaj_array).value();
-
-  auto colmaj_array = LiteralUtil::CreateR2WithLayout(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1}));
-  auto colmaj_handle = client_->TransferToServer(colmaj_array).value();
-
-  XlaBuilder builder(TestName());
-  Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2, 2}), "param0");
-  XlaComputation computation = builder.Build().value();
-
-  ExecuteComputationR2F32(computation, {colmaj_handle.get()},
-                          {{1.0f, 2.0f}, {3.0f, 4.0f}},
-                          /*expect_cache_hit=*/false);
-  ExecuteComputationR2F32(computation, {colmaj_handle.get()},
-                          {{1.0f, 2.0f}, {3.0f, 4.0f}},
-                          /*expect_cache_hit=*/true);
-  ExecuteComputationR2F32(computation, {rowmaj_handle.get()},
-                          {{1.0f, 2.0f}, {3.0f, 4.0f}},
-                          /*expect_cache_hit=*/false);
-  ExecuteComputationR2F32(computation, {rowmaj_handle.get()},
-                          {{1.0f, 2.0f}, {3.0f, 4.0f}},
-                          /*expect_cache_hit=*/true);
-  ExecuteComputationR2F32(computation, {colmaj_handle.get()},
-                          {{1.0f, 2.0f}, {3.0f, 4.0f}},
-                          /*expect_cache_hit=*/true);
-}
-
-}  // namespace
-}  // namespace xla

From 210d15664debd6a0393a8c9698b217cd5b77e056 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Mar 2024 09:19:31 -0700
Subject: [PATCH 597/670] Restore GOOGLE_CUDA guard in scoped_annotation.h

PiperOrigin-RevId: 620260337
---
 .../xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
index bfd222a81184e3..c41a2a39a8dc3a 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
@@ -35,11 +35,13 @@ namespace tsl::profiler {
 // TraceCollector until PopAnnotation() is called.
 template <typename T>
 void PushAnnotation(const T& generator) {
+#if GOOGLE_CUDA
   if (auto domain = DefaultProfilerDomain();
       TF_PREDICT_FALSE(domain != nullptr)) {
     RangePush(domain, generator());
     return;
   }
+#endif
 
 #if !defined(IS_MOBILE_PLATFORM)
   if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
@@ -60,11 +62,13 @@ inline void PopAnnotation() {
   // fail probably due to compiler in that presubmit config.
   std::atomic_thread_fence(std::memory_order_acquire);
 
+#if GOOGLE_CUDA
   if (auto domain = DefaultProfilerDomain();
       TF_PREDICT_FALSE(domain != nullptr)) {
     RangePop(domain);
     return;
   }
+#endif
 
 #if !defined(IS_MOBILE_PLATFORM)
   if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {

From b36854adf7405d4fc95cd6e6fec0b9b4cc057707 Mon Sep 17 00:00:00 2001
From: Kevin Gleason <gleasonk@google.com>
Date: Fri, 29 Mar 2024 10:00:03 -0700
Subject: [PATCH 598/670] [odml] Remove MHLO from CHLO->StableHLO lowering.
 Migrate JAX random lowering to StableHLO

PiperOrigin-RevId: 620269527
---
 tensorflow/compiler/mlir/lite/BUILD           |  2 ++
 .../mlir/lite/tests/legalize_jax_random.mlir  | 20 +++++++++----------
 .../compiler/mlir/lite/tf_tfl_passes.cc       | 15 ++++++++------
 .../lite/transforms/legalize_jax_random.cc    |  6 +++---
 .../compiler/mlir/lite/transforms/passes.td   |  2 +-
 .../lite/transforms/quantize_variables.cc     |  1 +
 6 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index f535d0d1aaea2e..c3826f1bfb935c 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -747,6 +747,7 @@ cc_library(
         "@local_xla//xla:status",
         "@local_xla//xla:statusor",
         "@local_xla//xla/mlir_hlo",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -886,6 +887,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@local_xla//xla/mlir_hlo",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize_jax_random.mlir b/tensorflow/compiler/mlir/lite/tests/legalize_jax_random.mlir
index d7d77f2e77a97b..76f453d1d3a8aa 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize_jax_random.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize_jax_random.mlir
@@ -3,31 +3,31 @@
 
 // CHECK-LABEL:   func @tfl_wrapped_jax_random_normal(
 // CHECK-SAME:                                        %[[RNG:.*]]: tensor<2xui32>) -> tuple<tensor<3x4xf32>> {
-// CHECK:           %[[VAL_0:.*]] = mhlo.constant dense<[3, 4]> : tensor<2xi32>
+// CHECK:           %[[VAL_0:.*]] = stablehlo.constant dense<[3, 4]> : tensor<2xi32>
 // CHECK:           %[[VAL_1:.*]] = "tfl.custom"(%[[VAL_0]]) {custom_code = "RandomStandardNormal", custom_option = #tfl<const_bytes : "0x">} : (tensor<2xi32>) -> tensor<3x4xf32>
-// CHECK:           %[[VAL_2:.*]] = mhlo.tuple %[[VAL_1]] : tuple<tensor<3x4xf32>>
+// CHECK:           %[[VAL_2:.*]] = stablehlo.tuple %[[VAL_1]] : tuple<tensor<3x4xf32>>
 // CHECK:           return %[[VAL_2]] : tuple<tensor<3x4xf32>>
 // CHECK:         }
 func.func @tfl_wrapped_jax_random_normal(%arg0: tensor<2xui32>) -> tuple<tensor<3x4xf32>> {
   // This is a fake jax random normal body.
-  %0 = mhlo.constant dense<0.0> : tensor<12xf32>
-  %1 = "mhlo.reshape"(%0) : (tensor<12xf32>) -> tensor<3x4xf32>
-  %2 = "mhlo.tuple"(%1) : (tensor<3x4xf32>) -> tuple<tensor<3x4xf32>>
+  %0 = stablehlo.constant dense<0.0> : tensor<12xf32>
+  %1 = "stablehlo.reshape"(%0) : (tensor<12xf32>) -> tensor<3x4xf32>
+  %2 = "stablehlo.tuple"(%1) : (tensor<3x4xf32>) -> tuple<tensor<3x4xf32>>
   func.return %2 : tuple<tensor<3x4xf32>>
 }
 
 
 // CHECK-LABEL:   func @tfl_wrapped_jax_random_uniform(
 // CHECK-SAME:                                         %[[RNG:.*]]: tensor<2xui32>) -> tuple<tensor<1x2xf32>> {
-// CHECK:           %[[VAL_0:.*]] = mhlo.constant dense<[1, 2]> : tensor<2xi32>
+// CHECK:           %[[VAL_0:.*]] = stablehlo.constant dense<[1, 2]> : tensor<2xi32>
 // CHECK:           %[[VAL_1:.*]] = "tfl.custom"(%[[VAL_0]]) {custom_code = "RandomUniform", custom_option = #tfl<const_bytes : "0x">} : (tensor<2xi32>) -> tensor<1x2xf32>
-// CHECK:           %[[VAL_2:.*]] = mhlo.tuple %[[VAL_1]] : tuple<tensor<1x2xf32>>
+// CHECK:           %[[VAL_2:.*]] = stablehlo.tuple %[[VAL_1]] : tuple<tensor<1x2xf32>>
 // CHECK:           return %[[VAL_2]] : tuple<tensor<1x2xf32>>
 // CHECK:         }
 func.func @tfl_wrapped_jax_random_uniform(%arg0: tensor<2xui32>) -> tuple<tensor<1x2xf32>> {
   // This is a fake jax random uniform body.
-  %0 = mhlo.constant dense<0.0> : tensor<2xf32>
-  %1 = "mhlo.reshape"(%0) : (tensor<2xf32>) -> tensor<1x2xf32>
-  %2 = "mhlo.tuple"(%1) : (tensor<1x2xf32>) -> tuple<tensor<1x2xf32>>
+  %0 = stablehlo.constant dense<0.0> : tensor<2xf32>
+  %1 = "stablehlo.reshape"(%0) : (tensor<2xf32>) -> tensor<1x2xf32>
+  %2 = "stablehlo.tuple"(%1) : (tensor<1x2xf32>) -> tuple<tensor<1x2xf32>>
   func.return %2 : tuple<tensor<1x2xf32>>
 }
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 72abf68f852fb5..f4aa97069655e8 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -144,14 +144,17 @@ void AddPreQuantizationStableHloToTfPasses(
   pass_manager.addPass(
       mlir::odml::CreateLegalizeTFXlaCallModuleToStablehloPass());
 
-  // Add CHLO to StableHLO Decompositions:
-  // This is needed since we are relying on XlaCallModule uses MHLO
-  // specific features like mhlo::ErfOp which aren't supported
-  // in StableHLO, but we have CHLO->StableHLO decompositions to legalize.
+  // Legalize MHLO to StableHLO should be moved closer to where it is needed
+  // There are some entry points that start with HLO->MHLO like
+  // jax_to_tfl_flatbuffer.cc which can likely be updated to emit StableHLO
+  // to be consistent with other entrypoints.
   pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+
+  // Decompose CHLO into StableHLO ops
+  // TODO(b/331843141): There are some CHLO's like TopK which we could instead
+  // lower to TFL ops.
   mlir::stablehlo::experimental::createChloLegalizeToStablehloPipeline(
       pass_manager);
-  pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
 
   // The following two passes find specific uniform quantization patterns in
   // StableHLO and converts them to TFLite ops that accept or produce uniform
@@ -168,7 +171,6 @@ void AddPreQuantizationStableHloToTfPasses(
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::odml::CreateUniformQuantizedStableHloToTflPass());
 
-  pass_manager.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   // Legalize jax random to tflite custom op.
   // The CreateLegalizeJaxRandom Pass has to stay at because we need to replace
   // the random function body before being inlined.
@@ -176,6 +178,7 @@ void AddPreQuantizationStableHloToTfPasses(
       mlir::TFL::CreateLegalizeJaxRandomPass());
 
   // Canonicalize, CSE etc.
+  pass_manager.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::createCanonicalizerPass());
   pass_manager.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
index 72120f1502f021..e8bae6eb64280f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
@@ -47,10 +47,10 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
 namespace TFL {
@@ -99,7 +99,7 @@ void LegalizeJaxRandomPass::runOnOperation() {
   }
   auto result_shape_attr = builder.getI32TensorAttr(result_shape_i32);
   Value result_shape_tensor =
-      builder.create<mhlo::ConstantOp>(result_shape_attr);
+      builder.create<stablehlo::ConstantOp>(result_shape_attr);
   auto custom_code =
       IsJaxRandomUniform(func) ? "RandomUniform" : "RandomStandardNormal";
 
@@ -112,7 +112,7 @@ void LegalizeJaxRandomPass::runOnOperation() {
                                  ValueRange(result_shape_tensor_vec),
                                  custom_code, attr)
           .getResult(0);
-  Value tulple_result = builder.create<mhlo::TupleOp>(random_result);
+  Value tulple_result = builder.create<stablehlo::TupleOp>(random_result);
   builder.create<mlir::func::ReturnOp>(tulple_result);
 }
 }  // namespace
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.td b/tensorflow/compiler/mlir/lite/transforms/passes.td
index 988ad189a6ec00..eefb109d2b966e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.td
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.td
@@ -108,7 +108,7 @@ def LegalizeHashTablesPass : Pass<"tfl-legalize-hashtables-tf", "mlir::ModuleOp"
 def LegalizeJaxRandomPass : Pass<"tfl-legalize-random", "mlir::func::FuncOp"> {
   let summary = "Replace jax.random.uniform/normal with tfl.custom.";
   let constructor = "CreateLegalizeJaxRandomPass()";
-  let dependentDialects = ["TFL::TensorFlowLiteDialect"];
+  let dependentDialects = ["TFL::TensorFlowLiteDialect", "stablehlo::StablehloDialect"];
 }
 
 def LegalizeTFPass : Pass<"tfl-legalize-tf", "mlir::func::FuncOp"> {
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
index 33580d1ea95dbc..0d9db051ef27ff 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"

From bc4b9d84ac43580f33b235141a9d4abf1a85be33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Paruzel?= <paruzelp@google.com>
Date: Fri, 29 Mar 2024 10:07:30 -0700
Subject: [PATCH 599/670] Enhanced zeta readability based on the article

Changes based on the Hurwitz Zeta algorithm from the article linked in the comments.

PiperOrigin-RevId: 620272234
---
 third_party/stablehlo/temporary.patch         | 1181 +++++++++++++++++
 .../xla/third_party/stablehlo/temporary.patch | 1181 +++++++++++++++++
 .../Dialect/chlo/chlo_legalize_to_mhlo.mlir   |  834 ++++++------
 3 files changed, 2779 insertions(+), 417 deletions(-)

diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index 70d9744d6e8ae1..94971c07102a21 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -2645,4 +2645,1185 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+--- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
++++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+@@ -1283,153 +1283,153 @@
+ func.func @zeta_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
+   // CHECK: %[[TMP_0:.*]] = stablehlo.convert %[[X]] : (tensor<f16>) -> tensor<f32>
+   // CHECK: %[[TMP_1:.*]] = stablehlo.convert %[[Q]] : (tensor<f16>) -> tensor<f32>
+-  // CHECK: %[[TMP_2:.*]] = stablehlo.constant dense<0.000000e+00>
+-  // CHECK: %[[TMP_3:.*]] = stablehlo.negate %[[TMP_0]]
+-  // CHECK: %[[TMP_4:.*]] = stablehlo.power %[[TMP_1]], %[[TMP_3]]
+-  // CHECK: %[[TMP_5:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_6:.*]] = stablehlo.add %[[TMP_1]], %[[TMP_5]]
+-  // CHECK: %[[TMP_7:.*]] = stablehlo.power %[[TMP_6]], %[[TMP_3]]
+-  // CHECK: %[[TMP_8:.*]] = stablehlo.add %[[TMP_4]], %[[TMP_7]]
+-  // CHECK: %[[TMP_9:.*]] = stablehlo.add %[[TMP_6]], %[[TMP_5]]
+-  // CHECK: %[[TMP_10:.*]] = stablehlo.power %[[TMP_9]], %[[TMP_3]]
++  // CHECK-DAG: %[[TMP_2:.*]] = stablehlo.constant dense<0.000000e+00>
++  // CHECK-DAG: %[[TMP_3:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_4:.*]] = stablehlo.negate %[[TMP_0]]
++  // CHECK: %[[TMP_5:.*]] = stablehlo.power %[[TMP_1]], %[[TMP_4]]
++  // CHECK: %[[TMP_6:.*]] = stablehlo.add %[[TMP_1]], %[[TMP_3]]
++  // CHECK: %[[TMP_7:.*]] = stablehlo.power %[[TMP_6]], %[[TMP_4]]
++  // CHECK: %[[TMP_8:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_7]]
++  // CHECK: %[[TMP_9:.*]] = stablehlo.add %[[TMP_6]], %[[TMP_3]]
++  // CHECK: %[[TMP_10:.*]] = stablehlo.power %[[TMP_9]], %[[TMP_4]]
+   // CHECK: %[[TMP_11:.*]] = stablehlo.add %[[TMP_8]], %[[TMP_10]]
+-  // CHECK: %[[TMP_12:.*]] = stablehlo.add %[[TMP_9]], %[[TMP_5]]
+-  // CHECK: %[[TMP_13:.*]] = stablehlo.power %[[TMP_12]], %[[TMP_3]]
++  // CHECK: %[[TMP_12:.*]] = stablehlo.add %[[TMP_9]], %[[TMP_3]]
++  // CHECK: %[[TMP_13:.*]] = stablehlo.power %[[TMP_12]], %[[TMP_4]]
+   // CHECK: %[[TMP_14:.*]] = stablehlo.add %[[TMP_11]], %[[TMP_13]]
+-  // CHECK: %[[TMP_15:.*]] = stablehlo.add %[[TMP_12]], %[[TMP_5]]
+-  // CHECK: %[[TMP_16:.*]] = stablehlo.power %[[TMP_15]], %[[TMP_3]]
++  // CHECK: %[[TMP_15:.*]] = stablehlo.add %[[TMP_12]], %[[TMP_3]]
++  // CHECK: %[[TMP_16:.*]] = stablehlo.power %[[TMP_15]], %[[TMP_4]]
+   // CHECK: %[[TMP_17:.*]] = stablehlo.add %[[TMP_14]], %[[TMP_16]]
+-  // CHECK: %[[TMP_18:.*]] = stablehlo.add %[[TMP_15]], %[[TMP_5]]
+-  // CHECK: %[[TMP_19:.*]] = stablehlo.power %[[TMP_18]], %[[TMP_3]]
++  // CHECK: %[[TMP_18:.*]] = stablehlo.add %[[TMP_15]], %[[TMP_3]]
++  // CHECK: %[[TMP_19:.*]] = stablehlo.power %[[TMP_18]], %[[TMP_4]]
+   // CHECK: %[[TMP_20:.*]] = stablehlo.add %[[TMP_17]], %[[TMP_19]]
+-  // CHECK: %[[TMP_21:.*]] = stablehlo.add %[[TMP_18]], %[[TMP_5]]
+-  // CHECK: %[[TMP_22:.*]] = stablehlo.power %[[TMP_21]], %[[TMP_3]]
++  // CHECK: %[[TMP_21:.*]] = stablehlo.add %[[TMP_18]], %[[TMP_3]]
++  // CHECK: %[[TMP_22:.*]] = stablehlo.power %[[TMP_21]], %[[TMP_4]]
+   // CHECK: %[[TMP_23:.*]] = stablehlo.add %[[TMP_20]], %[[TMP_22]]
+-  // CHECK: %[[TMP_24:.*]] = stablehlo.add %[[TMP_21]], %[[TMP_5]]
+-  // CHECK: %[[TMP_25:.*]] = stablehlo.power %[[TMP_24]], %[[TMP_3]]
++  // CHECK: %[[TMP_24:.*]] = stablehlo.add %[[TMP_21]], %[[TMP_3]]
++  // CHECK: %[[TMP_25:.*]] = stablehlo.power %[[TMP_24]], %[[TMP_4]]
+   // CHECK: %[[TMP_26:.*]] = stablehlo.add %[[TMP_23]], %[[TMP_25]]
+-  // CHECK: %[[TMP_27:.*]] = stablehlo.add %[[TMP_24]], %[[TMP_5]]
+-  // CHECK: %[[TMP_28:.*]] = stablehlo.power %[[TMP_27]], %[[TMP_3]]
++  // CHECK: %[[TMP_27:.*]] = stablehlo.add %[[TMP_24]], %[[TMP_3]]
++  // CHECK: %[[TMP_28:.*]] = stablehlo.power %[[TMP_27]], %[[TMP_4]]
+   // CHECK: %[[TMP_29:.*]] = stablehlo.add %[[TMP_26]], %[[TMP_28]]
+-  // CHECK: %[[TMP_30:.*]] = stablehlo.add %[[TMP_27]], %[[TMP_5]]
+-  // CHECK: %[[TMP_31:.*]] = stablehlo.power %[[TMP_30]], %[[TMP_3]]
++  // CHECK: %[[TMP_30:.*]] = stablehlo.add %[[TMP_27]], %[[TMP_3]]
++  // CHECK: %[[TMP_31:.*]] = stablehlo.power %[[TMP_30]], %[[TMP_4]]
+   // CHECK: %[[TMP_32:.*]] = stablehlo.add %[[TMP_29]], %[[TMP_31]]
+-  // CHECK: %[[TMP_33:.*]] = stablehlo.add %[[TMP_30]], %[[TMP_5]]
+-  // CHECK: %[[TMP_34:.*]] = stablehlo.power %[[TMP_33]], %[[TMP_3]]
++  // CHECK: %[[TMP_33:.*]] = stablehlo.add %[[TMP_30]], %[[TMP_3]]
++  // CHECK: %[[TMP_34:.*]] = stablehlo.power %[[TMP_33]], %[[TMP_4]]
+   // CHECK: %[[TMP_35:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_36:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_35]]
+-  // CHECK: %[[TMP_37:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_33]]
+-  // CHECK: %[[TMP_38:.*]] = stablehlo.divide %[[TMP_37]], %[[TMP_36]]
+-  // CHECK: %[[TMP_39:.*]] = stablehlo.add %[[TMP_32]], %[[TMP_38]]
+-  // CHECK: %[[TMP_40:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
+-  // CHECK: %[[TMP_41:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_40]]
+-  // CHECK: %[[TMP_42:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_43:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_42]]
+-  // CHECK: %[[TMP_44:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_45:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_44]]
+-  // CHECK: %[[TMP_46:.*]] = stablehlo.multiply %[[TMP_43]], %[[TMP_45]]
+-  // CHECK: %[[TMP_47:.*]] = stablehlo.constant dense<-1.39544646E-19>
+-  // CHECK: %[[TMP_48:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_47]]
+-  // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_48]]
+-  // CHECK: %[[TMP_50:.*]] = stablehlo.multiply %[[TMP_46]], %[[TMP_49]]
+-  // CHECK: %[[TMP_51:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_52:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_51]]
+-  // CHECK: %[[TMP_53:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_54:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_53]]
+-  // CHECK: %[[TMP_55:.*]] = stablehlo.multiply %[[TMP_52]], %[[TMP_54]]
+-  // CHECK: %[[TMP_56:.*]] = stablehlo.constant dense<5.50900303E-18>
+-  // CHECK: %[[TMP_57:.*]] = stablehlo.add %[[TMP_50]], %[[TMP_56]]
+-  // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_57]]
+-  // CHECK: %[[TMP_59:.*]] = stablehlo.multiply %[[TMP_55]], %[[TMP_58]]
+-  // CHECK: %[[TMP_60:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_61:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_60]]
+-  // CHECK: %[[TMP_62:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_63:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_62]]
+-  // CHECK: %[[TMP_64:.*]] = stablehlo.multiply %[[TMP_61]], %[[TMP_63]]
+-  // CHECK: %[[TMP_65:.*]] = stablehlo.constant dense<-2.17486866E-16>
+-  // CHECK: %[[TMP_66:.*]] = stablehlo.add %[[TMP_59]], %[[TMP_65]]
+-  // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_66]]
+-  // CHECK: %[[TMP_68:.*]] = stablehlo.multiply %[[TMP_64]], %[[TMP_67]]
+-  // CHECK: %[[TMP_69:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_70:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_69]]
+-  // CHECK: %[[TMP_71:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_72:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_71]]
+-  // CHECK: %[[TMP_73:.*]] = stablehlo.multiply %[[TMP_70]], %[[TMP_72]]
+-  // CHECK: %[[TMP_74:.*]] = stablehlo.constant dense<8.58606213E-15>
+-  // CHECK: %[[TMP_75:.*]] = stablehlo.add %[[TMP_68]], %[[TMP_74]]
+-  // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_75]]
+-  // CHECK: %[[TMP_77:.*]] = stablehlo.multiply %[[TMP_73]], %[[TMP_76]]
+-  // CHECK: %[[TMP_78:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_79:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_78]]
+-  // CHECK: %[[TMP_80:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_81:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_80]]
+-  // CHECK: %[[TMP_82:.*]] = stablehlo.multiply %[[TMP_79]], %[[TMP_81]]
+-  // CHECK: %[[TMP_83:.*]] = stablehlo.constant dense<-3.3896803E-13>
+-  // CHECK: %[[TMP_84:.*]] = stablehlo.add %[[TMP_77]], %[[TMP_83]]
+-  // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_84]]
+-  // CHECK: %[[TMP_86:.*]] = stablehlo.multiply %[[TMP_82]], %[[TMP_85]]
+-  // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_88:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_87]]
+-  // CHECK: %[[TMP_89:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_89]]
+-  // CHECK: %[[TMP_91:.*]] = stablehlo.multiply %[[TMP_88]], %[[TMP_90]]
+-  // CHECK: %[[TMP_92:.*]] = stablehlo.constant dense<1.33825364E-11>
+-  // CHECK: %[[TMP_93:.*]] = stablehlo.add %[[TMP_86]], %[[TMP_92]]
+-  // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_93]]
+-  // CHECK: %[[TMP_95:.*]] = stablehlo.multiply %[[TMP_91]], %[[TMP_94]]
+-  // CHECK: %[[TMP_96:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_96]]
+-  // CHECK: %[[TMP_98:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_98]]
+-  // CHECK: %[[TMP_100:.*]] = stablehlo.multiply %[[TMP_97]], %[[TMP_99]]
+-  // CHECK: %[[TMP_101:.*]] = stablehlo.constant dense<-5.28419031E-10>
+-  // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_95]], %[[TMP_101]]
+-  // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_102]]
+-  // CHECK: %[[TMP_104:.*]] = stablehlo.multiply %[[TMP_100]], %[[TMP_103]]
+-  // CHECK: %[[TMP_105:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_105]]
+-  // CHECK: %[[TMP_107:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_107]]
+-  // CHECK: %[[TMP_109:.*]] = stablehlo.multiply %[[TMP_106]], %[[TMP_108]]
+-  // CHECK: %[[TMP_110:.*]] = stablehlo.constant dense<2.08767563E-8>
+-  // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_104]], %[[TMP_110]]
+-  // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_111]]
+-  // CHECK: %[[TMP_113:.*]] = stablehlo.multiply %[[TMP_109]], %[[TMP_112]]
+-  // CHECK: %[[TMP_114:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_114]]
+-  // CHECK: %[[TMP_116:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_116]]
+-  // CHECK: %[[TMP_118:.*]] = stablehlo.multiply %[[TMP_115]], %[[TMP_117]]
+-  // CHECK: %[[TMP_119:.*]] = stablehlo.constant dense<-8.26719599E-7>
+-  // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_113]], %[[TMP_119]]
+-  // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_120]]
+-  // CHECK: %[[TMP_122:.*]] = stablehlo.multiply %[[TMP_118]], %[[TMP_121]]
+-  // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_123]]
+-  // CHECK: %[[TMP_125:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_125]]
+-  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_124]], %[[TMP_126]]
+-  // CHECK: %[[TMP_128:.*]] = stablehlo.constant dense<3.30687835E-5>
+-  // CHECK: %[[TMP_129:.*]] = stablehlo.add %[[TMP_122]], %[[TMP_128]]
+-  // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_129]]
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.multiply %[[TMP_127]], %[[TMP_130]]
+-  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_132]]
+-  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_134]]
+-  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_135]]
+-  // CHECK: %[[TMP_137:.*]] = stablehlo.constant dense<-0.00138888892>
+-  // CHECK: %[[TMP_138:.*]] = stablehlo.add %[[TMP_131]], %[[TMP_137]]
+-  // CHECK: %[[TMP_139:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_138]]
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.multiply %[[TMP_136]], %[[TMP_139]]
+-  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<5.000000e-01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.divide %[[TMP_0]], %[[TMP_33]]
+-  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<0.0833333358>
+-  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_143]], %[[TMP_140]]
+-  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_144]]
+-  // CHECK: %[[TMP_146:.*]] = stablehlo.add %[[TMP_141]], %[[TMP_145]]
+-  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_146]]
+-  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_39]], %[[TMP_147]]
++  // CHECK: %[[TMP_36:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_33]]
++  // CHECK: %[[TMP_37:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_35]]
++  // CHECK: %[[TMP_38:.*]] = stablehlo.divide %[[TMP_36]], %[[TMP_37]]
++  // CHECK: %[[TMP_39:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
++  // CHECK: %[[TMP_40:.*]] = stablehlo.divide %[[TMP_3]], %[[TMP_39]]
++  // CHECK: %[[TMP_41:.*]] = stablehlo.constant dense<2.200000e+01>
++  // CHECK: %[[TMP_42:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_41]]
++  // CHECK: %[[TMP_43:.*]] = stablehlo.constant dense<2.100000e+01>
++  // CHECK: %[[TMP_44:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_43]]
++  // CHECK: %[[TMP_45:.*]] = stablehlo.multiply %[[TMP_42]], %[[TMP_44]]
++  // CHECK: %[[TMP_46:.*]] = stablehlo.constant dense<-1.39544646E-19>
++  // CHECK: %[[TMP_47:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_46]]
++  // CHECK: %[[TMP_48:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_47]]
++  // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_45]], %[[TMP_48]]
++  // CHECK: %[[TMP_50:.*]] = stablehlo.constant dense<2.000000e+01>
++  // CHECK: %[[TMP_51:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_50]]
++  // CHECK: %[[TMP_52:.*]] = stablehlo.constant dense<1.900000e+01>
++  // CHECK: %[[TMP_53:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_52]]
++  // CHECK: %[[TMP_54:.*]] = stablehlo.multiply %[[TMP_51]], %[[TMP_53]]
++  // CHECK: %[[TMP_55:.*]] = stablehlo.constant dense<5.50900303E-18>
++  // CHECK: %[[TMP_56:.*]] = stablehlo.add %[[TMP_49]], %[[TMP_55]]
++  // CHECK: %[[TMP_57:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_56]]
++  // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_54]], %[[TMP_57]]
++  // CHECK: %[[TMP_59:.*]] = stablehlo.constant dense<1.800000e+01>
++  // CHECK: %[[TMP_60:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_59]]
++  // CHECK: %[[TMP_61:.*]] = stablehlo.constant dense<1.700000e+01>
++  // CHECK: %[[TMP_62:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_61]]
++  // CHECK: %[[TMP_63:.*]] = stablehlo.multiply %[[TMP_60]], %[[TMP_62]]
++  // CHECK: %[[TMP_64:.*]] = stablehlo.constant dense<-2.17486866E-16>
++  // CHECK: %[[TMP_65:.*]] = stablehlo.add %[[TMP_58]], %[[TMP_64]]
++  // CHECK: %[[TMP_66:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_65]]
++  // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_63]], %[[TMP_66]]
++  // CHECK: %[[TMP_68:.*]] = stablehlo.constant dense<1.600000e+01>
++  // CHECK: %[[TMP_69:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_68]]
++  // CHECK: %[[TMP_70:.*]] = stablehlo.constant dense<1.500000e+01>
++  // CHECK: %[[TMP_71:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_70]]
++  // CHECK: %[[TMP_72:.*]] = stablehlo.multiply %[[TMP_69]], %[[TMP_71]]
++  // CHECK: %[[TMP_73:.*]] = stablehlo.constant dense<8.58606213E-15>
++  // CHECK: %[[TMP_74:.*]] = stablehlo.add %[[TMP_67]], %[[TMP_73]]
++  // CHECK: %[[TMP_75:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_74]]
++  // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_72]], %[[TMP_75]]
++  // CHECK: %[[TMP_77:.*]] = stablehlo.constant dense<1.400000e+01>
++  // CHECK: %[[TMP_78:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_77]]
++  // CHECK: %[[TMP_79:.*]] = stablehlo.constant dense<1.300000e+01>
++  // CHECK: %[[TMP_80:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_79]]
++  // CHECK: %[[TMP_81:.*]] = stablehlo.multiply %[[TMP_78]], %[[TMP_80]]
++  // CHECK: %[[TMP_82:.*]] = stablehlo.constant dense<-3.3896803E-13>
++  // CHECK: %[[TMP_83:.*]] = stablehlo.add %[[TMP_76]], %[[TMP_82]]
++  // CHECK: %[[TMP_84:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_83]]
++  // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_81]], %[[TMP_84]]
++  // CHECK: %[[TMP_86:.*]] = stablehlo.constant dense<1.200000e+01>
++  // CHECK: %[[TMP_87:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_86]]
++  // CHECK: %[[TMP_88:.*]] = stablehlo.constant dense<1.100000e+01>
++  // CHECK: %[[TMP_89:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_88]]
++  // CHECK: %[[TMP_90:.*]] = stablehlo.multiply %[[TMP_87]], %[[TMP_89]]
++  // CHECK: %[[TMP_91:.*]] = stablehlo.constant dense<1.33825364E-11>
++  // CHECK: %[[TMP_92:.*]] = stablehlo.add %[[TMP_85]], %[[TMP_91]]
++  // CHECK: %[[TMP_93:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_92]]
++  // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_90]], %[[TMP_93]]
++  // CHECK: %[[TMP_95:.*]] = stablehlo.constant dense<1.000000e+01>
++  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_95]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.constant dense<9.000000e+00>
++  // CHECK: %[[TMP_98:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_97]]
++  // CHECK: %[[TMP_99:.*]] = stablehlo.multiply %[[TMP_96]], %[[TMP_98]]
++  // CHECK: %[[TMP_100:.*]] = stablehlo.constant dense<-5.28419031E-10>
++  // CHECK: %[[TMP_101:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_100]]
++  // CHECK: %[[TMP_102:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_101]]
++  // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_99]], %[[TMP_102]]
++  // CHECK: %[[TMP_104:.*]] = stablehlo.constant dense<8.000000e+00>
++  // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_104]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.constant dense<7.000000e+00>
++  // CHECK: %[[TMP_107:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_106]]
++  // CHECK: %[[TMP_108:.*]] = stablehlo.multiply %[[TMP_105]], %[[TMP_107]]
++  // CHECK: %[[TMP_109:.*]] = stablehlo.constant dense<2.08767563E-8>
++  // CHECK: %[[TMP_110:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_109]]
++  // CHECK: %[[TMP_111:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_110]]
++  // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_108]], %[[TMP_111]]
++  // CHECK: %[[TMP_113:.*]] = stablehlo.constant dense<6.000000e+00>
++  // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_113]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.constant dense<5.000000e+00>
++  // CHECK: %[[TMP_116:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_115]]
++  // CHECK: %[[TMP_117:.*]] = stablehlo.multiply %[[TMP_114]], %[[TMP_116]]
++  // CHECK: %[[TMP_118:.*]] = stablehlo.constant dense<-8.26719599E-7>
++  // CHECK: %[[TMP_119:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_118]]
++  // CHECK: %[[TMP_120:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_119]]
++  // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_117]], %[[TMP_120]]
++  // CHECK: %[[TMP_122:.*]] = stablehlo.constant dense<4.000000e+00>
++  // CHECK: %[[TMP_123:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_122]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.constant dense<3.000000e+00>
++  // CHECK: %[[TMP_125:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_124]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.multiply %[[TMP_123]], %[[TMP_125]]
++  // CHECK: %[[TMP_127:.*]] = stablehlo.constant dense<3.30687835E-5>
++  // CHECK: %[[TMP_128:.*]] = stablehlo.add %[[TMP_121]], %[[TMP_127]]
++  // CHECK: %[[TMP_129:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_128]]
++  // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_126]], %[[TMP_129]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.000000e+00>
++  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_131]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_134:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_133]]
++  // CHECK: %[[TMP_135:.*]] = stablehlo.multiply %[[TMP_132]], %[[TMP_134]]
++  // CHECK: %[[TMP_136:.*]] = stablehlo.constant dense<-0.00138888892>
++  // CHECK: %[[TMP_137:.*]] = stablehlo.add %[[TMP_130]], %[[TMP_136]]
++  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_137]]
++  // CHECK: %[[TMP_139:.*]] = stablehlo.multiply %[[TMP_135]], %[[TMP_138]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<5.000000e-01>
++  // CHECK: %[[TMP_141:.*]] = stablehlo.divide %[[TMP_0]], %[[TMP_33]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.constant dense<0.0833333358>
++  // CHECK: %[[TMP_143:.*]] = stablehlo.add %[[TMP_142]], %[[TMP_139]]
++  // CHECK: %[[TMP_144:.*]] = stablehlo.multiply %[[TMP_141]], %[[TMP_143]]
++  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_140]], %[[TMP_144]]
++  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_145]]
++  // CHECK: %[[TMP_147:.*]] = stablehlo.add %[[TMP_32]], %[[TMP_38]]
++  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_146]]
+   // CHECK: %[[TMP_149:.*]] = stablehlo.abs %[[TMP_34]]
+   // CHECK: %[[TMP_150:.*]] = stablehlo.abs %[[TMP_32]]
+   // CHECK: %[[TMP_151:.*]] = stablehlo.constant dense<1.401300e-45>
+@@ -1456,7 +1456,7 @@
+   // CHECK: %[[TMP_172:.*]] = stablehlo.and %[[TMP_169]], %[[TMP_171]] : tensor<i1>
+   // CHECK: %[[TMP_173:.*]] = stablehlo.select %[[TMP_172]], %[[TMP_163]], %[[TMP_155]]
+   // CHECK: %[[TMP_174:.*]] = stablehlo.select %[[TMP_166]], %[[TMP_173]], %[[TMP_162]]
+-  // CHECK: %[[TMP_175:.*]] = stablehlo.compare EQ, %[[TMP_0]], %[[TMP_5]], NOTYPE
++  // CHECK: %[[TMP_175:.*]] = stablehlo.compare EQ, %[[TMP_0]], %[[TMP_3]], NOTYPE
+   // CHECK: %[[TMP_176:.*]] = stablehlo.select %[[TMP_175]], %[[TMP_163]], %[[TMP_174]]
+   // CHECK: %[[TMP_177:.*]] = stablehlo.convert %[[TMP_176]] : (tensor<f32>) -> tensor<f16>
+   %0 = chlo.zeta %arg0, %arg1 : tensor<f16>, tensor<f16> -> tensor<f16>
+@@ -1465,8 +1465,7 @@
+ 
+ // -----
+ 
+-
+-// CHECK-LABEL: @polygamma_f32
++// CHECK: @polygamma_f32
+ // CHECK-SAME: (%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>)
+ func.func @polygamma_f32(%lhs : tensor<f32>, %rhs : tensor<f32>) -> tensor<f32> {
+   // CHECK-DAG: %[[TMP_0:.*]] = stablehlo.constant dense<1.000000e+00>
+@@ -1559,153 +1558,153 @@
+   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<0x7F800000>
+   // CHECK: %[[TMP_88:.*]] = stablehlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
+   // CHECK: %[[TMP_89:.*]] = stablehlo.exponential %[[TMP_88]]
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
+-  // CHECK: %[[TMP_91:.*]] = stablehlo.negate %[[TMP_5]]
+-  // CHECK: %[[TMP_92:.*]] = stablehlo.power %[[ARG1]], %[[TMP_91]]
+-  // CHECK: %[[TMP_93:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_93]]
+-  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_91]]
+-  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_92]], %[[TMP_95]]
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_93]]
+-  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_91]]
++  // CHECK-DAG: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
++  // CHECK-DAG: %[[TMP_91:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_92:.*]] = stablehlo.negate %[[TMP_5]]
++  // CHECK: %[[TMP_93:.*]] = stablehlo.power %[[ARG1]], %[[TMP_92]]
++  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_91]]
++  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_92]]
++  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_93]], %[[TMP_95]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_91]]
++  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_92]]
+   // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_96]], %[[TMP_98]]
+-  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_93]]
+-  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_91]]
++  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_92]]
+   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_99]], %[[TMP_101]]
+-  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_93]]
+-  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_92]]
+   // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_102]], %[[TMP_104]]
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_93]]
+-  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_92]]
+   // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_105]], %[[TMP_107]]
+-  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_93]]
+-  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_92]]
+   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_108]], %[[TMP_110]]
+-  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_93]]
+-  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_92]]
+   // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_111]], %[[TMP_113]]
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_93]]
+-  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_92]]
+   // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_114]], %[[TMP_116]]
+-  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_93]]
+-  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_92]]
+   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_117]], %[[TMP_119]]
+-  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_93]]
+-  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_91]]
++  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_92]]
+   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
+-  // CHECK: %[[TMP_125:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_125]], %[[TMP_124]]
+-  // CHECK: %[[TMP_127:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
+-  // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
+-  // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
+-  // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
+-  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
+-  // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.39544646E-19>
+-  // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
+-  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
+-  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
+-  // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
+-  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
+-  // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
+-  // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.50900303E-18>
+-  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
+-  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
+-  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
+-  // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
+-  // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
+-  // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
+-  // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.17486866E-16>
+-  // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
+-  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
+-  // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
+-  // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
+-  // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
+-  // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
+-  // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.58606213E-15>
+-  // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
+-  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
+-  // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
+-  // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
+-  // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
+-  // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
+-  // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896803E-13>
+-  // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
+-  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
+-  // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
+-  // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
+-  // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
+-  // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
+-  // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.33825364E-11>
+-  // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
+-  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
+-  // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
+-  // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
+-  // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
+-  // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
+-  // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.28419031E-10>
+-  // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
+-  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
+-  // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
+-  // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
+-  // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
+-  // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
+-  // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767563E-8>
+-  // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
+-  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
+-  // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
+-  // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
+-  // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
+-  // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
+-  // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.26719599E-7>
+-  // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
+-  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
+-  // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
+-  // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
+-  // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
+-  // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
+-  // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.30687835E-5>
+-  // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
+-  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
+-  // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
+-  // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
+-  // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
+-  // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
+-  // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.00138888892>
+-  // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
+-  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_226]]
+-  // CHECK: %[[TMP_228:.*]] = stablehlo.multiply %[[TMP_224]], %[[TMP_227]]
+-  // CHECK: %[[TMP_229:.*]] = stablehlo.constant dense<5.000000e-01>
+-  // CHECK: %[[TMP_230:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
+-  // CHECK: %[[TMP_231:.*]] = stablehlo.constant dense<0.0833333358>
+-  // CHECK: %[[TMP_232:.*]] = stablehlo.add %[[TMP_231]], %[[TMP_228]]
+-  // CHECK: %[[TMP_233:.*]] = stablehlo.multiply %[[TMP_230]], %[[TMP_232]]
+-  // CHECK: %[[TMP_234:.*]] = stablehlo.add %[[TMP_229]], %[[TMP_233]]
+-  // CHECK: %[[TMP_235:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_234]]
+-  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_127]], %[[TMP_235]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
++  // CHECK: %[[TMP_125:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_124]], %[[TMP_125]]
++  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
++  // CHECK: %[[TMP_128:.*]] = stablehlo.divide %[[TMP_91]], %[[TMP_127]]
++  // CHECK: %[[TMP_129:.*]] = stablehlo.constant dense<2.200000e+01>
++  // CHECK: %[[TMP_130:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_129]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.100000e+01>
++  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_131]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.multiply %[[TMP_130]], %[[TMP_132]]
++  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<-1.39544646E-19>
++  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_134]]
++  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_135]]
++  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_136]]
++  // CHECK: %[[TMP_138:.*]] = stablehlo.constant dense<2.000000e+01>
++  // CHECK: %[[TMP_139:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_138]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<1.900000e+01>
++  // CHECK: %[[TMP_141:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_140]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.multiply %[[TMP_139]], %[[TMP_141]]
++  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<5.50900303E-18>
++  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_137]], %[[TMP_143]]
++  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_144]]
++  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_145]]
++  // CHECK: %[[TMP_147:.*]] = stablehlo.constant dense<1.800000e+01>
++  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_147]]
++  // CHECK: %[[TMP_149:.*]] = stablehlo.constant dense<1.700000e+01>
++  // CHECK: %[[TMP_150:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_149]]
++  // CHECK: %[[TMP_151:.*]] = stablehlo.multiply %[[TMP_148]], %[[TMP_150]]
++  // CHECK: %[[TMP_152:.*]] = stablehlo.constant dense<-2.17486866E-16>
++  // CHECK: %[[TMP_153:.*]] = stablehlo.add %[[TMP_146]], %[[TMP_152]]
++  // CHECK: %[[TMP_154:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_153]]
++  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_151]], %[[TMP_154]]
++  // CHECK: %[[TMP_156:.*]] = stablehlo.constant dense<1.600000e+01>
++  // CHECK: %[[TMP_157:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_156]]
++  // CHECK: %[[TMP_158:.*]] = stablehlo.constant dense<1.500000e+01>
++  // CHECK: %[[TMP_159:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_158]]
++  // CHECK: %[[TMP_160:.*]] = stablehlo.multiply %[[TMP_157]], %[[TMP_159]]
++  // CHECK: %[[TMP_161:.*]] = stablehlo.constant dense<8.58606213E-15>
++  // CHECK: %[[TMP_162:.*]] = stablehlo.add %[[TMP_155]], %[[TMP_161]]
++  // CHECK: %[[TMP_163:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_162]]
++  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_160]], %[[TMP_163]]
++  // CHECK: %[[TMP_165:.*]] = stablehlo.constant dense<1.400000e+01>
++  // CHECK: %[[TMP_166:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_165]]
++  // CHECK: %[[TMP_167:.*]] = stablehlo.constant dense<1.300000e+01>
++  // CHECK: %[[TMP_168:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_167]]
++  // CHECK: %[[TMP_169:.*]] = stablehlo.multiply %[[TMP_166]], %[[TMP_168]]
++  // CHECK: %[[TMP_170:.*]] = stablehlo.constant dense<-3.3896803E-13>
++  // CHECK: %[[TMP_171:.*]] = stablehlo.add %[[TMP_164]], %[[TMP_170]]
++  // CHECK: %[[TMP_172:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_171]]
++  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_169]], %[[TMP_172]]
++  // CHECK: %[[TMP_174:.*]] = stablehlo.constant dense<1.200000e+01>
++  // CHECK: %[[TMP_175:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_174]]
++  // CHECK: %[[TMP_176:.*]] = stablehlo.constant dense<1.100000e+01>
++  // CHECK: %[[TMP_177:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_176]]
++  // CHECK: %[[TMP_178:.*]] = stablehlo.multiply %[[TMP_175]], %[[TMP_177]]
++  // CHECK: %[[TMP_179:.*]] = stablehlo.constant dense<1.33825364E-11>
++  // CHECK: %[[TMP_180:.*]] = stablehlo.add %[[TMP_173]], %[[TMP_179]]
++  // CHECK: %[[TMP_181:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_180]]
++  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_178]], %[[TMP_181]]
++  // CHECK: %[[TMP_183:.*]] = stablehlo.constant dense<1.000000e+01>
++  // CHECK: %[[TMP_184:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_183]]
++  // CHECK: %[[TMP_185:.*]] = stablehlo.constant dense<9.000000e+00>
++  // CHECK: %[[TMP_186:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_185]]
++  // CHECK: %[[TMP_187:.*]] = stablehlo.multiply %[[TMP_184]], %[[TMP_186]]
++  // CHECK: %[[TMP_188:.*]] = stablehlo.constant dense<-5.28419031E-10>
++  // CHECK: %[[TMP_189:.*]] = stablehlo.add %[[TMP_182]], %[[TMP_188]]
++  // CHECK: %[[TMP_190:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_189]]
++  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_187]], %[[TMP_190]]
++  // CHECK: %[[TMP_192:.*]] = stablehlo.constant dense<8.000000e+00>
++  // CHECK: %[[TMP_193:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_192]]
++  // CHECK: %[[TMP_194:.*]] = stablehlo.constant dense<7.000000e+00>
++  // CHECK: %[[TMP_195:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_194]]
++  // CHECK: %[[TMP_196:.*]] = stablehlo.multiply %[[TMP_193]], %[[TMP_195]]
++  // CHECK: %[[TMP_197:.*]] = stablehlo.constant dense<2.08767563E-8>
++  // CHECK: %[[TMP_198:.*]] = stablehlo.add %[[TMP_191]], %[[TMP_197]]
++  // CHECK: %[[TMP_199:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_198]]
++  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_196]], %[[TMP_199]]
++  // CHECK: %[[TMP_201:.*]] = stablehlo.constant dense<6.000000e+00>
++  // CHECK: %[[TMP_202:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_201]]
++  // CHECK: %[[TMP_203:.*]] = stablehlo.constant dense<5.000000e+00>
++  // CHECK: %[[TMP_204:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_203]]
++  // CHECK: %[[TMP_205:.*]] = stablehlo.multiply %[[TMP_202]], %[[TMP_204]]
++  // CHECK: %[[TMP_206:.*]] = stablehlo.constant dense<-8.26719599E-7>
++  // CHECK: %[[TMP_207:.*]] = stablehlo.add %[[TMP_200]], %[[TMP_206]]
++  // CHECK: %[[TMP_208:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_207]]
++  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_205]], %[[TMP_208]]
++  // CHECK: %[[TMP_210:.*]] = stablehlo.constant dense<4.000000e+00>
++  // CHECK: %[[TMP_211:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_210]]
++  // CHECK: %[[TMP_212:.*]] = stablehlo.constant dense<3.000000e+00>
++  // CHECK: %[[TMP_213:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_212]]
++  // CHECK: %[[TMP_214:.*]] = stablehlo.multiply %[[TMP_211]], %[[TMP_213]]
++  // CHECK: %[[TMP_215:.*]] = stablehlo.constant dense<3.30687835E-5>
++  // CHECK: %[[TMP_216:.*]] = stablehlo.add %[[TMP_209]], %[[TMP_215]]
++  // CHECK: %[[TMP_217:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_216]]
++  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_214]], %[[TMP_217]]
++  // CHECK: %[[TMP_219:.*]] = stablehlo.constant dense<2.000000e+00>
++  // CHECK: %[[TMP_220:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_219]]
++  // CHECK: %[[TMP_221:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_222:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_221]]
++  // CHECK: %[[TMP_223:.*]] = stablehlo.multiply %[[TMP_220]], %[[TMP_222]]
++  // CHECK: %[[TMP_224:.*]] = stablehlo.constant dense<-0.00138888892>
++  // CHECK: %[[TMP_225:.*]] = stablehlo.add %[[TMP_218]], %[[TMP_224]]
++  // CHECK: %[[TMP_226:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_225]]
++  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_223]], %[[TMP_226]]
++  // CHECK: %[[TMP_228:.*]] = stablehlo.constant dense<5.000000e-01>
++  // CHECK: %[[TMP_229:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
++  // CHECK: %[[TMP_230:.*]] = stablehlo.constant dense<0.0833333358>
++  // CHECK: %[[TMP_231:.*]] = stablehlo.add %[[TMP_230]], %[[TMP_227]]
++  // CHECK: %[[TMP_232:.*]] = stablehlo.multiply %[[TMP_229]], %[[TMP_231]]
++  // CHECK: %[[TMP_233:.*]] = stablehlo.add %[[TMP_228]], %[[TMP_232]]
++  // CHECK: %[[TMP_234:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_233]]
++  // CHECK: %[[TMP_235:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
++  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_235]], %[[TMP_234]]
+   // CHECK: %[[TMP_237:.*]] = stablehlo.abs %[[TMP_122]]
+   // CHECK: %[[TMP_238:.*]] = stablehlo.abs %[[TMP_120]]
+   // CHECK: %[[TMP_239:.*]] = stablehlo.constant dense<1.401300e-45>
+@@ -1732,7 +1731,7 @@
+   // CHECK: %[[TMP_260:.*]] = stablehlo.and %[[TMP_257]], %[[TMP_259]]
+   // CHECK: %[[TMP_261:.*]] = stablehlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
+   // CHECK: %[[TMP_262:.*]] = stablehlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
+-  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
++  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
+   // CHECK: %[[TMP_264:.*]] = stablehlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
+   // CHECK: %[[TMP_265:.*]] = stablehlo.multiply %[[TMP_4]], %[[TMP_89]]
+   // CHECK: %[[TMP_266:.*]] = stablehlo.multiply %[[TMP_265]], %[[TMP_264]]
+@@ -1853,8 +1852,7 @@
+ 
+ // -----
+ 
+-
+-// CHECK-LABEL: @polygamma_f64
++// CHECK: @polygamma_f64
+ // CHECK-SAME: (%[[ARG0:.*]]: tensor<f64>, %[[ARG1:.*]]: tensor<f64>)
+ func.func @polygamma_f64(%lhs : tensor<f64>, %rhs : tensor<f64>) -> tensor<f64> {
+   // CHECK-DAG: %[[TMP_0:.*]] = stablehlo.constant dense<1.000000e+00>
+@@ -1947,153 +1945,153 @@
+   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<0x7FF0000000000000>
+   // CHECK: %[[TMP_88:.*]] = stablehlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
+   // CHECK: %[[TMP_89:.*]] = stablehlo.exponential %[[TMP_88]]
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
+-  // CHECK: %[[TMP_91:.*]] = stablehlo.negate %[[TMP_5]]
+-  // CHECK: %[[TMP_92:.*]] = stablehlo.power %[[ARG1]], %[[TMP_91]]
+-  // CHECK: %[[TMP_93:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_93]]
+-  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_91]]
+-  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_92]], %[[TMP_95]]
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_93]]
+-  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_91]]
++  // CHECK-DAG: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
++  // CHECK-DAG: %[[TMP_91:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_92:.*]] = stablehlo.negate %[[TMP_5]]
++  // CHECK: %[[TMP_93:.*]] = stablehlo.power %[[ARG1]], %[[TMP_92]]
++  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_91]]
++  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_92]]
++  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_93]], %[[TMP_95]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_91]]
++  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_92]]
+   // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_96]], %[[TMP_98]]
+-  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_93]]
+-  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_91]]
++  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_92]]
+   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_99]], %[[TMP_101]]
+-  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_93]]
+-  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_92]]
+   // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_102]], %[[TMP_104]]
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_93]]
+-  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_92]]
+   // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_105]], %[[TMP_107]]
+-  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_93]]
+-  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_92]]
+   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_108]], %[[TMP_110]]
+-  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_93]]
+-  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_92]]
+   // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_111]], %[[TMP_113]]
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_93]]
+-  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_92]]
+   // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_114]], %[[TMP_116]]
+-  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_93]]
+-  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_92]]
+   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_117]], %[[TMP_119]]
+-  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_93]]
+-  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_91]]
++  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_92]]
+   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
+-  // CHECK: %[[TMP_125:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_125]], %[[TMP_124]]
+-  // CHECK: %[[TMP_127:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
+-  // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
+-  // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
+-  // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
+-  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
+-  // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
+-  // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
+-  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
+-  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
+-  // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
+-  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
+-  // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
+-  // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
+-  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
+-  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
+-  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
+-  // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
+-  // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
+-  // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
+-  // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
+-  // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
+-  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
+-  // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
+-  // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
+-  // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
+-  // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
+-  // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
+-  // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
+-  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
+-  // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
+-  // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
+-  // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
+-  // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
+-  // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
+-  // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
+-  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
+-  // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
+-  // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
+-  // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
+-  // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
+-  // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
+-  // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
+-  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
+-  // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
+-  // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
+-  // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
+-  // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
+-  // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
+-  // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
+-  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
+-  // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
+-  // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
+-  // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
+-  // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
+-  // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767569878681E-8>
+-  // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
+-  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
+-  // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
+-  // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
+-  // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
+-  // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
+-  // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
+-  // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
+-  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
+-  // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
+-  // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
+-  // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
+-  // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
+-  // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
+-  // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
+-  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
+-  // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
+-  // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
+-  // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
+-  // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
+-  // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.0013888888888888889>
+-  // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
+-  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_226]]
+-  // CHECK: %[[TMP_228:.*]] = stablehlo.multiply %[[TMP_224]], %[[TMP_227]]
+-  // CHECK: %[[TMP_229:.*]] = stablehlo.constant dense<5.000000e-01>
+-  // CHECK: %[[TMP_230:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
+-  // CHECK: %[[TMP_231:.*]] = stablehlo.constant dense<0.083333333333333329>
+-  // CHECK: %[[TMP_232:.*]] = stablehlo.add %[[TMP_231]], %[[TMP_228]]
+-  // CHECK: %[[TMP_233:.*]] = stablehlo.multiply %[[TMP_230]], %[[TMP_232]]
+-  // CHECK: %[[TMP_234:.*]] = stablehlo.add %[[TMP_229]], %[[TMP_233]]
+-  // CHECK: %[[TMP_235:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_234]]
+-  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_127]], %[[TMP_235]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
++  // CHECK: %[[TMP_125:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_124]], %[[TMP_125]]
++  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
++  // CHECK: %[[TMP_128:.*]] = stablehlo.divide %[[TMP_91]], %[[TMP_127]]
++  // CHECK: %[[TMP_129:.*]] = stablehlo.constant dense<2.200000e+01>
++  // CHECK: %[[TMP_130:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_129]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.100000e+01>
++  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_131]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.multiply %[[TMP_130]], %[[TMP_132]]
++  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
++  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_134]]
++  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_135]]
++  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_136]]
++  // CHECK: %[[TMP_138:.*]] = stablehlo.constant dense<2.000000e+01>
++  // CHECK: %[[TMP_139:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_138]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<1.900000e+01>
++  // CHECK: %[[TMP_141:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_140]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.multiply %[[TMP_139]], %[[TMP_141]]
++  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
++  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_137]], %[[TMP_143]]
++  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_144]]
++  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_145]]
++  // CHECK: %[[TMP_147:.*]] = stablehlo.constant dense<1.800000e+01>
++  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_147]]
++  // CHECK: %[[TMP_149:.*]] = stablehlo.constant dense<1.700000e+01>
++  // CHECK: %[[TMP_150:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_149]]
++  // CHECK: %[[TMP_151:.*]] = stablehlo.multiply %[[TMP_148]], %[[TMP_150]]
++  // CHECK: %[[TMP_152:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
++  // CHECK: %[[TMP_153:.*]] = stablehlo.add %[[TMP_146]], %[[TMP_152]]
++  // CHECK: %[[TMP_154:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_153]]
++  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_151]], %[[TMP_154]]
++  // CHECK: %[[TMP_156:.*]] = stablehlo.constant dense<1.600000e+01>
++  // CHECK: %[[TMP_157:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_156]]
++  // CHECK: %[[TMP_158:.*]] = stablehlo.constant dense<1.500000e+01>
++  // CHECK: %[[TMP_159:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_158]]
++  // CHECK: %[[TMP_160:.*]] = stablehlo.multiply %[[TMP_157]], %[[TMP_159]]
++  // CHECK: %[[TMP_161:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
++  // CHECK: %[[TMP_162:.*]] = stablehlo.add %[[TMP_155]], %[[TMP_161]]
++  // CHECK: %[[TMP_163:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_162]]
++  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_160]], %[[TMP_163]]
++  // CHECK: %[[TMP_165:.*]] = stablehlo.constant dense<1.400000e+01>
++  // CHECK: %[[TMP_166:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_165]]
++  // CHECK: %[[TMP_167:.*]] = stablehlo.constant dense<1.300000e+01>
++  // CHECK: %[[TMP_168:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_167]]
++  // CHECK: %[[TMP_169:.*]] = stablehlo.multiply %[[TMP_166]], %[[TMP_168]]
++  // CHECK: %[[TMP_170:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
++  // CHECK: %[[TMP_171:.*]] = stablehlo.add %[[TMP_164]], %[[TMP_170]]
++  // CHECK: %[[TMP_172:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_171]]
++  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_169]], %[[TMP_172]]
++  // CHECK: %[[TMP_174:.*]] = stablehlo.constant dense<1.200000e+01>
++  // CHECK: %[[TMP_175:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_174]]
++  // CHECK: %[[TMP_176:.*]] = stablehlo.constant dense<1.100000e+01>
++  // CHECK: %[[TMP_177:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_176]]
++  // CHECK: %[[TMP_178:.*]] = stablehlo.multiply %[[TMP_175]], %[[TMP_177]]
++  // CHECK: %[[TMP_179:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
++  // CHECK: %[[TMP_180:.*]] = stablehlo.add %[[TMP_173]], %[[TMP_179]]
++  // CHECK: %[[TMP_181:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_180]]
++  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_178]], %[[TMP_181]]
++  // CHECK: %[[TMP_183:.*]] = stablehlo.constant dense<1.000000e+01>
++  // CHECK: %[[TMP_184:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_183]]
++  // CHECK: %[[TMP_185:.*]] = stablehlo.constant dense<9.000000e+00>
++  // CHECK: %[[TMP_186:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_185]]
++  // CHECK: %[[TMP_187:.*]] = stablehlo.multiply %[[TMP_184]], %[[TMP_186]]
++  // CHECK: %[[TMP_188:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
++  // CHECK: %[[TMP_189:.*]] = stablehlo.add %[[TMP_182]], %[[TMP_188]]
++  // CHECK: %[[TMP_190:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_189]]
++  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_187]], %[[TMP_190]]
++  // CHECK: %[[TMP_192:.*]] = stablehlo.constant dense<8.000000e+00>
++  // CHECK: %[[TMP_193:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_192]]
++  // CHECK: %[[TMP_194:.*]] = stablehlo.constant dense<7.000000e+00>
++  // CHECK: %[[TMP_195:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_194]]
++  // CHECK: %[[TMP_196:.*]] = stablehlo.multiply %[[TMP_193]], %[[TMP_195]]
++  // CHECK: %[[TMP_197:.*]] = stablehlo.constant dense<2.08767569878681E-8>
++  // CHECK: %[[TMP_198:.*]] = stablehlo.add %[[TMP_191]], %[[TMP_197]]
++  // CHECK: %[[TMP_199:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_198]]
++  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_196]], %[[TMP_199]]
++  // CHECK: %[[TMP_201:.*]] = stablehlo.constant dense<6.000000e+00>
++  // CHECK: %[[TMP_202:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_201]]
++  // CHECK: %[[TMP_203:.*]] = stablehlo.constant dense<5.000000e+00>
++  // CHECK: %[[TMP_204:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_203]]
++  // CHECK: %[[TMP_205:.*]] = stablehlo.multiply %[[TMP_202]], %[[TMP_204]]
++  // CHECK: %[[TMP_206:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
++  // CHECK: %[[TMP_207:.*]] = stablehlo.add %[[TMP_200]], %[[TMP_206]]
++  // CHECK: %[[TMP_208:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_207]]
++  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_205]], %[[TMP_208]]
++  // CHECK: %[[TMP_210:.*]] = stablehlo.constant dense<4.000000e+00>
++  // CHECK: %[[TMP_211:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_210]]
++  // CHECK: %[[TMP_212:.*]] = stablehlo.constant dense<3.000000e+00>
++  // CHECK: %[[TMP_213:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_212]]
++  // CHECK: %[[TMP_214:.*]] = stablehlo.multiply %[[TMP_211]], %[[TMP_213]]
++  // CHECK: %[[TMP_215:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
++  // CHECK: %[[TMP_216:.*]] = stablehlo.add %[[TMP_209]], %[[TMP_215]]
++  // CHECK: %[[TMP_217:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_216]]
++  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_214]], %[[TMP_217]]
++  // CHECK: %[[TMP_219:.*]] = stablehlo.constant dense<2.000000e+00>
++  // CHECK: %[[TMP_220:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_219]]
++  // CHECK: %[[TMP_221:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_222:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_221]]
++  // CHECK: %[[TMP_223:.*]] = stablehlo.multiply %[[TMP_220]], %[[TMP_222]]
++  // CHECK: %[[TMP_224:.*]] = stablehlo.constant dense<-0.0013888888888888889>
++  // CHECK: %[[TMP_225:.*]] = stablehlo.add %[[TMP_218]], %[[TMP_224]]
++  // CHECK: %[[TMP_226:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_225]]
++  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_223]], %[[TMP_226]]
++  // CHECK: %[[TMP_228:.*]] = stablehlo.constant dense<5.000000e-01>
++  // CHECK: %[[TMP_229:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
++  // CHECK: %[[TMP_230:.*]] = stablehlo.constant dense<0.083333333333333329>
++  // CHECK: %[[TMP_231:.*]] = stablehlo.add %[[TMP_230]], %[[TMP_227]]
++  // CHECK: %[[TMP_232:.*]] = stablehlo.multiply %[[TMP_229]], %[[TMP_231]]
++  // CHECK: %[[TMP_233:.*]] = stablehlo.add %[[TMP_228]], %[[TMP_232]]
++  // CHECK: %[[TMP_234:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_233]]
++  // CHECK: %[[TMP_235:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
++  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_235]], %[[TMP_234]]
+   // CHECK: %[[TMP_237:.*]] = stablehlo.abs %[[TMP_122]]
+   // CHECK: %[[TMP_238:.*]] = stablehlo.abs %[[TMP_120]]
+   // CHECK: %[[TMP_239:.*]] = stablehlo.constant dense<4.940660e-324>
+@@ -2120,7 +2118,7 @@
+   // CHECK: %[[TMP_260:.*]] = stablehlo.and %[[TMP_257]], %[[TMP_259]]
+   // CHECK: %[[TMP_261:.*]] = stablehlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
+   // CHECK: %[[TMP_262:.*]] = stablehlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
+-  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
++  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
+   // CHECK: %[[TMP_264:.*]] = stablehlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
+   // CHECK: %[[TMP_265:.*]] = stablehlo.multiply %[[TMP_4]], %[[TMP_89]]
+   // CHECK: %[[TMP_266:.*]] = stablehlo.multiply %[[TMP_265]], %[[TMP_264]]
+diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+--- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
++++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+@@ -1575,11 +1575,21 @@
+ 
+ static Value materializeZeta(ConversionPatternRewriter &rewriter, Location loc,
+                              ValueRange args) {
+-  // Code should match XLA's materializeZeta from chlo_legalize_to_hlo.cc
++  // Implementation ported from:
++  // https://github.com/openxla/xla/blob/7a067a7b88d2ffb15b1dc5e3c06f701a15f0391d/xla/client/lib/math.cc#L1912-L1917
++  // Reference: Johansson, Fredrik.
++  // "Rigorous high-precision computation of the Hurwitz zeta function and its
++  // derivatives." Numerical Algorithms 69.2 (2015): 253-270.
++  // https://arxiv.org/abs/1309.2877 - formula (5)
++  // Notation is more or less kept as a reference to the whitepaper.
+   assert(args.size() == 2);
+   Value x = args[0];
+   Value q = args[1];
+-  static const std::array<double, 12> kZetaCoeffs{
++
++  static constexpr auto kTerms = 12;
++  static constexpr auto kIters = 9;
++  static constexpr auto kTwoTermsMinusOne = 2 * kTerms - 1;
++  static constexpr auto kZetaCoeffs = std::array<double, kTerms>{
+       -7.1661652561756670113e18,
+       1.8152105401943546773e17,
+       -4.5979787224074726105e15,
+@@ -1596,131 +1606,134 @@
+ 
+   // For speed we'll always use 9 iterations for the initial series estimate,
+   // and a 12 term expansion for the Euler-Maclaurin formula.
+-  Value a = q;
+-  Value zero = getConstantLike(rewriter, loc, 0.0, a);
+-  Value negPower = zero;
+-  Value negX = rewriter.create<mlir::stablehlo::NegOp>(loc, x);
+-  Value initialSum = rewriter.create<mlir::stablehlo::PowOp>(loc, q, negX);
+-  Value one = getConstantLike(rewriter, loc, 1.0, a);
+-  for (int i = 0; i < 9; ++i) {
+-    a = rewriter.create<mlir::stablehlo::AddOp>(loc, a, one);
+-    negPower = rewriter.create<mlir::stablehlo::PowOp>(loc, a, negX);
+-    initialSum =
+-        rewriter.create<mlir::stablehlo::AddOp>(loc, initialSum, negPower);
+-  }
+-
+-  a = rewriter.create<mlir::stablehlo::AddOp>(loc, a, one);
+-  negPower = rewriter.create<mlir::stablehlo::PowOp>(loc, a, negX);
++  Value zero = getConstantLike(rewriter, loc, 0.0, q);
++  Value one = getConstantLike(rewriter, loc, 1.0, q);
++  Value acc = q;
++  Value qNegPower = zero;
++  Value negX = rewriter.create<NegOp>(loc, x);
++  Value powerSum = rewriter.create<PowOp>(loc, q, negX);
++  for (int i = 0; i < kIters; ++i) {
++    acc = rewriter.create<AddOp>(loc, acc, one);
++    qNegPower = rewriter.create<PowOp>(loc, acc, negX);
++    powerSum =
++        rewriter.create<AddOp>(loc, powerSum, qNegPower);
++  }
++  acc = rewriter.create<AddOp>(loc, acc, one);
++  qNegPower = rewriter.create<PowOp>(loc, acc, negX);
+   Value oneLikeX = getConstantLike(rewriter, loc, 1.0, x);
+-  Value xMinusOne =
+-      rewriter.create<mlir::stablehlo::SubtractOp>(loc, x, oneLikeX);
+-  Value negPowerMulA =
+-      rewriter.create<mlir::stablehlo::MulOp>(loc, negPower, a);
+-  Value negPowerMulADivXMinusOne =
+-      rewriter.create<mlir::stablehlo::DivOp>(loc, negPowerMulA, xMinusOne);
+-  Value s = rewriter.create<mlir::stablehlo::AddOp>(loc, initialSum,
+-                                                    negPowerMulADivXMinusOne);
+-  Value aInverseSquare = rewriter.create<mlir::stablehlo::DivOp>(
+-      loc, one, rewriter.create<mlir::stablehlo::MulOp>(loc, a, a));
+-
+-  Value hornerSum = zero;
+-  Value factor = one;
++  Value correctionEulerMaclaurin = rewriter.create<DivOp>(
++      loc, rewriter.create<MulOp>(loc, qNegPower, acc),
++      rewriter.create<SubtractOp>(loc, x, oneLikeX));
++
++  // Manual reciprocal of the square root as RsqrtOp produces different results
++  Value rsqrtAcc = rewriter.create<DivOp>(
++      loc, one, rewriter.create<MulOp>(loc, acc, acc));
++
+   // Use Horner's rule for this.
+   // Note this differs from Cephes which does a 'naive' polynomial evaluation.
+   // Using Horner's rule allows to avoid some NaN's and Infs from happening,
+   // resulting in more numerically stable code.
+-  for (int i = 0; i < 11; ++i) {
+-    Value factorLhs = rewriter.create<mlir::stablehlo::AddOp>(
+-        loc, x, getConstantLike(rewriter, loc, 22 - 2 * i, x));
+-    Value factorRhs = rewriter.create<mlir::stablehlo::AddOp>(
+-        loc, x, getConstantLike(rewriter, loc, 21 - 2 * i, x));
+-    factor = rewriter.create<mlir::stablehlo::MulOp>(loc, factorLhs, factorRhs);
+-    hornerSum = rewriter.create<mlir::stablehlo::MulOp>(
+-        loc, factor,
+-        rewriter.create<mlir::stablehlo::MulOp>(
+-            loc, aInverseSquare,
+-            rewriter.create<mlir::stablehlo::AddOp>(
++  Value hornerSum = zero;
++  Value hornerProduct = one;
++
++  for (int i = 0; i < kTerms - 1; ++i) {
++    Value factorLhs = rewriter.create<AddOp>(
++        loc, x,
++        getConstantLike(rewriter, loc, kTwoTermsMinusOne - 1 - 2 * i, x));
++    Value factorRhs = rewriter.create<AddOp>(
++        loc, x,
++        getConstantLike(rewriter, loc, kTwoTermsMinusOne - 2 - 2 * i, x));
++    hornerProduct =
++        rewriter.create<MulOp>(loc, factorLhs, factorRhs);
++    hornerSum = rewriter.create<MulOp>(
++        loc, hornerProduct,
++        rewriter.create<MulOp>(
++            loc, rsqrtAcc,
++            rewriter.create<AddOp>(
+                 loc, hornerSum,
+-                getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], a))));
+-  }
+-  Value zeroPointFiveLikeNegPower =
+-      getConstantLike(rewriter, loc, .5, negPower);
+-  Value xDivA = rewriter.create<mlir::stablehlo::DivOp>(loc, x, a);
+-  s = rewriter.create<mlir::stablehlo::AddOp>(
+-      loc, s,
+-      rewriter.create<mlir::stablehlo::MulOp>(
+-          loc, negPower,
+-          rewriter.create<mlir::stablehlo::AddOp>(
+-              loc, zeroPointFiveLikeNegPower,
+-              rewriter.create<mlir::stablehlo::MulOp>(
+-                  loc, xDivA,
+-                  rewriter.create<mlir::stablehlo::AddOp>(
+-                      loc,
+-                      getConstantLike(rewriter, loc, 1. / kZetaCoeffs[11], a),
+-                      hornerSum)))));
++                getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], acc))));
++  }
++  Value zeroPointFiveLikeQNegPower =
++      getConstantLike(rewriter, loc, .5, qNegPower);
++  Value xDivAcc = rewriter.create<DivOp>(loc, x, acc);
++  Value bernoulliTailTerm = rewriter.create<MulOp>(
++      loc, qNegPower,
++      rewriter.create<AddOp>(
++          loc, zeroPointFiveLikeQNegPower,
++          rewriter.create<MulOp>(
++              loc, xDivAcc,
++              rewriter.create<AddOp>(
++                  loc,
++                  getConstantLike(rewriter, loc, 1. / kZetaCoeffs[kTerms - 1],
++                                  acc),
++                  hornerSum))));
++  Value accurateResult = rewriter.create<AddOp>(
++      loc,
++      rewriter.create<AddOp>(loc, powerSum,
++                                              correctionEulerMaclaurin),
++      bernoulliTailTerm);
+ 
+   // Use the initial zeta sum without the correction term coming
+   // from Euler-Maclaurin if it is accurate enough.
+-  Value absNegPower = rewriter.create<mlir::stablehlo::AbsOp>(loc, negPower);
+-  Value absInitialSum =
+-      rewriter.create<mlir::stablehlo::AbsOp>(loc, initialSum);
+-  Value output = rewriter.create<mlir::stablehlo::SelectOp>(
++  Value absQNegPower = rewriter.create<AbsOp>(loc, qNegPower);
++  Value absPowerSum = rewriter.create<AbsOp>(loc, powerSum);
++  Value output = rewriter.create<SelectOp>(
+       loc,
+-      rewriter.create<mlir::stablehlo::CompareOp>(
+-          loc, absNegPower,
+-          rewriter.create<mlir::stablehlo::MulOp>(
+-              loc, absInitialSum,
+-              getConstantLikeSmallestFiniteValue(rewriter, loc, a)),
+-          mlir::stablehlo::ComparisonDirection::LT),
+-      initialSum, s);
++      rewriter.create<CompareOp>(
++          loc, absQNegPower,
++          rewriter.create<MulOp>(
++              loc, absPowerSum,
++              getConstantLikeSmallestFiniteValue(rewriter, loc, acc)),
++          ComparisonDirection::LT),
++      powerSum, accurateResult);
+ 
+   // Function is not defined for x < 1.
+   Value nan = getConstantLike(rewriter, loc,
+                               std::numeric_limits<double>::quiet_NaN(), x);
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(
++  output = rewriter.create<SelectOp>(
+       loc,
+-      rewriter.create<mlir::stablehlo::CompareOp>(
+-          loc, x, oneLikeX, mlir::stablehlo::ComparisonDirection::LT),
++      rewriter.create<CompareOp>(
++          loc, x, oneLikeX, ComparisonDirection::LT),
+       nan, output);
+ 
+   // For q <= 0, x must be an integer.
+-  Value qLeZero = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, q, zero, mlir::stablehlo::ComparisonDirection::LE);
+-  Value xNotInt = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, x, rewriter.create<mlir::stablehlo::FloorOp>(loc, x),
+-      mlir::stablehlo::ComparisonDirection::NE);
++  Value qLeZero = rewriter.create<CompareOp>(
++      loc, q, zero, ComparisonDirection::LE);
++  Value xNotInt = rewriter.create<CompareOp>(
++      loc, x, rewriter.create<FloorOp>(loc, x),
++      ComparisonDirection::NE);
+   Value xDomainError =
+-      rewriter.create<mlir::stablehlo::AndOp>(loc, qLeZero, xNotInt);
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(loc, xDomainError, nan,
++      rewriter.create<AndOp>(loc, qLeZero, xNotInt);
++  output = rewriter.create<SelectOp>(loc, xDomainError, nan,
+                                                       output);
+ 
+   // For all integer q <= 0, zeta has a pole. The limit is only defined as
+   // +inf if x is and even integer.
+   Value inf = getConstantLike(rewriter, loc,
+                               std::numeric_limits<double>::infinity(), x);
+-  Value qIsInt = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, q, rewriter.create<mlir::stablehlo::FloorOp>(loc, q),
+-      mlir::stablehlo::ComparisonDirection::EQ);
+-  Value atPole = rewriter.create<mlir::stablehlo::AndOp>(loc, qLeZero, qIsInt);
++  Value qIsInt = rewriter.create<CompareOp>(
++      loc, q, rewriter.create<FloorOp>(loc, q),
++      ComparisonDirection::EQ);
++  Value atPole = rewriter.create<AndOp>(loc, qLeZero, qIsInt);
+   Value two = getConstantLike(rewriter, loc, 2.0, x);
+-  Value xIsInt = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, x, rewriter.create<mlir::stablehlo::FloorOp>(loc, x),
+-      mlir::stablehlo::ComparisonDirection::EQ);
+-  Value xIsEven = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, rewriter.create<mlir::stablehlo::RemOp>(loc, x, two), zero,
+-      mlir::stablehlo::ComparisonDirection::EQ);
++  Value xIsInt = rewriter.create<CompareOp>(
++      loc, x, rewriter.create<FloorOp>(loc, x),
++      ComparisonDirection::EQ);
++  Value xIsEven = rewriter.create<CompareOp>(
++      loc, rewriter.create<RemOp>(loc, x, two), zero,
++      ComparisonDirection::EQ);
+   Value xIsEvenInt =
+-      rewriter.create<mlir::stablehlo::AndOp>(loc, xIsInt, xIsEven);
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(
++      rewriter.create<AndOp>(loc, xIsInt, xIsEven);
++  output = rewriter.create<SelectOp>(
+       loc, atPole,
+-      rewriter.create<mlir::stablehlo::SelectOp>(loc, xIsEvenInt, inf, nan),
++      rewriter.create<SelectOp>(loc, xIsEvenInt, inf, nan),
+       output);
+ 
+   // For x = 1, this is the harmonic series and diverges.
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(
++  output = rewriter.create<SelectOp>(
+       loc,
+-      rewriter.create<mlir::stablehlo::CompareOp>(
+-          loc, x, one, mlir::stablehlo::ComparisonDirection::EQ),
++      rewriter.create<CompareOp>(
++          loc, x, one, ComparisonDirection::EQ),
+       inf, output);
+ 
+   return output;
 
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 70d9744d6e8ae1..94971c07102a21 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -2645,4 +2645,1185 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+--- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
++++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+@@ -1283,153 +1283,153 @@
+ func.func @zeta_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
+   // CHECK: %[[TMP_0:.*]] = stablehlo.convert %[[X]] : (tensor<f16>) -> tensor<f32>
+   // CHECK: %[[TMP_1:.*]] = stablehlo.convert %[[Q]] : (tensor<f16>) -> tensor<f32>
+-  // CHECK: %[[TMP_2:.*]] = stablehlo.constant dense<0.000000e+00>
+-  // CHECK: %[[TMP_3:.*]] = stablehlo.negate %[[TMP_0]]
+-  // CHECK: %[[TMP_4:.*]] = stablehlo.power %[[TMP_1]], %[[TMP_3]]
+-  // CHECK: %[[TMP_5:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_6:.*]] = stablehlo.add %[[TMP_1]], %[[TMP_5]]
+-  // CHECK: %[[TMP_7:.*]] = stablehlo.power %[[TMP_6]], %[[TMP_3]]
+-  // CHECK: %[[TMP_8:.*]] = stablehlo.add %[[TMP_4]], %[[TMP_7]]
+-  // CHECK: %[[TMP_9:.*]] = stablehlo.add %[[TMP_6]], %[[TMP_5]]
+-  // CHECK: %[[TMP_10:.*]] = stablehlo.power %[[TMP_9]], %[[TMP_3]]
++  // CHECK-DAG: %[[TMP_2:.*]] = stablehlo.constant dense<0.000000e+00>
++  // CHECK-DAG: %[[TMP_3:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_4:.*]] = stablehlo.negate %[[TMP_0]]
++  // CHECK: %[[TMP_5:.*]] = stablehlo.power %[[TMP_1]], %[[TMP_4]]
++  // CHECK: %[[TMP_6:.*]] = stablehlo.add %[[TMP_1]], %[[TMP_3]]
++  // CHECK: %[[TMP_7:.*]] = stablehlo.power %[[TMP_6]], %[[TMP_4]]
++  // CHECK: %[[TMP_8:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_7]]
++  // CHECK: %[[TMP_9:.*]] = stablehlo.add %[[TMP_6]], %[[TMP_3]]
++  // CHECK: %[[TMP_10:.*]] = stablehlo.power %[[TMP_9]], %[[TMP_4]]
+   // CHECK: %[[TMP_11:.*]] = stablehlo.add %[[TMP_8]], %[[TMP_10]]
+-  // CHECK: %[[TMP_12:.*]] = stablehlo.add %[[TMP_9]], %[[TMP_5]]
+-  // CHECK: %[[TMP_13:.*]] = stablehlo.power %[[TMP_12]], %[[TMP_3]]
++  // CHECK: %[[TMP_12:.*]] = stablehlo.add %[[TMP_9]], %[[TMP_3]]
++  // CHECK: %[[TMP_13:.*]] = stablehlo.power %[[TMP_12]], %[[TMP_4]]
+   // CHECK: %[[TMP_14:.*]] = stablehlo.add %[[TMP_11]], %[[TMP_13]]
+-  // CHECK: %[[TMP_15:.*]] = stablehlo.add %[[TMP_12]], %[[TMP_5]]
+-  // CHECK: %[[TMP_16:.*]] = stablehlo.power %[[TMP_15]], %[[TMP_3]]
++  // CHECK: %[[TMP_15:.*]] = stablehlo.add %[[TMP_12]], %[[TMP_3]]
++  // CHECK: %[[TMP_16:.*]] = stablehlo.power %[[TMP_15]], %[[TMP_4]]
+   // CHECK: %[[TMP_17:.*]] = stablehlo.add %[[TMP_14]], %[[TMP_16]]
+-  // CHECK: %[[TMP_18:.*]] = stablehlo.add %[[TMP_15]], %[[TMP_5]]
+-  // CHECK: %[[TMP_19:.*]] = stablehlo.power %[[TMP_18]], %[[TMP_3]]
++  // CHECK: %[[TMP_18:.*]] = stablehlo.add %[[TMP_15]], %[[TMP_3]]
++  // CHECK: %[[TMP_19:.*]] = stablehlo.power %[[TMP_18]], %[[TMP_4]]
+   // CHECK: %[[TMP_20:.*]] = stablehlo.add %[[TMP_17]], %[[TMP_19]]
+-  // CHECK: %[[TMP_21:.*]] = stablehlo.add %[[TMP_18]], %[[TMP_5]]
+-  // CHECK: %[[TMP_22:.*]] = stablehlo.power %[[TMP_21]], %[[TMP_3]]
++  // CHECK: %[[TMP_21:.*]] = stablehlo.add %[[TMP_18]], %[[TMP_3]]
++  // CHECK: %[[TMP_22:.*]] = stablehlo.power %[[TMP_21]], %[[TMP_4]]
+   // CHECK: %[[TMP_23:.*]] = stablehlo.add %[[TMP_20]], %[[TMP_22]]
+-  // CHECK: %[[TMP_24:.*]] = stablehlo.add %[[TMP_21]], %[[TMP_5]]
+-  // CHECK: %[[TMP_25:.*]] = stablehlo.power %[[TMP_24]], %[[TMP_3]]
++  // CHECK: %[[TMP_24:.*]] = stablehlo.add %[[TMP_21]], %[[TMP_3]]
++  // CHECK: %[[TMP_25:.*]] = stablehlo.power %[[TMP_24]], %[[TMP_4]]
+   // CHECK: %[[TMP_26:.*]] = stablehlo.add %[[TMP_23]], %[[TMP_25]]
+-  // CHECK: %[[TMP_27:.*]] = stablehlo.add %[[TMP_24]], %[[TMP_5]]
+-  // CHECK: %[[TMP_28:.*]] = stablehlo.power %[[TMP_27]], %[[TMP_3]]
++  // CHECK: %[[TMP_27:.*]] = stablehlo.add %[[TMP_24]], %[[TMP_3]]
++  // CHECK: %[[TMP_28:.*]] = stablehlo.power %[[TMP_27]], %[[TMP_4]]
+   // CHECK: %[[TMP_29:.*]] = stablehlo.add %[[TMP_26]], %[[TMP_28]]
+-  // CHECK: %[[TMP_30:.*]] = stablehlo.add %[[TMP_27]], %[[TMP_5]]
+-  // CHECK: %[[TMP_31:.*]] = stablehlo.power %[[TMP_30]], %[[TMP_3]]
++  // CHECK: %[[TMP_30:.*]] = stablehlo.add %[[TMP_27]], %[[TMP_3]]
++  // CHECK: %[[TMP_31:.*]] = stablehlo.power %[[TMP_30]], %[[TMP_4]]
+   // CHECK: %[[TMP_32:.*]] = stablehlo.add %[[TMP_29]], %[[TMP_31]]
+-  // CHECK: %[[TMP_33:.*]] = stablehlo.add %[[TMP_30]], %[[TMP_5]]
+-  // CHECK: %[[TMP_34:.*]] = stablehlo.power %[[TMP_33]], %[[TMP_3]]
++  // CHECK: %[[TMP_33:.*]] = stablehlo.add %[[TMP_30]], %[[TMP_3]]
++  // CHECK: %[[TMP_34:.*]] = stablehlo.power %[[TMP_33]], %[[TMP_4]]
+   // CHECK: %[[TMP_35:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_36:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_35]]
+-  // CHECK: %[[TMP_37:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_33]]
+-  // CHECK: %[[TMP_38:.*]] = stablehlo.divide %[[TMP_37]], %[[TMP_36]]
+-  // CHECK: %[[TMP_39:.*]] = stablehlo.add %[[TMP_32]], %[[TMP_38]]
+-  // CHECK: %[[TMP_40:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
+-  // CHECK: %[[TMP_41:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_40]]
+-  // CHECK: %[[TMP_42:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_43:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_42]]
+-  // CHECK: %[[TMP_44:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_45:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_44]]
+-  // CHECK: %[[TMP_46:.*]] = stablehlo.multiply %[[TMP_43]], %[[TMP_45]]
+-  // CHECK: %[[TMP_47:.*]] = stablehlo.constant dense<-1.39544646E-19>
+-  // CHECK: %[[TMP_48:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_47]]
+-  // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_48]]
+-  // CHECK: %[[TMP_50:.*]] = stablehlo.multiply %[[TMP_46]], %[[TMP_49]]
+-  // CHECK: %[[TMP_51:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_52:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_51]]
+-  // CHECK: %[[TMP_53:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_54:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_53]]
+-  // CHECK: %[[TMP_55:.*]] = stablehlo.multiply %[[TMP_52]], %[[TMP_54]]
+-  // CHECK: %[[TMP_56:.*]] = stablehlo.constant dense<5.50900303E-18>
+-  // CHECK: %[[TMP_57:.*]] = stablehlo.add %[[TMP_50]], %[[TMP_56]]
+-  // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_57]]
+-  // CHECK: %[[TMP_59:.*]] = stablehlo.multiply %[[TMP_55]], %[[TMP_58]]
+-  // CHECK: %[[TMP_60:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_61:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_60]]
+-  // CHECK: %[[TMP_62:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_63:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_62]]
+-  // CHECK: %[[TMP_64:.*]] = stablehlo.multiply %[[TMP_61]], %[[TMP_63]]
+-  // CHECK: %[[TMP_65:.*]] = stablehlo.constant dense<-2.17486866E-16>
+-  // CHECK: %[[TMP_66:.*]] = stablehlo.add %[[TMP_59]], %[[TMP_65]]
+-  // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_66]]
+-  // CHECK: %[[TMP_68:.*]] = stablehlo.multiply %[[TMP_64]], %[[TMP_67]]
+-  // CHECK: %[[TMP_69:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_70:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_69]]
+-  // CHECK: %[[TMP_71:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_72:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_71]]
+-  // CHECK: %[[TMP_73:.*]] = stablehlo.multiply %[[TMP_70]], %[[TMP_72]]
+-  // CHECK: %[[TMP_74:.*]] = stablehlo.constant dense<8.58606213E-15>
+-  // CHECK: %[[TMP_75:.*]] = stablehlo.add %[[TMP_68]], %[[TMP_74]]
+-  // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_75]]
+-  // CHECK: %[[TMP_77:.*]] = stablehlo.multiply %[[TMP_73]], %[[TMP_76]]
+-  // CHECK: %[[TMP_78:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_79:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_78]]
+-  // CHECK: %[[TMP_80:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_81:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_80]]
+-  // CHECK: %[[TMP_82:.*]] = stablehlo.multiply %[[TMP_79]], %[[TMP_81]]
+-  // CHECK: %[[TMP_83:.*]] = stablehlo.constant dense<-3.3896803E-13>
+-  // CHECK: %[[TMP_84:.*]] = stablehlo.add %[[TMP_77]], %[[TMP_83]]
+-  // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_84]]
+-  // CHECK: %[[TMP_86:.*]] = stablehlo.multiply %[[TMP_82]], %[[TMP_85]]
+-  // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_88:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_87]]
+-  // CHECK: %[[TMP_89:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_89]]
+-  // CHECK: %[[TMP_91:.*]] = stablehlo.multiply %[[TMP_88]], %[[TMP_90]]
+-  // CHECK: %[[TMP_92:.*]] = stablehlo.constant dense<1.33825364E-11>
+-  // CHECK: %[[TMP_93:.*]] = stablehlo.add %[[TMP_86]], %[[TMP_92]]
+-  // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_93]]
+-  // CHECK: %[[TMP_95:.*]] = stablehlo.multiply %[[TMP_91]], %[[TMP_94]]
+-  // CHECK: %[[TMP_96:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_96]]
+-  // CHECK: %[[TMP_98:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_98]]
+-  // CHECK: %[[TMP_100:.*]] = stablehlo.multiply %[[TMP_97]], %[[TMP_99]]
+-  // CHECK: %[[TMP_101:.*]] = stablehlo.constant dense<-5.28419031E-10>
+-  // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_95]], %[[TMP_101]]
+-  // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_102]]
+-  // CHECK: %[[TMP_104:.*]] = stablehlo.multiply %[[TMP_100]], %[[TMP_103]]
+-  // CHECK: %[[TMP_105:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_105]]
+-  // CHECK: %[[TMP_107:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_107]]
+-  // CHECK: %[[TMP_109:.*]] = stablehlo.multiply %[[TMP_106]], %[[TMP_108]]
+-  // CHECK: %[[TMP_110:.*]] = stablehlo.constant dense<2.08767563E-8>
+-  // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_104]], %[[TMP_110]]
+-  // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_111]]
+-  // CHECK: %[[TMP_113:.*]] = stablehlo.multiply %[[TMP_109]], %[[TMP_112]]
+-  // CHECK: %[[TMP_114:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_114]]
+-  // CHECK: %[[TMP_116:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_116]]
+-  // CHECK: %[[TMP_118:.*]] = stablehlo.multiply %[[TMP_115]], %[[TMP_117]]
+-  // CHECK: %[[TMP_119:.*]] = stablehlo.constant dense<-8.26719599E-7>
+-  // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_113]], %[[TMP_119]]
+-  // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_120]]
+-  // CHECK: %[[TMP_122:.*]] = stablehlo.multiply %[[TMP_118]], %[[TMP_121]]
+-  // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_123]]
+-  // CHECK: %[[TMP_125:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_125]]
+-  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_124]], %[[TMP_126]]
+-  // CHECK: %[[TMP_128:.*]] = stablehlo.constant dense<3.30687835E-5>
+-  // CHECK: %[[TMP_129:.*]] = stablehlo.add %[[TMP_122]], %[[TMP_128]]
+-  // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_129]]
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.multiply %[[TMP_127]], %[[TMP_130]]
+-  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_132]]
+-  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_134]]
+-  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_135]]
+-  // CHECK: %[[TMP_137:.*]] = stablehlo.constant dense<-0.00138888892>
+-  // CHECK: %[[TMP_138:.*]] = stablehlo.add %[[TMP_131]], %[[TMP_137]]
+-  // CHECK: %[[TMP_139:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_138]]
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.multiply %[[TMP_136]], %[[TMP_139]]
+-  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<5.000000e-01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.divide %[[TMP_0]], %[[TMP_33]]
+-  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<0.0833333358>
+-  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_143]], %[[TMP_140]]
+-  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_144]]
+-  // CHECK: %[[TMP_146:.*]] = stablehlo.add %[[TMP_141]], %[[TMP_145]]
+-  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_146]]
+-  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_39]], %[[TMP_147]]
++  // CHECK: %[[TMP_36:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_33]]
++  // CHECK: %[[TMP_37:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_35]]
++  // CHECK: %[[TMP_38:.*]] = stablehlo.divide %[[TMP_36]], %[[TMP_37]]
++  // CHECK: %[[TMP_39:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
++  // CHECK: %[[TMP_40:.*]] = stablehlo.divide %[[TMP_3]], %[[TMP_39]]
++  // CHECK: %[[TMP_41:.*]] = stablehlo.constant dense<2.200000e+01>
++  // CHECK: %[[TMP_42:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_41]]
++  // CHECK: %[[TMP_43:.*]] = stablehlo.constant dense<2.100000e+01>
++  // CHECK: %[[TMP_44:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_43]]
++  // CHECK: %[[TMP_45:.*]] = stablehlo.multiply %[[TMP_42]], %[[TMP_44]]
++  // CHECK: %[[TMP_46:.*]] = stablehlo.constant dense<-1.39544646E-19>
++  // CHECK: %[[TMP_47:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_46]]
++  // CHECK: %[[TMP_48:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_47]]
++  // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_45]], %[[TMP_48]]
++  // CHECK: %[[TMP_50:.*]] = stablehlo.constant dense<2.000000e+01>
++  // CHECK: %[[TMP_51:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_50]]
++  // CHECK: %[[TMP_52:.*]] = stablehlo.constant dense<1.900000e+01>
++  // CHECK: %[[TMP_53:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_52]]
++  // CHECK: %[[TMP_54:.*]] = stablehlo.multiply %[[TMP_51]], %[[TMP_53]]
++  // CHECK: %[[TMP_55:.*]] = stablehlo.constant dense<5.50900303E-18>
++  // CHECK: %[[TMP_56:.*]] = stablehlo.add %[[TMP_49]], %[[TMP_55]]
++  // CHECK: %[[TMP_57:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_56]]
++  // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_54]], %[[TMP_57]]
++  // CHECK: %[[TMP_59:.*]] = stablehlo.constant dense<1.800000e+01>
++  // CHECK: %[[TMP_60:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_59]]
++  // CHECK: %[[TMP_61:.*]] = stablehlo.constant dense<1.700000e+01>
++  // CHECK: %[[TMP_62:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_61]]
++  // CHECK: %[[TMP_63:.*]] = stablehlo.multiply %[[TMP_60]], %[[TMP_62]]
++  // CHECK: %[[TMP_64:.*]] = stablehlo.constant dense<-2.17486866E-16>
++  // CHECK: %[[TMP_65:.*]] = stablehlo.add %[[TMP_58]], %[[TMP_64]]
++  // CHECK: %[[TMP_66:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_65]]
++  // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_63]], %[[TMP_66]]
++  // CHECK: %[[TMP_68:.*]] = stablehlo.constant dense<1.600000e+01>
++  // CHECK: %[[TMP_69:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_68]]
++  // CHECK: %[[TMP_70:.*]] = stablehlo.constant dense<1.500000e+01>
++  // CHECK: %[[TMP_71:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_70]]
++  // CHECK: %[[TMP_72:.*]] = stablehlo.multiply %[[TMP_69]], %[[TMP_71]]
++  // CHECK: %[[TMP_73:.*]] = stablehlo.constant dense<8.58606213E-15>
++  // CHECK: %[[TMP_74:.*]] = stablehlo.add %[[TMP_67]], %[[TMP_73]]
++  // CHECK: %[[TMP_75:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_74]]
++  // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_72]], %[[TMP_75]]
++  // CHECK: %[[TMP_77:.*]] = stablehlo.constant dense<1.400000e+01>
++  // CHECK: %[[TMP_78:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_77]]
++  // CHECK: %[[TMP_79:.*]] = stablehlo.constant dense<1.300000e+01>
++  // CHECK: %[[TMP_80:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_79]]
++  // CHECK: %[[TMP_81:.*]] = stablehlo.multiply %[[TMP_78]], %[[TMP_80]]
++  // CHECK: %[[TMP_82:.*]] = stablehlo.constant dense<-3.3896803E-13>
++  // CHECK: %[[TMP_83:.*]] = stablehlo.add %[[TMP_76]], %[[TMP_82]]
++  // CHECK: %[[TMP_84:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_83]]
++  // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_81]], %[[TMP_84]]
++  // CHECK: %[[TMP_86:.*]] = stablehlo.constant dense<1.200000e+01>
++  // CHECK: %[[TMP_87:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_86]]
++  // CHECK: %[[TMP_88:.*]] = stablehlo.constant dense<1.100000e+01>
++  // CHECK: %[[TMP_89:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_88]]
++  // CHECK: %[[TMP_90:.*]] = stablehlo.multiply %[[TMP_87]], %[[TMP_89]]
++  // CHECK: %[[TMP_91:.*]] = stablehlo.constant dense<1.33825364E-11>
++  // CHECK: %[[TMP_92:.*]] = stablehlo.add %[[TMP_85]], %[[TMP_91]]
++  // CHECK: %[[TMP_93:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_92]]
++  // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_90]], %[[TMP_93]]
++  // CHECK: %[[TMP_95:.*]] = stablehlo.constant dense<1.000000e+01>
++  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_95]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.constant dense<9.000000e+00>
++  // CHECK: %[[TMP_98:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_97]]
++  // CHECK: %[[TMP_99:.*]] = stablehlo.multiply %[[TMP_96]], %[[TMP_98]]
++  // CHECK: %[[TMP_100:.*]] = stablehlo.constant dense<-5.28419031E-10>
++  // CHECK: %[[TMP_101:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_100]]
++  // CHECK: %[[TMP_102:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_101]]
++  // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_99]], %[[TMP_102]]
++  // CHECK: %[[TMP_104:.*]] = stablehlo.constant dense<8.000000e+00>
++  // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_104]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.constant dense<7.000000e+00>
++  // CHECK: %[[TMP_107:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_106]]
++  // CHECK: %[[TMP_108:.*]] = stablehlo.multiply %[[TMP_105]], %[[TMP_107]]
++  // CHECK: %[[TMP_109:.*]] = stablehlo.constant dense<2.08767563E-8>
++  // CHECK: %[[TMP_110:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_109]]
++  // CHECK: %[[TMP_111:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_110]]
++  // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_108]], %[[TMP_111]]
++  // CHECK: %[[TMP_113:.*]] = stablehlo.constant dense<6.000000e+00>
++  // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_113]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.constant dense<5.000000e+00>
++  // CHECK: %[[TMP_116:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_115]]
++  // CHECK: %[[TMP_117:.*]] = stablehlo.multiply %[[TMP_114]], %[[TMP_116]]
++  // CHECK: %[[TMP_118:.*]] = stablehlo.constant dense<-8.26719599E-7>
++  // CHECK: %[[TMP_119:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_118]]
++  // CHECK: %[[TMP_120:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_119]]
++  // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_117]], %[[TMP_120]]
++  // CHECK: %[[TMP_122:.*]] = stablehlo.constant dense<4.000000e+00>
++  // CHECK: %[[TMP_123:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_122]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.constant dense<3.000000e+00>
++  // CHECK: %[[TMP_125:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_124]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.multiply %[[TMP_123]], %[[TMP_125]]
++  // CHECK: %[[TMP_127:.*]] = stablehlo.constant dense<3.30687835E-5>
++  // CHECK: %[[TMP_128:.*]] = stablehlo.add %[[TMP_121]], %[[TMP_127]]
++  // CHECK: %[[TMP_129:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_128]]
++  // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_126]], %[[TMP_129]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.000000e+00>
++  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_131]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_134:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_133]]
++  // CHECK: %[[TMP_135:.*]] = stablehlo.multiply %[[TMP_132]], %[[TMP_134]]
++  // CHECK: %[[TMP_136:.*]] = stablehlo.constant dense<-0.00138888892>
++  // CHECK: %[[TMP_137:.*]] = stablehlo.add %[[TMP_130]], %[[TMP_136]]
++  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_137]]
++  // CHECK: %[[TMP_139:.*]] = stablehlo.multiply %[[TMP_135]], %[[TMP_138]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<5.000000e-01>
++  // CHECK: %[[TMP_141:.*]] = stablehlo.divide %[[TMP_0]], %[[TMP_33]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.constant dense<0.0833333358>
++  // CHECK: %[[TMP_143:.*]] = stablehlo.add %[[TMP_142]], %[[TMP_139]]
++  // CHECK: %[[TMP_144:.*]] = stablehlo.multiply %[[TMP_141]], %[[TMP_143]]
++  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_140]], %[[TMP_144]]
++  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_145]]
++  // CHECK: %[[TMP_147:.*]] = stablehlo.add %[[TMP_32]], %[[TMP_38]]
++  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_146]]
+   // CHECK: %[[TMP_149:.*]] = stablehlo.abs %[[TMP_34]]
+   // CHECK: %[[TMP_150:.*]] = stablehlo.abs %[[TMP_32]]
+   // CHECK: %[[TMP_151:.*]] = stablehlo.constant dense<1.401300e-45>
+@@ -1456,7 +1456,7 @@
+   // CHECK: %[[TMP_172:.*]] = stablehlo.and %[[TMP_169]], %[[TMP_171]] : tensor<i1>
+   // CHECK: %[[TMP_173:.*]] = stablehlo.select %[[TMP_172]], %[[TMP_163]], %[[TMP_155]]
+   // CHECK: %[[TMP_174:.*]] = stablehlo.select %[[TMP_166]], %[[TMP_173]], %[[TMP_162]]
+-  // CHECK: %[[TMP_175:.*]] = stablehlo.compare EQ, %[[TMP_0]], %[[TMP_5]], NOTYPE
++  // CHECK: %[[TMP_175:.*]] = stablehlo.compare EQ, %[[TMP_0]], %[[TMP_3]], NOTYPE
+   // CHECK: %[[TMP_176:.*]] = stablehlo.select %[[TMP_175]], %[[TMP_163]], %[[TMP_174]]
+   // CHECK: %[[TMP_177:.*]] = stablehlo.convert %[[TMP_176]] : (tensor<f32>) -> tensor<f16>
+   %0 = chlo.zeta %arg0, %arg1 : tensor<f16>, tensor<f16> -> tensor<f16>
+@@ -1465,8 +1465,7 @@
+ 
+ // -----
+ 
+-
+-// CHECK-LABEL: @polygamma_f32
++// CHECK: @polygamma_f32
+ // CHECK-SAME: (%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>)
+ func.func @polygamma_f32(%lhs : tensor<f32>, %rhs : tensor<f32>) -> tensor<f32> {
+   // CHECK-DAG: %[[TMP_0:.*]] = stablehlo.constant dense<1.000000e+00>
+@@ -1559,153 +1558,153 @@
+   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<0x7F800000>
+   // CHECK: %[[TMP_88:.*]] = stablehlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
+   // CHECK: %[[TMP_89:.*]] = stablehlo.exponential %[[TMP_88]]
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
+-  // CHECK: %[[TMP_91:.*]] = stablehlo.negate %[[TMP_5]]
+-  // CHECK: %[[TMP_92:.*]] = stablehlo.power %[[ARG1]], %[[TMP_91]]
+-  // CHECK: %[[TMP_93:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_93]]
+-  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_91]]
+-  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_92]], %[[TMP_95]]
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_93]]
+-  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_91]]
++  // CHECK-DAG: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
++  // CHECK-DAG: %[[TMP_91:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_92:.*]] = stablehlo.negate %[[TMP_5]]
++  // CHECK: %[[TMP_93:.*]] = stablehlo.power %[[ARG1]], %[[TMP_92]]
++  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_91]]
++  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_92]]
++  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_93]], %[[TMP_95]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_91]]
++  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_92]]
+   // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_96]], %[[TMP_98]]
+-  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_93]]
+-  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_91]]
++  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_92]]
+   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_99]], %[[TMP_101]]
+-  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_93]]
+-  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_92]]
+   // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_102]], %[[TMP_104]]
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_93]]
+-  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_92]]
+   // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_105]], %[[TMP_107]]
+-  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_93]]
+-  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_92]]
+   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_108]], %[[TMP_110]]
+-  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_93]]
+-  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_92]]
+   // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_111]], %[[TMP_113]]
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_93]]
+-  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_92]]
+   // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_114]], %[[TMP_116]]
+-  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_93]]
+-  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_92]]
+   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_117]], %[[TMP_119]]
+-  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_93]]
+-  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_91]]
++  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_92]]
+   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
+-  // CHECK: %[[TMP_125:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_125]], %[[TMP_124]]
+-  // CHECK: %[[TMP_127:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
+-  // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
+-  // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
+-  // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
+-  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
+-  // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.39544646E-19>
+-  // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
+-  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
+-  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
+-  // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
+-  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
+-  // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
+-  // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.50900303E-18>
+-  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
+-  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
+-  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
+-  // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
+-  // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
+-  // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
+-  // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.17486866E-16>
+-  // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
+-  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
+-  // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
+-  // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
+-  // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
+-  // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
+-  // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.58606213E-15>
+-  // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
+-  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
+-  // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
+-  // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
+-  // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
+-  // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
+-  // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896803E-13>
+-  // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
+-  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
+-  // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
+-  // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
+-  // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
+-  // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
+-  // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.33825364E-11>
+-  // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
+-  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
+-  // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
+-  // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
+-  // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
+-  // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
+-  // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.28419031E-10>
+-  // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
+-  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
+-  // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
+-  // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
+-  // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
+-  // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
+-  // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767563E-8>
+-  // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
+-  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
+-  // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
+-  // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
+-  // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
+-  // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
+-  // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.26719599E-7>
+-  // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
+-  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
+-  // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
+-  // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
+-  // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
+-  // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
+-  // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.30687835E-5>
+-  // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
+-  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
+-  // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
+-  // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
+-  // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
+-  // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
+-  // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.00138888892>
+-  // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
+-  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_226]]
+-  // CHECK: %[[TMP_228:.*]] = stablehlo.multiply %[[TMP_224]], %[[TMP_227]]
+-  // CHECK: %[[TMP_229:.*]] = stablehlo.constant dense<5.000000e-01>
+-  // CHECK: %[[TMP_230:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
+-  // CHECK: %[[TMP_231:.*]] = stablehlo.constant dense<0.0833333358>
+-  // CHECK: %[[TMP_232:.*]] = stablehlo.add %[[TMP_231]], %[[TMP_228]]
+-  // CHECK: %[[TMP_233:.*]] = stablehlo.multiply %[[TMP_230]], %[[TMP_232]]
+-  // CHECK: %[[TMP_234:.*]] = stablehlo.add %[[TMP_229]], %[[TMP_233]]
+-  // CHECK: %[[TMP_235:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_234]]
+-  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_127]], %[[TMP_235]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
++  // CHECK: %[[TMP_125:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_124]], %[[TMP_125]]
++  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
++  // CHECK: %[[TMP_128:.*]] = stablehlo.divide %[[TMP_91]], %[[TMP_127]]
++  // CHECK: %[[TMP_129:.*]] = stablehlo.constant dense<2.200000e+01>
++  // CHECK: %[[TMP_130:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_129]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.100000e+01>
++  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_131]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.multiply %[[TMP_130]], %[[TMP_132]]
++  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<-1.39544646E-19>
++  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_134]]
++  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_135]]
++  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_136]]
++  // CHECK: %[[TMP_138:.*]] = stablehlo.constant dense<2.000000e+01>
++  // CHECK: %[[TMP_139:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_138]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<1.900000e+01>
++  // CHECK: %[[TMP_141:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_140]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.multiply %[[TMP_139]], %[[TMP_141]]
++  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<5.50900303E-18>
++  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_137]], %[[TMP_143]]
++  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_144]]
++  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_145]]
++  // CHECK: %[[TMP_147:.*]] = stablehlo.constant dense<1.800000e+01>
++  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_147]]
++  // CHECK: %[[TMP_149:.*]] = stablehlo.constant dense<1.700000e+01>
++  // CHECK: %[[TMP_150:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_149]]
++  // CHECK: %[[TMP_151:.*]] = stablehlo.multiply %[[TMP_148]], %[[TMP_150]]
++  // CHECK: %[[TMP_152:.*]] = stablehlo.constant dense<-2.17486866E-16>
++  // CHECK: %[[TMP_153:.*]] = stablehlo.add %[[TMP_146]], %[[TMP_152]]
++  // CHECK: %[[TMP_154:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_153]]
++  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_151]], %[[TMP_154]]
++  // CHECK: %[[TMP_156:.*]] = stablehlo.constant dense<1.600000e+01>
++  // CHECK: %[[TMP_157:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_156]]
++  // CHECK: %[[TMP_158:.*]] = stablehlo.constant dense<1.500000e+01>
++  // CHECK: %[[TMP_159:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_158]]
++  // CHECK: %[[TMP_160:.*]] = stablehlo.multiply %[[TMP_157]], %[[TMP_159]]
++  // CHECK: %[[TMP_161:.*]] = stablehlo.constant dense<8.58606213E-15>
++  // CHECK: %[[TMP_162:.*]] = stablehlo.add %[[TMP_155]], %[[TMP_161]]
++  // CHECK: %[[TMP_163:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_162]]
++  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_160]], %[[TMP_163]]
++  // CHECK: %[[TMP_165:.*]] = stablehlo.constant dense<1.400000e+01>
++  // CHECK: %[[TMP_166:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_165]]
++  // CHECK: %[[TMP_167:.*]] = stablehlo.constant dense<1.300000e+01>
++  // CHECK: %[[TMP_168:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_167]]
++  // CHECK: %[[TMP_169:.*]] = stablehlo.multiply %[[TMP_166]], %[[TMP_168]]
++  // CHECK: %[[TMP_170:.*]] = stablehlo.constant dense<-3.3896803E-13>
++  // CHECK: %[[TMP_171:.*]] = stablehlo.add %[[TMP_164]], %[[TMP_170]]
++  // CHECK: %[[TMP_172:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_171]]
++  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_169]], %[[TMP_172]]
++  // CHECK: %[[TMP_174:.*]] = stablehlo.constant dense<1.200000e+01>
++  // CHECK: %[[TMP_175:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_174]]
++  // CHECK: %[[TMP_176:.*]] = stablehlo.constant dense<1.100000e+01>
++  // CHECK: %[[TMP_177:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_176]]
++  // CHECK: %[[TMP_178:.*]] = stablehlo.multiply %[[TMP_175]], %[[TMP_177]]
++  // CHECK: %[[TMP_179:.*]] = stablehlo.constant dense<1.33825364E-11>
++  // CHECK: %[[TMP_180:.*]] = stablehlo.add %[[TMP_173]], %[[TMP_179]]
++  // CHECK: %[[TMP_181:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_180]]
++  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_178]], %[[TMP_181]]
++  // CHECK: %[[TMP_183:.*]] = stablehlo.constant dense<1.000000e+01>
++  // CHECK: %[[TMP_184:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_183]]
++  // CHECK: %[[TMP_185:.*]] = stablehlo.constant dense<9.000000e+00>
++  // CHECK: %[[TMP_186:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_185]]
++  // CHECK: %[[TMP_187:.*]] = stablehlo.multiply %[[TMP_184]], %[[TMP_186]]
++  // CHECK: %[[TMP_188:.*]] = stablehlo.constant dense<-5.28419031E-10>
++  // CHECK: %[[TMP_189:.*]] = stablehlo.add %[[TMP_182]], %[[TMP_188]]
++  // CHECK: %[[TMP_190:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_189]]
++  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_187]], %[[TMP_190]]
++  // CHECK: %[[TMP_192:.*]] = stablehlo.constant dense<8.000000e+00>
++  // CHECK: %[[TMP_193:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_192]]
++  // CHECK: %[[TMP_194:.*]] = stablehlo.constant dense<7.000000e+00>
++  // CHECK: %[[TMP_195:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_194]]
++  // CHECK: %[[TMP_196:.*]] = stablehlo.multiply %[[TMP_193]], %[[TMP_195]]
++  // CHECK: %[[TMP_197:.*]] = stablehlo.constant dense<2.08767563E-8>
++  // CHECK: %[[TMP_198:.*]] = stablehlo.add %[[TMP_191]], %[[TMP_197]]
++  // CHECK: %[[TMP_199:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_198]]
++  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_196]], %[[TMP_199]]
++  // CHECK: %[[TMP_201:.*]] = stablehlo.constant dense<6.000000e+00>
++  // CHECK: %[[TMP_202:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_201]]
++  // CHECK: %[[TMP_203:.*]] = stablehlo.constant dense<5.000000e+00>
++  // CHECK: %[[TMP_204:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_203]]
++  // CHECK: %[[TMP_205:.*]] = stablehlo.multiply %[[TMP_202]], %[[TMP_204]]
++  // CHECK: %[[TMP_206:.*]] = stablehlo.constant dense<-8.26719599E-7>
++  // CHECK: %[[TMP_207:.*]] = stablehlo.add %[[TMP_200]], %[[TMP_206]]
++  // CHECK: %[[TMP_208:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_207]]
++  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_205]], %[[TMP_208]]
++  // CHECK: %[[TMP_210:.*]] = stablehlo.constant dense<4.000000e+00>
++  // CHECK: %[[TMP_211:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_210]]
++  // CHECK: %[[TMP_212:.*]] = stablehlo.constant dense<3.000000e+00>
++  // CHECK: %[[TMP_213:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_212]]
++  // CHECK: %[[TMP_214:.*]] = stablehlo.multiply %[[TMP_211]], %[[TMP_213]]
++  // CHECK: %[[TMP_215:.*]] = stablehlo.constant dense<3.30687835E-5>
++  // CHECK: %[[TMP_216:.*]] = stablehlo.add %[[TMP_209]], %[[TMP_215]]
++  // CHECK: %[[TMP_217:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_216]]
++  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_214]], %[[TMP_217]]
++  // CHECK: %[[TMP_219:.*]] = stablehlo.constant dense<2.000000e+00>
++  // CHECK: %[[TMP_220:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_219]]
++  // CHECK: %[[TMP_221:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_222:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_221]]
++  // CHECK: %[[TMP_223:.*]] = stablehlo.multiply %[[TMP_220]], %[[TMP_222]]
++  // CHECK: %[[TMP_224:.*]] = stablehlo.constant dense<-0.00138888892>
++  // CHECK: %[[TMP_225:.*]] = stablehlo.add %[[TMP_218]], %[[TMP_224]]
++  // CHECK: %[[TMP_226:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_225]]
++  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_223]], %[[TMP_226]]
++  // CHECK: %[[TMP_228:.*]] = stablehlo.constant dense<5.000000e-01>
++  // CHECK: %[[TMP_229:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
++  // CHECK: %[[TMP_230:.*]] = stablehlo.constant dense<0.0833333358>
++  // CHECK: %[[TMP_231:.*]] = stablehlo.add %[[TMP_230]], %[[TMP_227]]
++  // CHECK: %[[TMP_232:.*]] = stablehlo.multiply %[[TMP_229]], %[[TMP_231]]
++  // CHECK: %[[TMP_233:.*]] = stablehlo.add %[[TMP_228]], %[[TMP_232]]
++  // CHECK: %[[TMP_234:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_233]]
++  // CHECK: %[[TMP_235:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
++  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_235]], %[[TMP_234]]
+   // CHECK: %[[TMP_237:.*]] = stablehlo.abs %[[TMP_122]]
+   // CHECK: %[[TMP_238:.*]] = stablehlo.abs %[[TMP_120]]
+   // CHECK: %[[TMP_239:.*]] = stablehlo.constant dense<1.401300e-45>
+@@ -1732,7 +1731,7 @@
+   // CHECK: %[[TMP_260:.*]] = stablehlo.and %[[TMP_257]], %[[TMP_259]]
+   // CHECK: %[[TMP_261:.*]] = stablehlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
+   // CHECK: %[[TMP_262:.*]] = stablehlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
+-  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
++  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
+   // CHECK: %[[TMP_264:.*]] = stablehlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
+   // CHECK: %[[TMP_265:.*]] = stablehlo.multiply %[[TMP_4]], %[[TMP_89]]
+   // CHECK: %[[TMP_266:.*]] = stablehlo.multiply %[[TMP_265]], %[[TMP_264]]
+@@ -1853,8 +1852,7 @@
+ 
+ // -----
+ 
+-
+-// CHECK-LABEL: @polygamma_f64
++// CHECK: @polygamma_f64
+ // CHECK-SAME: (%[[ARG0:.*]]: tensor<f64>, %[[ARG1:.*]]: tensor<f64>)
+ func.func @polygamma_f64(%lhs : tensor<f64>, %rhs : tensor<f64>) -> tensor<f64> {
+   // CHECK-DAG: %[[TMP_0:.*]] = stablehlo.constant dense<1.000000e+00>
+@@ -1947,153 +1945,153 @@
+   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<0x7FF0000000000000>
+   // CHECK: %[[TMP_88:.*]] = stablehlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
+   // CHECK: %[[TMP_89:.*]] = stablehlo.exponential %[[TMP_88]]
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
+-  // CHECK: %[[TMP_91:.*]] = stablehlo.negate %[[TMP_5]]
+-  // CHECK: %[[TMP_92:.*]] = stablehlo.power %[[ARG1]], %[[TMP_91]]
+-  // CHECK: %[[TMP_93:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_93]]
+-  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_91]]
+-  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_92]], %[[TMP_95]]
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_93]]
+-  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_91]]
++  // CHECK-DAG: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
++  // CHECK-DAG: %[[TMP_91:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_92:.*]] = stablehlo.negate %[[TMP_5]]
++  // CHECK: %[[TMP_93:.*]] = stablehlo.power %[[ARG1]], %[[TMP_92]]
++  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_91]]
++  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_92]]
++  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_93]], %[[TMP_95]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_91]]
++  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_92]]
+   // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_96]], %[[TMP_98]]
+-  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_93]]
+-  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_91]]
++  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_92]]
+   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_99]], %[[TMP_101]]
+-  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_93]]
+-  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_92]]
+   // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_102]], %[[TMP_104]]
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_93]]
+-  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_92]]
+   // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_105]], %[[TMP_107]]
+-  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_93]]
+-  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_92]]
+   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_108]], %[[TMP_110]]
+-  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_93]]
+-  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_92]]
+   // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_111]], %[[TMP_113]]
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_93]]
+-  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_92]]
+   // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_114]], %[[TMP_116]]
+-  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_93]]
+-  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_92]]
+   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_117]], %[[TMP_119]]
+-  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_93]]
+-  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_91]]
++  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_92]]
+   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
+-  // CHECK: %[[TMP_125:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_125]], %[[TMP_124]]
+-  // CHECK: %[[TMP_127:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
+-  // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
+-  // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
+-  // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
+-  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
+-  // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
+-  // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
+-  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
+-  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
+-  // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
+-  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
+-  // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
+-  // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
+-  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
+-  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
+-  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
+-  // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
+-  // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
+-  // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
+-  // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
+-  // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
+-  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
+-  // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
+-  // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
+-  // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
+-  // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
+-  // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
+-  // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
+-  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
+-  // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
+-  // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
+-  // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
+-  // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
+-  // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
+-  // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
+-  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
+-  // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
+-  // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
+-  // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
+-  // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
+-  // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
+-  // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
+-  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
+-  // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
+-  // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
+-  // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
+-  // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
+-  // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
+-  // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
+-  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
+-  // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
+-  // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
+-  // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
+-  // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
+-  // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767569878681E-8>
+-  // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
+-  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
+-  // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
+-  // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
+-  // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
+-  // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
+-  // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
+-  // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
+-  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
+-  // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
+-  // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
+-  // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
+-  // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
+-  // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
+-  // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
+-  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
+-  // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
+-  // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
+-  // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
+-  // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
+-  // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.0013888888888888889>
+-  // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
+-  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_226]]
+-  // CHECK: %[[TMP_228:.*]] = stablehlo.multiply %[[TMP_224]], %[[TMP_227]]
+-  // CHECK: %[[TMP_229:.*]] = stablehlo.constant dense<5.000000e-01>
+-  // CHECK: %[[TMP_230:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
+-  // CHECK: %[[TMP_231:.*]] = stablehlo.constant dense<0.083333333333333329>
+-  // CHECK: %[[TMP_232:.*]] = stablehlo.add %[[TMP_231]], %[[TMP_228]]
+-  // CHECK: %[[TMP_233:.*]] = stablehlo.multiply %[[TMP_230]], %[[TMP_232]]
+-  // CHECK: %[[TMP_234:.*]] = stablehlo.add %[[TMP_229]], %[[TMP_233]]
+-  // CHECK: %[[TMP_235:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_234]]
+-  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_127]], %[[TMP_235]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
++  // CHECK: %[[TMP_125:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_124]], %[[TMP_125]]
++  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
++  // CHECK: %[[TMP_128:.*]] = stablehlo.divide %[[TMP_91]], %[[TMP_127]]
++  // CHECK: %[[TMP_129:.*]] = stablehlo.constant dense<2.200000e+01>
++  // CHECK: %[[TMP_130:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_129]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.100000e+01>
++  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_131]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.multiply %[[TMP_130]], %[[TMP_132]]
++  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
++  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_134]]
++  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_135]]
++  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_136]]
++  // CHECK: %[[TMP_138:.*]] = stablehlo.constant dense<2.000000e+01>
++  // CHECK: %[[TMP_139:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_138]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<1.900000e+01>
++  // CHECK: %[[TMP_141:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_140]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.multiply %[[TMP_139]], %[[TMP_141]]
++  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
++  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_137]], %[[TMP_143]]
++  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_144]]
++  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_145]]
++  // CHECK: %[[TMP_147:.*]] = stablehlo.constant dense<1.800000e+01>
++  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_147]]
++  // CHECK: %[[TMP_149:.*]] = stablehlo.constant dense<1.700000e+01>
++  // CHECK: %[[TMP_150:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_149]]
++  // CHECK: %[[TMP_151:.*]] = stablehlo.multiply %[[TMP_148]], %[[TMP_150]]
++  // CHECK: %[[TMP_152:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
++  // CHECK: %[[TMP_153:.*]] = stablehlo.add %[[TMP_146]], %[[TMP_152]]
++  // CHECK: %[[TMP_154:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_153]]
++  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_151]], %[[TMP_154]]
++  // CHECK: %[[TMP_156:.*]] = stablehlo.constant dense<1.600000e+01>
++  // CHECK: %[[TMP_157:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_156]]
++  // CHECK: %[[TMP_158:.*]] = stablehlo.constant dense<1.500000e+01>
++  // CHECK: %[[TMP_159:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_158]]
++  // CHECK: %[[TMP_160:.*]] = stablehlo.multiply %[[TMP_157]], %[[TMP_159]]
++  // CHECK: %[[TMP_161:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
++  // CHECK: %[[TMP_162:.*]] = stablehlo.add %[[TMP_155]], %[[TMP_161]]
++  // CHECK: %[[TMP_163:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_162]]
++  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_160]], %[[TMP_163]]
++  // CHECK: %[[TMP_165:.*]] = stablehlo.constant dense<1.400000e+01>
++  // CHECK: %[[TMP_166:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_165]]
++  // CHECK: %[[TMP_167:.*]] = stablehlo.constant dense<1.300000e+01>
++  // CHECK: %[[TMP_168:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_167]]
++  // CHECK: %[[TMP_169:.*]] = stablehlo.multiply %[[TMP_166]], %[[TMP_168]]
++  // CHECK: %[[TMP_170:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
++  // CHECK: %[[TMP_171:.*]] = stablehlo.add %[[TMP_164]], %[[TMP_170]]
++  // CHECK: %[[TMP_172:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_171]]
++  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_169]], %[[TMP_172]]
++  // CHECK: %[[TMP_174:.*]] = stablehlo.constant dense<1.200000e+01>
++  // CHECK: %[[TMP_175:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_174]]
++  // CHECK: %[[TMP_176:.*]] = stablehlo.constant dense<1.100000e+01>
++  // CHECK: %[[TMP_177:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_176]]
++  // CHECK: %[[TMP_178:.*]] = stablehlo.multiply %[[TMP_175]], %[[TMP_177]]
++  // CHECK: %[[TMP_179:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
++  // CHECK: %[[TMP_180:.*]] = stablehlo.add %[[TMP_173]], %[[TMP_179]]
++  // CHECK: %[[TMP_181:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_180]]
++  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_178]], %[[TMP_181]]
++  // CHECK: %[[TMP_183:.*]] = stablehlo.constant dense<1.000000e+01>
++  // CHECK: %[[TMP_184:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_183]]
++  // CHECK: %[[TMP_185:.*]] = stablehlo.constant dense<9.000000e+00>
++  // CHECK: %[[TMP_186:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_185]]
++  // CHECK: %[[TMP_187:.*]] = stablehlo.multiply %[[TMP_184]], %[[TMP_186]]
++  // CHECK: %[[TMP_188:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
++  // CHECK: %[[TMP_189:.*]] = stablehlo.add %[[TMP_182]], %[[TMP_188]]
++  // CHECK: %[[TMP_190:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_189]]
++  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_187]], %[[TMP_190]]
++  // CHECK: %[[TMP_192:.*]] = stablehlo.constant dense<8.000000e+00>
++  // CHECK: %[[TMP_193:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_192]]
++  // CHECK: %[[TMP_194:.*]] = stablehlo.constant dense<7.000000e+00>
++  // CHECK: %[[TMP_195:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_194]]
++  // CHECK: %[[TMP_196:.*]] = stablehlo.multiply %[[TMP_193]], %[[TMP_195]]
++  // CHECK: %[[TMP_197:.*]] = stablehlo.constant dense<2.08767569878681E-8>
++  // CHECK: %[[TMP_198:.*]] = stablehlo.add %[[TMP_191]], %[[TMP_197]]
++  // CHECK: %[[TMP_199:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_198]]
++  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_196]], %[[TMP_199]]
++  // CHECK: %[[TMP_201:.*]] = stablehlo.constant dense<6.000000e+00>
++  // CHECK: %[[TMP_202:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_201]]
++  // CHECK: %[[TMP_203:.*]] = stablehlo.constant dense<5.000000e+00>
++  // CHECK: %[[TMP_204:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_203]]
++  // CHECK: %[[TMP_205:.*]] = stablehlo.multiply %[[TMP_202]], %[[TMP_204]]
++  // CHECK: %[[TMP_206:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
++  // CHECK: %[[TMP_207:.*]] = stablehlo.add %[[TMP_200]], %[[TMP_206]]
++  // CHECK: %[[TMP_208:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_207]]
++  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_205]], %[[TMP_208]]
++  // CHECK: %[[TMP_210:.*]] = stablehlo.constant dense<4.000000e+00>
++  // CHECK: %[[TMP_211:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_210]]
++  // CHECK: %[[TMP_212:.*]] = stablehlo.constant dense<3.000000e+00>
++  // CHECK: %[[TMP_213:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_212]]
++  // CHECK: %[[TMP_214:.*]] = stablehlo.multiply %[[TMP_211]], %[[TMP_213]]
++  // CHECK: %[[TMP_215:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
++  // CHECK: %[[TMP_216:.*]] = stablehlo.add %[[TMP_209]], %[[TMP_215]]
++  // CHECK: %[[TMP_217:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_216]]
++  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_214]], %[[TMP_217]]
++  // CHECK: %[[TMP_219:.*]] = stablehlo.constant dense<2.000000e+00>
++  // CHECK: %[[TMP_220:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_219]]
++  // CHECK: %[[TMP_221:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_222:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_221]]
++  // CHECK: %[[TMP_223:.*]] = stablehlo.multiply %[[TMP_220]], %[[TMP_222]]
++  // CHECK: %[[TMP_224:.*]] = stablehlo.constant dense<-0.0013888888888888889>
++  // CHECK: %[[TMP_225:.*]] = stablehlo.add %[[TMP_218]], %[[TMP_224]]
++  // CHECK: %[[TMP_226:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_225]]
++  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_223]], %[[TMP_226]]
++  // CHECK: %[[TMP_228:.*]] = stablehlo.constant dense<5.000000e-01>
++  // CHECK: %[[TMP_229:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
++  // CHECK: %[[TMP_230:.*]] = stablehlo.constant dense<0.083333333333333329>
++  // CHECK: %[[TMP_231:.*]] = stablehlo.add %[[TMP_230]], %[[TMP_227]]
++  // CHECK: %[[TMP_232:.*]] = stablehlo.multiply %[[TMP_229]], %[[TMP_231]]
++  // CHECK: %[[TMP_233:.*]] = stablehlo.add %[[TMP_228]], %[[TMP_232]]
++  // CHECK: %[[TMP_234:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_233]]
++  // CHECK: %[[TMP_235:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
++  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_235]], %[[TMP_234]]
+   // CHECK: %[[TMP_237:.*]] = stablehlo.abs %[[TMP_122]]
+   // CHECK: %[[TMP_238:.*]] = stablehlo.abs %[[TMP_120]]
+   // CHECK: %[[TMP_239:.*]] = stablehlo.constant dense<4.940660e-324>
+@@ -2120,7 +2118,7 @@
+   // CHECK: %[[TMP_260:.*]] = stablehlo.and %[[TMP_257]], %[[TMP_259]]
+   // CHECK: %[[TMP_261:.*]] = stablehlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
+   // CHECK: %[[TMP_262:.*]] = stablehlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
+-  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
++  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
+   // CHECK: %[[TMP_264:.*]] = stablehlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
+   // CHECK: %[[TMP_265:.*]] = stablehlo.multiply %[[TMP_4]], %[[TMP_89]]
+   // CHECK: %[[TMP_266:.*]] = stablehlo.multiply %[[TMP_265]], %[[TMP_264]]
+diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+--- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
++++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+@@ -1575,11 +1575,21 @@
+ 
+ static Value materializeZeta(ConversionPatternRewriter &rewriter, Location loc,
+                              ValueRange args) {
+-  // Code should match XLA's materializeZeta from chlo_legalize_to_hlo.cc
++  // Implementation ported from:
++  // https://github.com/openxla/xla/blob/7a067a7b88d2ffb15b1dc5e3c06f701a15f0391d/xla/client/lib/math.cc#L1912-L1917
++  // Reference: Johansson, Fredrik.
++  // "Rigorous high-precision computation of the Hurwitz zeta function and its
++  // derivatives." Numerical Algorithms 69.2 (2015): 253-270.
++  // https://arxiv.org/abs/1309.2877 - formula (5)
++  // Notation is more or less kept as a reference to the whitepaper.
+   assert(args.size() == 2);
+   Value x = args[0];
+   Value q = args[1];
+-  static const std::array<double, 12> kZetaCoeffs{
++
++  static constexpr auto kTerms = 12;
++  static constexpr auto kIters = 9;
++  static constexpr auto kTwoTermsMinusOne = 2 * kTerms - 1;
++  static constexpr auto kZetaCoeffs = std::array<double, kTerms>{
+       -7.1661652561756670113e18,
+       1.8152105401943546773e17,
+       -4.5979787224074726105e15,
+@@ -1596,131 +1606,134 @@
+ 
+   // For speed we'll always use 9 iterations for the initial series estimate,
+   // and a 12 term expansion for the Euler-Maclaurin formula.
+-  Value a = q;
+-  Value zero = getConstantLike(rewriter, loc, 0.0, a);
+-  Value negPower = zero;
+-  Value negX = rewriter.create<mlir::stablehlo::NegOp>(loc, x);
+-  Value initialSum = rewriter.create<mlir::stablehlo::PowOp>(loc, q, negX);
+-  Value one = getConstantLike(rewriter, loc, 1.0, a);
+-  for (int i = 0; i < 9; ++i) {
+-    a = rewriter.create<mlir::stablehlo::AddOp>(loc, a, one);
+-    negPower = rewriter.create<mlir::stablehlo::PowOp>(loc, a, negX);
+-    initialSum =
+-        rewriter.create<mlir::stablehlo::AddOp>(loc, initialSum, negPower);
+-  }
+-
+-  a = rewriter.create<mlir::stablehlo::AddOp>(loc, a, one);
+-  negPower = rewriter.create<mlir::stablehlo::PowOp>(loc, a, negX);
++  Value zero = getConstantLike(rewriter, loc, 0.0, q);
++  Value one = getConstantLike(rewriter, loc, 1.0, q);
++  Value acc = q;
++  Value qNegPower = zero;
++  Value negX = rewriter.create<NegOp>(loc, x);
++  Value powerSum = rewriter.create<PowOp>(loc, q, negX);
++  for (int i = 0; i < kIters; ++i) {
++    acc = rewriter.create<AddOp>(loc, acc, one);
++    qNegPower = rewriter.create<PowOp>(loc, acc, negX);
++    powerSum =
++        rewriter.create<AddOp>(loc, powerSum, qNegPower);
++  }
++  acc = rewriter.create<AddOp>(loc, acc, one);
++  qNegPower = rewriter.create<PowOp>(loc, acc, negX);
+   Value oneLikeX = getConstantLike(rewriter, loc, 1.0, x);
+-  Value xMinusOne =
+-      rewriter.create<mlir::stablehlo::SubtractOp>(loc, x, oneLikeX);
+-  Value negPowerMulA =
+-      rewriter.create<mlir::stablehlo::MulOp>(loc, negPower, a);
+-  Value negPowerMulADivXMinusOne =
+-      rewriter.create<mlir::stablehlo::DivOp>(loc, negPowerMulA, xMinusOne);
+-  Value s = rewriter.create<mlir::stablehlo::AddOp>(loc, initialSum,
+-                                                    negPowerMulADivXMinusOne);
+-  Value aInverseSquare = rewriter.create<mlir::stablehlo::DivOp>(
+-      loc, one, rewriter.create<mlir::stablehlo::MulOp>(loc, a, a));
+-
+-  Value hornerSum = zero;
+-  Value factor = one;
++  Value correctionEulerMaclaurin = rewriter.create<DivOp>(
++      loc, rewriter.create<MulOp>(loc, qNegPower, acc),
++      rewriter.create<SubtractOp>(loc, x, oneLikeX));
++
++  // Manual reciprocal of the square root as RsqrtOp produces different results
++  Value rsqrtAcc = rewriter.create<DivOp>(
++      loc, one, rewriter.create<MulOp>(loc, acc, acc));
++
+   // Use Horner's rule for this.
+   // Note this differs from Cephes which does a 'naive' polynomial evaluation.
+   // Using Horner's rule allows to avoid some NaN's and Infs from happening,
+   // resulting in more numerically stable code.
+-  for (int i = 0; i < 11; ++i) {
+-    Value factorLhs = rewriter.create<mlir::stablehlo::AddOp>(
+-        loc, x, getConstantLike(rewriter, loc, 22 - 2 * i, x));
+-    Value factorRhs = rewriter.create<mlir::stablehlo::AddOp>(
+-        loc, x, getConstantLike(rewriter, loc, 21 - 2 * i, x));
+-    factor = rewriter.create<mlir::stablehlo::MulOp>(loc, factorLhs, factorRhs);
+-    hornerSum = rewriter.create<mlir::stablehlo::MulOp>(
+-        loc, factor,
+-        rewriter.create<mlir::stablehlo::MulOp>(
+-            loc, aInverseSquare,
+-            rewriter.create<mlir::stablehlo::AddOp>(
++  Value hornerSum = zero;
++  Value hornerProduct = one;
++
++  for (int i = 0; i < kTerms - 1; ++i) {
++    Value factorLhs = rewriter.create<AddOp>(
++        loc, x,
++        getConstantLike(rewriter, loc, kTwoTermsMinusOne - 1 - 2 * i, x));
++    Value factorRhs = rewriter.create<AddOp>(
++        loc, x,
++        getConstantLike(rewriter, loc, kTwoTermsMinusOne - 2 - 2 * i, x));
++    hornerProduct =
++        rewriter.create<MulOp>(loc, factorLhs, factorRhs);
++    hornerSum = rewriter.create<MulOp>(
++        loc, hornerProduct,
++        rewriter.create<MulOp>(
++            loc, rsqrtAcc,
++            rewriter.create<AddOp>(
+                 loc, hornerSum,
+-                getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], a))));
+-  }
+-  Value zeroPointFiveLikeNegPower =
+-      getConstantLike(rewriter, loc, .5, negPower);
+-  Value xDivA = rewriter.create<mlir::stablehlo::DivOp>(loc, x, a);
+-  s = rewriter.create<mlir::stablehlo::AddOp>(
+-      loc, s,
+-      rewriter.create<mlir::stablehlo::MulOp>(
+-          loc, negPower,
+-          rewriter.create<mlir::stablehlo::AddOp>(
+-              loc, zeroPointFiveLikeNegPower,
+-              rewriter.create<mlir::stablehlo::MulOp>(
+-                  loc, xDivA,
+-                  rewriter.create<mlir::stablehlo::AddOp>(
+-                      loc,
+-                      getConstantLike(rewriter, loc, 1. / kZetaCoeffs[11], a),
+-                      hornerSum)))));
++                getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], acc))));
++  }
++  Value zeroPointFiveLikeQNegPower =
++      getConstantLike(rewriter, loc, .5, qNegPower);
++  Value xDivAcc = rewriter.create<DivOp>(loc, x, acc);
++  Value bernoulliTailTerm = rewriter.create<MulOp>(
++      loc, qNegPower,
++      rewriter.create<AddOp>(
++          loc, zeroPointFiveLikeQNegPower,
++          rewriter.create<MulOp>(
++              loc, xDivAcc,
++              rewriter.create<AddOp>(
++                  loc,
++                  getConstantLike(rewriter, loc, 1. / kZetaCoeffs[kTerms - 1],
++                                  acc),
++                  hornerSum))));
++  Value accurateResult = rewriter.create<AddOp>(
++      loc,
++      rewriter.create<AddOp>(loc, powerSum,
++                                              correctionEulerMaclaurin),
++      bernoulliTailTerm);
+ 
+   // Use the initial zeta sum without the correction term coming
+   // from Euler-Maclaurin if it is accurate enough.
+-  Value absNegPower = rewriter.create<mlir::stablehlo::AbsOp>(loc, negPower);
+-  Value absInitialSum =
+-      rewriter.create<mlir::stablehlo::AbsOp>(loc, initialSum);
+-  Value output = rewriter.create<mlir::stablehlo::SelectOp>(
++  Value absQNegPower = rewriter.create<AbsOp>(loc, qNegPower);
++  Value absPowerSum = rewriter.create<AbsOp>(loc, powerSum);
++  Value output = rewriter.create<SelectOp>(
+       loc,
+-      rewriter.create<mlir::stablehlo::CompareOp>(
+-          loc, absNegPower,
+-          rewriter.create<mlir::stablehlo::MulOp>(
+-              loc, absInitialSum,
+-              getConstantLikeSmallestFiniteValue(rewriter, loc, a)),
+-          mlir::stablehlo::ComparisonDirection::LT),
+-      initialSum, s);
++      rewriter.create<CompareOp>(
++          loc, absQNegPower,
++          rewriter.create<MulOp>(
++              loc, absPowerSum,
++              getConstantLikeSmallestFiniteValue(rewriter, loc, acc)),
++          ComparisonDirection::LT),
++      powerSum, accurateResult);
+ 
+   // Function is not defined for x < 1.
+   Value nan = getConstantLike(rewriter, loc,
+                               std::numeric_limits<double>::quiet_NaN(), x);
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(
++  output = rewriter.create<SelectOp>(
+       loc,
+-      rewriter.create<mlir::stablehlo::CompareOp>(
+-          loc, x, oneLikeX, mlir::stablehlo::ComparisonDirection::LT),
++      rewriter.create<CompareOp>(
++          loc, x, oneLikeX, ComparisonDirection::LT),
+       nan, output);
+ 
+   // For q <= 0, x must be an integer.
+-  Value qLeZero = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, q, zero, mlir::stablehlo::ComparisonDirection::LE);
+-  Value xNotInt = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, x, rewriter.create<mlir::stablehlo::FloorOp>(loc, x),
+-      mlir::stablehlo::ComparisonDirection::NE);
++  Value qLeZero = rewriter.create<CompareOp>(
++      loc, q, zero, ComparisonDirection::LE);
++  Value xNotInt = rewriter.create<CompareOp>(
++      loc, x, rewriter.create<FloorOp>(loc, x),
++      ComparisonDirection::NE);
+   Value xDomainError =
+-      rewriter.create<mlir::stablehlo::AndOp>(loc, qLeZero, xNotInt);
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(loc, xDomainError, nan,
++      rewriter.create<AndOp>(loc, qLeZero, xNotInt);
++  output = rewriter.create<SelectOp>(loc, xDomainError, nan,
+                                                       output);
+ 
+   // For all integer q <= 0, zeta has a pole. The limit is only defined as
+   // +inf if x is and even integer.
+   Value inf = getConstantLike(rewriter, loc,
+                               std::numeric_limits<double>::infinity(), x);
+-  Value qIsInt = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, q, rewriter.create<mlir::stablehlo::FloorOp>(loc, q),
+-      mlir::stablehlo::ComparisonDirection::EQ);
+-  Value atPole = rewriter.create<mlir::stablehlo::AndOp>(loc, qLeZero, qIsInt);
++  Value qIsInt = rewriter.create<CompareOp>(
++      loc, q, rewriter.create<FloorOp>(loc, q),
++      ComparisonDirection::EQ);
++  Value atPole = rewriter.create<AndOp>(loc, qLeZero, qIsInt);
+   Value two = getConstantLike(rewriter, loc, 2.0, x);
+-  Value xIsInt = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, x, rewriter.create<mlir::stablehlo::FloorOp>(loc, x),
+-      mlir::stablehlo::ComparisonDirection::EQ);
+-  Value xIsEven = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, rewriter.create<mlir::stablehlo::RemOp>(loc, x, two), zero,
+-      mlir::stablehlo::ComparisonDirection::EQ);
++  Value xIsInt = rewriter.create<CompareOp>(
++      loc, x, rewriter.create<FloorOp>(loc, x),
++      ComparisonDirection::EQ);
++  Value xIsEven = rewriter.create<CompareOp>(
++      loc, rewriter.create<RemOp>(loc, x, two), zero,
++      ComparisonDirection::EQ);
+   Value xIsEvenInt =
+-      rewriter.create<mlir::stablehlo::AndOp>(loc, xIsInt, xIsEven);
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(
++      rewriter.create<AndOp>(loc, xIsInt, xIsEven);
++  output = rewriter.create<SelectOp>(
+       loc, atPole,
+-      rewriter.create<mlir::stablehlo::SelectOp>(loc, xIsEvenInt, inf, nan),
++      rewriter.create<SelectOp>(loc, xIsEvenInt, inf, nan),
+       output);
+ 
+   // For x = 1, this is the harmonic series and diverges.
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(
++  output = rewriter.create<SelectOp>(
+       loc,
+-      rewriter.create<mlir::stablehlo::CompareOp>(
+-          loc, x, one, mlir::stablehlo::ComparisonDirection::EQ),
++      rewriter.create<CompareOp>(
++          loc, x, one, ComparisonDirection::EQ),
+       inf, output);
+ 
+   return output;
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
index 4d9835dc8b790c..27a2d6567f50d7 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
@@ -1109,153 +1109,153 @@ func.func @digamma_f16(%arg : tensor<f16>) -> tensor<f16> {
 func.func @zeta_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
   // CHECK: %[[TMP_0:.*]] = mhlo.convert %[[X]] : (tensor<f16>) -> tensor<f32>
   // CHECK: %[[TMP_1:.*]] = mhlo.convert %[[Q]] : (tensor<f16>) -> tensor<f32>
-  // CHECK: %[[TMP_2:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_3:.*]] = mhlo.negate %[[TMP_0]]
-  // CHECK: %[[TMP_4:.*]] = mhlo.power %[[TMP_1]], %[[TMP_3]]
-  // CHECK: %[[TMP_5:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_6:.*]] = mhlo.add %[[TMP_1]], %[[TMP_5]]
-  // CHECK: %[[TMP_7:.*]] = mhlo.power %[[TMP_6]], %[[TMP_3]]
-  // CHECK: %[[TMP_8:.*]] = mhlo.add %[[TMP_4]], %[[TMP_7]]
-  // CHECK: %[[TMP_9:.*]] = mhlo.add %[[TMP_6]], %[[TMP_5]]
-  // CHECK: %[[TMP_10:.*]] = mhlo.power %[[TMP_9]], %[[TMP_3]]
+  // CHECK-DAG: %[[TMP_2:.*]] = mhlo.constant dense<0.000000e+00>
+  // CHECK-DAG: %[[TMP_3:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_4:.*]] = mhlo.negate %[[TMP_0]]
+  // CHECK: %[[TMP_5:.*]] = mhlo.power %[[TMP_1]], %[[TMP_4]]
+  // CHECK: %[[TMP_6:.*]] = mhlo.add %[[TMP_1]], %[[TMP_3]]
+  // CHECK: %[[TMP_7:.*]] = mhlo.power %[[TMP_6]], %[[TMP_4]]
+  // CHECK: %[[TMP_8:.*]] = mhlo.add %[[TMP_5]], %[[TMP_7]]
+  // CHECK: %[[TMP_9:.*]] = mhlo.add %[[TMP_6]], %[[TMP_3]]
+  // CHECK: %[[TMP_10:.*]] = mhlo.power %[[TMP_9]], %[[TMP_4]]
   // CHECK: %[[TMP_11:.*]] = mhlo.add %[[TMP_8]], %[[TMP_10]]
-  // CHECK: %[[TMP_12:.*]] = mhlo.add %[[TMP_9]], %[[TMP_5]]
-  // CHECK: %[[TMP_13:.*]] = mhlo.power %[[TMP_12]], %[[TMP_3]]
+  // CHECK: %[[TMP_12:.*]] = mhlo.add %[[TMP_9]], %[[TMP_3]]
+  // CHECK: %[[TMP_13:.*]] = mhlo.power %[[TMP_12]], %[[TMP_4]]
   // CHECK: %[[TMP_14:.*]] = mhlo.add %[[TMP_11]], %[[TMP_13]]
-  // CHECK: %[[TMP_15:.*]] = mhlo.add %[[TMP_12]], %[[TMP_5]]
-  // CHECK: %[[TMP_16:.*]] = mhlo.power %[[TMP_15]], %[[TMP_3]]
+  // CHECK: %[[TMP_15:.*]] = mhlo.add %[[TMP_12]], %[[TMP_3]]
+  // CHECK: %[[TMP_16:.*]] = mhlo.power %[[TMP_15]], %[[TMP_4]]
   // CHECK: %[[TMP_17:.*]] = mhlo.add %[[TMP_14]], %[[TMP_16]]
-  // CHECK: %[[TMP_18:.*]] = mhlo.add %[[TMP_15]], %[[TMP_5]]
-  // CHECK: %[[TMP_19:.*]] = mhlo.power %[[TMP_18]], %[[TMP_3]]
+  // CHECK: %[[TMP_18:.*]] = mhlo.add %[[TMP_15]], %[[TMP_3]]
+  // CHECK: %[[TMP_19:.*]] = mhlo.power %[[TMP_18]], %[[TMP_4]]
   // CHECK: %[[TMP_20:.*]] = mhlo.add %[[TMP_17]], %[[TMP_19]]
-  // CHECK: %[[TMP_21:.*]] = mhlo.add %[[TMP_18]], %[[TMP_5]]
-  // CHECK: %[[TMP_22:.*]] = mhlo.power %[[TMP_21]], %[[TMP_3]]
+  // CHECK: %[[TMP_21:.*]] = mhlo.add %[[TMP_18]], %[[TMP_3]]
+  // CHECK: %[[TMP_22:.*]] = mhlo.power %[[TMP_21]], %[[TMP_4]]
   // CHECK: %[[TMP_23:.*]] = mhlo.add %[[TMP_20]], %[[TMP_22]]
-  // CHECK: %[[TMP_24:.*]] = mhlo.add %[[TMP_21]], %[[TMP_5]]
-  // CHECK: %[[TMP_25:.*]] = mhlo.power %[[TMP_24]], %[[TMP_3]]
+  // CHECK: %[[TMP_24:.*]] = mhlo.add %[[TMP_21]], %[[TMP_3]]
+  // CHECK: %[[TMP_25:.*]] = mhlo.power %[[TMP_24]], %[[TMP_4]]
   // CHECK: %[[TMP_26:.*]] = mhlo.add %[[TMP_23]], %[[TMP_25]]
-  // CHECK: %[[TMP_27:.*]] = mhlo.add %[[TMP_24]], %[[TMP_5]]
-  // CHECK: %[[TMP_28:.*]] = mhlo.power %[[TMP_27]], %[[TMP_3]]
+  // CHECK: %[[TMP_27:.*]] = mhlo.add %[[TMP_24]], %[[TMP_3]]
+  // CHECK: %[[TMP_28:.*]] = mhlo.power %[[TMP_27]], %[[TMP_4]]
   // CHECK: %[[TMP_29:.*]] = mhlo.add %[[TMP_26]], %[[TMP_28]]
-  // CHECK: %[[TMP_30:.*]] = mhlo.add %[[TMP_27]], %[[TMP_5]]
-  // CHECK: %[[TMP_31:.*]] = mhlo.power %[[TMP_30]], %[[TMP_3]]
+  // CHECK: %[[TMP_30:.*]] = mhlo.add %[[TMP_27]], %[[TMP_3]]
+  // CHECK: %[[TMP_31:.*]] = mhlo.power %[[TMP_30]], %[[TMP_4]]
   // CHECK: %[[TMP_32:.*]] = mhlo.add %[[TMP_29]], %[[TMP_31]]
-  // CHECK: %[[TMP_33:.*]] = mhlo.add %[[TMP_30]], %[[TMP_5]]
-  // CHECK: %[[TMP_34:.*]] = mhlo.power %[[TMP_33]], %[[TMP_3]]
+  // CHECK: %[[TMP_33:.*]] = mhlo.add %[[TMP_30]], %[[TMP_3]]
+  // CHECK: %[[TMP_34:.*]] = mhlo.power %[[TMP_33]], %[[TMP_4]]
   // CHECK: %[[TMP_35:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_36:.*]] = mhlo.subtract %[[TMP_0]], %[[TMP_35]]
-  // CHECK: %[[TMP_37:.*]] = mhlo.multiply %[[TMP_34]], %[[TMP_33]]
-  // CHECK: %[[TMP_38:.*]] = mhlo.divide %[[TMP_37]], %[[TMP_36]]
-  // CHECK: %[[TMP_39:.*]] = mhlo.add %[[TMP_32]], %[[TMP_38]]
-  // CHECK: %[[TMP_40:.*]] = mhlo.multiply %[[TMP_33]], %[[TMP_33]]
-  // CHECK: %[[TMP_41:.*]] = mhlo.divide %[[TMP_5]], %[[TMP_40]]
-  // CHECK: %[[TMP_42:.*]] = mhlo.constant dense<2.200000e+01>
-  // CHECK: %[[TMP_43:.*]] = mhlo.add %[[TMP_0]], %[[TMP_42]]
-  // CHECK: %[[TMP_44:.*]] = mhlo.constant dense<2.100000e+01>
-  // CHECK: %[[TMP_45:.*]] = mhlo.add %[[TMP_0]], %[[TMP_44]]
-  // CHECK: %[[TMP_46:.*]] = mhlo.multiply %[[TMP_43]], %[[TMP_45]]
-  // CHECK: %[[TMP_47:.*]] = mhlo.constant dense<-1.39544646E-19>
-  // CHECK: %[[TMP_48:.*]] = mhlo.add %[[TMP_2]], %[[TMP_47]]
-  // CHECK: %[[TMP_49:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_48]]
-  // CHECK: %[[TMP_50:.*]] = mhlo.multiply %[[TMP_46]], %[[TMP_49]]
-  // CHECK: %[[TMP_51:.*]] = mhlo.constant dense<2.000000e+01>
-  // CHECK: %[[TMP_52:.*]] = mhlo.add %[[TMP_0]], %[[TMP_51]]
-  // CHECK: %[[TMP_53:.*]] = mhlo.constant dense<1.900000e+01>
-  // CHECK: %[[TMP_54:.*]] = mhlo.add %[[TMP_0]], %[[TMP_53]]
-  // CHECK: %[[TMP_55:.*]] = mhlo.multiply %[[TMP_52]], %[[TMP_54]]
-  // CHECK: %[[TMP_56:.*]] = mhlo.constant dense<5.50900303E-18>
-  // CHECK: %[[TMP_57:.*]] = mhlo.add %[[TMP_50]], %[[TMP_56]]
-  // CHECK: %[[TMP_58:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_57]]
-  // CHECK: %[[TMP_59:.*]] = mhlo.multiply %[[TMP_55]], %[[TMP_58]]
-  // CHECK: %[[TMP_60:.*]] = mhlo.constant dense<1.800000e+01>
-  // CHECK: %[[TMP_61:.*]] = mhlo.add %[[TMP_0]], %[[TMP_60]]
-  // CHECK: %[[TMP_62:.*]] = mhlo.constant dense<1.700000e+01>
-  // CHECK: %[[TMP_63:.*]] = mhlo.add %[[TMP_0]], %[[TMP_62]]
-  // CHECK: %[[TMP_64:.*]] = mhlo.multiply %[[TMP_61]], %[[TMP_63]]
-  // CHECK: %[[TMP_65:.*]] = mhlo.constant dense<-2.17486866E-16>
-  // CHECK: %[[TMP_66:.*]] = mhlo.add %[[TMP_59]], %[[TMP_65]]
-  // CHECK: %[[TMP_67:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_66]]
-  // CHECK: %[[TMP_68:.*]] = mhlo.multiply %[[TMP_64]], %[[TMP_67]]
-  // CHECK: %[[TMP_69:.*]] = mhlo.constant dense<1.600000e+01>
-  // CHECK: %[[TMP_70:.*]] = mhlo.add %[[TMP_0]], %[[TMP_69]]
-  // CHECK: %[[TMP_71:.*]] = mhlo.constant dense<1.500000e+01>
-  // CHECK: %[[TMP_72:.*]] = mhlo.add %[[TMP_0]], %[[TMP_71]]
-  // CHECK: %[[TMP_73:.*]] = mhlo.multiply %[[TMP_70]], %[[TMP_72]]
-  // CHECK: %[[TMP_74:.*]] = mhlo.constant dense<8.58606213E-15>
-  // CHECK: %[[TMP_75:.*]] = mhlo.add %[[TMP_68]], %[[TMP_74]]
-  // CHECK: %[[TMP_76:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_75]]
-  // CHECK: %[[TMP_77:.*]] = mhlo.multiply %[[TMP_73]], %[[TMP_76]]
-  // CHECK: %[[TMP_78:.*]] = mhlo.constant dense<1.400000e+01>
-  // CHECK: %[[TMP_79:.*]] = mhlo.add %[[TMP_0]], %[[TMP_78]]
-  // CHECK: %[[TMP_80:.*]] = mhlo.constant dense<1.300000e+01>
-  // CHECK: %[[TMP_81:.*]] = mhlo.add %[[TMP_0]], %[[TMP_80]]
-  // CHECK: %[[TMP_82:.*]] = mhlo.multiply %[[TMP_79]], %[[TMP_81]]
-  // CHECK: %[[TMP_83:.*]] = mhlo.constant dense<-3.3896803E-13>
-  // CHECK: %[[TMP_84:.*]] = mhlo.add %[[TMP_77]], %[[TMP_83]]
-  // CHECK: %[[TMP_85:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_84]]
-  // CHECK: %[[TMP_86:.*]] = mhlo.multiply %[[TMP_82]], %[[TMP_85]]
-  // CHECK: %[[TMP_87:.*]] = mhlo.constant dense<1.200000e+01>
-  // CHECK: %[[TMP_88:.*]] = mhlo.add %[[TMP_0]], %[[TMP_87]]
-  // CHECK: %[[TMP_89:.*]] = mhlo.constant dense<1.100000e+01>
-  // CHECK: %[[TMP_90:.*]] = mhlo.add %[[TMP_0]], %[[TMP_89]]
-  // CHECK: %[[TMP_91:.*]] = mhlo.multiply %[[TMP_88]], %[[TMP_90]]
-  // CHECK: %[[TMP_92:.*]] = mhlo.constant dense<1.33825364E-11>
-  // CHECK: %[[TMP_93:.*]] = mhlo.add %[[TMP_86]], %[[TMP_92]]
-  // CHECK: %[[TMP_94:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_93]]
-  // CHECK: %[[TMP_95:.*]] = mhlo.multiply %[[TMP_91]], %[[TMP_94]]
-  // CHECK: %[[TMP_96:.*]] = mhlo.constant dense<1.000000e+01>
-  // CHECK: %[[TMP_97:.*]] = mhlo.add %[[TMP_0]], %[[TMP_96]]
-  // CHECK: %[[TMP_98:.*]] = mhlo.constant dense<9.000000e+00>
-  // CHECK: %[[TMP_99:.*]] = mhlo.add %[[TMP_0]], %[[TMP_98]]
-  // CHECK: %[[TMP_100:.*]] = mhlo.multiply %[[TMP_97]], %[[TMP_99]]
-  // CHECK: %[[TMP_101:.*]] = mhlo.constant dense<-5.28419031E-10>
-  // CHECK: %[[TMP_102:.*]] = mhlo.add %[[TMP_95]], %[[TMP_101]]
-  // CHECK: %[[TMP_103:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_102]]
-  // CHECK: %[[TMP_104:.*]] = mhlo.multiply %[[TMP_100]], %[[TMP_103]]
-  // CHECK: %[[TMP_105:.*]] = mhlo.constant dense<8.000000e+00>
-  // CHECK: %[[TMP_106:.*]] = mhlo.add %[[TMP_0]], %[[TMP_105]]
-  // CHECK: %[[TMP_107:.*]] = mhlo.constant dense<7.000000e+00>
-  // CHECK: %[[TMP_108:.*]] = mhlo.add %[[TMP_0]], %[[TMP_107]]
-  // CHECK: %[[TMP_109:.*]] = mhlo.multiply %[[TMP_106]], %[[TMP_108]]
-  // CHECK: %[[TMP_110:.*]] = mhlo.constant dense<2.08767563E-8>
-  // CHECK: %[[TMP_111:.*]] = mhlo.add %[[TMP_104]], %[[TMP_110]]
-  // CHECK: %[[TMP_112:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_111]]
-  // CHECK: %[[TMP_113:.*]] = mhlo.multiply %[[TMP_109]], %[[TMP_112]]
-  // CHECK: %[[TMP_114:.*]] = mhlo.constant dense<6.000000e+00>
-  // CHECK: %[[TMP_115:.*]] = mhlo.add %[[TMP_0]], %[[TMP_114]]
-  // CHECK: %[[TMP_116:.*]] = mhlo.constant dense<5.000000e+00>
-  // CHECK: %[[TMP_117:.*]] = mhlo.add %[[TMP_0]], %[[TMP_116]]
-  // CHECK: %[[TMP_118:.*]] = mhlo.multiply %[[TMP_115]], %[[TMP_117]]
-  // CHECK: %[[TMP_119:.*]] = mhlo.constant dense<-8.26719599E-7>
-  // CHECK: %[[TMP_120:.*]] = mhlo.add %[[TMP_113]], %[[TMP_119]]
-  // CHECK: %[[TMP_121:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_120]]
-  // CHECK: %[[TMP_122:.*]] = mhlo.multiply %[[TMP_118]], %[[TMP_121]]
-  // CHECK: %[[TMP_123:.*]] = mhlo.constant dense<4.000000e+00>
-  // CHECK: %[[TMP_124:.*]] = mhlo.add %[[TMP_0]], %[[TMP_123]]
-  // CHECK: %[[TMP_125:.*]] = mhlo.constant dense<3.000000e+00>
-  // CHECK: %[[TMP_126:.*]] = mhlo.add %[[TMP_0]], %[[TMP_125]]
-  // CHECK: %[[TMP_127:.*]] = mhlo.multiply %[[TMP_124]], %[[TMP_126]]
-  // CHECK: %[[TMP_128:.*]] = mhlo.constant dense<3.30687835E-5>
-  // CHECK: %[[TMP_129:.*]] = mhlo.add %[[TMP_122]], %[[TMP_128]]
-  // CHECK: %[[TMP_130:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_129]]
-  // CHECK: %[[TMP_131:.*]] = mhlo.multiply %[[TMP_127]], %[[TMP_130]]
-  // CHECK: %[[TMP_132:.*]] = mhlo.constant dense<2.000000e+00>
-  // CHECK: %[[TMP_133:.*]] = mhlo.add %[[TMP_0]], %[[TMP_132]]
-  // CHECK: %[[TMP_134:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_135:.*]] = mhlo.add %[[TMP_0]], %[[TMP_134]]
-  // CHECK: %[[TMP_136:.*]] = mhlo.multiply %[[TMP_133]], %[[TMP_135]]
-  // CHECK: %[[TMP_137:.*]] = mhlo.constant dense<-0.00138888892>
-  // CHECK: %[[TMP_138:.*]] = mhlo.add %[[TMP_131]], %[[TMP_137]]
-  // CHECK: %[[TMP_139:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_138]]
-  // CHECK: %[[TMP_140:.*]] = mhlo.multiply %[[TMP_136]], %[[TMP_139]]
-  // CHECK: %[[TMP_141:.*]] = mhlo.constant dense<5.000000e-01>
-  // CHECK: %[[TMP_142:.*]] = mhlo.divide %[[TMP_0]], %[[TMP_33]]
-  // CHECK: %[[TMP_143:.*]] = mhlo.constant dense<0.0833333358>
-  // CHECK: %[[TMP_144:.*]] = mhlo.add %[[TMP_143]], %[[TMP_140]]
-  // CHECK: %[[TMP_145:.*]] = mhlo.multiply %[[TMP_142]], %[[TMP_144]]
-  // CHECK: %[[TMP_146:.*]] = mhlo.add %[[TMP_141]], %[[TMP_145]]
-  // CHECK: %[[TMP_147:.*]] = mhlo.multiply %[[TMP_34]], %[[TMP_146]]
-  // CHECK: %[[TMP_148:.*]] = mhlo.add %[[TMP_39]], %[[TMP_147]]
+  // CHECK: %[[TMP_36:.*]] = mhlo.multiply %[[TMP_34]], %[[TMP_33]]
+  // CHECK: %[[TMP_37:.*]] = mhlo.subtract %[[TMP_0]], %[[TMP_35]]
+  // CHECK: %[[TMP_38:.*]] = mhlo.divide %[[TMP_36]], %[[TMP_37]]
+  // CHECK: %[[TMP_39:.*]] = mhlo.multiply %[[TMP_33]], %[[TMP_33]]
+  // CHECK: %[[TMP_40:.*]] = mhlo.divide %[[TMP_3]], %[[TMP_39]]
+  // CHECK: %[[TMP_41:.*]] = mhlo.constant dense<2.200000e+01>
+  // CHECK: %[[TMP_42:.*]] = mhlo.add %[[TMP_0]], %[[TMP_41]]
+  // CHECK: %[[TMP_43:.*]] = mhlo.constant dense<2.100000e+01>
+  // CHECK: %[[TMP_44:.*]] = mhlo.add %[[TMP_0]], %[[TMP_43]]
+  // CHECK: %[[TMP_45:.*]] = mhlo.multiply %[[TMP_42]], %[[TMP_44]]
+  // CHECK: %[[TMP_46:.*]] = mhlo.constant dense<-1.39544646E-19>
+  // CHECK: %[[TMP_47:.*]] = mhlo.add %[[TMP_2]], %[[TMP_46]]
+  // CHECK: %[[TMP_48:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_47]]
+  // CHECK: %[[TMP_49:.*]] = mhlo.multiply %[[TMP_45]], %[[TMP_48]]
+  // CHECK: %[[TMP_50:.*]] = mhlo.constant dense<2.000000e+01>
+  // CHECK: %[[TMP_51:.*]] = mhlo.add %[[TMP_0]], %[[TMP_50]]
+  // CHECK: %[[TMP_52:.*]] = mhlo.constant dense<1.900000e+01>
+  // CHECK: %[[TMP_53:.*]] = mhlo.add %[[TMP_0]], %[[TMP_52]]
+  // CHECK: %[[TMP_54:.*]] = mhlo.multiply %[[TMP_51]], %[[TMP_53]]
+  // CHECK: %[[TMP_55:.*]] = mhlo.constant dense<5.50900303E-18>
+  // CHECK: %[[TMP_56:.*]] = mhlo.add %[[TMP_49]], %[[TMP_55]]
+  // CHECK: %[[TMP_57:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_56]]
+  // CHECK: %[[TMP_58:.*]] = mhlo.multiply %[[TMP_54]], %[[TMP_57]]
+  // CHECK: %[[TMP_59:.*]] = mhlo.constant dense<1.800000e+01>
+  // CHECK: %[[TMP_60:.*]] = mhlo.add %[[TMP_0]], %[[TMP_59]]
+  // CHECK: %[[TMP_61:.*]] = mhlo.constant dense<1.700000e+01>
+  // CHECK: %[[TMP_62:.*]] = mhlo.add %[[TMP_0]], %[[TMP_61]]
+  // CHECK: %[[TMP_63:.*]] = mhlo.multiply %[[TMP_60]], %[[TMP_62]]
+  // CHECK: %[[TMP_64:.*]] = mhlo.constant dense<-2.17486866E-16>
+  // CHECK: %[[TMP_65:.*]] = mhlo.add %[[TMP_58]], %[[TMP_64]]
+  // CHECK: %[[TMP_66:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_65]]
+  // CHECK: %[[TMP_67:.*]] = mhlo.multiply %[[TMP_63]], %[[TMP_66]]
+  // CHECK: %[[TMP_68:.*]] = mhlo.constant dense<1.600000e+01>
+  // CHECK: %[[TMP_69:.*]] = mhlo.add %[[TMP_0]], %[[TMP_68]]
+  // CHECK: %[[TMP_70:.*]] = mhlo.constant dense<1.500000e+01>
+  // CHECK: %[[TMP_71:.*]] = mhlo.add %[[TMP_0]], %[[TMP_70]]
+  // CHECK: %[[TMP_72:.*]] = mhlo.multiply %[[TMP_69]], %[[TMP_71]]
+  // CHECK: %[[TMP_73:.*]] = mhlo.constant dense<8.58606213E-15>
+  // CHECK: %[[TMP_74:.*]] = mhlo.add %[[TMP_67]], %[[TMP_73]]
+  // CHECK: %[[TMP_75:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_74]]
+  // CHECK: %[[TMP_76:.*]] = mhlo.multiply %[[TMP_72]], %[[TMP_75]]
+  // CHECK: %[[TMP_77:.*]] = mhlo.constant dense<1.400000e+01>
+  // CHECK: %[[TMP_78:.*]] = mhlo.add %[[TMP_0]], %[[TMP_77]]
+  // CHECK: %[[TMP_79:.*]] = mhlo.constant dense<1.300000e+01>
+  // CHECK: %[[TMP_80:.*]] = mhlo.add %[[TMP_0]], %[[TMP_79]]
+  // CHECK: %[[TMP_81:.*]] = mhlo.multiply %[[TMP_78]], %[[TMP_80]]
+  // CHECK: %[[TMP_82:.*]] = mhlo.constant dense<-3.3896803E-13>
+  // CHECK: %[[TMP_83:.*]] = mhlo.add %[[TMP_76]], %[[TMP_82]]
+  // CHECK: %[[TMP_84:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_83]]
+  // CHECK: %[[TMP_85:.*]] = mhlo.multiply %[[TMP_81]], %[[TMP_84]]
+  // CHECK: %[[TMP_86:.*]] = mhlo.constant dense<1.200000e+01>
+  // CHECK: %[[TMP_87:.*]] = mhlo.add %[[TMP_0]], %[[TMP_86]]
+  // CHECK: %[[TMP_88:.*]] = mhlo.constant dense<1.100000e+01>
+  // CHECK: %[[TMP_89:.*]] = mhlo.add %[[TMP_0]], %[[TMP_88]]
+  // CHECK: %[[TMP_90:.*]] = mhlo.multiply %[[TMP_87]], %[[TMP_89]]
+  // CHECK: %[[TMP_91:.*]] = mhlo.constant dense<1.33825364E-11>
+  // CHECK: %[[TMP_92:.*]] = mhlo.add %[[TMP_85]], %[[TMP_91]]
+  // CHECK: %[[TMP_93:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_92]]
+  // CHECK: %[[TMP_94:.*]] = mhlo.multiply %[[TMP_90]], %[[TMP_93]]
+  // CHECK: %[[TMP_95:.*]] = mhlo.constant dense<1.000000e+01>
+  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_0]], %[[TMP_95]]
+  // CHECK: %[[TMP_97:.*]] = mhlo.constant dense<9.000000e+00>
+  // CHECK: %[[TMP_98:.*]] = mhlo.add %[[TMP_0]], %[[TMP_97]]
+  // CHECK: %[[TMP_99:.*]] = mhlo.multiply %[[TMP_96]], %[[TMP_98]]
+  // CHECK: %[[TMP_100:.*]] = mhlo.constant dense<-5.28419031E-10>
+  // CHECK: %[[TMP_101:.*]] = mhlo.add %[[TMP_94]], %[[TMP_100]]
+  // CHECK: %[[TMP_102:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_101]]
+  // CHECK: %[[TMP_103:.*]] = mhlo.multiply %[[TMP_99]], %[[TMP_102]]
+  // CHECK: %[[TMP_104:.*]] = mhlo.constant dense<8.000000e+00>
+  // CHECK: %[[TMP_105:.*]] = mhlo.add %[[TMP_0]], %[[TMP_104]]
+  // CHECK: %[[TMP_106:.*]] = mhlo.constant dense<7.000000e+00>
+  // CHECK: %[[TMP_107:.*]] = mhlo.add %[[TMP_0]], %[[TMP_106]]
+  // CHECK: %[[TMP_108:.*]] = mhlo.multiply %[[TMP_105]], %[[TMP_107]]
+  // CHECK: %[[TMP_109:.*]] = mhlo.constant dense<2.08767563E-8>
+  // CHECK: %[[TMP_110:.*]] = mhlo.add %[[TMP_103]], %[[TMP_109]]
+  // CHECK: %[[TMP_111:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_110]]
+  // CHECK: %[[TMP_112:.*]] = mhlo.multiply %[[TMP_108]], %[[TMP_111]]
+  // CHECK: %[[TMP_113:.*]] = mhlo.constant dense<6.000000e+00>
+  // CHECK: %[[TMP_114:.*]] = mhlo.add %[[TMP_0]], %[[TMP_113]]
+  // CHECK: %[[TMP_115:.*]] = mhlo.constant dense<5.000000e+00>
+  // CHECK: %[[TMP_116:.*]] = mhlo.add %[[TMP_0]], %[[TMP_115]]
+  // CHECK: %[[TMP_117:.*]] = mhlo.multiply %[[TMP_114]], %[[TMP_116]]
+  // CHECK: %[[TMP_118:.*]] = mhlo.constant dense<-8.26719599E-7>
+  // CHECK: %[[TMP_119:.*]] = mhlo.add %[[TMP_112]], %[[TMP_118]]
+  // CHECK: %[[TMP_120:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_119]]
+  // CHECK: %[[TMP_121:.*]] = mhlo.multiply %[[TMP_117]], %[[TMP_120]]
+  // CHECK: %[[TMP_122:.*]] = mhlo.constant dense<4.000000e+00>
+  // CHECK: %[[TMP_123:.*]] = mhlo.add %[[TMP_0]], %[[TMP_122]]
+  // CHECK: %[[TMP_124:.*]] = mhlo.constant dense<3.000000e+00>
+  // CHECK: %[[TMP_125:.*]] = mhlo.add %[[TMP_0]], %[[TMP_124]]
+  // CHECK: %[[TMP_126:.*]] = mhlo.multiply %[[TMP_123]], %[[TMP_125]]
+  // CHECK: %[[TMP_127:.*]] = mhlo.constant dense<3.30687835E-5>
+  // CHECK: %[[TMP_128:.*]] = mhlo.add %[[TMP_121]], %[[TMP_127]]
+  // CHECK: %[[TMP_129:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_128]]
+  // CHECK: %[[TMP_130:.*]] = mhlo.multiply %[[TMP_126]], %[[TMP_129]]
+  // CHECK: %[[TMP_131:.*]] = mhlo.constant dense<2.000000e+00>
+  // CHECK: %[[TMP_132:.*]] = mhlo.add %[[TMP_0]], %[[TMP_131]]
+  // CHECK: %[[TMP_133:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_134:.*]] = mhlo.add %[[TMP_0]], %[[TMP_133]]
+  // CHECK: %[[TMP_135:.*]] = mhlo.multiply %[[TMP_132]], %[[TMP_134]]
+  // CHECK: %[[TMP_136:.*]] = mhlo.constant dense<-0.00138888892>
+  // CHECK: %[[TMP_137:.*]] = mhlo.add %[[TMP_130]], %[[TMP_136]]
+  // CHECK: %[[TMP_138:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_137]]
+  // CHECK: %[[TMP_139:.*]] = mhlo.multiply %[[TMP_135]], %[[TMP_138]]
+  // CHECK: %[[TMP_140:.*]] = mhlo.constant dense<5.000000e-01>
+  // CHECK: %[[TMP_141:.*]] = mhlo.divide %[[TMP_0]], %[[TMP_33]]
+  // CHECK: %[[TMP_142:.*]] = mhlo.constant dense<0.0833333358>
+  // CHECK: %[[TMP_143:.*]] = mhlo.add %[[TMP_142]], %[[TMP_139]]
+  // CHECK: %[[TMP_144:.*]] = mhlo.multiply %[[TMP_141]], %[[TMP_143]]
+  // CHECK: %[[TMP_145:.*]] = mhlo.add %[[TMP_140]], %[[TMP_144]]
+  // CHECK: %[[TMP_146:.*]] = mhlo.multiply %[[TMP_34]], %[[TMP_145]]
+  // CHECK: %[[TMP_147:.*]] = mhlo.add %[[TMP_32]], %[[TMP_38]]
+  // CHECK: %[[TMP_148:.*]] = mhlo.add %[[TMP_147]], %[[TMP_146]]
   // CHECK: %[[TMP_149:.*]] = mhlo.abs %[[TMP_34]]
   // CHECK: %[[TMP_150:.*]] = mhlo.abs %[[TMP_32]]
   // CHECK: %[[TMP_151:.*]] = mhlo.constant dense<1.401300e-45>
@@ -1282,7 +1282,7 @@ func.func @zeta_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
   // CHECK: %[[TMP_172:.*]] = mhlo.and %[[TMP_169]], %[[TMP_171]] : tensor<i1>
   // CHECK: %[[TMP_173:.*]] = mhlo.select %[[TMP_172]], %[[TMP_163]], %[[TMP_155]]
   // CHECK: %[[TMP_174:.*]] = mhlo.select %[[TMP_166]], %[[TMP_173]], %[[TMP_162]]
-  // CHECK: %[[TMP_175:.*]] = mhlo.compare EQ, %[[TMP_0]], %[[TMP_5]], NOTYPE
+  // CHECK: %[[TMP_175:.*]] = mhlo.compare EQ, %[[TMP_0]], %[[TMP_3]], NOTYPE
   // CHECK: %[[TMP_176:.*]] = mhlo.select %[[TMP_175]], %[[TMP_163]], %[[TMP_174]]
   // CHECK: %[[TMP_177:.*]] = mhlo.convert %[[TMP_176]] : (tensor<f32>) -> tensor<f16>
   %0 = chlo.zeta %arg0, %arg1 : tensor<f16>, tensor<f16> -> tensor<f16>
@@ -1384,153 +1384,153 @@ func.func @polygamma_f32(%lhs : tensor<f32>, %rhs : tensor<f32>) -> tensor<f32>
   // CHECK: %[[TMP_87:.*]] = mhlo.constant dense<0x7F800000>
   // CHECK: %[[TMP_88:.*]] = mhlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
   // CHECK: %[[TMP_89:.*]] = mhlo.exponential %[[TMP_88]]
-  // CHECK: %[[TMP_90:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_91:.*]] = mhlo.negate %[[TMP_5]]
-  // CHECK: %[[TMP_92:.*]] = mhlo.power %[[ARG1]], %[[TMP_91]]
-  // CHECK: %[[TMP_93:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_94:.*]] = mhlo.add %[[ARG1]], %[[TMP_93]]
-  // CHECK: %[[TMP_95:.*]] = mhlo.power %[[TMP_94]], %[[TMP_91]]
-  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_92]], %[[TMP_95]]
-  // CHECK: %[[TMP_97:.*]] = mhlo.add %[[TMP_94]], %[[TMP_93]]
-  // CHECK: %[[TMP_98:.*]] = mhlo.power %[[TMP_97]], %[[TMP_91]]
+  // CHECK-DAG: %[[TMP_90:.*]] = mhlo.constant dense<0.000000e+00>
+  // CHECK-DAG: %[[TMP_91:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_92:.*]] = mhlo.negate %[[TMP_5]]
+  // CHECK: %[[TMP_93:.*]] = mhlo.power %[[ARG1]], %[[TMP_92]]
+  // CHECK: %[[TMP_94:.*]] = mhlo.add %[[ARG1]], %[[TMP_91]]
+  // CHECK: %[[TMP_95:.*]] = mhlo.power %[[TMP_94]], %[[TMP_92]]
+  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_93]], %[[TMP_95]]
+  // CHECK: %[[TMP_97:.*]] = mhlo.add %[[TMP_94]], %[[TMP_91]]
+  // CHECK: %[[TMP_98:.*]] = mhlo.power %[[TMP_97]], %[[TMP_92]]
   // CHECK: %[[TMP_99:.*]] = mhlo.add %[[TMP_96]], %[[TMP_98]]
-  // CHECK: %[[TMP_100:.*]] = mhlo.add %[[TMP_97]], %[[TMP_93]]
-  // CHECK: %[[TMP_101:.*]] = mhlo.power %[[TMP_100]], %[[TMP_91]]
+  // CHECK: %[[TMP_100:.*]] = mhlo.add %[[TMP_97]], %[[TMP_91]]
+  // CHECK: %[[TMP_101:.*]] = mhlo.power %[[TMP_100]], %[[TMP_92]]
   // CHECK: %[[TMP_102:.*]] = mhlo.add %[[TMP_99]], %[[TMP_101]]
-  // CHECK: %[[TMP_103:.*]] = mhlo.add %[[TMP_100]], %[[TMP_93]]
-  // CHECK: %[[TMP_104:.*]] = mhlo.power %[[TMP_103]], %[[TMP_91]]
+  // CHECK: %[[TMP_103:.*]] = mhlo.add %[[TMP_100]], %[[TMP_91]]
+  // CHECK: %[[TMP_104:.*]] = mhlo.power %[[TMP_103]], %[[TMP_92]]
   // CHECK: %[[TMP_105:.*]] = mhlo.add %[[TMP_102]], %[[TMP_104]]
-  // CHECK: %[[TMP_106:.*]] = mhlo.add %[[TMP_103]], %[[TMP_93]]
-  // CHECK: %[[TMP_107:.*]] = mhlo.power %[[TMP_106]], %[[TMP_91]]
+  // CHECK: %[[TMP_106:.*]] = mhlo.add %[[TMP_103]], %[[TMP_91]]
+  // CHECK: %[[TMP_107:.*]] = mhlo.power %[[TMP_106]], %[[TMP_92]]
   // CHECK: %[[TMP_108:.*]] = mhlo.add %[[TMP_105]], %[[TMP_107]]
-  // CHECK: %[[TMP_109:.*]] = mhlo.add %[[TMP_106]], %[[TMP_93]]
-  // CHECK: %[[TMP_110:.*]] = mhlo.power %[[TMP_109]], %[[TMP_91]]
+  // CHECK: %[[TMP_109:.*]] = mhlo.add %[[TMP_106]], %[[TMP_91]]
+  // CHECK: %[[TMP_110:.*]] = mhlo.power %[[TMP_109]], %[[TMP_92]]
   // CHECK: %[[TMP_111:.*]] = mhlo.add %[[TMP_108]], %[[TMP_110]]
-  // CHECK: %[[TMP_112:.*]] = mhlo.add %[[TMP_109]], %[[TMP_93]]
-  // CHECK: %[[TMP_113:.*]] = mhlo.power %[[TMP_112]], %[[TMP_91]]
+  // CHECK: %[[TMP_112:.*]] = mhlo.add %[[TMP_109]], %[[TMP_91]]
+  // CHECK: %[[TMP_113:.*]] = mhlo.power %[[TMP_112]], %[[TMP_92]]
   // CHECK: %[[TMP_114:.*]] = mhlo.add %[[TMP_111]], %[[TMP_113]]
-  // CHECK: %[[TMP_115:.*]] = mhlo.add %[[TMP_112]], %[[TMP_93]]
-  // CHECK: %[[TMP_116:.*]] = mhlo.power %[[TMP_115]], %[[TMP_91]]
+  // CHECK: %[[TMP_115:.*]] = mhlo.add %[[TMP_112]], %[[TMP_91]]
+  // CHECK: %[[TMP_116:.*]] = mhlo.power %[[TMP_115]], %[[TMP_92]]
   // CHECK: %[[TMP_117:.*]] = mhlo.add %[[TMP_114]], %[[TMP_116]]
-  // CHECK: %[[TMP_118:.*]] = mhlo.add %[[TMP_115]], %[[TMP_93]]
-  // CHECK: %[[TMP_119:.*]] = mhlo.power %[[TMP_118]], %[[TMP_91]]
+  // CHECK: %[[TMP_118:.*]] = mhlo.add %[[TMP_115]], %[[TMP_91]]
+  // CHECK: %[[TMP_119:.*]] = mhlo.power %[[TMP_118]], %[[TMP_92]]
   // CHECK: %[[TMP_120:.*]] = mhlo.add %[[TMP_117]], %[[TMP_119]]
-  // CHECK: %[[TMP_121:.*]] = mhlo.add %[[TMP_118]], %[[TMP_93]]
-  // CHECK: %[[TMP_122:.*]] = mhlo.power %[[TMP_121]], %[[TMP_91]]
+  // CHECK: %[[TMP_121:.*]] = mhlo.add %[[TMP_118]], %[[TMP_91]]
+  // CHECK: %[[TMP_122:.*]] = mhlo.power %[[TMP_121]], %[[TMP_92]]
   // CHECK: %[[TMP_123:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_124:.*]] = mhlo.subtract %[[TMP_5]], %[[TMP_123]]
-  // CHECK: %[[TMP_125:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_121]]
-  // CHECK: %[[TMP_126:.*]] = mhlo.divide %[[TMP_125]], %[[TMP_124]]
-  // CHECK: %[[TMP_127:.*]] = mhlo.add %[[TMP_120]], %[[TMP_126]]
-  // CHECK: %[[TMP_128:.*]] = mhlo.multiply %[[TMP_121]], %[[TMP_121]]
-  // CHECK: %[[TMP_129:.*]] = mhlo.divide %[[TMP_93]], %[[TMP_128]]
-  // CHECK: %[[TMP_130:.*]] = mhlo.constant dense<2.200000e+01>
-  // CHECK: %[[TMP_131:.*]] = mhlo.add %[[TMP_5]], %[[TMP_130]]
-  // CHECK: %[[TMP_132:.*]] = mhlo.constant dense<2.100000e+01>
-  // CHECK: %[[TMP_133:.*]] = mhlo.add %[[TMP_5]], %[[TMP_132]]
-  // CHECK: %[[TMP_134:.*]] = mhlo.multiply %[[TMP_131]], %[[TMP_133]]
-  // CHECK: %[[TMP_135:.*]] = mhlo.constant dense<-1.39544646E-19>
-  // CHECK: %[[TMP_136:.*]] = mhlo.add %[[TMP_90]], %[[TMP_135]]
-  // CHECK: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_136]]
-  // CHECK: %[[TMP_138:.*]] = mhlo.multiply %[[TMP_134]], %[[TMP_137]]
-  // CHECK: %[[TMP_139:.*]] = mhlo.constant dense<2.000000e+01>
-  // CHECK: %[[TMP_140:.*]] = mhlo.add %[[TMP_5]], %[[TMP_139]]
-  // CHECK: %[[TMP_141:.*]] = mhlo.constant dense<1.900000e+01>
-  // CHECK: %[[TMP_142:.*]] = mhlo.add %[[TMP_5]], %[[TMP_141]]
-  // CHECK: %[[TMP_143:.*]] = mhlo.multiply %[[TMP_140]], %[[TMP_142]]
-  // CHECK: %[[TMP_144:.*]] = mhlo.constant dense<5.50900303E-18>
-  // CHECK: %[[TMP_145:.*]] = mhlo.add %[[TMP_138]], %[[TMP_144]]
-  // CHECK: %[[TMP_146:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_145]]
-  // CHECK: %[[TMP_147:.*]] = mhlo.multiply %[[TMP_143]], %[[TMP_146]]
-  // CHECK: %[[TMP_148:.*]] = mhlo.constant dense<1.800000e+01>
-  // CHECK: %[[TMP_149:.*]] = mhlo.add %[[TMP_5]], %[[TMP_148]]
-  // CHECK: %[[TMP_150:.*]] = mhlo.constant dense<1.700000e+01>
-  // CHECK: %[[TMP_151:.*]] = mhlo.add %[[TMP_5]], %[[TMP_150]]
-  // CHECK: %[[TMP_152:.*]] = mhlo.multiply %[[TMP_149]], %[[TMP_151]]
-  // CHECK: %[[TMP_153:.*]] = mhlo.constant dense<-2.17486866E-16>
-  // CHECK: %[[TMP_154:.*]] = mhlo.add %[[TMP_147]], %[[TMP_153]]
-  // CHECK: %[[TMP_155:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_154]]
-  // CHECK: %[[TMP_156:.*]] = mhlo.multiply %[[TMP_152]], %[[TMP_155]]
-  // CHECK: %[[TMP_157:.*]] = mhlo.constant dense<1.600000e+01>
-  // CHECK: %[[TMP_158:.*]] = mhlo.add %[[TMP_5]], %[[TMP_157]]
-  // CHECK: %[[TMP_159:.*]] = mhlo.constant dense<1.500000e+01>
-  // CHECK: %[[TMP_160:.*]] = mhlo.add %[[TMP_5]], %[[TMP_159]]
-  // CHECK: %[[TMP_161:.*]] = mhlo.multiply %[[TMP_158]], %[[TMP_160]]
-  // CHECK: %[[TMP_162:.*]] = mhlo.constant dense<8.58606213E-15>
-  // CHECK: %[[TMP_163:.*]] = mhlo.add %[[TMP_156]], %[[TMP_162]]
-  // CHECK: %[[TMP_164:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_163]]
-  // CHECK: %[[TMP_165:.*]] = mhlo.multiply %[[TMP_161]], %[[TMP_164]]
-  // CHECK: %[[TMP_166:.*]] = mhlo.constant dense<1.400000e+01>
-  // CHECK: %[[TMP_167:.*]] = mhlo.add %[[TMP_5]], %[[TMP_166]]
-  // CHECK: %[[TMP_168:.*]] = mhlo.constant dense<1.300000e+01>
-  // CHECK: %[[TMP_169:.*]] = mhlo.add %[[TMP_5]], %[[TMP_168]]
-  // CHECK: %[[TMP_170:.*]] = mhlo.multiply %[[TMP_167]], %[[TMP_169]]
-  // CHECK: %[[TMP_171:.*]] = mhlo.constant dense<-3.3896803E-13>
-  // CHECK: %[[TMP_172:.*]] = mhlo.add %[[TMP_165]], %[[TMP_171]]
-  // CHECK: %[[TMP_173:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_172]]
-  // CHECK: %[[TMP_174:.*]] = mhlo.multiply %[[TMP_170]], %[[TMP_173]]
-  // CHECK: %[[TMP_175:.*]] = mhlo.constant dense<1.200000e+01>
-  // CHECK: %[[TMP_176:.*]] = mhlo.add %[[TMP_5]], %[[TMP_175]]
-  // CHECK: %[[TMP_177:.*]] = mhlo.constant dense<1.100000e+01>
-  // CHECK: %[[TMP_178:.*]] = mhlo.add %[[TMP_5]], %[[TMP_177]]
-  // CHECK: %[[TMP_179:.*]] = mhlo.multiply %[[TMP_176]], %[[TMP_178]]
-  // CHECK: %[[TMP_180:.*]] = mhlo.constant dense<1.33825364E-11>
-  // CHECK: %[[TMP_181:.*]] = mhlo.add %[[TMP_174]], %[[TMP_180]]
-  // CHECK: %[[TMP_182:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_181]]
-  // CHECK: %[[TMP_183:.*]] = mhlo.multiply %[[TMP_179]], %[[TMP_182]]
-  // CHECK: %[[TMP_184:.*]] = mhlo.constant dense<1.000000e+01>
-  // CHECK: %[[TMP_185:.*]] = mhlo.add %[[TMP_5]], %[[TMP_184]]
-  // CHECK: %[[TMP_186:.*]] = mhlo.constant dense<9.000000e+00>
-  // CHECK: %[[TMP_187:.*]] = mhlo.add %[[TMP_5]], %[[TMP_186]]
-  // CHECK: %[[TMP_188:.*]] = mhlo.multiply %[[TMP_185]], %[[TMP_187]]
-  // CHECK: %[[TMP_189:.*]] = mhlo.constant dense<-5.28419031E-10>
-  // CHECK: %[[TMP_190:.*]] = mhlo.add %[[TMP_183]], %[[TMP_189]]
-  // CHECK: %[[TMP_191:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_190]]
-  // CHECK: %[[TMP_192:.*]] = mhlo.multiply %[[TMP_188]], %[[TMP_191]]
-  // CHECK: %[[TMP_193:.*]] = mhlo.constant dense<8.000000e+00>
-  // CHECK: %[[TMP_194:.*]] = mhlo.add %[[TMP_5]], %[[TMP_193]]
-  // CHECK: %[[TMP_195:.*]] = mhlo.constant dense<7.000000e+00>
-  // CHECK: %[[TMP_196:.*]] = mhlo.add %[[TMP_5]], %[[TMP_195]]
-  // CHECK: %[[TMP_197:.*]] = mhlo.multiply %[[TMP_194]], %[[TMP_196]]
-  // CHECK: %[[TMP_198:.*]] = mhlo.constant dense<2.08767563E-8>
-  // CHECK: %[[TMP_199:.*]] = mhlo.add %[[TMP_192]], %[[TMP_198]]
-  // CHECK: %[[TMP_200:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_199]]
-  // CHECK: %[[TMP_201:.*]] = mhlo.multiply %[[TMP_197]], %[[TMP_200]]
-  // CHECK: %[[TMP_202:.*]] = mhlo.constant dense<6.000000e+00>
-  // CHECK: %[[TMP_203:.*]] = mhlo.add %[[TMP_5]], %[[TMP_202]]
-  // CHECK: %[[TMP_204:.*]] = mhlo.constant dense<5.000000e+00>
-  // CHECK: %[[TMP_205:.*]] = mhlo.add %[[TMP_5]], %[[TMP_204]]
-  // CHECK: %[[TMP_206:.*]] = mhlo.multiply %[[TMP_203]], %[[TMP_205]]
-  // CHECK: %[[TMP_207:.*]] = mhlo.constant dense<-8.26719599E-7>
-  // CHECK: %[[TMP_208:.*]] = mhlo.add %[[TMP_201]], %[[TMP_207]]
-  // CHECK: %[[TMP_209:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_208]]
-  // CHECK: %[[TMP_210:.*]] = mhlo.multiply %[[TMP_206]], %[[TMP_209]]
-  // CHECK: %[[TMP_211:.*]] = mhlo.constant dense<4.000000e+00>
-  // CHECK: %[[TMP_212:.*]] = mhlo.add %[[TMP_5]], %[[TMP_211]]
-  // CHECK: %[[TMP_213:.*]] = mhlo.constant dense<3.000000e+00>
-  // CHECK: %[[TMP_214:.*]] = mhlo.add %[[TMP_5]], %[[TMP_213]]
-  // CHECK: %[[TMP_215:.*]] = mhlo.multiply %[[TMP_212]], %[[TMP_214]]
-  // CHECK: %[[TMP_216:.*]] = mhlo.constant dense<3.30687835E-5>
-  // CHECK: %[[TMP_217:.*]] = mhlo.add %[[TMP_210]], %[[TMP_216]]
-  // CHECK: %[[TMP_218:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_217]]
-  // CHECK: %[[TMP_219:.*]] = mhlo.multiply %[[TMP_215]], %[[TMP_218]]
-  // CHECK: %[[TMP_220:.*]] = mhlo.constant dense<2.000000e+00>
-  // CHECK: %[[TMP_221:.*]] = mhlo.add %[[TMP_5]], %[[TMP_220]]
-  // CHECK: %[[TMP_222:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_223:.*]] = mhlo.add %[[TMP_5]], %[[TMP_222]]
-  // CHECK: %[[TMP_224:.*]] = mhlo.multiply %[[TMP_221]], %[[TMP_223]]
-  // CHECK: %[[TMP_225:.*]] = mhlo.constant dense<-0.00138888892>
-  // CHECK: %[[TMP_226:.*]] = mhlo.add %[[TMP_219]], %[[TMP_225]]
-  // CHECK: %[[TMP_227:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_226]]
-  // CHECK: %[[TMP_228:.*]] = mhlo.multiply %[[TMP_224]], %[[TMP_227]]
-  // CHECK: %[[TMP_229:.*]] = mhlo.constant dense<5.000000e-01>
-  // CHECK: %[[TMP_230:.*]] = mhlo.divide %[[TMP_5]], %[[TMP_121]]
-  // CHECK: %[[TMP_231:.*]] = mhlo.constant dense<0.0833333358>
-  // CHECK: %[[TMP_232:.*]] = mhlo.add %[[TMP_231]], %[[TMP_228]]
-  // CHECK: %[[TMP_233:.*]] = mhlo.multiply %[[TMP_230]], %[[TMP_232]]
-  // CHECK: %[[TMP_234:.*]] = mhlo.add %[[TMP_229]], %[[TMP_233]]
-  // CHECK: %[[TMP_235:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_234]]
-  // CHECK: %[[TMP_236:.*]] = mhlo.add %[[TMP_127]], %[[TMP_235]]
+  // CHECK: %[[TMP_124:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_121]]
+  // CHECK: %[[TMP_125:.*]] = mhlo.subtract %[[TMP_5]], %[[TMP_123]]
+  // CHECK: %[[TMP_126:.*]] = mhlo.divide %[[TMP_124]], %[[TMP_125]]
+  // CHECK: %[[TMP_127:.*]] = mhlo.multiply %[[TMP_121]], %[[TMP_121]]
+  // CHECK: %[[TMP_128:.*]] = mhlo.divide %[[TMP_91]], %[[TMP_127]]
+  // CHECK: %[[TMP_129:.*]] = mhlo.constant dense<2.200000e+01>
+  // CHECK: %[[TMP_130:.*]] = mhlo.add %[[TMP_5]], %[[TMP_129]]
+  // CHECK: %[[TMP_131:.*]] = mhlo.constant dense<2.100000e+01>
+  // CHECK: %[[TMP_132:.*]] = mhlo.add %[[TMP_5]], %[[TMP_131]]
+  // CHECK: %[[TMP_133:.*]] = mhlo.multiply %[[TMP_130]], %[[TMP_132]]
+  // CHECK: %[[TMP_134:.*]] = mhlo.constant dense<-1.39544646E-19>
+  // CHECK: %[[TMP_135:.*]] = mhlo.add %[[TMP_90]], %[[TMP_134]]
+  // CHECK: %[[TMP_136:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_135]]
+  // CHECK: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_133]], %[[TMP_136]]
+  // CHECK: %[[TMP_138:.*]] = mhlo.constant dense<2.000000e+01>
+  // CHECK: %[[TMP_139:.*]] = mhlo.add %[[TMP_5]], %[[TMP_138]]
+  // CHECK: %[[TMP_140:.*]] = mhlo.constant dense<1.900000e+01>
+  // CHECK: %[[TMP_141:.*]] = mhlo.add %[[TMP_5]], %[[TMP_140]]
+  // CHECK: %[[TMP_142:.*]] = mhlo.multiply %[[TMP_139]], %[[TMP_141]]
+  // CHECK: %[[TMP_143:.*]] = mhlo.constant dense<5.50900303E-18>
+  // CHECK: %[[TMP_144:.*]] = mhlo.add %[[TMP_137]], %[[TMP_143]]
+  // CHECK: %[[TMP_145:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_144]]
+  // CHECK: %[[TMP_146:.*]] = mhlo.multiply %[[TMP_142]], %[[TMP_145]]
+  // CHECK: %[[TMP_147:.*]] = mhlo.constant dense<1.800000e+01>
+  // CHECK: %[[TMP_148:.*]] = mhlo.add %[[TMP_5]], %[[TMP_147]]
+  // CHECK: %[[TMP_149:.*]] = mhlo.constant dense<1.700000e+01>
+  // CHECK: %[[TMP_150:.*]] = mhlo.add %[[TMP_5]], %[[TMP_149]]
+  // CHECK: %[[TMP_151:.*]] = mhlo.multiply %[[TMP_148]], %[[TMP_150]]
+  // CHECK: %[[TMP_152:.*]] = mhlo.constant dense<-2.17486866E-16>
+  // CHECK: %[[TMP_153:.*]] = mhlo.add %[[TMP_146]], %[[TMP_152]]
+  // CHECK: %[[TMP_154:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_153]]
+  // CHECK: %[[TMP_155:.*]] = mhlo.multiply %[[TMP_151]], %[[TMP_154]]
+  // CHECK: %[[TMP_156:.*]] = mhlo.constant dense<1.600000e+01>
+  // CHECK: %[[TMP_157:.*]] = mhlo.add %[[TMP_5]], %[[TMP_156]]
+  // CHECK: %[[TMP_158:.*]] = mhlo.constant dense<1.500000e+01>
+  // CHECK: %[[TMP_159:.*]] = mhlo.add %[[TMP_5]], %[[TMP_158]]
+  // CHECK: %[[TMP_160:.*]] = mhlo.multiply %[[TMP_157]], %[[TMP_159]]
+  // CHECK: %[[TMP_161:.*]] = mhlo.constant dense<8.58606213E-15>
+  // CHECK: %[[TMP_162:.*]] = mhlo.add %[[TMP_155]], %[[TMP_161]]
+  // CHECK: %[[TMP_163:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_162]]
+  // CHECK: %[[TMP_164:.*]] = mhlo.multiply %[[TMP_160]], %[[TMP_163]]
+  // CHECK: %[[TMP_165:.*]] = mhlo.constant dense<1.400000e+01>
+  // CHECK: %[[TMP_166:.*]] = mhlo.add %[[TMP_5]], %[[TMP_165]]
+  // CHECK: %[[TMP_167:.*]] = mhlo.constant dense<1.300000e+01>
+  // CHECK: %[[TMP_168:.*]] = mhlo.add %[[TMP_5]], %[[TMP_167]]
+  // CHECK: %[[TMP_169:.*]] = mhlo.multiply %[[TMP_166]], %[[TMP_168]]
+  // CHECK: %[[TMP_170:.*]] = mhlo.constant dense<-3.3896803E-13>
+  // CHECK: %[[TMP_171:.*]] = mhlo.add %[[TMP_164]], %[[TMP_170]]
+  // CHECK: %[[TMP_172:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_171]]
+  // CHECK: %[[TMP_173:.*]] = mhlo.multiply %[[TMP_169]], %[[TMP_172]]
+  // CHECK: %[[TMP_174:.*]] = mhlo.constant dense<1.200000e+01>
+  // CHECK: %[[TMP_175:.*]] = mhlo.add %[[TMP_5]], %[[TMP_174]]
+  // CHECK: %[[TMP_176:.*]] = mhlo.constant dense<1.100000e+01>
+  // CHECK: %[[TMP_177:.*]] = mhlo.add %[[TMP_5]], %[[TMP_176]]
+  // CHECK: %[[TMP_178:.*]] = mhlo.multiply %[[TMP_175]], %[[TMP_177]]
+  // CHECK: %[[TMP_179:.*]] = mhlo.constant dense<1.33825364E-11>
+  // CHECK: %[[TMP_180:.*]] = mhlo.add %[[TMP_173]], %[[TMP_179]]
+  // CHECK: %[[TMP_181:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_180]]
+  // CHECK: %[[TMP_182:.*]] = mhlo.multiply %[[TMP_178]], %[[TMP_181]]
+  // CHECK: %[[TMP_183:.*]] = mhlo.constant dense<1.000000e+01>
+  // CHECK: %[[TMP_184:.*]] = mhlo.add %[[TMP_5]], %[[TMP_183]]
+  // CHECK: %[[TMP_185:.*]] = mhlo.constant dense<9.000000e+00>
+  // CHECK: %[[TMP_186:.*]] = mhlo.add %[[TMP_5]], %[[TMP_185]]
+  // CHECK: %[[TMP_187:.*]] = mhlo.multiply %[[TMP_184]], %[[TMP_186]]
+  // CHECK: %[[TMP_188:.*]] = mhlo.constant dense<-5.28419031E-10>
+  // CHECK: %[[TMP_189:.*]] = mhlo.add %[[TMP_182]], %[[TMP_188]]
+  // CHECK: %[[TMP_190:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_189]]
+  // CHECK: %[[TMP_191:.*]] = mhlo.multiply %[[TMP_187]], %[[TMP_190]]
+  // CHECK: %[[TMP_192:.*]] = mhlo.constant dense<8.000000e+00>
+  // CHECK: %[[TMP_193:.*]] = mhlo.add %[[TMP_5]], %[[TMP_192]]
+  // CHECK: %[[TMP_194:.*]] = mhlo.constant dense<7.000000e+00>
+  // CHECK: %[[TMP_195:.*]] = mhlo.add %[[TMP_5]], %[[TMP_194]]
+  // CHECK: %[[TMP_196:.*]] = mhlo.multiply %[[TMP_193]], %[[TMP_195]]
+  // CHECK: %[[TMP_197:.*]] = mhlo.constant dense<2.08767563E-8>
+  // CHECK: %[[TMP_198:.*]] = mhlo.add %[[TMP_191]], %[[TMP_197]]
+  // CHECK: %[[TMP_199:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_198]]
+  // CHECK: %[[TMP_200:.*]] = mhlo.multiply %[[TMP_196]], %[[TMP_199]]
+  // CHECK: %[[TMP_201:.*]] = mhlo.constant dense<6.000000e+00>
+  // CHECK: %[[TMP_202:.*]] = mhlo.add %[[TMP_5]], %[[TMP_201]]
+  // CHECK: %[[TMP_203:.*]] = mhlo.constant dense<5.000000e+00>
+  // CHECK: %[[TMP_204:.*]] = mhlo.add %[[TMP_5]], %[[TMP_203]]
+  // CHECK: %[[TMP_205:.*]] = mhlo.multiply %[[TMP_202]], %[[TMP_204]]
+  // CHECK: %[[TMP_206:.*]] = mhlo.constant dense<-8.26719599E-7>
+  // CHECK: %[[TMP_207:.*]] = mhlo.add %[[TMP_200]], %[[TMP_206]]
+  // CHECK: %[[TMP_208:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_207]]
+  // CHECK: %[[TMP_209:.*]] = mhlo.multiply %[[TMP_205]], %[[TMP_208]]
+  // CHECK: %[[TMP_210:.*]] = mhlo.constant dense<4.000000e+00>
+  // CHECK: %[[TMP_211:.*]] = mhlo.add %[[TMP_5]], %[[TMP_210]]
+  // CHECK: %[[TMP_212:.*]] = mhlo.constant dense<3.000000e+00>
+  // CHECK: %[[TMP_213:.*]] = mhlo.add %[[TMP_5]], %[[TMP_212]]
+  // CHECK: %[[TMP_214:.*]] = mhlo.multiply %[[TMP_211]], %[[TMP_213]]
+  // CHECK: %[[TMP_215:.*]] = mhlo.constant dense<3.30687835E-5>
+  // CHECK: %[[TMP_216:.*]] = mhlo.add %[[TMP_209]], %[[TMP_215]]
+  // CHECK: %[[TMP_217:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_216]]
+  // CHECK: %[[TMP_218:.*]] = mhlo.multiply %[[TMP_214]], %[[TMP_217]]
+  // CHECK: %[[TMP_219:.*]] = mhlo.constant dense<2.000000e+00>
+  // CHECK: %[[TMP_220:.*]] = mhlo.add %[[TMP_5]], %[[TMP_219]]
+  // CHECK: %[[TMP_221:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_222:.*]] = mhlo.add %[[TMP_5]], %[[TMP_221]]
+  // CHECK: %[[TMP_223:.*]] = mhlo.multiply %[[TMP_220]], %[[TMP_222]]
+  // CHECK: %[[TMP_224:.*]] = mhlo.constant dense<-0.00138888892>
+  // CHECK: %[[TMP_225:.*]] = mhlo.add %[[TMP_218]], %[[TMP_224]]
+  // CHECK: %[[TMP_226:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_225]]
+  // CHECK: %[[TMP_227:.*]] = mhlo.multiply %[[TMP_223]], %[[TMP_226]]
+  // CHECK: %[[TMP_228:.*]] = mhlo.constant dense<5.000000e-01>
+  // CHECK: %[[TMP_229:.*]] = mhlo.divide %[[TMP_5]], %[[TMP_121]]
+  // CHECK: %[[TMP_230:.*]] = mhlo.constant dense<0.0833333358>
+  // CHECK: %[[TMP_231:.*]] = mhlo.add %[[TMP_230]], %[[TMP_227]]
+  // CHECK: %[[TMP_232:.*]] = mhlo.multiply %[[TMP_229]], %[[TMP_231]]
+  // CHECK: %[[TMP_233:.*]] = mhlo.add %[[TMP_228]], %[[TMP_232]]
+  // CHECK: %[[TMP_234:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_233]]
+  // CHECK: %[[TMP_235:.*]] = mhlo.add %[[TMP_120]], %[[TMP_126]]
+  // CHECK: %[[TMP_236:.*]] = mhlo.add %[[TMP_235]], %[[TMP_234]]
   // CHECK: %[[TMP_237:.*]] = mhlo.abs %[[TMP_122]]
   // CHECK: %[[TMP_238:.*]] = mhlo.abs %[[TMP_120]]
   // CHECK: %[[TMP_239:.*]] = mhlo.constant dense<1.401300e-45>
@@ -1557,7 +1557,7 @@ func.func @polygamma_f32(%lhs : tensor<f32>, %rhs : tensor<f32>) -> tensor<f32>
   // CHECK: %[[TMP_260:.*]] = mhlo.and %[[TMP_257]], %[[TMP_259]]
   // CHECK: %[[TMP_261:.*]] = mhlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
   // CHECK: %[[TMP_262:.*]] = mhlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
-  // CHECK: %[[TMP_263:.*]] = mhlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
+  // CHECK: %[[TMP_263:.*]] = mhlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
   // CHECK: %[[TMP_264:.*]] = mhlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
   // CHECK: %[[TMP_265:.*]] = mhlo.multiply %[[TMP_4]], %[[TMP_89]]
   // CHECK: %[[TMP_266:.*]] = mhlo.multiply %[[TMP_265]], %[[TMP_264]]
@@ -1771,153 +1771,153 @@ func.func @polygamma_f64(%lhs : tensor<f64>, %rhs : tensor<f64>) -> tensor<f64>
   // CHECK: %[[TMP_87:.*]] = mhlo.constant dense<0x7FF0000000000000>
   // CHECK: %[[TMP_88:.*]] = mhlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
   // CHECK: %[[TMP_89:.*]] = mhlo.exponential %[[TMP_88]]
-  // CHECK: %[[TMP_90:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_91:.*]] = mhlo.negate %[[TMP_5]]
-  // CHECK: %[[TMP_92:.*]] = mhlo.power %[[ARG1]], %[[TMP_91]]
-  // CHECK: %[[TMP_93:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_94:.*]] = mhlo.add %[[ARG1]], %[[TMP_93]]
-  // CHECK: %[[TMP_95:.*]] = mhlo.power %[[TMP_94]], %[[TMP_91]]
-  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_92]], %[[TMP_95]]
-  // CHECK: %[[TMP_97:.*]] = mhlo.add %[[TMP_94]], %[[TMP_93]]
-  // CHECK: %[[TMP_98:.*]] = mhlo.power %[[TMP_97]], %[[TMP_91]]
+  // CHECK-DAG: %[[TMP_90:.*]] = mhlo.constant dense<0.000000e+00>
+  // CHECK-DAG: %[[TMP_91:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_92:.*]] = mhlo.negate %[[TMP_5]]
+  // CHECK: %[[TMP_93:.*]] = mhlo.power %[[ARG1]], %[[TMP_92]]
+  // CHECK: %[[TMP_94:.*]] = mhlo.add %[[ARG1]], %[[TMP_91]]
+  // CHECK: %[[TMP_95:.*]] = mhlo.power %[[TMP_94]], %[[TMP_92]]
+  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_93]], %[[TMP_95]]
+  // CHECK: %[[TMP_97:.*]] = mhlo.add %[[TMP_94]], %[[TMP_91]]
+  // CHECK: %[[TMP_98:.*]] = mhlo.power %[[TMP_97]], %[[TMP_92]]
   // CHECK: %[[TMP_99:.*]] = mhlo.add %[[TMP_96]], %[[TMP_98]]
-  // CHECK: %[[TMP_100:.*]] = mhlo.add %[[TMP_97]], %[[TMP_93]]
-  // CHECK: %[[TMP_101:.*]] = mhlo.power %[[TMP_100]], %[[TMP_91]]
+  // CHECK: %[[TMP_100:.*]] = mhlo.add %[[TMP_97]], %[[TMP_91]]
+  // CHECK: %[[TMP_101:.*]] = mhlo.power %[[TMP_100]], %[[TMP_92]]
   // CHECK: %[[TMP_102:.*]] = mhlo.add %[[TMP_99]], %[[TMP_101]]
-  // CHECK: %[[TMP_103:.*]] = mhlo.add %[[TMP_100]], %[[TMP_93]]
-  // CHECK: %[[TMP_104:.*]] = mhlo.power %[[TMP_103]], %[[TMP_91]]
+  // CHECK: %[[TMP_103:.*]] = mhlo.add %[[TMP_100]], %[[TMP_91]]
+  // CHECK: %[[TMP_104:.*]] = mhlo.power %[[TMP_103]], %[[TMP_92]]
   // CHECK: %[[TMP_105:.*]] = mhlo.add %[[TMP_102]], %[[TMP_104]]
-  // CHECK: %[[TMP_106:.*]] = mhlo.add %[[TMP_103]], %[[TMP_93]]
-  // CHECK: %[[TMP_107:.*]] = mhlo.power %[[TMP_106]], %[[TMP_91]]
+  // CHECK: %[[TMP_106:.*]] = mhlo.add %[[TMP_103]], %[[TMP_91]]
+  // CHECK: %[[TMP_107:.*]] = mhlo.power %[[TMP_106]], %[[TMP_92]]
   // CHECK: %[[TMP_108:.*]] = mhlo.add %[[TMP_105]], %[[TMP_107]]
-  // CHECK: %[[TMP_109:.*]] = mhlo.add %[[TMP_106]], %[[TMP_93]]
-  // CHECK: %[[TMP_110:.*]] = mhlo.power %[[TMP_109]], %[[TMP_91]]
+  // CHECK: %[[TMP_109:.*]] = mhlo.add %[[TMP_106]], %[[TMP_91]]
+  // CHECK: %[[TMP_110:.*]] = mhlo.power %[[TMP_109]], %[[TMP_92]]
   // CHECK: %[[TMP_111:.*]] = mhlo.add %[[TMP_108]], %[[TMP_110]]
-  // CHECK: %[[TMP_112:.*]] = mhlo.add %[[TMP_109]], %[[TMP_93]]
-  // CHECK: %[[TMP_113:.*]] = mhlo.power %[[TMP_112]], %[[TMP_91]]
+  // CHECK: %[[TMP_112:.*]] = mhlo.add %[[TMP_109]], %[[TMP_91]]
+  // CHECK: %[[TMP_113:.*]] = mhlo.power %[[TMP_112]], %[[TMP_92]]
   // CHECK: %[[TMP_114:.*]] = mhlo.add %[[TMP_111]], %[[TMP_113]]
-  // CHECK: %[[TMP_115:.*]] = mhlo.add %[[TMP_112]], %[[TMP_93]]
-  // CHECK: %[[TMP_116:.*]] = mhlo.power %[[TMP_115]], %[[TMP_91]]
+  // CHECK: %[[TMP_115:.*]] = mhlo.add %[[TMP_112]], %[[TMP_91]]
+  // CHECK: %[[TMP_116:.*]] = mhlo.power %[[TMP_115]], %[[TMP_92]]
   // CHECK: %[[TMP_117:.*]] = mhlo.add %[[TMP_114]], %[[TMP_116]]
-  // CHECK: %[[TMP_118:.*]] = mhlo.add %[[TMP_115]], %[[TMP_93]]
-  // CHECK: %[[TMP_119:.*]] = mhlo.power %[[TMP_118]], %[[TMP_91]]
+  // CHECK: %[[TMP_118:.*]] = mhlo.add %[[TMP_115]], %[[TMP_91]]
+  // CHECK: %[[TMP_119:.*]] = mhlo.power %[[TMP_118]], %[[TMP_92]]
   // CHECK: %[[TMP_120:.*]] = mhlo.add %[[TMP_117]], %[[TMP_119]]
-  // CHECK: %[[TMP_121:.*]] = mhlo.add %[[TMP_118]], %[[TMP_93]]
-  // CHECK: %[[TMP_122:.*]] = mhlo.power %[[TMP_121]], %[[TMP_91]]
+  // CHECK: %[[TMP_121:.*]] = mhlo.add %[[TMP_118]], %[[TMP_91]]
+  // CHECK: %[[TMP_122:.*]] = mhlo.power %[[TMP_121]], %[[TMP_92]]
   // CHECK: %[[TMP_123:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_124:.*]] = mhlo.subtract %[[TMP_5]], %[[TMP_123]]
-  // CHECK: %[[TMP_125:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_121]]
-  // CHECK: %[[TMP_126:.*]] = mhlo.divide %[[TMP_125]], %[[TMP_124]]
-  // CHECK: %[[TMP_127:.*]] = mhlo.add %[[TMP_120]], %[[TMP_126]]
-  // CHECK: %[[TMP_128:.*]] = mhlo.multiply %[[TMP_121]], %[[TMP_121]]
-  // CHECK: %[[TMP_129:.*]] = mhlo.divide %[[TMP_93]], %[[TMP_128]]
-  // CHECK: %[[TMP_130:.*]] = mhlo.constant dense<2.200000e+01>
-  // CHECK: %[[TMP_131:.*]] = mhlo.add %[[TMP_5]], %[[TMP_130]]
-  // CHECK: %[[TMP_132:.*]] = mhlo.constant dense<2.100000e+01>
-  // CHECK: %[[TMP_133:.*]] = mhlo.add %[[TMP_5]], %[[TMP_132]]
-  // CHECK: %[[TMP_134:.*]] = mhlo.multiply %[[TMP_131]], %[[TMP_133]]
-  // CHECK: %[[TMP_135:.*]] = mhlo.constant dense<-1.3954464685812522E-19>
-  // CHECK: %[[TMP_136:.*]] = mhlo.add %[[TMP_90]], %[[TMP_135]]
-  // CHECK: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_136]]
-  // CHECK: %[[TMP_138:.*]] = mhlo.multiply %[[TMP_134]], %[[TMP_137]]
-  // CHECK: %[[TMP_139:.*]] = mhlo.constant dense<2.000000e+01>
-  // CHECK: %[[TMP_140:.*]] = mhlo.add %[[TMP_5]], %[[TMP_139]]
-  // CHECK: %[[TMP_141:.*]] = mhlo.constant dense<1.900000e+01>
-  // CHECK: %[[TMP_142:.*]] = mhlo.add %[[TMP_5]], %[[TMP_141]]
-  // CHECK: %[[TMP_143:.*]] = mhlo.multiply %[[TMP_140]], %[[TMP_142]]
-  // CHECK: %[[TMP_144:.*]] = mhlo.constant dense<5.5090028283602295E-18>
-  // CHECK: %[[TMP_145:.*]] = mhlo.add %[[TMP_138]], %[[TMP_144]]
-  // CHECK: %[[TMP_146:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_145]]
-  // CHECK: %[[TMP_147:.*]] = mhlo.multiply %[[TMP_143]], %[[TMP_146]]
-  // CHECK: %[[TMP_148:.*]] = mhlo.constant dense<1.800000e+01>
-  // CHECK: %[[TMP_149:.*]] = mhlo.add %[[TMP_5]], %[[TMP_148]]
-  // CHECK: %[[TMP_150:.*]] = mhlo.constant dense<1.700000e+01>
-  // CHECK: %[[TMP_151:.*]] = mhlo.add %[[TMP_5]], %[[TMP_150]]
-  // CHECK: %[[TMP_152:.*]] = mhlo.multiply %[[TMP_149]], %[[TMP_151]]
-  // CHECK: %[[TMP_153:.*]] = mhlo.constant dense<-2.1748686985580617E-16>
-  // CHECK: %[[TMP_154:.*]] = mhlo.add %[[TMP_147]], %[[TMP_153]]
-  // CHECK: %[[TMP_155:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_154]]
-  // CHECK: %[[TMP_156:.*]] = mhlo.multiply %[[TMP_152]], %[[TMP_155]]
-  // CHECK: %[[TMP_157:.*]] = mhlo.constant dense<1.600000e+01>
-  // CHECK: %[[TMP_158:.*]] = mhlo.add %[[TMP_5]], %[[TMP_157]]
-  // CHECK: %[[TMP_159:.*]] = mhlo.constant dense<1.500000e+01>
-  // CHECK: %[[TMP_160:.*]] = mhlo.add %[[TMP_5]], %[[TMP_159]]
-  // CHECK: %[[TMP_161:.*]] = mhlo.multiply %[[TMP_158]], %[[TMP_160]]
-  // CHECK: %[[TMP_162:.*]] = mhlo.constant dense<8.5860620562778452E-15>
-  // CHECK: %[[TMP_163:.*]] = mhlo.add %[[TMP_156]], %[[TMP_162]]
-  // CHECK: %[[TMP_164:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_163]]
-  // CHECK: %[[TMP_165:.*]] = mhlo.multiply %[[TMP_161]], %[[TMP_164]]
-  // CHECK: %[[TMP_166:.*]] = mhlo.constant dense<1.400000e+01>
-  // CHECK: %[[TMP_167:.*]] = mhlo.add %[[TMP_5]], %[[TMP_166]]
-  // CHECK: %[[TMP_168:.*]] = mhlo.constant dense<1.300000e+01>
-  // CHECK: %[[TMP_169:.*]] = mhlo.add %[[TMP_5]], %[[TMP_168]]
-  // CHECK: %[[TMP_170:.*]] = mhlo.multiply %[[TMP_167]], %[[TMP_169]]
-  // CHECK: %[[TMP_171:.*]] = mhlo.constant dense<-3.3896802963225832E-13>
-  // CHECK: %[[TMP_172:.*]] = mhlo.add %[[TMP_165]], %[[TMP_171]]
-  // CHECK: %[[TMP_173:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_172]]
-  // CHECK: %[[TMP_174:.*]] = mhlo.multiply %[[TMP_170]], %[[TMP_173]]
-  // CHECK: %[[TMP_175:.*]] = mhlo.constant dense<1.200000e+01>
-  // CHECK: %[[TMP_176:.*]] = mhlo.add %[[TMP_5]], %[[TMP_175]]
-  // CHECK: %[[TMP_177:.*]] = mhlo.constant dense<1.100000e+01>
-  // CHECK: %[[TMP_178:.*]] = mhlo.add %[[TMP_5]], %[[TMP_177]]
-  // CHECK: %[[TMP_179:.*]] = mhlo.multiply %[[TMP_176]], %[[TMP_178]]
-  // CHECK: %[[TMP_180:.*]] = mhlo.constant dense<1.3382536530684679E-11>
-  // CHECK: %[[TMP_181:.*]] = mhlo.add %[[TMP_174]], %[[TMP_180]]
-  // CHECK: %[[TMP_182:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_181]]
-  // CHECK: %[[TMP_183:.*]] = mhlo.multiply %[[TMP_179]], %[[TMP_182]]
-  // CHECK: %[[TMP_184:.*]] = mhlo.constant dense<1.000000e+01>
-  // CHECK: %[[TMP_185:.*]] = mhlo.add %[[TMP_5]], %[[TMP_184]]
-  // CHECK: %[[TMP_186:.*]] = mhlo.constant dense<9.000000e+00>
-  // CHECK: %[[TMP_187:.*]] = mhlo.add %[[TMP_5]], %[[TMP_186]]
-  // CHECK: %[[TMP_188:.*]] = mhlo.multiply %[[TMP_185]], %[[TMP_187]]
-  // CHECK: %[[TMP_189:.*]] = mhlo.constant dense<-5.2841901386874932E-10>
-  // CHECK: %[[TMP_190:.*]] = mhlo.add %[[TMP_183]], %[[TMP_189]]
-  // CHECK: %[[TMP_191:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_190]]
-  // CHECK: %[[TMP_192:.*]] = mhlo.multiply %[[TMP_188]], %[[TMP_191]]
-  // CHECK: %[[TMP_193:.*]] = mhlo.constant dense<8.000000e+00>
-  // CHECK: %[[TMP_194:.*]] = mhlo.add %[[TMP_5]], %[[TMP_193]]
-  // CHECK: %[[TMP_195:.*]] = mhlo.constant dense<7.000000e+00>
-  // CHECK: %[[TMP_196:.*]] = mhlo.add %[[TMP_5]], %[[TMP_195]]
-  // CHECK: %[[TMP_197:.*]] = mhlo.multiply %[[TMP_194]], %[[TMP_196]]
-  // CHECK: %[[TMP_198:.*]] = mhlo.constant dense<2.08767569878681E-8>
-  // CHECK: %[[TMP_199:.*]] = mhlo.add %[[TMP_192]], %[[TMP_198]]
-  // CHECK: %[[TMP_200:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_199]]
-  // CHECK: %[[TMP_201:.*]] = mhlo.multiply %[[TMP_197]], %[[TMP_200]]
-  // CHECK: %[[TMP_202:.*]] = mhlo.constant dense<6.000000e+00>
-  // CHECK: %[[TMP_203:.*]] = mhlo.add %[[TMP_5]], %[[TMP_202]]
-  // CHECK: %[[TMP_204:.*]] = mhlo.constant dense<5.000000e+00>
-  // CHECK: %[[TMP_205:.*]] = mhlo.add %[[TMP_5]], %[[TMP_204]]
-  // CHECK: %[[TMP_206:.*]] = mhlo.multiply %[[TMP_203]], %[[TMP_205]]
-  // CHECK: %[[TMP_207:.*]] = mhlo.constant dense<-8.2671957671957675E-7>
-  // CHECK: %[[TMP_208:.*]] = mhlo.add %[[TMP_201]], %[[TMP_207]]
-  // CHECK: %[[TMP_209:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_208]]
-  // CHECK: %[[TMP_210:.*]] = mhlo.multiply %[[TMP_206]], %[[TMP_209]]
-  // CHECK: %[[TMP_211:.*]] = mhlo.constant dense<4.000000e+00>
-  // CHECK: %[[TMP_212:.*]] = mhlo.add %[[TMP_5]], %[[TMP_211]]
-  // CHECK: %[[TMP_213:.*]] = mhlo.constant dense<3.000000e+00>
-  // CHECK: %[[TMP_214:.*]] = mhlo.add %[[TMP_5]], %[[TMP_213]]
-  // CHECK: %[[TMP_215:.*]] = mhlo.multiply %[[TMP_212]], %[[TMP_214]]
-  // CHECK: %[[TMP_216:.*]] = mhlo.constant dense<3.3068783068783071E-5>
-  // CHECK: %[[TMP_217:.*]] = mhlo.add %[[TMP_210]], %[[TMP_216]]
-  // CHECK: %[[TMP_218:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_217]]
-  // CHECK: %[[TMP_219:.*]] = mhlo.multiply %[[TMP_215]], %[[TMP_218]]
-  // CHECK: %[[TMP_220:.*]] = mhlo.constant dense<2.000000e+00>
-  // CHECK: %[[TMP_221:.*]] = mhlo.add %[[TMP_5]], %[[TMP_220]]
-  // CHECK: %[[TMP_222:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_223:.*]] = mhlo.add %[[TMP_5]], %[[TMP_222]]
-  // CHECK: %[[TMP_224:.*]] = mhlo.multiply %[[TMP_221]], %[[TMP_223]]
-  // CHECK: %[[TMP_225:.*]] = mhlo.constant dense<-0.0013888888888888889>
-  // CHECK: %[[TMP_226:.*]] = mhlo.add %[[TMP_219]], %[[TMP_225]]
-  // CHECK: %[[TMP_227:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_226]]
-  // CHECK: %[[TMP_228:.*]] = mhlo.multiply %[[TMP_224]], %[[TMP_227]]
-  // CHECK: %[[TMP_229:.*]] = mhlo.constant dense<5.000000e-01>
-  // CHECK: %[[TMP_230:.*]] = mhlo.divide %[[TMP_5]], %[[TMP_121]]
-  // CHECK: %[[TMP_231:.*]] = mhlo.constant dense<0.083333333333333329>
-  // CHECK: %[[TMP_232:.*]] = mhlo.add %[[TMP_231]], %[[TMP_228]]
-  // CHECK: %[[TMP_233:.*]] = mhlo.multiply %[[TMP_230]], %[[TMP_232]]
-  // CHECK: %[[TMP_234:.*]] = mhlo.add %[[TMP_229]], %[[TMP_233]]
-  // CHECK: %[[TMP_235:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_234]]
-  // CHECK: %[[TMP_236:.*]] = mhlo.add %[[TMP_127]], %[[TMP_235]]
+  // CHECK: %[[TMP_124:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_121]]
+  // CHECK: %[[TMP_125:.*]] = mhlo.subtract %[[TMP_5]], %[[TMP_123]]
+  // CHECK: %[[TMP_126:.*]] = mhlo.divide %[[TMP_124]], %[[TMP_125]]
+  // CHECK: %[[TMP_127:.*]] = mhlo.multiply %[[TMP_121]], %[[TMP_121]]
+  // CHECK: %[[TMP_128:.*]] = mhlo.divide %[[TMP_91]], %[[TMP_127]]
+  // CHECK: %[[TMP_129:.*]] = mhlo.constant dense<2.200000e+01>
+  // CHECK: %[[TMP_130:.*]] = mhlo.add %[[TMP_5]], %[[TMP_129]]
+  // CHECK: %[[TMP_131:.*]] = mhlo.constant dense<2.100000e+01>
+  // CHECK: %[[TMP_132:.*]] = mhlo.add %[[TMP_5]], %[[TMP_131]]
+  // CHECK: %[[TMP_133:.*]] = mhlo.multiply %[[TMP_130]], %[[TMP_132]]
+  // CHECK: %[[TMP_134:.*]] = mhlo.constant dense<-1.3954464685812522E-19>
+  // CHECK: %[[TMP_135:.*]] = mhlo.add %[[TMP_90]], %[[TMP_134]]
+  // CHECK: %[[TMP_136:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_135]]
+  // CHECK: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_133]], %[[TMP_136]]
+  // CHECK: %[[TMP_138:.*]] = mhlo.constant dense<2.000000e+01>
+  // CHECK: %[[TMP_139:.*]] = mhlo.add %[[TMP_5]], %[[TMP_138]]
+  // CHECK: %[[TMP_140:.*]] = mhlo.constant dense<1.900000e+01>
+  // CHECK: %[[TMP_141:.*]] = mhlo.add %[[TMP_5]], %[[TMP_140]]
+  // CHECK: %[[TMP_142:.*]] = mhlo.multiply %[[TMP_139]], %[[TMP_141]]
+  // CHECK: %[[TMP_143:.*]] = mhlo.constant dense<5.5090028283602295E-18>
+  // CHECK: %[[TMP_144:.*]] = mhlo.add %[[TMP_137]], %[[TMP_143]]
+  // CHECK: %[[TMP_145:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_144]]
+  // CHECK: %[[TMP_146:.*]] = mhlo.multiply %[[TMP_142]], %[[TMP_145]]
+  // CHECK: %[[TMP_147:.*]] = mhlo.constant dense<1.800000e+01>
+  // CHECK: %[[TMP_148:.*]] = mhlo.add %[[TMP_5]], %[[TMP_147]]
+  // CHECK: %[[TMP_149:.*]] = mhlo.constant dense<1.700000e+01>
+  // CHECK: %[[TMP_150:.*]] = mhlo.add %[[TMP_5]], %[[TMP_149]]
+  // CHECK: %[[TMP_151:.*]] = mhlo.multiply %[[TMP_148]], %[[TMP_150]]
+  // CHECK: %[[TMP_152:.*]] = mhlo.constant dense<-2.1748686985580617E-16>
+  // CHECK: %[[TMP_153:.*]] = mhlo.add %[[TMP_146]], %[[TMP_152]]
+  // CHECK: %[[TMP_154:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_153]]
+  // CHECK: %[[TMP_155:.*]] = mhlo.multiply %[[TMP_151]], %[[TMP_154]]
+  // CHECK: %[[TMP_156:.*]] = mhlo.constant dense<1.600000e+01>
+  // CHECK: %[[TMP_157:.*]] = mhlo.add %[[TMP_5]], %[[TMP_156]]
+  // CHECK: %[[TMP_158:.*]] = mhlo.constant dense<1.500000e+01>
+  // CHECK: %[[TMP_159:.*]] = mhlo.add %[[TMP_5]], %[[TMP_158]]
+  // CHECK: %[[TMP_160:.*]] = mhlo.multiply %[[TMP_157]], %[[TMP_159]]
+  // CHECK: %[[TMP_161:.*]] = mhlo.constant dense<8.5860620562778452E-15>
+  // CHECK: %[[TMP_162:.*]] = mhlo.add %[[TMP_155]], %[[TMP_161]]
+  // CHECK: %[[TMP_163:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_162]]
+  // CHECK: %[[TMP_164:.*]] = mhlo.multiply %[[TMP_160]], %[[TMP_163]]
+  // CHECK: %[[TMP_165:.*]] = mhlo.constant dense<1.400000e+01>
+  // CHECK: %[[TMP_166:.*]] = mhlo.add %[[TMP_5]], %[[TMP_165]]
+  // CHECK: %[[TMP_167:.*]] = mhlo.constant dense<1.300000e+01>
+  // CHECK: %[[TMP_168:.*]] = mhlo.add %[[TMP_5]], %[[TMP_167]]
+  // CHECK: %[[TMP_169:.*]] = mhlo.multiply %[[TMP_166]], %[[TMP_168]]
+  // CHECK: %[[TMP_170:.*]] = mhlo.constant dense<-3.3896802963225832E-13>
+  // CHECK: %[[TMP_171:.*]] = mhlo.add %[[TMP_164]], %[[TMP_170]]
+  // CHECK: %[[TMP_172:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_171]]
+  // CHECK: %[[TMP_173:.*]] = mhlo.multiply %[[TMP_169]], %[[TMP_172]]
+  // CHECK: %[[TMP_174:.*]] = mhlo.constant dense<1.200000e+01>
+  // CHECK: %[[TMP_175:.*]] = mhlo.add %[[TMP_5]], %[[TMP_174]]
+  // CHECK: %[[TMP_176:.*]] = mhlo.constant dense<1.100000e+01>
+  // CHECK: %[[TMP_177:.*]] = mhlo.add %[[TMP_5]], %[[TMP_176]]
+  // CHECK: %[[TMP_178:.*]] = mhlo.multiply %[[TMP_175]], %[[TMP_177]]
+  // CHECK: %[[TMP_179:.*]] = mhlo.constant dense<1.3382536530684679E-11>
+  // CHECK: %[[TMP_180:.*]] = mhlo.add %[[TMP_173]], %[[TMP_179]]
+  // CHECK: %[[TMP_181:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_180]]
+  // CHECK: %[[TMP_182:.*]] = mhlo.multiply %[[TMP_178]], %[[TMP_181]]
+  // CHECK: %[[TMP_183:.*]] = mhlo.constant dense<1.000000e+01>
+  // CHECK: %[[TMP_184:.*]] = mhlo.add %[[TMP_5]], %[[TMP_183]]
+  // CHECK: %[[TMP_185:.*]] = mhlo.constant dense<9.000000e+00>
+  // CHECK: %[[TMP_186:.*]] = mhlo.add %[[TMP_5]], %[[TMP_185]]
+  // CHECK: %[[TMP_187:.*]] = mhlo.multiply %[[TMP_184]], %[[TMP_186]]
+  // CHECK: %[[TMP_188:.*]] = mhlo.constant dense<-5.2841901386874932E-10>
+  // CHECK: %[[TMP_189:.*]] = mhlo.add %[[TMP_182]], %[[TMP_188]]
+  // CHECK: %[[TMP_190:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_189]]
+  // CHECK: %[[TMP_191:.*]] = mhlo.multiply %[[TMP_187]], %[[TMP_190]]
+  // CHECK: %[[TMP_192:.*]] = mhlo.constant dense<8.000000e+00>
+  // CHECK: %[[TMP_193:.*]] = mhlo.add %[[TMP_5]], %[[TMP_192]]
+  // CHECK: %[[TMP_194:.*]] = mhlo.constant dense<7.000000e+00>
+  // CHECK: %[[TMP_195:.*]] = mhlo.add %[[TMP_5]], %[[TMP_194]]
+  // CHECK: %[[TMP_196:.*]] = mhlo.multiply %[[TMP_193]], %[[TMP_195]]
+  // CHECK: %[[TMP_197:.*]] = mhlo.constant dense<2.08767569878681E-8>
+  // CHECK: %[[TMP_198:.*]] = mhlo.add %[[TMP_191]], %[[TMP_197]]
+  // CHECK: %[[TMP_199:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_198]]
+  // CHECK: %[[TMP_200:.*]] = mhlo.multiply %[[TMP_196]], %[[TMP_199]]
+  // CHECK: %[[TMP_201:.*]] = mhlo.constant dense<6.000000e+00>
+  // CHECK: %[[TMP_202:.*]] = mhlo.add %[[TMP_5]], %[[TMP_201]]
+  // CHECK: %[[TMP_203:.*]] = mhlo.constant dense<5.000000e+00>
+  // CHECK: %[[TMP_204:.*]] = mhlo.add %[[TMP_5]], %[[TMP_203]]
+  // CHECK: %[[TMP_205:.*]] = mhlo.multiply %[[TMP_202]], %[[TMP_204]]
+  // CHECK: %[[TMP_206:.*]] = mhlo.constant dense<-8.2671957671957675E-7>
+  // CHECK: %[[TMP_207:.*]] = mhlo.add %[[TMP_200]], %[[TMP_206]]
+  // CHECK: %[[TMP_208:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_207]]
+  // CHECK: %[[TMP_209:.*]] = mhlo.multiply %[[TMP_205]], %[[TMP_208]]
+  // CHECK: %[[TMP_210:.*]] = mhlo.constant dense<4.000000e+00>
+  // CHECK: %[[TMP_211:.*]] = mhlo.add %[[TMP_5]], %[[TMP_210]]
+  // CHECK: %[[TMP_212:.*]] = mhlo.constant dense<3.000000e+00>
+  // CHECK: %[[TMP_213:.*]] = mhlo.add %[[TMP_5]], %[[TMP_212]]
+  // CHECK: %[[TMP_214:.*]] = mhlo.multiply %[[TMP_211]], %[[TMP_213]]
+  // CHECK: %[[TMP_215:.*]] = mhlo.constant dense<3.3068783068783071E-5>
+  // CHECK: %[[TMP_216:.*]] = mhlo.add %[[TMP_209]], %[[TMP_215]]
+  // CHECK: %[[TMP_217:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_216]]
+  // CHECK: %[[TMP_218:.*]] = mhlo.multiply %[[TMP_214]], %[[TMP_217]]
+  // CHECK: %[[TMP_219:.*]] = mhlo.constant dense<2.000000e+00>
+  // CHECK: %[[TMP_220:.*]] = mhlo.add %[[TMP_5]], %[[TMP_219]]
+  // CHECK: %[[TMP_221:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_222:.*]] = mhlo.add %[[TMP_5]], %[[TMP_221]]
+  // CHECK: %[[TMP_223:.*]] = mhlo.multiply %[[TMP_220]], %[[TMP_222]]
+  // CHECK: %[[TMP_224:.*]] = mhlo.constant dense<-0.0013888888888888889>
+  // CHECK: %[[TMP_225:.*]] = mhlo.add %[[TMP_218]], %[[TMP_224]]
+  // CHECK: %[[TMP_226:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_225]]
+  // CHECK: %[[TMP_227:.*]] = mhlo.multiply %[[TMP_223]], %[[TMP_226]]
+  // CHECK: %[[TMP_228:.*]] = mhlo.constant dense<5.000000e-01>
+  // CHECK: %[[TMP_229:.*]] = mhlo.divide %[[TMP_5]], %[[TMP_121]]
+  // CHECK: %[[TMP_230:.*]] = mhlo.constant dense<0.083333333333333329>
+  // CHECK: %[[TMP_231:.*]] = mhlo.add %[[TMP_230]], %[[TMP_227]]
+  // CHECK: %[[TMP_232:.*]] = mhlo.multiply %[[TMP_229]], %[[TMP_231]]
+  // CHECK: %[[TMP_233:.*]] = mhlo.add %[[TMP_228]], %[[TMP_232]]
+  // CHECK: %[[TMP_234:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_233]]
+  // CHECK: %[[TMP_235:.*]] = mhlo.add %[[TMP_120]], %[[TMP_126]]
+  // CHECK: %[[TMP_236:.*]] = mhlo.add %[[TMP_235]], %[[TMP_234]]
   // CHECK: %[[TMP_237:.*]] = mhlo.abs %[[TMP_122]]
   // CHECK: %[[TMP_238:.*]] = mhlo.abs %[[TMP_120]]
   // CHECK: %[[TMP_239:.*]] = mhlo.constant dense<4.940660e-324>
@@ -1944,7 +1944,7 @@ func.func @polygamma_f64(%lhs : tensor<f64>, %rhs : tensor<f64>) -> tensor<f64>
   // CHECK: %[[TMP_260:.*]] = mhlo.and %[[TMP_257]], %[[TMP_259]]
   // CHECK: %[[TMP_261:.*]] = mhlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
   // CHECK: %[[TMP_262:.*]] = mhlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
-  // CHECK: %[[TMP_263:.*]] = mhlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
+  // CHECK: %[[TMP_263:.*]] = mhlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
   // CHECK: %[[TMP_264:.*]] = mhlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
   // CHECK: %[[TMP_265:.*]] = mhlo.multiply %[[TMP_4]], %[[TMP_89]]
   // CHECK: %[[TMP_266:.*]] = mhlo.multiply %[[TMP_265]], %[[TMP_264]]

From 19efc8f0a30f727f6349fcbdf3b19fb2dc0316ae Mon Sep 17 00:00:00 2001
From: Majid Dadashi <majiddadashi@google.com>
Date: Fri, 29 Mar 2024 10:11:36 -0700
Subject: [PATCH 600/670] [tflite] Add the composite lowering logic for
 hardswish

This directly lowers an aten hardswish to a tflite hardswish

PiperOrigin-RevId: 620273404
---
 .../stablehlo/tests/composite-lowering.mlir   | 34 +++++++++++++++++++
 .../transforms/composite_lowering_patterns.td |  6 ++++
 2 files changed, 40 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir

diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
new file mode 100644
index 00000000000000..5924d0dce396c4
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
@@ -0,0 +1,34 @@
+// RUN: odml-to-stablehlo-opt -composite-lowering -verify-diagnostics %s | FileCheck %s
+
+func.func @hardswish(%arg0: tensor<2xf32>) -> (tensor<*xf32>) {
+  %0 = mhlo.composite "aten.hardswish.default" %arg0 {decomposition = @XlaCallModule_aten.hardswish.default.impl_0} : (tensor<2xf32>) -> tensor<2xf32>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<2xf32>) -> tensor<*xf32>
+  %2 = "tf.Identity"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+  return %2 : tensor<*xf32>
+}
+func.func private @XlaCallModule_aten.hardswish.default.impl_0(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = mhlo.constant dense<6.000000e+00> : tensor<f32>
+  %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %2 = mhlo.constant dense<3.40282347E+38> : tensor<f32>
+  %3 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %4 = mhlo.constant dense<3.000000e+00> : tensor<f32>
+  %5 = "mhlo.broadcast_in_dim"(%4) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %6 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  %7 = "mhlo.broadcast_in_dim"(%6) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %8 = mhlo.constant dense<-3.40282347E+38> : tensor<f32>
+  %9 = "mhlo.broadcast_in_dim"(%8) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %10 = mhlo.add %arg0, %5 : tensor<2xf32>
+  %11 = mhlo.clamp %7, %10, %3 : tensor<2xf32>
+  %12 = mhlo.clamp %9, %11, %1 : tensor<2xf32>
+  %13 = mhlo.multiply %arg0, %12 : tensor<2xf32>
+  %14 = mhlo.divide %13, %1 : tensor<2xf32>
+  return %14 : tensor<2xf32>
+}
+
+// CHECK-LABEL:   func.func @hardswish(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tfl.hard_swish"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Identity"(%[[VAL_1]]) {device = ""} : (tensor<2xf32>) -> tensor<*xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Identity"(%[[VAL_2]]) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_3]] : tensor<*xf32>
+// CHECK:         }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
index 74d8bb372c7d37..1b62b6fcc4aeae 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
@@ -20,3 +20,9 @@ include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mhlo/IR/hlo_ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 
+
+def LegalizeHardSwishComposite: Pat<
+                    (MHLO_CompositeOp:$old_value
+                    (variadic $input),
+                    ConstantStrAttr<StrAttr, "aten.hardswish.default">, $_, $_, $_),
+                    (TFL_HardSwishOp $input)>;

From 5c66d026b089c47b021a65663153a268f6598e03 Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Fri, 29 Mar 2024 10:11:59 -0700
Subject: [PATCH 601/670] Rollback
 https://github.com/openxla/xla/commit/0ab2be0b5a575da3206d2c2f92b85e6346708405.

There is an internal issue with running tests on H100s requiring the change to be rolled back.

Reverts 42883cb09d1a8155824ce4ed044794c0dffdd19f

PiperOrigin-RevId: 620273492
---
 third_party/xla/xla/service/gpu/BUILD | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index b8bd5348f53318..d723001b77dcf2 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -599,7 +599,6 @@ xla_test(
     srcs = if_cuda_is_configured(["ir_emitter_triton_test.cc"]),
     backends = [
         "gpu_a100",
-        "gpu_h100",
     ],
     shard_count = 20,
     tags = ["nomac"],
@@ -652,10 +651,7 @@ xla_test(
     backend_tags = {"gpu": [
         "requires-gpu-sm80",
     ]},
-    backends = [
-        "gpu",
-        "gpu_h100",
-    ],
+    backends = ["gpu"],
     tags = [
         "large",
         "no_oss",  # requires-mem:16g tag doesn't work in open source

From 1a9dbe83517f4697acfb640726c56ecc6df0a450 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Mar 2024 10:12:51 -0700
Subject: [PATCH 602/670] Modify the matrix class to keep track of both memory
 and communication resharding costs for a given edge as part of one matrix
 object.

PiperOrigin-RevId: 620273768
---
 .../auto_sharding/auto_sharding.cc            |  13 +-
 .../auto_sharding/auto_sharding_cost_graph.cc | 142 ++++++------------
 .../auto_sharding/auto_sharding_cost_graph.h  |  45 ++++--
 .../hlo/experimental/auto_sharding/matrix.h   |  15 +-
 4 files changed, 92 insertions(+), 123 deletions(-)

diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index a9caf4daa6308e..49d4807266603b 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -1852,8 +1852,7 @@ AutoShardingSolverResult CallSolver(
   if (max_cost) {
     request.mutable_max_cost()->set_coeff(*max_cost);
   }
-  for (const auto& [edge, edge_cost] : cost_graph.edge_communication_costs_) {
-    const auto& edge_memory_cost = cost_graph.edge_memory_costs_.at(edge);
+  for (const auto& [edge, edge_cost] : cost_graph.edge_costs_) {
     AutoShardingSolverRequest_Pair raw_edge;
     raw_edge.set_first(edge.first);
     raw_edge.set_second(edge.second);
@@ -1862,8 +1861,8 @@ AutoShardingSolverResult CallSolver(
     AutoShardingSolverRequest_Costs mij;
     for (NodeStrategyIdx i = 0; i < edge_cost.n_; i++) {
       for (NodeStrategyIdx j = 0; j < edge_cost.m_; j++) {
-        rij.add_costs(edge_cost(i, j));
-        mij.add_costs(edge_memory_cost(i, j));
+        rij.add_costs(edge_cost(i, j).communication_cost);
+        mij.add_costs(edge_cost(i, j).memory_cost);
       }
     }
     request.mutable_resharding_costs()->Add(std::move(rij));
@@ -1929,8 +1928,8 @@ AutoShardingSolverResult CallSolver(
   for (const auto& pair : alias_set) {
     const StrategyGroup* src_strategy_group = strategy_groups[pair.first];
     const StrategyGroup* dst_strategy_group = strategy_groups[pair.second];
-    Matrix raw_cost(src_strategy_group->strategies.size(),
-                    dst_strategy_group->strategies.size());
+    Matrix<double> raw_cost(src_strategy_group->strategies.size(),
+                            dst_strategy_group->strategies.size());
     for (NodeStrategyIdx i = 0; i < src_strategy_group->strategies.size();
          ++i) {
       for (NodeStrategyIdx j = 0; j < dst_strategy_group->strategies.size();
@@ -3782,7 +3781,7 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     std::vector<absl::flat_hash_set<spmd::EdgeIdx>> node_to_edges(
         strategy_groups.size());
     spmd::EdgeIdx edge_idx = 0;
-    for (const auto& [edge, _] : cost_graph.edge_communication_costs_) {
+    for (const auto& [edge, _] : cost_graph.edge_costs_) {
       node_to_edges[edge.second].insert(edge_idx);
       ++edge_idx;
     }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.cc
index 5db37cc868bc2f..1156e0b80c3027 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.cc
@@ -51,22 +51,16 @@ CostGraph::CostGraph(const StrategyGroups& strategy_groups,
       if (!in_nodes[i]->is_tuple) {
         NodeIdx src_idx = in_nodes[i]->node_idx;
         NodeIdx dst_idx = strategy_group->node_idx;
-        Matrix edge_communication_cost =
-            CreateEdgeCommunicationCost(src_idx, dst_idx, i, strategy_group);
-        Matrix edge_memory_cost =
-            CreateEdgeMemoryCost(src_idx, dst_idx, i, strategy_group);
-        AddEdgeCost(src_idx, dst_idx, edge_communication_cost,
-                    edge_memory_cost);
+        EdgeReshardingCostMatrix edge_cost =
+            CreateEdgeCost(src_idx, dst_idx, i, strategy_group);
+        AddEdgeCost(src_idx, dst_idx, edge_cost);
       } else if (in_nodes[i]->is_tuple && in_nodes.size() > 1) {
         for (size_t l = 0; l < in_nodes[i]->childs.size(); ++l) {
           NodeIdx src_idx = in_nodes[i]->childs[l]->node_idx;
           NodeIdx dst_idx = strategy_group->node_idx;
-          Matrix edge_communication_cost = CreateEdgeCommunicationCost(
-              src_idx, dst_idx, i, strategy_group, true);
-          Matrix edge_memory_cost =
-              CreateEdgeMemoryCost(src_idx, dst_idx, i, strategy_group, true);
-          AddEdgeCost(src_idx, dst_idx, edge_communication_cost,
-                      edge_memory_cost);
+          EdgeReshardingCostMatrix edge_cost =
+              CreateEdgeCost(src_idx, dst_idx, i, strategy_group, true);
+          AddEdgeCost(src_idx, dst_idx, edge_cost);
         }
       } else {
         CHECK_EQ(in_nodes.size(), 1)
@@ -80,12 +74,9 @@ CostGraph::CostGraph(const StrategyGroups& strategy_groups,
           // operands. If there is only one operand and it's a tuple, the
           // first index of communication_resharding_costs is for the tuple
           // element.
-          Matrix edge_communication_cost = CreateEdgeCommunicationCost(
+          EdgeReshardingCostMatrix edge_cost = CreateEdgeCost(
               src_idx, dst_idx, /*in_node_idx=*/l, strategy_group);
-          Matrix edge_memory_cost = CreateEdgeMemoryCost(
-              src_idx, dst_idx, /*in_node_idx=*/l, strategy_group);
-          AddEdgeCost(src_idx, dst_idx, edge_communication_cost,
-                      edge_memory_cost);
+          AddEdgeCost(src_idx, dst_idx, edge_cost);
         }
       }
     }
@@ -110,8 +101,8 @@ CostGraph::CostGraph(const StrategyGroups& strategy_groups,
     NodeIdx src_idx = pair.first->node_idx;
     NodeIdx dst_idx = pair.second->node_idx;
 
-    Matrix edge_communication_cost(node_lens_[src_idx], node_lens_[dst_idx]);
-    Matrix edge_memory_cost(node_lens_[src_idx], node_lens_[dst_idx]);
+    EdgeReshardingCostMatrix edge_cost(node_lens_[src_idx],
+                                       node_lens_[dst_idx]);
     absl::flat_hash_map<std::string, NodeStrategyIdx>
         src_strategy_name_to_idx_map;
     for (NodeStrategyIdx i = 0; i < node_lens_[src_idx]; ++i) {
@@ -132,49 +123,21 @@ CostGraph::CostGraph(const StrategyGroups& strategy_groups,
           CHECK_LE(std::abs(src_strategy.communication_cost -
                             dst_strategy.communication_cost),
                    1e-6);
-          edge_communication_cost(it->second, i) =
+          edge_cost(it->second, i).communication_cost =
               -src_strategy.communication_cost;
         }
       }
     }
-    AddEdgeCost(src_idx, dst_idx, edge_communication_cost, edge_memory_cost);
-  }
-}
-
-Matrix CostGraph::CreateEdgeCommunicationCost(const NodeIdx src_idx,
-                                              const NodeIdx dst_idx,
-                                              const size_t in_node_idx,
-                                              StrategyGroup* strategy_group,
-                                              const bool zero_cost) {
-  CHECK_LT(src_idx, node_lens_.size());
-  CHECK_LT(dst_idx, node_lens_.size());
-  Matrix edge_communication_cost(node_lens_[src_idx], node_lens_[dst_idx]);
-  for (NodeStrategyIdx k = 0; k < strategy_group->strategies.size(); ++k) {
-    const ShardingStrategy& strategy = strategy_group->strategies[k];
-    size_t start_idx = 0;
-    if (strategy.communication_resharding_costs[in_node_idx].size() >
-        node_lens_[src_idx]) {
-      start_idx = strategy.communication_resharding_costs[in_node_idx].size() -
-                  node_lens_[src_idx];
-    }
-    for (size_t j = start_idx;
-         j < strategy.communication_resharding_costs[in_node_idx].size(); ++j) {
-      edge_communication_cost(j - start_idx, k) =
-          zero_cost ? 0
-                    : strategy.communication_resharding_costs[in_node_idx][j];
-    }
+    AddEdgeCost(src_idx, dst_idx, edge_cost);
   }
-  return edge_communication_cost;
 }
 
-Matrix CostGraph::CreateEdgeMemoryCost(const NodeIdx src_idx,
-                                       const NodeIdx dst_idx,
-                                       const size_t in_node_idx,
-                                       StrategyGroup* strategy_group,
-                                       const bool zero_cost) {
+EdgeReshardingCostMatrix CostGraph::CreateEdgeCost(
+    const NodeIdx src_idx, const NodeIdx dst_idx, const size_t in_node_idx,
+    StrategyGroup* strategy_group, const bool zero_cost) {
   CHECK_LT(src_idx, node_lens_.size());
   CHECK_LT(dst_idx, node_lens_.size());
-  Matrix edge_communication_cost(node_lens_[src_idx], node_lens_[dst_idx]);
+  EdgeReshardingCostMatrix edge_cost(node_lens_[src_idx], node_lens_[dst_idx]);
   for (NodeStrategyIdx k = 0; k < strategy_group->strategies.size(); ++k) {
     const ShardingStrategy& strategy = strategy_group->strategies[k];
     size_t start_idx = 0;
@@ -187,46 +150,43 @@ Matrix CostGraph::CreateEdgeMemoryCost(const NodeIdx src_idx,
     }
     for (size_t j = start_idx;
          j < strategy.memory_resharding_costs[in_node_idx].size(); ++j) {
-      edge_communication_cost(j - start_idx, k) =
-          zero_cost ? 0 : strategy.memory_resharding_costs[in_node_idx][j];
+      double communication_cost = 0;
+      double memory_cost = 0;
+      if (!zero_cost) {
+        communication_cost =
+            strategy.communication_resharding_costs[in_node_idx][j];
+        memory_cost = strategy.memory_resharding_costs[in_node_idx][j];
+      }
+      edge_cost(j - start_idx, k) =
+          EdgeReshardingCost(communication_cost, memory_cost);
     }
   }
-  return edge_communication_cost;
+  return edge_cost;
 }
 
-Matrix CostGraph::GetEdgeCommunicationCost(const NodeIdx i, const NodeIdx j) {
+EdgeReshardingCostMatrix CostGraph::GetEdgeCost(const NodeIdx i,
+                                                const NodeIdx j) {
   if (i <= j) {
-    return edge_communication_costs_[{i, j}];
+    return edge_costs_[{i, j}];
   }
-  return edge_communication_costs_[{j, i}].Transpose();
+  return edge_costs_[{j, i}].Transpose();
 }
 
-Matrix CostGraph::GetEdgeMemoryCost(const NodeIdx i, const NodeIdx j) {
-  if (i <= j) {
-    return edge_memory_costs_[{i, j}];
-  }
-  return edge_memory_costs_[{j, i}].Transpose();
-}
-
-void CostGraph::AddEdgeCost(NodeIdx i, NodeIdx j, Matrix& cost,
-                            Matrix& memory_cost) {
+void CostGraph::AddEdgeCost(NodeIdx i, NodeIdx j,
+                            EdgeReshardingCostMatrix& cost) {
   if (i > j) {
     std::swap(i, j);
     cost = cost.Transpose();
-    memory_cost = memory_cost.Transpose();
   }
 
-  if (edge_communication_costs_.contains({i, j})) {
+  if (edge_costs_.contains({i, j})) {
     CHECK(adjacency_[i].contains(j));
     CHECK(adjacency_[j].contains(i));
-    edge_communication_costs_[{i, j}] =
-        edge_communication_costs_[{i, j}] + cost;
-    edge_memory_costs_[{i, j}] = edge_memory_costs_[{i, j}] + memory_cost;
+    edge_costs_[{i, j}] = edge_costs_[{i, j}] + cost;
   } else {
     adjacency_[i].insert(j);
     adjacency_[j].insert(i);
-    edge_communication_costs_[{i, j}] = cost;
-    edge_memory_costs_[{i, j}] = memory_cost;
+    edge_costs_[{i, j}] = cost;
   }
 }
 
@@ -237,13 +197,11 @@ void CostGraph::RemoveEdge(NodeIdx i, NodeIdx j) {
 
   CHECK(adjacency_[i].contains(j));
   CHECK(adjacency_[j].contains(i));
-  CHECK(edge_communication_costs_.contains({i, j}));
-  CHECK(edge_memory_costs_.contains({i, j}));
+  CHECK(edge_costs_.contains({i, j}));
 
   adjacency_[i].erase(j);
   adjacency_[j].erase(i);
-  edge_communication_costs_.erase({i, j});
-  edge_memory_costs_.erase({i, j});
+  edge_costs_.erase({i, j});
 }
 
 void CostGraph::MergeNode(const NodeIdx src, const NodeIdx dst) {
@@ -253,7 +211,7 @@ void CostGraph::MergeNode(const NodeIdx src, const NodeIdx dst) {
   CHECK(!merged_to_.contains(dst));
   CHECK_NE(src, dst);
 
-  Matrix edge_communication_cost = GetEdgeCommunicationCost(dst, src);
+  EdgeReshardingCostMatrix edge_cost = GetEdgeCost(dst, src);
 
   std::vector<NodeStrategyIdx> reindexing(node_lens_[dst]);
   if (node_lens_[dst] == node_lens_[src]) {
@@ -277,7 +235,7 @@ void CostGraph::MergeNode(const NodeIdx src, const NodeIdx dst) {
       // as the last strategy in BuildStrategyAndCost.
       keys.reserve(node_lens_[src]);
       for (NodeStrategyIdx j = 0; j < node_lens_[src]; ++j) {
-        keys.push_back({edge_communication_cost(i, j), -j});
+        keys.push_back({edge_cost(i, j).communication_cost, -j});
       }
 
       std::sort(arange.begin(), arange.end(), [&keys](int l, int r) {
@@ -296,25 +254,19 @@ void CostGraph::MergeNode(const NodeIdx src, const NodeIdx dst) {
   for (const NodeIdx adj : adj_list) {
     if (adj == dst) {
       for (NodeStrategyIdx i = 0; i < node_lens_[dst]; ++i) {
-        extra_node_costs_[dst][i] += edge_communication_cost(i, reindexing[i]);
+        extra_node_costs_[dst][i] +=
+            edge_cost(i, reindexing[i]).communication_cost;
       }
     } else {
-      Matrix added_edge_communication_cost(node_lens_[dst], node_lens_[adj]);
-      Matrix added_edge_memory_cost(node_lens_[dst], node_lens_[adj]);
-      Matrix edge_communication_cost_src_adj =
-          GetEdgeCommunicationCost(src, adj);
-      Matrix edge_memory_cost_src_adj = GetEdgeMemoryCost(src, adj);
-
+      EdgeReshardingCostMatrix added_edge_cost(node_lens_[dst],
+                                               node_lens_[adj]);
+      EdgeReshardingCostMatrix edge_cost_src_adj = GetEdgeCost(src, adj);
       for (NodeStrategyIdx i = 0; i < node_lens_[dst]; ++i) {
         for (NodeStrategyIdx k = 0; k < node_lens_[adj]; ++k) {
-          added_edge_communication_cost(i, k) =
-              edge_communication_cost_src_adj(reindexing[i], k);
-          added_edge_memory_cost(i, k) =
-              edge_memory_cost_src_adj(reindexing[i], k);
+          added_edge_cost(i, k) = edge_cost_src_adj(reindexing[i], k);
         }
       }
-      AddEdgeCost(dst, adj, added_edge_communication_cost,
-                  added_edge_memory_cost);
+      AddEdgeCost(dst, adj, added_edge_cost);
     }
   }
   // Remove edges
@@ -380,7 +332,7 @@ std::string CostGraph::ToString() const {
   }
   absl::StrAppend(&str, "\n");
 
-  for (const auto& iter : edge_communication_costs_) {
+  for (const auto& iter : edge_costs_) {
     absl::StrAppend(&str, "Edge (", iter.first.first, ", ", iter.first.second,
                     "):\n");
     absl::StrAppend(&str, iter.second.ToString(), "\n");
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
index 6ef20f764cbca5..08b0bd968b6d4c 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/experimental/auto_sharding/matrix.h"
@@ -32,6 +33,28 @@ limitations under the License.
 namespace xla {
 namespace spmd {
 
+struct EdgeReshardingCost {
+  double communication_cost = 0;
+  double memory_cost = 0;
+
+  EdgeReshardingCost() : communication_cost(0), memory_cost(0) {}
+
+  EdgeReshardingCost(double communication_cost_, double memory_cost_)
+      : communication_cost(communication_cost_), memory_cost(memory_cost_) {}
+
+  EdgeReshardingCost operator+(const EdgeReshardingCost& other) const {
+    return EdgeReshardingCost(other.communication_cost + communication_cost,
+                              other.memory_cost + memory_cost);
+  }
+
+  std::string ToString() const {
+    return absl::StrCat("{communication_cost=", communication_cost,
+                        ", memory_cost=", memory_cost, "}");
+  }
+};
+
+using EdgeReshardingCostMatrix = Matrix<EdgeReshardingCost>;
+
 // A graph data structure to simplify the edge cost graph. It merges nodes and
 // performs path compression.
 class CostGraph {
@@ -39,20 +62,14 @@ class CostGraph {
   CostGraph(const StrategyGroups& strategy_groups,
             const AssociativeDotPairs& associative_dot_pairs);
 
-  Matrix CreateEdgeCommunicationCost(NodeIdx src_idx, NodeIdx dst_idx,
-                                     size_t in_node_idx,
-                                     StrategyGroup* strategy_group,
-                                     bool zero_cost = false);
-
-  Matrix CreateEdgeMemoryCost(NodeIdx src_idx, NodeIdx dst_idx,
-                              size_t in_node_idx, StrategyGroup* strategy_group,
-                              bool zero_cost = false);
-
-  Matrix GetEdgeCommunicationCost(NodeIdx i, NodeIdx j);
+  EdgeReshardingCostMatrix CreateEdgeCost(NodeIdx src_idx, NodeIdx dst_idx,
+                                          size_t in_node_idx,
+                                          StrategyGroup* strategy_group,
+                                          bool zero_cost = false);
 
-  Matrix GetEdgeMemoryCost(NodeIdx i, NodeIdx j);
+  EdgeReshardingCostMatrix GetEdgeCost(NodeIdx i, NodeIdx j);
 
-  void AddEdgeCost(NodeIdx i, NodeIdx j, Matrix& cost, Matrix& memory_cost);
+  void AddEdgeCost(NodeIdx i, NodeIdx j, EdgeReshardingCostMatrix& cost);
 
   void RemoveEdge(NodeIdx i, NodeIdx j);
 
@@ -90,8 +107,8 @@ class CostGraph {
   std::vector<StableHashSet<int>> adjacency_;
   // The cost matrix between two nodes.
 
-  StableHashMap<std::pair<NodeIdx, NodeIdx>, Matrix> edge_communication_costs_;
-  StableHashMap<std::pair<NodeIdx, NodeIdx>, Matrix> edge_memory_costs_;
+  StableHashMap<std::pair<NodeIdx, NodeIdx>, EdgeReshardingCostMatrix>
+      edge_costs_;
   // The extra node costs introduced by merging nodes.
   std::vector<std::vector<double>> extra_node_costs_;
   // The reindexing vector of the node.
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/matrix.h b/third_party/xla/xla/hlo/experimental/auto_sharding/matrix.h
index 40eb8d35887685..903973eea5a3a6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/matrix.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/matrix.h
@@ -36,6 +36,7 @@ namespace spmd {
 // It can create a view for matrix transpose without copying the memory.
 // TODO (zhuohan): Inherit from Array2D and add Transpose and operator+ (See
 // tensorflow/compiler/xla/array2d.h;l=39)
+template <typename T>
 class Matrix {
  public:
   Matrix() : n_(0), m_(0), transpose_(false), data_(nullptr) {}
@@ -44,11 +45,11 @@ class Matrix {
     this->n_ = n;
     this->m_ = m;
     transpose_ = false;
-    data_ = std::make_shared<std::vector<double>>(n * m, 0.0);
+    data_ = std::make_shared<std::vector<T>>(n * m, T());
   }
 
   Matrix(size_t n, size_t m, bool transpose,
-         std::shared_ptr<std::vector<double>> data) {
+         std::shared_ptr<std::vector<T>> data) {
     this->n_ = n;
     this->m_ = m;
     this->transpose_ = transpose;
@@ -57,7 +58,7 @@ class Matrix {
 
   Matrix Transpose() { return Matrix(m_, n_, !transpose_, data_); }
 
-  double operator()(size_t i, size_t j) const {
+  T operator()(size_t i, size_t j) const {
     size_t idx;
     if (transpose_) {
       idx = j * n_ + i;
@@ -69,7 +70,7 @@ class Matrix {
     return (*data_)[idx];
   }
 
-  double& operator()(size_t i, size_t j) {
+  T& operator()(size_t i, size_t j) {
     size_t idx;
     if (transpose_) {
       idx = j * n_ + i;
@@ -81,7 +82,7 @@ class Matrix {
     return (*data_)[idx];
   }
 
-  Matrix operator+(const Matrix& other) {
+  Matrix<T> operator+(const Matrix<T>& other) {
     CHECK_EQ(n_, other.n_);
     CHECK_EQ(m_, other.m_);
     Matrix ret = Matrix(n_, m_);
@@ -98,7 +99,7 @@ class Matrix {
 
     for (size_t i = 0; i < n_; ++i) {
       for (size_t j = 0; j < m_; ++j) {
-        absl::StrAppend(&str, operator()(i, j), " ");
+        absl::StrAppend(&str, operator()(i, j).ToString(), " ");
       }
       absl::StrAppend(&str, "\n");
     }
@@ -109,7 +110,7 @@ class Matrix {
   size_t n_;
   size_t m_;
   bool transpose_;
-  std::shared_ptr<std::vector<double>> data_;
+  std::shared_ptr<std::vector<T>> data_;
 };
 }  // namespace spmd
 }  // namespace xla

From c9cffcd9ee77dba29078a04185e2e1f3865a6a78 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 29 Mar 2024 10:38:02 -0700
Subject: [PATCH 603/670] [xla:gpu][NFC] Simplify `collect_slice_info`

PiperOrigin-RevId: 620281417
---
 .../xla/xla/service/gpu/fusions/custom.cc     | 52 ++++++++-----------
 .../gpu/runtime/address_computation_thunk.cc  |  4 +-
 .../gpu/runtime/address_computation_thunk.h   |  8 +--
 3 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 8c2229506e35d5..c8c0900bf0523b 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -199,10 +199,10 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
       ir_emitter_context.buffer_assignment();
 
   std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
-      offset_buffer_indices;
-  std::vector<std::optional<const Shape>> orig_shapes;
-  std::vector<std::optional<const Shape>> sliced_shapes;
-  std::vector<std::optional<uint64_t>> offset_byte_sizes;
+      offset_buffer_indices(4, std::nullopt);
+  std::vector<std::optional<Shape>> orig_shapes(4, std::nullopt);
+  std::vector<std::optional<Shape>> sliced_shapes(4, std::nullopt);
+  std::vector<std::optional<uint64_t>> offset_byte_sizes(4, std::nullopt);
 
   HloDynamicIndexInstruction* slice_instr = nullptr;
   auto get_original_operand_slice =
@@ -231,12 +231,8 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
                               fusion.operand(param->parameter_number()), index);
   };
 
-  auto collect_slice_info = [&]() {
+  auto collect_slice_info = [&](unsigned idx) {
     if (slice_instr == nullptr) {
-      offset_buffer_indices.push_back(std::nullopt);
-      orig_shapes.push_back(std::nullopt);
-      sliced_shapes.push_back(std::nullopt);
-      offset_byte_sizes.push_back(std::nullopt);
       return;
     }
 
@@ -249,27 +245,29 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
                              /*index=*/{})
               .value());
     }
-    offset_buffer_indices.push_back(offset_slices);
-    orig_shapes.push_back(slice_instr->operand(0)->shape());
-    sliced_shapes.push_back(DynCast<HloDynamicSliceInstruction>(slice_instr)
-                                ? slice_instr->shape()
-                                : slice_instr->operand(1)->shape());
-    offset_byte_sizes.push_back(ShapeUtil::ByteSizeOfPrimitiveType(
-        slice_instr->index_operands().front()->shape().element_type()));
+    offset_buffer_indices[idx] = std::move(offset_slices);
+    orig_shapes[idx] = slice_instr->operand(0)->shape();
+    sliced_shapes[idx] = DynCast<HloDynamicSliceInstruction>(slice_instr)
+                             ? slice_instr->shape()
+                             : slice_instr->operand(1)->shape();
+    offset_byte_sizes[idx] = ShapeUtil::ByteSizeOfPrimitiveType(
+        slice_instr->index_operands().front()->shape().element_type());
+
+    // Reset `slice_instr` for the next call to `collect_slice_info()`.
+    slice_instr = nullptr;
   };
 
+  unsigned argument_idx = 0;
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice lhs_slice,
                       get_original_operand_slice(
-                          custom_call.operand(kLHSOperandIndex), /*index=*/{}));
-  collect_slice_info();
+                          custom_call.operand(argument_idx), /*index=*/{}));
+  collect_slice_info(argument_idx++);
 
-  slice_instr = nullptr;
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice rhs_slice,
                       get_original_operand_slice(
-                          custom_call.operand(kRHSOperandIndex), /*index=*/{}));
-  collect_slice_info();
+                          custom_call.operand(argument_idx), /*index=*/{}));
+  collect_slice_info(argument_idx++);
 
-  slice_instr = nullptr;
   BufferAllocation::Slice output;
   std::optional<BufferAllocation::Slice> workspace = std::nullopt;
   std::optional<BufferAllocation::Slice> slice_workspace_fake = std::nullopt;
@@ -313,22 +311,18 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
   if (fusion.shape().IsArray()) {
     TF_ASSIGN_OR_RETURN(output,
                         get_original_result_slice(&custom_call, /*index=*/{}));
-    collect_slice_info();
-    // Collect slice info for std::nullopt workspace.
-    slice_instr = nullptr;
-    collect_slice_info();
+    collect_slice_info(argument_idx);
   } else {
     TF_ASSIGN_OR_RETURN(
         output, get_original_result_slice(&custom_call,
                                           /*index=*/{kGEMMOutputBufferIndex}));
-    collect_slice_info();
+    collect_slice_info(argument_idx++);
     // TODO(vuson): If we want to support slices of workspace, we'd need to
     // start `HloFindIf` with `get-tuple-element` with the right index.
     TF_ASSIGN_OR_RETURN(
         workspace, GetAllocationSlice(buffer_assignment, &fusion,
                                       /*index=*/{kGEMMWorkspaceBufferIndex}));
-    slice_instr = nullptr;
-    collect_slice_info();
+    collect_slice_info(argument_idx);
     fake_allocations[3] = std::make_unique<BufferAllocation>(
         /*index=*/3, workspace->size(), /*color=*/0);
     slice_workspace_fake = BufferAllocation::Slice(fake_allocations[3].get(), 0,
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 15efc3e89b4b5b..b24a4f2b7cc3b7 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -49,8 +49,8 @@ AddressComputationThunk::AddressComputationThunk(
     std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
     std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
         offset_buffer_indices,
-    std::vector<std::optional<const Shape>> orig_shapes,
-    std::vector<std::optional<const Shape>> sliced_shapes,
+    std::vector<std::optional<Shape>> orig_shapes,
+    std::vector<std::optional<Shape>> sliced_shapes,
     std::vector<std::optional<uint64_t>> offset_byte_sizes)
     : Thunk(Kind::kAddressComputation, thunk_info),
       embedded_thunk_(std::make_unique<SequentialThunk>(
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index e1c0b30d9953aa..8d36751b9d830d 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -48,8 +48,8 @@ class AddressComputationThunk : public Thunk {
       std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_,
       std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
           offset_buffer_indices,
-      std::vector<std::optional<const Shape>> orig_shapes,
-      std::vector<std::optional<const Shape>> sliced_shapes,
+      std::vector<std::optional<Shape>> orig_shapes,
+      std::vector<std::optional<Shape>> sliced_shapes,
       std::vector<std::optional<uint64_t>> offset_byte_sizes);
 
   AddressComputationThunk(const AddressComputationThunk&) = delete;
@@ -67,8 +67,8 @@ class AddressComputationThunk : public Thunk {
   std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
   std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
       offset_buffer_indices_;
-  std::vector<std::optional<const Shape>> orig_shapes_;
-  std::vector<std::optional<const Shape>> sliced_shapes_;
+  std::vector<std::optional<Shape>> orig_shapes_;
+  std::vector<std::optional<Shape>> sliced_shapes_;
   std::vector<std::optional<uint64_t>> offset_byte_sizes_;
 
   // Pinned host memory for transferring offset values from device to host.

From 8012c68f2fbbb7fc5d6986e55c49976504fb7eb3 Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Fri, 29 Mar 2024 10:49:09 -0700
Subject: [PATCH 604/670] Change include order in `ml_dtypes.cc` to prevent
 errors.

Trying to prevent `error: "Using deprecated NumPy API, disable it with #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION"`

PiperOrigin-RevId: 620284610
---
 .../xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc     | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc b/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc
index 662a29f84f9387..2815b25b24469f 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc
+++ b/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc
@@ -17,13 +17,16 @@ limitations under the License.
 #include <atomic>
 #include <exception>
 
+// Must be included first to ensure `NPY_NO_DEPRECATED_API` is defined.
+// clang-format off
+#include "tsl/python/lib/core/numpy.h"  // IWYU pragma: keep
+// clang-format on
 #include "numpy/ndarraytypes.h"
 #include "absl/base/attributes.h"
 #include "absl/base/call_once.h"
 #include "pybind11/gil.h"  // from @pybind11
 #include "pybind11/numpy.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
-#include "tsl/python/lib/core/numpy.h"  // IWYU pragma: keep
 
 namespace tsl {
 namespace ml_dtypes {

From 1faaaceb5f5c27cdfbb5b9f254ad4c3d95bb6b39 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <dmitrig@google.com>
Date: Fri, 29 Mar 2024 10:53:24 -0700
Subject: [PATCH 605/670] Integrate LLVM at llvm/llvm-project@80aa52d8c5a8

Updates LLVM usage to match
[80aa52d8c5a8](https://github.com/llvm/llvm-project/commit/80aa52d8c5a8)

PiperOrigin-RevId: 620285862
---
 third_party/llvm/generated.patch | 206 -------------------------------
 third_party/llvm/workspace.bzl   |   4 +-
 2 files changed, 2 insertions(+), 208 deletions(-)

diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 229971a2e9ad47..509398da979e83 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,207 +1 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/clang/lib/APINotes/APINotesWriter.cpp b/clang/lib/APINotes/APINotesWriter.cpp
---- a/clang/lib/APINotes/APINotesWriter.cpp
-+++ b/clang/lib/APINotes/APINotesWriter.cpp
-@@ -441,7 +441,7 @@
-   std::sort(VI.begin(), VI.end(),
-             [](const std::pair<VersionTuple, T> &LHS,
-                const std::pair<VersionTuple, T> &RHS) -> bool {
--              assert(LHS.first != RHS.first &&
-+              assert((&LHS == &RHS || LHS.first != RHS.first) &&
-                      "two entries for the same version");
-               return LHS.first < RHS.first;
-             });
-diff -ruN --strip-trailing-cr a/clang/test/APINotes/module-cache.m b/clang/test/APINotes/module-cache.m
---- a/clang/test/APINotes/module-cache.m
-+++ b/clang/test/APINotes/module-cache.m
-@@ -27,6 +27,7 @@
- // RUN: FileCheck -check-prefix=CHECK-ONE-ERROR %s < %t/before.log
- 
- // Change the API notes file, after the module has rebuilt once.
-+// RUN: chmod u+w %t/APINotes/SomeOtherKit.apinotes
- // RUN: echo '      - Selector: "methodA"' >> %t/APINotes/SomeOtherKit.apinotes
- // RUN: echo '        MethodKind: Instance' >> %t/APINotes/SomeOtherKit.apinotes
- // RUN: echo '        Availability: none' >> %t/APINotes/SomeOtherKit.apinotes
-diff -ruN --strip-trailing-cr a/lld/test/ELF/lto/libcall-archive.ll b/lld/test/ELF/lto/libcall-archive.ll
---- a/lld/test/ELF/lto/libcall-archive.ll
-+++ b/lld/test/ELF/lto/libcall-archive.ll
-@@ -4,8 +4,8 @@
- ; RUN: llvm-as -o %t2.o %S/Inputs/libcall-archive.ll
- ; RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux -o %t3.o %S/Inputs/libcall-archive.s
- ; RUN: llvm-ar rcs %t.a %t2.o %t3.o
--; RUN: ld.lld --why-extract=why.txt -o %t %t.o %t.a
--; RUN: FileCheck %s --input-file=why.txt --check-prefix=CHECK-WHY
-+; RUN: ld.lld --why-extract=%t.why.txt -o %t %t.o %t.a
-+; RUN: FileCheck %s --input-file=%t.why.txt --check-prefix=CHECK-WHY
- ; RUN: llvm-nm %t | FileCheck %s
- ; RUN: ld.lld -o %t2 %t.o --start-lib %t2.o %t3.o --end-lib
- ; RUN: llvm-nm %t2 | FileCheck %s
-diff -ruN --strip-trailing-cr a/llvm/include/llvm/IR/Verifier.h b/llvm/include/llvm/IR/Verifier.h
---- a/llvm/include/llvm/IR/Verifier.h
-+++ b/llvm/include/llvm/IR/Verifier.h
-@@ -77,7 +77,6 @@
-   /// Visit an instruction and return true if it is valid, return false if an
-   /// invalid TBAA is attached.
-   bool visitTBAAMetadata(Instruction &I, const MDNode *MD);
--  bool visitTBAAStructMetadata(Instruction &I, const MDNode *MD);
- };
- 
- /// Check a function for errors, useful for use when debugging a
-diff -ruN --strip-trailing-cr a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
---- a/llvm/lib/IR/Verifier.cpp
-+++ b/llvm/lib/IR/Verifier.cpp
-@@ -5096,9 +5096,6 @@
-   if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa))
-     TBAAVerifyHelper.visitTBAAMetadata(I, TBAA);
- 
--  if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa_struct))
--    TBAAVerifyHelper.visitTBAAStructMetadata(I, TBAA);
--
-   if (MDNode *MD = I.getMetadata(LLVMContext::MD_noalias))
-     visitAliasScopeListMetadata(MD);
-   if (MDNode *MD = I.getMetadata(LLVMContext::MD_alias_scope))
-@@ -7422,35 +7419,6 @@
-   return true;
- }
- 
--bool TBAAVerifier::visitTBAAStructMetadata(Instruction &I, const MDNode *MD) {
--  CheckTBAA(MD->getNumOperands() % 3 == 0,
--            "tbaa.struct operands must occur in groups of three", &I, MD);
--
--  // Each group of three operands must consist of two integers and a
--  // tbaa node. Moreover, the regions described by the offset and size
--  // operands must be non-overlapping.
--  std::optional<APInt> NextFree;
--  for (unsigned int Idx = 0; Idx < MD->getNumOperands(); Idx += 3) {
--    auto *OffsetCI =
--        mdconst::dyn_extract_or_null<ConstantInt>(MD->getOperand(Idx));
--    CheckTBAA(OffsetCI, "Offset must be a constant integer", &I, MD);
--
--    auto *SizeCI =
--        mdconst::dyn_extract_or_null<ConstantInt>(MD->getOperand(Idx + 1));
--    CheckTBAA(SizeCI, "Size must be a constant integer", &I, MD);
--
--    MDNode *TBAA = dyn_cast_or_null<MDNode>(MD->getOperand(Idx + 2));
--    CheckTBAA(TBAA, "TBAA tag missing", &I, MD);
--    visitTBAAMetadata(I, TBAA);
--
--    bool NonOverlapping = !NextFree || NextFree->ule(OffsetCI->getValue());
--    CheckTBAA(NonOverlapping, "Overlapping tbaa.struct regions", &I, MD);
--
--    NextFree = OffsetCI->getValue() + SizeCI->getValue();
--  }
--  return true;
--}
--
- char VerifierLegacyPass::ID = 0;
- INITIALIZE_PASS(VerifierLegacyPass, "verify", "Module Verifier", false, false)
- 
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
---- a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
-+++ b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
-@@ -518,6 +518,4 @@
- !1 = !{!"omnipotent char", !2}
- !2 = !{!"Simple C/C++ TBAA"}
- !3 = !{!"short", !1}
--!4 = !{i64 0, i64 4, !5, i64 4, i64 2, !6, i64 8, i64 4, !5, i64 12, i64 2, !6, i64 16, i64 4, !5, i64 20, i64 2, !6}
--!5 = !{!0, !0, i64 0}
--!6 = !{!3, !3, i64 0}
-+!4 = !{i64 0, i64 4, !0, i64 4, i64 2, !3, i64 8, i64 4, !0, i64 12, i64 2, !3, i64 16, i64 4, !0, i64 20, i64 2, !3}
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
---- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
-+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
-@@ -141,4 +141,4 @@
- !5 = distinct !{!5, !"some domain"}
- !6 = !{!7}
- !7 = distinct !{!7, !5, !"some scope 2"}
--!8 = !{i64 0, i64 8, !0}
-+!8 = !{i64 0, i64 8, null}
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
---- a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
-+++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
-@@ -75,7 +75,7 @@
- !1 = !{!"omnipotent char", !0}
- !2 = !{!5, !5, i64 0}
- !3 = !{i64 0, i64 4, !2}
--!4 = !{i64 0, i64 8, !2}
-+!4 = !{i64 0, i64 8, null}
- !5 = !{!"float", !0}
- !6 = !{i64 0, i64 4, !2, i64 4, i64 4, !2}
- !7 = !{i64 0, i64 2, !2, i64 4, i64 6, !2}
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
---- a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
-+++ b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
-@@ -836,6 +836,5 @@
- !2 = !{ !"set2", !0 }
- !3 = !{ !3, !{!"llvm.loop.parallel_accesses", !13} }
- !4 = !{ float 4.0 }
--!5 = !{ i64 0, i64 8, !6 }
--!6 = !{ !1, !1, i64 0 }
-+!5 = !{ i64 0, i64 8, null }
- !13 = distinct !{}
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/Scalarizer/basic.ll b/llvm/test/Transforms/Scalarizer/basic.ll
---- a/llvm/test/Transforms/Scalarizer/basic.ll
-+++ b/llvm/test/Transforms/Scalarizer/basic.ll
-@@ -870,6 +870,5 @@
- !2 = !{ !"set2", !0 }
- !3 = !{ !3, !{!"llvm.loop.parallel_accesses", !13} }
- !4 = !{ float 4.0 }
--!5 = !{ i64 0, i64 8, !6 }
--!6 = !{ !1, !1, i64 0 }
-+!5 = !{ i64 0, i64 8, null }
- !13 = distinct !{}
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SROA/tbaa-struct3.ll b/llvm/test/Transforms/SROA/tbaa-struct3.ll
---- a/llvm/test/Transforms/SROA/tbaa-struct3.ll
-+++ b/llvm/test/Transforms/SROA/tbaa-struct3.ll
-@@ -539,7 +539,7 @@
- !6 = !{!5, !5, i64 0}
- !7 = !{i64 0, i64 8, !6, i64 8, i64 4, !1}
- !8 = !{i64 0, i64 4, !1, i64 4, i64 8, !6}
--!9 = !{i64 0, i64 8, !6, i64 8, i64 8, !1}
-+!9 = !{i64 0, i64 8, !6, i64 4, i64 8, !1}
- !10 = !{i64 0, i64 2, !1, i64 2, i64 2, !1}
- !11 = !{i64 0, i64 1, !1, i64 1, i64 3, !1}
- !12 = !{i64 0, i64 2, !1, i64 2, i64 6, !1}
-diff -ruN --strip-trailing-cr a/llvm/test/Verifier/tbaa-struct.ll b/llvm/test/Verifier/tbaa-struct.ll
---- a/llvm/test/Verifier/tbaa-struct.ll
-+++ b/llvm/test/Verifier/tbaa-struct.ll
-@@ -1,36 +1,28 @@
--; RUN: not llvm-as < %s 2>&1 | FileCheck %s
-+; RUN: llvm-as < %s 2>&1
-+
-+; FIXME: The verifer should reject the invalid !tbaa.struct nodes below.
- 
- define void @test_overlapping_regions(ptr %a1) {
--; CHECK: Overlapping tbaa.struct regions
--; CHECK-NEXT:  %ld = load i8, ptr %a1, align 1, !tbaa.struct !0
-   %ld = load i8, ptr %a1, align 1, !tbaa.struct !0
-   ret void
- }
- 
- define void @test_size_not_integer(ptr %a1) {
--; CHECK: Size must be a constant integer
--; CHECK-NEXT:  store i8 1, ptr %a1, align 1, !tbaa.struct !5
-   store i8 1, ptr %a1, align 1, !tbaa.struct !5
-   ret void
- }
- 
- define void @test_offset_not_integer(ptr %a1, ptr %a2) {
--; CHECK: Offset must be a constant integer
--; CHECK-NEXT:  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !6
-   tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !6
-   ret void
- }
- 
- define void @test_tbaa_missing(ptr %a1, ptr %a2) {
--; CHECK: TBAA tag missing
--; CHECK-NEXT:  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !7
-   tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !7
-   ret void
- }
- 
- define void @test_tbaa_invalid(ptr %a1) {
--; CHECK: Old-style TBAA is no longer allowed, use struct-path TBAA instead
--; CHECK-NEXT:  store i8 1, ptr %a1, align 1, !tbaa.struct !8
-   store i8 1, ptr %a1, align 1, !tbaa.struct !8
-   ret void
- }
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 1a7d56b5764590..6ed4d29d211c15 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "aa2c14de1adcd265bf0c0fb44f97b5d6c1c38710"
-    LLVM_SHA256 = "50d2c7cd5355ec04a75991f2a4e2c89a3876b46fc1b71cd9fa3245f212d55da0"
+    LLVM_COMMIT = "80aa52d8c5a8a1c26b4114c60c2159c743d236d8"
+    LLVM_SHA256 = "b9079d7e8d72d7bb2453d908be1bd2bc4e5d62fd358ea9b7108f1c7d3b3c8585"
 
     tf_http_archive(
         name = name,

From 8dc94a2ad54f5ea13d842d1eacbab9d3f04433ce Mon Sep 17 00:00:00 2001
From: Chris Minge <chrisminge@google.com>
Date: Fri, 29 Mar 2024 11:15:22 -0700
Subject: [PATCH 606/670] Add support for LocalDeviceManager in tfrt_session.

PiperOrigin-RevId: 620292882
---
 tensorflow/core/tfrt/tfrt_session/tfrt_session.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
index 2836105ca727e1..a664a14d58ffe1 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
@@ -464,6 +464,10 @@ class TfrtSession : public tensorflow::Session {
   Status ListDevices(std::vector<DeviceAttributes>* response) override {
     return errors::Unimplemented("TfrtSession::ListDevices is Unimplemented.");
   }
+  Status LocalDeviceManager(const DeviceMgr** output) override {
+    *output = &graph_executor_->fallback_state().device_manager();
+    return absl::OkStatus();
+  }
 
  private:
   tfrt::HostContext* GetHostContext() {

From 66ee739cec70a0488c0f4c7ef9ce99e5f14fe14f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Mar 2024 11:41:47 -0700
Subject: [PATCH 607/670] Add new methods in backend async kernel interface.
 Reflect to the design of go/async-io-coherency

New methods will set / retrieve the attributes based on the buffers.

PiperOrigin-RevId: 620300315
---
 .../async/backend_async_kernel_interface.cc   | 20 ++++++++++
 .../async/backend_async_kernel_interface.h    | 14 +++++++
 .../backend_async_kernel_interface_test.cc    |  4 ++
 .../lite/async/testing/mock_async_kernel.h    |  5 +++
 .../lite/core/async/async_kernel_internal.h   | 11 ++++++
 .../lite/core/async/async_signature_runner.cc | 10 +++++
 .../lite/core/async/async_signature_runner.h  | 11 ++++++
 tensorflow/lite/core/async/async_subgraph.cc  | 10 +++++
 tensorflow/lite/core/async/async_subgraph.h   | 18 +++++++++
 tensorflow/lite/core/async/c/async_kernel.cc  | 18 +++++++++
 tensorflow/lite/core/async/c/async_kernel.h   | 38 +++++++++++++++++++
 tensorflow/lite/delegates/gpu/delegate.cc     | 16 ++++++++
 12 files changed, 175 insertions(+)

diff --git a/tensorflow/lite/async/backend_async_kernel_interface.cc b/tensorflow/lite/async/backend_async_kernel_interface.cc
index ef7bc0563018a2..a3e0234bbc1fd9 100644
--- a/tensorflow/lite/async/backend_async_kernel_interface.cc
+++ b/tensorflow/lite/async/backend_async_kernel_interface.cc
@@ -152,6 +152,22 @@ TfLiteStatus Finish(TfLiteAsyncKernel* async_kernel,
       ->Finish(context, task);
 }
 
+TfLiteStatus SetBufferAttributes(TfLiteAsyncKernel* async_kernel,
+                                 const TfLiteBackendBuffer* buffer,
+                                 const TfLiteAttributeMap* attrs) {
+  return reinterpret_cast<BackendAsyncKernelInterface*>(
+             TfLiteAsyncKernelGetKernelData(async_kernel))
+      ->SetBufferAttributes(buffer, attrs);
+}
+
+TfLiteStatus GetBufferAttributes(TfLiteAsyncKernel* async_kernel,
+                                 const TfLiteBackendBuffer* buffer,
+                                 TfLiteAttributeMap* attrs) {
+  return reinterpret_cast<BackendAsyncKernelInterface*>(
+             TfLiteAsyncKernelGetKernelData(async_kernel))
+      ->GetBufferAttributes(buffer, attrs);
+}
+
 }  // namespace internal
 
 BackendAsyncKernelInterface::BackendAsyncKernelInterface() {
@@ -167,6 +183,10 @@ BackendAsyncKernelInterface::BackendAsyncKernelInterface() {
   TfLiteAsyncKernelSetReconcileRestrictions(kernel_,
                                             internal::ReconcileRestrictions);
   TfLiteAsyncKernelSetSetAttributes(kernel_, internal::SetAttributes);
+  TfLiteAsyncKernelSetSetBufferAttributes(kernel_,
+                                          internal::SetBufferAttributes);
+  TfLiteAsyncKernelSetGetBufferAttributes(kernel_,
+                                          internal::GetBufferAttributes);
   TfLiteAsyncKernelSetPrepare(kernel_, internal::Prepare);
   TfLiteAsyncKernelSetEval(kernel_, internal::Eval);
   TfLiteAsyncKernelSetWait(kernel_, internal::Wait);
diff --git a/tensorflow/lite/async/backend_async_kernel_interface.h b/tensorflow/lite/async/backend_async_kernel_interface.h
index 2849c229c61cc9..c8d94341c417ad 100644
--- a/tensorflow/lite/async/backend_async_kernel_interface.h
+++ b/tensorflow/lite/async/backend_async_kernel_interface.h
@@ -130,6 +130,20 @@ class BackendAsyncKernelInterface {
                                      TfLiteOpaqueNode* node, int tensor_index,
                                      const TfLiteAttributeMap* attrs) = 0;
 
+  // Set buffer's attributes. Backend will check if the buffer has been
+  // registered. And return TfLiteOk if the `attrs` for the `buffer` could be
+  // set in the corresponding async kernel.
+  virtual TfLiteStatus SetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                           const TfLiteAttributeMap* attrs) = 0;
+
+  // Get buffer's attributes. Backend will check if the buffer has been
+  // registered. And return TfLiteOk if provided `attrs` for the `buffer` could
+  // be found in the registration pool in corresponding async kernel. If `attrs`
+  // is a non-empty map, it will be overwritten by the attributes of the
+  // `buffer`.
+  virtual TfLiteStatus GetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                           TfLiteAttributeMap* attrs) = 0;
+
   // Prepares the kernel using the information from Set[In|Out]putAttributes
   // call above.
   virtual TfLiteStatus Prepare(TfLiteOpaqueContext* context,
diff --git a/tensorflow/lite/async/backend_async_kernel_interface_test.cc b/tensorflow/lite/async/backend_async_kernel_interface_test.cc
index 2620cc707a1ad0..e4f68bb32167b6 100644
--- a/tensorflow/lite/async/backend_async_kernel_interface_test.cc
+++ b/tensorflow/lite/async/backend_async_kernel_interface_test.cc
@@ -35,6 +35,8 @@ TEST(BackendAsyncKernelInterfaceTest, BasicTest) {
   EXPECT_CALL(kernel, UnregisterBuffer(_, _));
   EXPECT_CALL(kernel, ReconcileRestrictions(_, _, _, _, _, _));
   EXPECT_CALL(kernel, SetAttributes(_, _, _, _));
+  EXPECT_CALL(kernel, SetBufferAttributes(_, _));
+  EXPECT_CALL(kernel, GetBufferAttributes(_, _));
   EXPECT_CALL(kernel, Prepare(_, _));
   EXPECT_CALL(kernel, Eval(_, _, _));
   EXPECT_CALL(kernel, Wait(_, _));
@@ -49,6 +51,8 @@ TEST(BackendAsyncKernelInterfaceTest, BasicTest) {
   tflite_kernel->reconcile_restrictions(tflite_kernel, nullptr, nullptr, 0,
                                         nullptr, nullptr, nullptr);
   tflite_kernel->set_attributes(tflite_kernel, nullptr, nullptr, 0, nullptr);
+  tflite_kernel->set_buffer_attributes(tflite_kernel, nullptr, nullptr);
+  tflite_kernel->get_buffer_attributes(tflite_kernel, nullptr, nullptr);
   tflite_kernel->prepare(tflite_kernel, nullptr, nullptr);
   tflite_kernel->eval(tflite_kernel, nullptr, nullptr, nullptr);
   tflite_kernel->wait(tflite_kernel, nullptr, nullptr);
diff --git a/tensorflow/lite/async/testing/mock_async_kernel.h b/tensorflow/lite/async/testing/mock_async_kernel.h
index a3297f849b12f6..be31a2a71a843c 100644
--- a/tensorflow/lite/async/testing/mock_async_kernel.h
+++ b/tensorflow/lite/async/testing/mock_async_kernel.h
@@ -48,6 +48,11 @@ class MockAsyncKernel : public delegates::BackendAsyncKernelInterface {
               (TfLiteOpaqueContext*, TfLiteOpaqueNode*, int,
                const TfLiteAttributeMap*),
               (override));
+  MOCK_METHOD(TfLiteStatus, SetBufferAttributes,
+              (const TfLiteBackendBuffer*, const TfLiteAttributeMap*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, GetBufferAttributes,
+              (const TfLiteBackendBuffer*, TfLiteAttributeMap*), (override));
   MOCK_METHOD(TfLiteStatus, Prepare, (TfLiteOpaqueContext*, TfLiteOpaqueNode*),
               (override));
   MOCK_METHOD(TfLiteStatus, Eval,
diff --git a/tensorflow/lite/core/async/async_kernel_internal.h b/tensorflow/lite/core/async/async_kernel_internal.h
index 2ce473a029e516..efc341be8b3f0c 100644
--- a/tensorflow/lite/core/async/async_kernel_internal.h
+++ b/tensorflow/lite/core/async/async_kernel_internal.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_ASYNC_ASYNC_KERNEL_INTERNAL_H_
 #define TENSORFLOW_LITE_CORE_ASYNC_ASYNC_KERNEL_INTERNAL_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <vector>
 
@@ -115,6 +116,16 @@ struct TfLiteAsyncKernel {
                                  TfLiteOpaqueNode* node, int tensor_index,
                                  const TfLiteAttributeMap* attrs) = nullptr;
 
+  // Set attributes to the buffer, backend kernel will validate the buffer.
+  TfLiteStatus (*set_buffer_attributes)(
+      TfLiteAsyncKernel* async_kernel, const TfLiteBackendBuffer* buffer,
+      const TfLiteAttributeMap* attrs) = nullptr;
+
+  // Get attributes from the buffer, backend kernel will validate the buffer.
+  TfLiteStatus (*get_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                        const TfLiteBackendBuffer* buffer,
+                                        TfLiteAttributeMap* attrs) = nullptr;
+
   // Prepares the kernel using the information from Set[In|Out]putAttributes
   // call above.
   TfLiteStatus (*prepare)(TfLiteAsyncKernel* async_kernel,
diff --git a/tensorflow/lite/core/async/async_signature_runner.cc b/tensorflow/lite/core/async/async_signature_runner.cc
index d87f9f40683ab1..ad1ee15a601bdb 100644
--- a/tensorflow/lite/core/async/async_signature_runner.cc
+++ b/tensorflow/lite/core/async/async_signature_runner.cc
@@ -137,6 +137,16 @@ TfLiteStatus AsyncSignatureRunner::SetAttributes(
   return async_subgraph_->SetAttributes(tensor_index, attrs);
 }
 
+TfLiteStatus AsyncSignatureRunner::SetBufferAttributes(
+    const TfLiteBackendBuffer* buffer, const TfLiteAttributeMap* attrs) {
+  return async_subgraph_->SetBufferAttributes(buffer, attrs);
+}
+
+TfLiteStatus AsyncSignatureRunner::GetBufferAttributes(
+    const TfLiteBackendBuffer* buffer, TfLiteAttributeMap* attrs) {
+  return async_subgraph_->GetBufferAttributes(buffer, attrs);
+}
+
 TfLiteStatus AsyncSignatureRunner::PrepareBackends() {
   return async_subgraph_->Prepare();
 }
diff --git a/tensorflow/lite/core/async/async_signature_runner.h b/tensorflow/lite/core/async/async_signature_runner.h
index b23a460debea29..d0a85a6c7eb038 100644
--- a/tensorflow/lite/core/async/async_signature_runner.h
+++ b/tensorflow/lite/core/async/async_signature_runner.h
@@ -122,6 +122,17 @@ class AsyncSignatureRunner {
   // Returns true if all backends accept the `attrs`.
   TfLiteStatus SetAttributes(int tensor_index, const TfLiteAttributeMap* attrs);
 
+  // Set the attributes of a specific buffer. Returns
+  // kTfLiteDelegateError if the buffer is not registered.
+  TfLiteStatus SetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   const TfLiteAttributeMap* attrs);
+
+  // Get the attributes from a specific buffer. Returns
+  // kTfLiteDelegateError if the buffer has not been found in the
+  // backends.
+  TfLiteStatus GetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   TfLiteAttributeMap* attrs);
+
   // Prepares delegate backends for execution.
   // Must be called after calling `SetAttributes`.
   TfLiteStatus PrepareBackends();
diff --git a/tensorflow/lite/core/async/async_subgraph.cc b/tensorflow/lite/core/async/async_subgraph.cc
index 7a575372049a07..11fcef091be9ff 100644
--- a/tensorflow/lite/core/async/async_subgraph.cc
+++ b/tensorflow/lite/core/async/async_subgraph.cc
@@ -179,6 +179,16 @@ TfLiteStatus AsyncSubgraph::SetAttributes(int tensor_index,
                                           opaque_node_, tensor_index, attrs);
 }
 
+TfLiteStatus AsyncSubgraph::SetBufferAttributes(
+    const TfLiteBackendBuffer* buffer, const TfLiteAttributeMap* attrs) {
+  return (*async_kernel_->set_buffer_attributes)(async_kernel_, buffer, attrs);
+}
+
+TfLiteStatus AsyncSubgraph::GetBufferAttributes(
+    const TfLiteBackendBuffer* buffer, TfLiteAttributeMap* attrs) {
+  return (*async_kernel_->get_buffer_attributes)(async_kernel_, buffer, attrs);
+}
+
 TfLiteStatus AsyncSubgraph::Prepare() {
   if (async_kernel() == nullptr) return kTfLiteError;
   return (*async_kernel_->prepare)(async_kernel_, opaque_context(),
diff --git a/tensorflow/lite/core/async/async_subgraph.h b/tensorflow/lite/core/async/async_subgraph.h
index edf87ecaae72d1..cf4f3c905ca381 100644
--- a/tensorflow/lite/core/async/async_subgraph.h
+++ b/tensorflow/lite/core/async/async_subgraph.h
@@ -109,6 +109,24 @@ class AsyncSubgraph {
   // Returns true if all backends accept the `attrs`.
   TfLiteStatus SetAttributes(int tensor_index, const TfLiteAttributeMap* attrs);
 
+  // Set the attributes for a specific buffer. `attrs` should be initialized
+  // before calling this function and could be constructed by calling
+  // TfLiteAttributeMapCreate(). The attributes will be sent to backend kernels
+  // and stored in the map with the buffer. `buffer` and `attrs` should not be
+  // nullptr. The buffer needs to be registered before calling this function.
+  TfLiteStatus SetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   const TfLiteAttributeMap* attrs);
+
+  // Get the attributes for a specific buffer. `attrs` should be initialized
+  // before calling this function and could be constructed by calling
+  // TfLiteAttributeMapCreate(). `attrs` will be used to store the attributes
+  // obtained from the backend kernel. If `attrs` is a non-empty map, it will be
+  // overwritten by the attributes of the buffer. `buffer` and `attrs` should
+  // not be nullptr. The buffer needs to be registered before calling this
+  // function.
+  TfLiteStatus GetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   TfLiteAttributeMap* attrs);
+
   // Prepares delegate backends for execution.
   // Must be called after calling `SetAttributes`.
   TfLiteStatus Prepare();
diff --git a/tensorflow/lite/core/async/c/async_kernel.cc b/tensorflow/lite/core/async/c/async_kernel.cc
index e220014954926c..08e6f0f2f8581f 100644
--- a/tensorflow/lite/core/async/c/async_kernel.cc
+++ b/tensorflow/lite/core/async/c/async_kernel.cc
@@ -100,6 +100,24 @@ void TfLiteAsyncKernelSetSetAttributes(
   async_kernel->set_attributes = set_attributes;
 }
 
+void TfLiteAsyncKernelSetSetBufferAttributes(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*set_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                          const TfLiteBackendBuffer* buffer,
+                                          const TfLiteAttributeMap* attrs)) {
+  if (!async_kernel) return;
+  async_kernel->set_buffer_attributes = set_buffer_attributes;
+}
+
+void TfLiteAsyncKernelSetGetBufferAttributes(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*get_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                          const TfLiteBackendBuffer* buffer,
+                                          TfLiteAttributeMap* attrs)) {
+  if (!async_kernel) return;
+  async_kernel->get_buffer_attributes = get_buffer_attributes;
+};
+
 void TfLiteAsyncKernelSetPrepare(
     TfLiteAsyncKernel* async_kernel,
     TfLiteStatus (*prepare)(TfLiteAsyncKernel* async_kernel,
diff --git a/tensorflow/lite/core/async/c/async_kernel.h b/tensorflow/lite/core/async/c/async_kernel.h
index 1b3c76acee6324..e53eca75c70f65 100644
--- a/tensorflow/lite/core/async/c/async_kernel.h
+++ b/tensorflow/lite/core/async/c/async_kernel.h
@@ -178,6 +178,44 @@ TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetSetAttributes(
                                    TfLiteOpaqueNode* node, int tensor_index,
                                    const TfLiteAttributeMap* attrs));
 
+/// Sets the callback for the backend to set buffer attributes.
+///
+/// `set_buffer_attributes`:
+/// Sets the attributes of the buffers.
+/// Backend kernel will check if the provided buffer has been registered, and
+/// update the map in the backend, so that the callers can retrieve specific
+/// buffer's attributes. `attrs` should be initialized
+/// before calling this function and could be constructed by calling
+/// TfLiteAttributeMapCreate(). The attributes will be sent to backend kernels
+/// and stored in the map with the buffer. `buffer` and `attrs` should not be
+/// nullptr. The buffer needs to be registered before calling this
+/// function. Returns kTfLiteOk if the buffer has been registered and
+/// callers can successfully set the attributes for a buffer.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetSetBufferAttributes(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*set_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                          const TfLiteBackendBuffer* buffer,
+                                          const TfLiteAttributeMap* attrs));
+
+/// Sets the callback for the backend to get buffer attributes.
+///
+/// `get_buffer_attributes`:
+/// Gets the attributes of the buffers.
+/// Backend kernel will check if the provided buffer has been registered, and
+/// get the corresponding attributes from the map. `attrs` should be initialized
+/// before calling this function and could be constructed by calling
+/// TfLiteAttributeMapCreate(). `attrs` will be used to store the attributes
+/// obtained from the backend kernel. If `attrs` is a non-empty map, it will be
+/// overwritten by the attributes of the buffer. `buffer` and `attrs` should not
+/// be nullptr. The buffer needs to be registered before calling this function.
+/// Returns kTfLiteOk if the buffer has been registered and callers can
+/// successfully get the attributes for a buffer.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetGetBufferAttributes(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*get_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                          const TfLiteBackendBuffer* buffer,
+                                          TfLiteAttributeMap* attrs));
+
 /// Sets the callback to prepare the kernels using the information from
 /// `set_attributes` calls.
 TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetPrepare(
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index 409c7da45d5add..3389cf7948e0d6 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -809,6 +809,10 @@ class DelegateAsyncKernel : public BackendAsyncKernelInterface {
   TfLiteStatus SetAttributes(TfLiteOpaqueContext* context,
                              TfLiteOpaqueNode* node, int tensor_index,
                              const TfLiteAttributeMap* attrs) override;
+  TfLiteStatus SetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   const TfLiteAttributeMap* attrs) override;
+  TfLiteStatus GetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   TfLiteAttributeMap* attrs) override;
   TfLiteStatus Prepare(TfLiteOpaqueContext* context,
                        TfLiteOpaqueNode* node) override;
 
@@ -1068,6 +1072,18 @@ TfLiteStatus DelegateAsyncKernel::SetAttributesImpl(
   return kTfLiteOk;
 }
 
+TfLiteStatus DelegateAsyncKernel::SetBufferAttributes(
+    const TfLiteBackendBuffer* buffer, const TfLiteAttributeMap* attrs) {
+  // TODO(b/325338475): Implement the details for set attributes to buffer.
+  return kTfLiteDelegateError;
+}
+
+TfLiteStatus DelegateAsyncKernel::GetBufferAttributes(
+    const TfLiteBackendBuffer* buffer, TfLiteAttributeMap* attrs) {
+  // TODO(b/325338475): Implement the details for get attributes from buffer.
+  return kTfLiteDelegateError;
+}
+
 TfLiteStatus DelegateAsyncKernel::Prepare(TfLiteOpaqueContext* opaque_context,
                                           TfLiteOpaqueNode* opaque_node) {
   // The following cast is safe only because this code is part of the

From 8958c652df2d5db28f71d95b83fa119cdc03942a Mon Sep 17 00:00:00 2001
From: Clemens Giuliani <clemens@inailuig.it>
Date: Fri, 29 Mar 2024 11:49:47 -0700
Subject: [PATCH 608/670] PR #7849: [XLA:CPU] Add support for cross-process
 collectives using mpi.

Imported from GitHub PR https://github.com/openxla/xla/pull/7849

Mpi collectives as proposed in https://github.com/google/jax/issues/11182?notification_referrer_id=NT_kwDOAG8zGbIzODQ5MDcxMzM0OjcyODc1Nzc#issuecomment-1851591135.

I only implemented the inter-process communication and this does not yet support more than 1 threads per process. Adding support for multiple threads/devices per process in the future seems quite a bit more involved if one wanted to do it properly.

For MPI I am building and linking against https://github.com/eschnett/MPItrampoline, which dlopens the (wrapped) mpi library at runtime. To wrap and load the desired mpi library one needs compile https://github.com/eschnett/MPIwrapper and set `MPITRAMPOLINE_LIB=/path/to/libmpiwrapper.so`.

@hawkinsp
Copybara import of the project:

--
b74bbb909d902bd30523f943a7c15f2c754cf98a by Clemens Giuliani <clemens@inailuig.it>:

add mpi collectives

--
23508eb46848464f6711dd8f3f91830ea1adb16d by Clemens Giuliani <clemens@inailuig.it>:

add explicit Init and Finalize methods and export them to python

--
bbe5840b8eb56a306a66ed03d701fd8976e01491 by Clemens Giuliani <clemens@inailuig.it>:

add comment

--
38d156282ecc89509f4b21d80db1a37cb290437a by Clemens Giuliani <clemens@inailuig.it>:

fix windows build

--
201f7238f166197ede5cf5d4d70e117a91eddcd7 by Clemens Giuliani <clemens@inailuig.it>:

fmt

--
2784869df650c1c123c346401db2f67cb153b03e by Clemens Giuliani <clemens@inailuig.it>:

bump xla_extension_version

Merging this change closes #7849

PiperOrigin-RevId: 620302264
---
 third_party/mpitrampoline/BUILD               |   1 +
 third_party/mpitrampoline/gen.patch           | 149 +++++++++
 third_party/mpitrampoline/mpitrampoline.BUILD | 135 +++++++++
 third_party/mpitrampoline/workspace.bzl       |  18 ++
 .../xla/third_party/mpitrampoline/BUILD       |   1 +
 .../xla/third_party/mpitrampoline/gen.patch   | 149 +++++++++
 .../mpitrampoline/mpitrampoline.BUILD         | 135 +++++++++
 .../third_party/mpitrampoline/workspace.bzl   |  18 ++
 third_party/xla/workspace2.bzl                |   2 +
 third_party/xla/xla/pjrt/cpu/BUILD            |  32 ++
 .../xla/xla/pjrt/cpu/mpi_collectives.cc       | 283 ++++++++++++++++++
 .../xla/xla/pjrt/cpu/mpi_collectives.h        | 102 +++++++
 third_party/xla/xla/python/BUILD              |   6 +
 third_party/xla/xla/python/xla.cc             |  22 ++
 third_party/xla/xla/python/xla_client.py      |   2 +-
 15 files changed, 1054 insertions(+), 1 deletion(-)
 create mode 100644 third_party/mpitrampoline/BUILD
 create mode 100644 third_party/mpitrampoline/gen.patch
 create mode 100644 third_party/mpitrampoline/mpitrampoline.BUILD
 create mode 100644 third_party/mpitrampoline/workspace.bzl
 create mode 100644 third_party/xla/third_party/mpitrampoline/BUILD
 create mode 100644 third_party/xla/third_party/mpitrampoline/gen.patch
 create mode 100644 third_party/xla/third_party/mpitrampoline/mpitrampoline.BUILD
 create mode 100644 third_party/xla/third_party/mpitrampoline/workspace.bzl
 create mode 100644 third_party/xla/xla/pjrt/cpu/mpi_collectives.cc
 create mode 100644 third_party/xla/xla/pjrt/cpu/mpi_collectives.h

diff --git a/third_party/mpitrampoline/BUILD b/third_party/mpitrampoline/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/mpitrampoline/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/mpitrampoline/gen.patch b/third_party/mpitrampoline/gen.patch
new file mode 100644
index 00000000000000..35124db0abb1e3
--- /dev/null
+++ b/third_party/mpitrampoline/gen.patch
@@ -0,0 +1,149 @@
+diff --git a/gen/gen_decl.py b/gen/gen_decl.py
+index 1005b95..696b4e0 100755
+--- a/gen/gen_decl.py
++++ b/gen/gen_decl.py
+@@ -9,8 +9,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
+ 
+ from mpi_constants import constants
+ from mpi_functions import functions
+-from mpi_constants_fortran import constants_fortran
+-from mpi_functions_fortran import functions_fortran
++# from mpi_constants_fortran import constants_fortran
++# from mpi_functions_fortran import functions_fortran
+ 
+ support_profiling = True
+ have_weak_symbols = False
+@@ -24,7 +24,7 @@ def wrap(line):
+     lines.append(line)
+     return "\n".join(lines)
+ 
+-with open("include/mpi_decl_constants_c.h", "w") as file:
++with open(sys.argv[1], "w") as file:
+     file.write("// Declare C MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants:
+@@ -32,7 +32,7 @@ with open("include/mpi_decl_constants_c.h", "w") as file:
+                 'mpi_nm': nm}
+         file.write(Template("extern $mpi_tp MPITRAMPOLINE_CONST $mpi_nm;\n").substitute(subs))
+ 
+-with open("include/mpi_decl_functions_c.h", "w") as file:
++with open(sys.argv[2], "w") as file:
+     file.write("// Declare C MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args, flags) in functions:
+@@ -90,7 +90,7 @@ with open("include/mpi_decl_functions_c.h", "w") as file:
+         file.write(Template("\n".join(tmpl)).substitute(subs))
+         file.write("\n")
+ 
+-with open("include/mpi_decl_constants_fortran.h", "w") as file:
++if False:
+     file.write("!     Declare Fortran MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants_fortran:
+@@ -104,7 +104,7 @@ with open("include/mpi_decl_constants_fortran.h", "w") as file:
+         file.write("\n".join(map(lambda line: wrap(Template(line).substitute(subs)), tmpl)))
+         file.write("\n")
+ 
+-with open("include/mpi_decl_functions_fortran.h", "w") as file:
++if False:
+     file.write("!     Declare Fortran MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args) in functions_fortran:
+diff --git a/gen/gen_defn.py b/gen/gen_defn.py
+index bf31f35..318222e 100755
+--- a/gen/gen_defn.py
++++ b/gen/gen_defn.py
+@@ -9,14 +9,14 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
+ 
+ from mpi_constants import constants
+ from mpi_functions import functions
+-from mpi_constants_fortran import constants_fortran
+-from mpi_functions_fortran import functions_fortran
++# from mpi_constants_fortran import constants_fortran
++# from mpi_functions_fortran import functions_fortran
+ 
+ support_profiling = True
+ have_weak_symbols = False
+ replace_sentinels = False
+ 
+-with open("src/mpi_defn_constants_c.h", "w") as file:
++with open(sys.argv[1], "w") as file:
+     file.write("// Define C MPI constants")
+     file.write("\n")
+     for (tp, nm) in constants:
+@@ -24,7 +24,7 @@ with open("src/mpi_defn_constants_c.h", "w") as file:
+                 'mpi_nm': nm}
+         file.write(Template("$mpi_tp $mpi_nm = ($mpi_tp)0xdeadbeef;\n").substitute(subs))
+ 
+-with open("src/mpi_defn_functions_c.h", "w") as file:
++with open(sys.argv[2], "w") as file:
+     file.write("// Define C MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args, flags) in functions:
+@@ -89,7 +89,7 @@ with open("src/mpi_defn_functions_c.h", "w") as file:
+         file.write(Template("\n".join(tmpl)).substitute(subs))
+         file.write("\n")
+ 
+-with open("src/mpi_defn_constants_fortran.h", "w") as file:
++if False:
+     file.write("// Define Fortran MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants_fortran:
+@@ -98,7 +98,7 @@ with open("src/mpi_defn_constants_fortran.h", "w") as file:
+         # Fortran common blocks with `-march=skylake-avx512` are aligned to 64 bytes
+         file.write(Template("$mpi_tp $abi_nm __attribute__((__aligned__(64))) = (int)0xdeadbeef;\n").substitute(subs))
+ 
+-with open("src/mpi_defn_functions_fortran.h", "w") as file:
++if False:
+     file.write("// Define Fortran MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args) in functions_fortran:
+diff --git a/gen/gen_init.py b/gen/gen_init.py
+index 4939261..0e52822 100755
+--- a/gen/gen_init.py
++++ b/gen/gen_init.py
+@@ -9,14 +9,14 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
+ 
+ from mpi_constants import constants
+ from mpi_functions import functions
+-from mpi_constants_fortran import constants_fortran
+-from mpi_functions_fortran import functions_fortran
++# from mpi_constants_fortran import constants_fortran
++# from mpi_functions_fortran import functions_fortran
+ 
+ support_profiling = True
+ have_weak_symbols = False
+ replace_sentinels = False
+ 
+-with open("src/mpi_init_constants_c.h", "w") as file:
++with open(sys.argv[1], "w") as file:
+     file.write("// Initialize C MPI constants")
+     file.write("\n")
+     for (tp, nm) in constants:
+@@ -25,7 +25,7 @@ with open("src/mpi_init_constants_c.h", "w") as file:
+                 'abi_nm': re.sub(r"MPI(X?)_", r"MPI\1ABI_", nm)}
+         file.write(Template("$mpi_nm = *($mpi_tp const *)get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
+ 
+-with open("src/mpi_init_functions_c.h", "w") as file:
++with open(sys.argv[2], "w") as file:
+     file.write("// Initialize C MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args, flags) in functions:
+@@ -39,7 +39,7 @@ with open("src/mpi_init_functions_c.h", "w") as file:
+             subs['anm{0}'.format(i)] = anm
+         file.write(Template("$abi_nm = get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
+ 
+-with open("src/mpi_init_constants_fortran.h", "w") as file:
++if False:
+     file.write("// Initialize Fortran MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants_fortran:
+@@ -47,7 +47,7 @@ with open("src/mpi_init_constants_fortran.h", "w") as file:
+                 'abi_nm': re.sub(r"MPI(X?)_", r"MPI\1ABI_", nm).lower() + "_"}
+         file.write(Template("$abi_nm = *($abi_tp const*)get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
+ 
+-with open("src/mpi_init_functions_fortran.h", "w") as file:
++if False:
+     file.write("// Initialize Fortran MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args) in functions_fortran:
diff --git a/third_party/mpitrampoline/mpitrampoline.BUILD b/third_party/mpitrampoline/mpitrampoline.BUILD
new file mode 100644
index 00000000000000..cf8e9c336e4e33
--- /dev/null
+++ b/third_party/mpitrampoline/mpitrampoline.BUILD
@@ -0,0 +1,135 @@
+# Description:
+#  A forwarding MPI implementation that can use any other MPI implementation via an MPI ABI
+
+load("@org_tensorflow//xla:strict.default.bzl", "py_strict_binary")
+load("//third_party/bazel_skylib/rules:expand_template.bzl", "expand_template")
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])
+
+exports_files(["LICENSE.md"])
+
+genrule(
+    name = "mpi_version",
+    srcs = [
+        "CMakeLists.txt",
+        "include/mpi_version.h.in",
+    ],
+    outs = ["include/mpi_version.h"],
+    cmd = """
+      PROJECT_VERSION=`cat $(location CMakeLists.txt) \
+                       | grep "MPItrampoline VERSION" | awk '{print $$NF}'`
+      PROJECT_VERSION_MAJOR=`echo $$PROJECT_VERSION | cut -d. -f1`
+      PROJECT_VERSION_MINOR=`echo $$PROJECT_VERSION | cut -d. -f2`
+      PROJECT_VERSION_PATCH=`echo $$PROJECT_VERSION | cut -d. -f3`
+      sed -e "s/@PROJECT_VERSION@/$${PROJECT_VERSION}/" \
+          -e "s/@PROJECT_VERSION_MAJOR@/$${PROJECT_VERSION_MAJOR}/" \
+          -e "s/@PROJECT_VERSION_MINOR@/$${PROJECT_VERSION_MINOR}/" \
+          -e "s/@PROJECT_VERSION_PATCH@/$${PROJECT_VERSION_PATCH}/" \
+          $(location include/mpi_version.h.in) > $(location include/mpi_version.h)
+      """,
+)
+
+expand_template(
+    name = "mpi_defaults",
+    out = "src/mpi_defaults.h",
+    substitutions = {
+        "@MPITRAMPOLINE_DEFAULT_DELAY_INIT@": "",
+        "@MPITRAMPOLINE_DEFAULT_DLOPEN_BINDING@": "",
+        "@MPITRAMPOLINE_DEFAULT_DLOPEN_MODE@": "",
+        "@MPITRAMPOLINE_DEFAULT_LIB@": "",
+        "@MPITRAMPOLINE_DEFAULT_PRELOAD@": "",
+        "@MPITRAMPOLINE_DEFAULT_VERBOSE@": "",
+    },
+    template = "src/mpi_defaults.h.in",
+)
+
+py_strict_binary(
+    name = "gen_decl",
+    srcs = [
+        "gen/gen_decl.py",
+        "mpiabi/mpi_constants.py",
+        "mpiabi/mpi_functions.py",
+    ],
+)
+
+genrule(
+    name = "decl",
+    outs = [
+        "include/mpi_decl_constants_c.h",
+        "include/mpi_decl_functions_c.h",
+    ],
+    cmd = "$(location :gen_decl) $(location include/mpi_decl_constants_c.h) \
+           $(location include/mpi_decl_functions_c.h)",
+    tools = [":gen_decl"],
+)
+
+py_strict_binary(
+    name = "gen_defn",
+    srcs = [
+        "gen/gen_defn.py",
+        "mpiabi/mpi_constants.py",
+        "mpiabi/mpi_functions.py",
+    ],
+)
+
+genrule(
+    name = "defn",
+    outs = [
+        "include/mpi_defn_constants_c.h",
+        "include/mpi_defn_functions_c.h",
+    ],
+    cmd = "$(location :gen_defn) $(location include/mpi_defn_constants_c.h) \
+           $(location include/mpi_defn_functions_c.h)",
+    tools = [":gen_defn"],
+)
+
+py_strict_binary(
+    name = "gen_init",
+    srcs = [
+        "gen/gen_init.py",
+        "mpiabi/mpi_constants.py",
+        "mpiabi/mpi_functions.py",
+    ],
+)
+
+genrule(
+    name = "init",
+    outs = [
+        "include/mpi_init_constants_c.h",
+        "include/mpi_init_functions_c.h",
+    ],
+    cmd = "$(location :gen_init) $(location include/mpi_init_constants_c.h) \
+           $(location include/mpi_init_functions_c.h)",
+    tools = [":gen_init"],
+)
+
+cc_library(
+    name = "mpitrampoline",
+    srcs = [
+        "src/mpi.c",
+    ],
+    hdrs = [
+        "include/mpi.h",
+        "include/mpi_decl_constants_c.h",
+        "include/mpi_decl_functions_c.h",
+        "include/mpi_defn_constants_c.h",
+        "include/mpi_defn_functions_c.h",
+        "include/mpi_init_constants_c.h",
+        "include/mpi_init_functions_c.h",
+        "include/mpi_version.h",
+        "mpiabi/mpiabi.h",
+        "src/mpi_defaults.h",
+    ],
+    copts = [
+        "-fexceptions",
+    ],
+    includes = [
+        "include",
+        "mpiabi",
+        "src",
+    ],
+)
diff --git a/third_party/mpitrampoline/workspace.bzl b/third_party/mpitrampoline/workspace.bzl
new file mode 100644
index 00000000000000..4748931ae6e368
--- /dev/null
+++ b/third_party/mpitrampoline/workspace.bzl
@@ -0,0 +1,18 @@
+"""Provides the repository macro to import mpitrampoline."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    """Imports mpitrampoline."""
+
+    MPITRAMPOLINE_COMMIT = "25efb0f7a4cd00ed82bafb8b1a6285fc50d297ed"
+    MPITRAMPOLINE_SHA256 = "5a36656205c472bdb639bffebb0f014523b32dda0c2cbedd9ce7abfc9e879e84"
+
+    tf_http_archive(
+        name = "mpitrampoline",
+        sha256 = MPITRAMPOLINE_SHA256,
+        strip_prefix = "MPItrampoline-{commit}".format(commit = MPITRAMPOLINE_COMMIT),
+        urls = tf_mirror_urls("https://github.com/eschnett/mpitrampoline/archive/{commit}.tar.gz".format(commit = MPITRAMPOLINE_COMMIT)),
+        patch_file = ["//third_party/mpitrampoline:gen.patch"],
+        build_file = "//third_party/mpitrampoline:mpitrampoline.BUILD",
+    )
diff --git a/third_party/xla/third_party/mpitrampoline/BUILD b/third_party/xla/third_party/mpitrampoline/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/xla/third_party/mpitrampoline/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/mpitrampoline/gen.patch b/third_party/xla/third_party/mpitrampoline/gen.patch
new file mode 100644
index 00000000000000..35124db0abb1e3
--- /dev/null
+++ b/third_party/xla/third_party/mpitrampoline/gen.patch
@@ -0,0 +1,149 @@
+diff --git a/gen/gen_decl.py b/gen/gen_decl.py
+index 1005b95..696b4e0 100755
+--- a/gen/gen_decl.py
++++ b/gen/gen_decl.py
+@@ -9,8 +9,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
+ 
+ from mpi_constants import constants
+ from mpi_functions import functions
+-from mpi_constants_fortran import constants_fortran
+-from mpi_functions_fortran import functions_fortran
++# from mpi_constants_fortran import constants_fortran
++# from mpi_functions_fortran import functions_fortran
+ 
+ support_profiling = True
+ have_weak_symbols = False
+@@ -24,7 +24,7 @@ def wrap(line):
+     lines.append(line)
+     return "\n".join(lines)
+ 
+-with open("include/mpi_decl_constants_c.h", "w") as file:
++with open(sys.argv[1], "w") as file:
+     file.write("// Declare C MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants:
+@@ -32,7 +32,7 @@ with open("include/mpi_decl_constants_c.h", "w") as file:
+                 'mpi_nm': nm}
+         file.write(Template("extern $mpi_tp MPITRAMPOLINE_CONST $mpi_nm;\n").substitute(subs))
+ 
+-with open("include/mpi_decl_functions_c.h", "w") as file:
++with open(sys.argv[2], "w") as file:
+     file.write("// Declare C MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args, flags) in functions:
+@@ -90,7 +90,7 @@ with open("include/mpi_decl_functions_c.h", "w") as file:
+         file.write(Template("\n".join(tmpl)).substitute(subs))
+         file.write("\n")
+ 
+-with open("include/mpi_decl_constants_fortran.h", "w") as file:
++if False:
+     file.write("!     Declare Fortran MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants_fortran:
+@@ -104,7 +104,7 @@ with open("include/mpi_decl_constants_fortran.h", "w") as file:
+         file.write("\n".join(map(lambda line: wrap(Template(line).substitute(subs)), tmpl)))
+         file.write("\n")
+ 
+-with open("include/mpi_decl_functions_fortran.h", "w") as file:
++if False:
+     file.write("!     Declare Fortran MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args) in functions_fortran:
+diff --git a/gen/gen_defn.py b/gen/gen_defn.py
+index bf31f35..318222e 100755
+--- a/gen/gen_defn.py
++++ b/gen/gen_defn.py
+@@ -9,14 +9,14 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
+ 
+ from mpi_constants import constants
+ from mpi_functions import functions
+-from mpi_constants_fortran import constants_fortran
+-from mpi_functions_fortran import functions_fortran
++# from mpi_constants_fortran import constants_fortran
++# from mpi_functions_fortran import functions_fortran
+ 
+ support_profiling = True
+ have_weak_symbols = False
+ replace_sentinels = False
+ 
+-with open("src/mpi_defn_constants_c.h", "w") as file:
++with open(sys.argv[1], "w") as file:
+     file.write("// Define C MPI constants")
+     file.write("\n")
+     for (tp, nm) in constants:
+@@ -24,7 +24,7 @@ with open("src/mpi_defn_constants_c.h", "w") as file:
+                 'mpi_nm': nm}
+         file.write(Template("$mpi_tp $mpi_nm = ($mpi_tp)0xdeadbeef;\n").substitute(subs))
+ 
+-with open("src/mpi_defn_functions_c.h", "w") as file:
++with open(sys.argv[2], "w") as file:
+     file.write("// Define C MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args, flags) in functions:
+@@ -89,7 +89,7 @@ with open("src/mpi_defn_functions_c.h", "w") as file:
+         file.write(Template("\n".join(tmpl)).substitute(subs))
+         file.write("\n")
+ 
+-with open("src/mpi_defn_constants_fortran.h", "w") as file:
++if False:
+     file.write("// Define Fortran MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants_fortran:
+@@ -98,7 +98,7 @@ with open("src/mpi_defn_constants_fortran.h", "w") as file:
+         # Fortran common blocks with `-march=skylake-avx512` are aligned to 64 bytes
+         file.write(Template("$mpi_tp $abi_nm __attribute__((__aligned__(64))) = (int)0xdeadbeef;\n").substitute(subs))
+ 
+-with open("src/mpi_defn_functions_fortran.h", "w") as file:
++if False:
+     file.write("// Define Fortran MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args) in functions_fortran:
+diff --git a/gen/gen_init.py b/gen/gen_init.py
+index 4939261..0e52822 100755
+--- a/gen/gen_init.py
++++ b/gen/gen_init.py
+@@ -9,14 +9,14 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
+ 
+ from mpi_constants import constants
+ from mpi_functions import functions
+-from mpi_constants_fortran import constants_fortran
+-from mpi_functions_fortran import functions_fortran
++# from mpi_constants_fortran import constants_fortran
++# from mpi_functions_fortran import functions_fortran
+ 
+ support_profiling = True
+ have_weak_symbols = False
+ replace_sentinels = False
+ 
+-with open("src/mpi_init_constants_c.h", "w") as file:
++with open(sys.argv[1], "w") as file:
+     file.write("// Initialize C MPI constants")
+     file.write("\n")
+     for (tp, nm) in constants:
+@@ -25,7 +25,7 @@ with open("src/mpi_init_constants_c.h", "w") as file:
+                 'abi_nm': re.sub(r"MPI(X?)_", r"MPI\1ABI_", nm)}
+         file.write(Template("$mpi_nm = *($mpi_tp const *)get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
+ 
+-with open("src/mpi_init_functions_c.h", "w") as file:
++with open(sys.argv[2], "w") as file:
+     file.write("// Initialize C MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args, flags) in functions:
+@@ -39,7 +39,7 @@ with open("src/mpi_init_functions_c.h", "w") as file:
+             subs['anm{0}'.format(i)] = anm
+         file.write(Template("$abi_nm = get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
+ 
+-with open("src/mpi_init_constants_fortran.h", "w") as file:
++if False:
+     file.write("// Initialize Fortran MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants_fortran:
+@@ -47,7 +47,7 @@ with open("src/mpi_init_constants_fortran.h", "w") as file:
+                 'abi_nm': re.sub(r"MPI(X?)_", r"MPI\1ABI_", nm).lower() + "_"}
+         file.write(Template("$abi_nm = *($abi_tp const*)get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
+ 
+-with open("src/mpi_init_functions_fortran.h", "w") as file:
++if False:
+     file.write("// Initialize Fortran MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args) in functions_fortran:
diff --git a/third_party/xla/third_party/mpitrampoline/mpitrampoline.BUILD b/third_party/xla/third_party/mpitrampoline/mpitrampoline.BUILD
new file mode 100644
index 00000000000000..f46e39d762a159
--- /dev/null
+++ b/third_party/xla/third_party/mpitrampoline/mpitrampoline.BUILD
@@ -0,0 +1,135 @@
+# Description:
+#  A forwarding MPI implementation that can use any other MPI implementation via an MPI ABI
+
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@local_xla//xla:strict.default.bzl", "py_strict_binary")
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])
+
+exports_files(["LICENSE.md"])
+
+genrule(
+    name = "mpi_version",
+    srcs = [
+        "CMakeLists.txt",
+        "include/mpi_version.h.in",
+    ],
+    outs = ["include/mpi_version.h"],
+    cmd = """
+      PROJECT_VERSION=`cat $(location CMakeLists.txt) \
+                       | grep "MPItrampoline VERSION" | awk '{print $$NF}'`
+      PROJECT_VERSION_MAJOR=`echo $$PROJECT_VERSION | cut -d. -f1`
+      PROJECT_VERSION_MINOR=`echo $$PROJECT_VERSION | cut -d. -f2`
+      PROJECT_VERSION_PATCH=`echo $$PROJECT_VERSION | cut -d. -f3`
+      sed -e "s/@PROJECT_VERSION@/$${PROJECT_VERSION}/" \
+          -e "s/@PROJECT_VERSION_MAJOR@/$${PROJECT_VERSION_MAJOR}/" \
+          -e "s/@PROJECT_VERSION_MINOR@/$${PROJECT_VERSION_MINOR}/" \
+          -e "s/@PROJECT_VERSION_PATCH@/$${PROJECT_VERSION_PATCH}/" \
+          $(location include/mpi_version.h.in) > $(location include/mpi_version.h)
+      """,
+)
+
+expand_template(
+    name = "mpi_defaults",
+    out = "src/mpi_defaults.h",
+    substitutions = {
+        "@MPITRAMPOLINE_DEFAULT_DELAY_INIT@": "",
+        "@MPITRAMPOLINE_DEFAULT_DLOPEN_BINDING@": "",
+        "@MPITRAMPOLINE_DEFAULT_DLOPEN_MODE@": "",
+        "@MPITRAMPOLINE_DEFAULT_LIB@": "",
+        "@MPITRAMPOLINE_DEFAULT_PRELOAD@": "",
+        "@MPITRAMPOLINE_DEFAULT_VERBOSE@": "",
+    },
+    template = "src/mpi_defaults.h.in",
+)
+
+py_strict_binary(
+    name = "gen_decl",
+    srcs = [
+        "gen/gen_decl.py",
+        "mpiabi/mpi_constants.py",
+        "mpiabi/mpi_functions.py",
+    ],
+)
+
+genrule(
+    name = "decl",
+    outs = [
+        "include/mpi_decl_constants_c.h",
+        "include/mpi_decl_functions_c.h",
+    ],
+    cmd = "$(location :gen_decl) $(location include/mpi_decl_constants_c.h) \
+           $(location include/mpi_decl_functions_c.h)",
+    tools = [":gen_decl"],
+)
+
+py_strict_binary(
+    name = "gen_defn",
+    srcs = [
+        "gen/gen_defn.py",
+        "mpiabi/mpi_constants.py",
+        "mpiabi/mpi_functions.py",
+    ],
+)
+
+genrule(
+    name = "defn",
+    outs = [
+        "include/mpi_defn_constants_c.h",
+        "include/mpi_defn_functions_c.h",
+    ],
+    cmd = "$(location :gen_defn) $(location include/mpi_defn_constants_c.h) \
+           $(location include/mpi_defn_functions_c.h)",
+    tools = [":gen_defn"],
+)
+
+py_strict_binary(
+    name = "gen_init",
+    srcs = [
+        "gen/gen_init.py",
+        "mpiabi/mpi_constants.py",
+        "mpiabi/mpi_functions.py",
+    ],
+)
+
+genrule(
+    name = "init",
+    outs = [
+        "include/mpi_init_constants_c.h",
+        "include/mpi_init_functions_c.h",
+    ],
+    cmd = "$(location :gen_init) $(location include/mpi_init_constants_c.h) \
+           $(location include/mpi_init_functions_c.h)",
+    tools = [":gen_init"],
+)
+
+cc_library(
+    name = "mpitrampoline",
+    srcs = [
+        "src/mpi.c",
+    ],
+    hdrs = [
+        "include/mpi.h",
+        "include/mpi_decl_constants_c.h",
+        "include/mpi_decl_functions_c.h",
+        "include/mpi_defn_constants_c.h",
+        "include/mpi_defn_functions_c.h",
+        "include/mpi_init_constants_c.h",
+        "include/mpi_init_functions_c.h",
+        "include/mpi_version.h",
+        "mpiabi/mpiabi.h",
+        "src/mpi_defaults.h",
+    ],
+    copts = [
+        "-fexceptions",
+    ],
+    includes = [
+        "include",
+        "mpiabi",
+        "src",
+    ],
+)
diff --git a/third_party/xla/third_party/mpitrampoline/workspace.bzl b/third_party/xla/third_party/mpitrampoline/workspace.bzl
new file mode 100644
index 00000000000000..4748931ae6e368
--- /dev/null
+++ b/third_party/xla/third_party/mpitrampoline/workspace.bzl
@@ -0,0 +1,18 @@
+"""Provides the repository macro to import mpitrampoline."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    """Imports mpitrampoline."""
+
+    MPITRAMPOLINE_COMMIT = "25efb0f7a4cd00ed82bafb8b1a6285fc50d297ed"
+    MPITRAMPOLINE_SHA256 = "5a36656205c472bdb639bffebb0f014523b32dda0c2cbedd9ce7abfc9e879e84"
+
+    tf_http_archive(
+        name = "mpitrampoline",
+        sha256 = MPITRAMPOLINE_SHA256,
+        strip_prefix = "MPItrampoline-{commit}".format(commit = MPITRAMPOLINE_COMMIT),
+        urls = tf_mirror_urls("https://github.com/eschnett/mpitrampoline/archive/{commit}.tar.gz".format(commit = MPITRAMPOLINE_COMMIT)),
+        patch_file = ["//third_party/mpitrampoline:gen.patch"],
+        build_file = "//third_party/mpitrampoline:mpitrampoline.BUILD",
+    )
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index e7c6c4be2500d1..9b9fd5e9265ed9 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -10,6 +10,7 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 # Import third party repository rules. See go/tfbr-thirdparty.
 load("//third_party/dlpack:workspace.bzl", dlpack = "repo")
 load("//third_party/gloo:workspace.bzl", gloo = "repo")
+load("//third_party/mpitrampoline:workspace.bzl", mpitrampoline = "repo")
 load("//third_party/nanobind:workspace.bzl", nanobind = "repo")
 load("//third_party/robin_map:workspace.bzl", robin_map = "repo")
 load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
@@ -19,6 +20,7 @@ def _initialize_third_party():
     """ Load third party repositories.  See above load() statements. """
     dlpack()
     gloo()
+    mpitrampoline()
     nanobind()
     robin_map()
     stablehlo()
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index a673cd5191e5ee..324d684611e22c 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -1,3 +1,4 @@
+load("@local_tsl//tsl:tsl.bzl", "if_oss")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
@@ -286,3 +287,34 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
     ],
 )
+
+cc_library(
+    name = "mpi_collectives",
+    srcs = if_oss(["mpi_collectives.cc"]),
+    hdrs = if_oss(["mpi_collectives.h"]),
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    deps = if_oss([
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:global_device_id",
+        "//xla/service/cpu:collectives_interface",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@mpitrampoline",
+    ]),
+)
diff --git a/third_party/xla/xla/pjrt/cpu/mpi_collectives.cc b/third_party/xla/xla/pjrt/cpu/mpi_collectives.cc
new file mode 100644
index 00000000000000..d2c93fd75450f5
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/mpi_collectives.cc
@@ -0,0 +1,283 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/cpu/mpi_collectives.h"
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "mpi.h"  // NOLINT
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/primitive_util.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/cpu/collectives_interface.h"
+#include "xla/service/global_device_id.h"
+#include "xla/status_macros.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+
+namespace xla::cpu {
+
+absl::StatusOr<MPI_Datatype> PrimitiveTypeToMpiType(
+    PrimitiveType element_type) {
+  switch (element_type) {
+    case S8:
+      return MPI_INT8_T;
+    case U8:
+    case PRED:
+      return MPI_UINT8_T;
+    case S16:
+      return MPI_INT16_T;
+    case U16:
+      return MPI_UINT16_T;
+    case S32:
+      return MPI_INT32_T;
+    case U32:
+      return MPI_UINT32_T;
+    case S64:
+      return MPI_INT64_T;
+    case U64:
+      return MPI_UINT64_T;
+    case F32:
+      return MPI_FLOAT;
+    case F64:
+      return MPI_DOUBLE;
+    case C64:
+      return MPI_C_COMPLEX;
+    case C128:
+      return MPI_C_DOUBLE_COMPLEX;
+    default:
+      // For implementing the reduction of unsupported types
+      // see e.g. https://stackoverflow.com/a/29643391
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Unsupported primitive type for reduction: ",
+          primitive_util::LowercasePrimitiveTypeName(element_type)));
+  }
+}
+
+bool MpiTypeIsComplex(MPI_Datatype type) {
+  return type == MPI_C_COMPLEX || type == MPI_C_DOUBLE_COMPLEX;
+}
+
+absl::StatusOr<MPI_Op> ReductionKindToMpiOp(ReductionKind reduction_kind,
+                                            MPI_Datatype type) {
+  switch (reduction_kind) {
+    case ReductionKind::SUM:
+      return MPI_SUM;
+    case ReductionKind::PRODUCT:
+      return MPI_PROD;
+    case ReductionKind::MIN:
+      if (!MpiTypeIsComplex(type)) {
+        return MPI_MIN;
+      } else {
+        return absl::InvalidArgumentError(
+            "MIN reduction not supported for complex types");
+      }
+    case ReductionKind::MAX:
+      if (!MpiTypeIsComplex(type)) {
+        return MPI_MAX;
+      } else {
+        return absl::InvalidArgumentError(
+            "MAX reduction not supported for complex types");
+      }
+    default:
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unknown reduction kind: ", reduction_kind));
+  }
+}
+
+static absl::Status MpiErrorToAbslStatus(int error) {
+  if (error != MPI_SUCCESS) {
+    char error_str[MPI_MAX_ERROR_STRING];
+    int len;
+    MPI_Error_string(error, error_str, &len);
+    return absl::UnknownError(absl::StrCat("MPI error: ", error_str));
+  }
+  return absl::OkStatus();
+}
+
+MpiCollectivesCommunicator::MpiCollectivesCommunicator(int color, int key) {
+  MPI_Comm_split(MPI_COMM_WORLD, color, key, &comm_);
+  MPI_Comm_rank(comm_, &mpi_rank_);
+  MPI_Comm_size(comm_, &mpi_size_);
+}
+
+MpiCollectivesCommunicator::~MpiCollectivesCommunicator() {
+  MPI_Comm_free(&comm_);
+};
+
+absl::Status MpiCollectivesCommunicator::AllReduce(
+    const RendezvousKey& key, ReductionKind reduction_kind,
+    PrimitiveType element_type, size_t num_elements, const void* input_buffer,
+    void* output_buffer, absl::Duration timeout) {
+  TF_ASSIGN_OR_RETURN(MPI_Datatype type, PrimitiveTypeToMpiType(element_type));
+  TF_ASSIGN_OR_RETURN(MPI_Op op, ReductionKindToMpiOp(reduction_kind, type));
+  return MpiErrorToAbslStatus(MPI_Allreduce(input_buffer, output_buffer,
+                                            num_elements, type, op, comm_));
+}
+
+absl::Status MpiCollectivesCommunicator::CollectivePermute(
+    const RendezvousKey& key, size_t num_bytes, std::optional<int> source_rank,
+    absl::Span<int const> target_ranks, const void* input_buffer,
+    void* output_buffer, absl::Duration timeout) {
+  int tag = 0;  // TODO come up with better tags.
+
+  const int rank = mpi_rank_;
+
+  std::vector<MPI_Request> requests;
+
+  if (source_rank) {
+    if (*source_rank == rank) {
+      std::memcpy(output_buffer, input_buffer, num_bytes);
+    } else {
+      VLOG(1) << "recv at " << rank << " from " << *source_rank;
+      requests.emplace_back();
+      TF_RETURN_IF_ERROR(MpiErrorToAbslStatus(
+          MPI_Irecv(output_buffer, num_bytes, MPI_BYTE, *source_rank, tag,
+                    comm_, &requests.back())));
+    }
+  } else {
+    std::memset(output_buffer, 0, num_bytes);
+  }
+
+  for (int target : target_ranks) {
+    if (target != rank) {
+      VLOG(1) << "send from " << rank << " to " << target;
+      requests.emplace_back();
+      TF_RETURN_IF_ERROR(MpiErrorToAbslStatus(
+          MPI_Isend(input_buffer, num_bytes, MPI_BYTE, target, tag, comm_,
+                    &requests.back())));
+    }
+  }
+
+  for (auto& request : requests) {
+    TF_RETURN_IF_ERROR(
+        MpiErrorToAbslStatus(MPI_Wait(&request, MPI_STATUS_IGNORE)));
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status MpiCollectivesCommunicator::AllToAll(
+    const RendezvousKey& key, size_t chunk_bytes,
+    absl::Span<const void* const> input_buffers,
+    absl::Span<void* const> output_buffers, absl::Duration timeout) {
+  // We can't use MPI_Alltoall directly because it assumes that the inputs and
+  // outputs are contiguous. Therefore here we implement it using MPI_Sendrecv.
+
+  int tag = 0;  // TODO use better tags.
+  const int rank = mpi_rank_;
+  const int size = mpi_size_;
+  TF_RET_CHECK(size == input_buffers.size());
+  TF_RET_CHECK(size == output_buffers.size());
+
+  std::memcpy(output_buffers[rank], input_buffers[rank], chunk_bytes);
+
+  for (int i = 1; i < size; i++) {
+    int send_rank = (rank + i) % size;
+    int recv_rank = (rank + size - i) % size;
+    TF_RETURN_IF_ERROR(MpiErrorToAbslStatus(
+        MPI_Sendrecv(input_buffers[send_rank], chunk_bytes, MPI_BYTE, send_rank,
+                     tag, output_buffers[recv_rank], chunk_bytes, MPI_BYTE,
+                     recv_rank, tag, comm_, MPI_STATUS_IGNORE)));
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status MpiCollectivesCommunicator::AllGather(const RendezvousKey& key,
+                                                   size_t chunk_bytes,
+                                                   const void* input_buffer,
+                                                   void* output_buffer,
+                                                   absl::Duration timeout) {
+  return MpiErrorToAbslStatus(MPI_Allgather(input_buffer, chunk_bytes, MPI_BYTE,
+                                            output_buffer, chunk_bytes,
+                                            MPI_BYTE, comm_));
+}
+
+absl::Status MpiCollectivesCommunicator::ReduceScatter(
+    const RendezvousKey& key, ReductionKind reduction_kind,
+    PrimitiveType element_type, size_t chunk_elems, const void* input_buffer,
+    void* output_buffer, absl::Duration timeout) {
+  const int size = mpi_size_;
+  std::vector<int> recvcounts(size, chunk_elems);
+  TF_ASSIGN_OR_RETURN(MPI_Datatype type, PrimitiveTypeToMpiType(element_type));
+  TF_ASSIGN_OR_RETURN(MPI_Op op, ReductionKindToMpiOp(reduction_kind, type));
+  return MpiErrorToAbslStatus(MPI_Reduce_scatter(
+      input_buffer, output_buffer, recvcounts.data(), type, op, comm_));
+}
+
+void MpiCollectives::Init() {
+  int provided;
+  MPI_Init_thread(NULL, NULL, MPI_THREAD_FUNNELED, &provided);
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_world_rank_);
+  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size_);
+  VLOG(1) << "MPI rank=" << mpi_world_rank_ << " size=" << mpi_world_size_;
+}
+
+void MpiCollectives::Finalize() {
+  contexts_.clear();
+  MPI_Finalize();
+}
+
+absl::StatusOr<std::shared_ptr<CollectivesCommunicator>>
+MpiCollectives::GetCommunicator(absl::Span<GlobalDeviceId const> global_devices,
+                                int rank) {
+  int flag;
+  MPI_Is_thread_main(&flag);
+  if (!flag) {
+    return absl::UnknownError(
+        absl::StrCat("MPI: Communicator requested from a thread that is not "
+                     "the one MPI was initialized from. Multiple "
+                     "threads/devices per process are not yet supported."));
+  }
+
+  auto& context = contexts_[std::make_tuple(
+      std::vector<GlobalDeviceId>(global_devices.begin(), global_devices.end()),
+      rank)];
+  if (context) {
+    return context;
+  }
+
+  int color;
+  int key = 0;
+  if (global_devices.size() > 0) {
+    color = static_cast<int>(global_devices.at(0).value());
+    key = rank;
+  } else {
+    color = MPI_UNDEFINED;
+  }
+  context = std::make_shared<MpiCollectivesCommunicator>(color, key);
+  return context;
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/pjrt/cpu/mpi_collectives.h b/third_party/xla/xla/pjrt/cpu/mpi_collectives.h
new file mode 100644
index 00000000000000..fdf6ec81b6dc6b
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/mpi_collectives.h
@@ -0,0 +1,102 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_MPI_COLLECTIVES_H_
+#define XLA_PJRT_CPU_MPI_COLLECTIVES_H_
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <vector>
+
+#include "mpi.h"  // NOLINT
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/cpu/collectives_interface.h"
+#include "xla/service/global_device_id.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class MpiCollectivesCommunicator : public CollectivesCommunicator {
+ public:
+  explicit MpiCollectivesCommunicator(int color, int key);
+  ~MpiCollectivesCommunicator() override;
+
+  absl::Status AllReduce(const RendezvousKey& key, ReductionKind reduction_kind,
+                         PrimitiveType element_type, size_t num_elements,
+                         const void* input_buffer, void* output_buffer,
+                         absl::Duration timeout) override;
+  absl::Status CollectivePermute(const RendezvousKey& key, size_t num_bytes,
+                                 std::optional<int> source_rank,
+                                 absl::Span<int const> target_ranks,
+                                 const void* input_buffer, void* output_buffer,
+                                 absl::Duration timeout) override;
+  absl::Status AllToAll(const RendezvousKey& key, size_t chunk_bytes,
+                        absl::Span<const void* const> input_buffers,
+                        absl::Span<void* const> output_buffers,
+                        absl::Duration timeout) override;
+  absl::Status AllGather(const RendezvousKey& key, size_t chunk_bytes,
+                         const void* input_buffer, void* output_buffer,
+                         absl::Duration timeout) override;
+  absl::Status ReduceScatter(const RendezvousKey& key,
+                             ReductionKind reduction_kind,
+                             PrimitiveType element_type, size_t chunk_elems,
+                             const void* input_buffer, void* output_buffer,
+                             absl::Duration timeout) override;
+
+ private:
+  MPI_Comm comm_;
+  int mpi_rank_;
+  int mpi_size_;
+};
+
+class MpiCollectives : public CollectivesInterface {
+ public:
+  /*
+  The user has to explicitly call Init() and Finalize() before and
+  after use.
+  For example, using the Python client, this can be achieved with:
+
+  collectives = xla_client._xla.make_mpi_collectives()
+  collectives.Init()
+  atexit.register(collectives.Finalize)
+  */
+  void Init();
+  void Finalize();
+
+  absl::StatusOr<std::shared_ptr<CollectivesCommunicator>> GetCommunicator(
+      absl::Span<GlobalDeviceId const> global_devices, int rank) override;
+
+ private:
+  absl::Status ExchangeGlobalDeviceIds(
+      absl::Span<GlobalDeviceId const> global_devices, int rank);
+
+  int mpi_world_rank_;
+  int mpi_world_size_;
+  absl::flat_hash_map<std::tuple<std::vector<GlobalDeviceId>, int>,
+                      std::shared_ptr<MpiCollectivesCommunicator>>
+      contexts_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_PJRT_CPU_MPI_COLLECTIVES_H_
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 2f022afb5c51f1..267e7284f05cb0 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -1261,6 +1261,12 @@ tsl_pybind_extension(
             "//xla/pjrt/cpu:gloo_collectives",
             "//xla/pjrt/cpu:gloo_kv_store",
         ],
+    }) + select({
+        # mpitrampoline does not build on windows
+        "@local_tsl//tsl:windows": [],
+        "//conditions:default": [
+            "//xla/pjrt/cpu:mpi_collectives",
+        ],
     }) + select({
         ":gpu_enabled": [
             "//xla/pjrt/gpu:se_gpu_pjrt_client",
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index 2e4f390f0dfb93..1c9cf5bbcdadbb 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -67,6 +67,11 @@ limitations under the License.
 #include "xla/pjrt/cpu/gloo_collectives.h"
 #include "xla/pjrt/cpu/gloo_kv_store.h"
 #endif  // __linux__
+
+#if !defined(_WIN32) && !defined(PLATFORM_GOOGLE)
+#include "xla/pjrt/cpu/mpi_collectives.h"
+#endif  // !_WIN32 && !PLATFORM_GOOGLE
+
 #include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/exceptions.h"
@@ -270,6 +275,23 @@ NB_MODULE(xla_extension, m_nb) {
       nb::arg("distributed_client"), nb::arg("hostname").none() = std::nullopt,
       nb::arg("interface").none() = std::nullopt);
 
+#if !defined(_WIN32) && !defined(PLATFORM_GOOGLE)
+  nb::class_<cpu::MpiCollectives> mpi_collectives(m_nb, "MpiCollectives",
+                                                  cpu_collectives);
+  mpi_collectives.def("Init", &cpu::MpiCollectives::Init);
+  mpi_collectives.def("Finalize", &cpu::MpiCollectives::Finalize);
+  m_nb.def("make_mpi_collectives",
+           []() -> std::shared_ptr<cpu::MpiCollectives> {
+             return std::make_shared<cpu::MpiCollectives>();
+           });
+#else   // !_WIN32 && !PLATFORM_GOOGLE
+  m_nb.def("make_mpi_collectives",
+           []() -> std::shared_ptr<xla::cpu::CollectivesInterface> {
+             throw xla::XlaRuntimeError(
+                 "make_mpi_collectives is not implemented for Windows");
+           });
+#endif  // !_WIN32 && !PLATFORM_GOOGLE
+
   m_nb.def(
       "get_tfrt_cpu_client",
       [](bool asynchronous,
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index ef9e4e5291ab97..64eb6cd7d4e1dd 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -48,7 +48,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 250
+_version = 251
 
 # Version number for MLIR:Python components.
 mlir_api_version = 55

From f606d83b95f6517f3dca4ede243713d6f3af03aa Mon Sep 17 00:00:00 2001
From: Hyeontaek Lim <hyeontaek@google.com>
Date: Fri, 29 Mar 2024 12:40:12 -0700
Subject: [PATCH 609/670] Use `bytes` proto field type for string values of
 `xla::PjRtValueType` to `bytes`

`xla::PjRtValueType` is defined in C++, where its `std::string` value can
contain any string (not necessarily UTF-8). Protobuf verison 3 requires a
`string` field to contain UTF-8, so it is more suitable to use `bytes` to
express this value.

(Note that the string value of `xla::PjRtValueType` would be often consumed by
Python, where nanobind would convert `std::string` into Python `str` with UTF-8
decoding. However, this is what some users of `xla::PjRtValueType` choose to
do; this is not sufficient enough to constrain the string to be UTF-8 only in
C++ APIs.)

This is a preemptive change; there is no known problem of using a `string`
field previously.

PiperOrigin-RevId: 620315110
---
 third_party/xla/xla/python/ifrt_proxy/common/types.proto | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/ifrt_proxy/common/types.proto b/third_party/xla/xla/python/ifrt_proxy/common/types.proto
index 7ef48aed10c54b..2de88772abe906 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/types.proto
+++ b/third_party/xla/xla/python/ifrt_proxy/common/types.proto
@@ -85,7 +85,7 @@ message Variant {
   }
 
   oneof value {
-    string string_value = 1;
+    bytes string_value = 1;
     sfixed64 int64_value = 2;
     Int64List int64_list = 3;
     float float_value = 4;

From ea867d6ee69125ebae452f07f5c6b54d82267f14 Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Fri, 29 Mar 2024 13:05:35 -0700
Subject: [PATCH 610/670] Move `tsl/python` to `xla/tsl/python`

PiperOrigin-RevId: 620320903
---
 .../lite/python/interpreter_wrapper/numpy.h   |  3 ++-
 tensorflow/python/BUILD                       | 10 ++++----
 tensorflow/python/client/BUILD                |  6 ++---
 tensorflow/python/client/tf_session_helper.h  |  2 +-
 .../python/client/tf_session_wrapper.cc       |  2 +-
 tensorflow/python/eager/BUILD                 |  2 +-
 tensorflow/python/eager/pywrap_tensor.cc      |  2 +-
 tensorflow/python/eager/pywrap_tensor.h       |  2 +-
 tensorflow/python/framework/BUILD             |  6 ++---
 tensorflow/python/lib/core/BUILD              | 16 ++++++-------
 tensorflow/python/lib/core/ndarray_tensor.cc  |  4 ++--
 .../python/lib/core/ndarray_tensor_bridge.cc  |  4 ++--
 .../python/lib/core/ndarray_tensor_bridge.h   |  2 +-
 tensorflow/python/lib/core/py_func.cc         |  2 +-
 tensorflow/python/lib/core/py_seq_tensor.cc   |  2 +-
 tensorflow/python/tfe_wrapper.cc              |  2 +-
 .../tools/def_file_filter/symbols_pybind.txt  |  4 ++--
 .../tools/pip_package/build_pip_package.py    | 23 ++++++++++++-------
 .../tools/def_file_filter/symbols_pybind.txt  |  4 ++--
 .../third_party/tsl/tsl/platform/ml_dtypes.h  | 14 +++++------
 third_party/xla/xla/python/BUILD              | 10 ++++----
 third_party/xla/xla/python/nb_numpy.cc        |  2 +-
 third_party/xla/xla/python/nb_numpy.h         |  2 +-
 third_party/xla/xla/python/pmap_lib.cc        |  2 +-
 .../xla/xla/python/py_compile_only_client.cc  |  2 +-
 third_party/xla/xla/python/py_values.cc       |  2 +-
 third_party/xla/xla/python/tools/BUILD        |  2 +-
 third_party/xla/xla/python/tools/_types.cc    |  2 +-
 third_party/xla/xla/python/types.cc           |  2 +-
 third_party/xla/xla/python/xla.cc             |  2 +-
 .../tsl => xla}/tsl/python/lib/core/BUILD     |  0
 .../tsl/python/lib/core/ml_dtypes.cc          |  4 ++--
 .../tsl/python/lib/core/ml_dtypes.h           |  6 ++---
 .../tsl => xla}/tsl/python/lib/core/numpy.cc  |  2 +-
 .../tsl => xla}/tsl/python/lib/core/numpy.h   |  6 ++---
 35 files changed, 83 insertions(+), 75 deletions(-)
 rename third_party/xla/{third_party/tsl => xla}/tsl/python/lib/core/BUILD (100%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/python/lib/core/ml_dtypes.cc (97%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/python/lib/core/ml_dtypes.h (90%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/python/lib/core/numpy.cc (95%)
 rename third_party/xla/{third_party/tsl => xla}/tsl/python/lib/core/numpy.h (91%)

diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.h b/tensorflow/lite/python/interpreter_wrapper/numpy.h
index acc3dbd9fdab3a..e04418c32df7f4 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.h
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.h
@@ -40,7 +40,8 @@ limitations under the License.
 // translation unit boundaries.
 //
 // For more info see https://sourceforge.net/p/numpy/mailman/message/5700519
-// See also tensorflow/tsl/python/lib/core/numpy.h for a similar approach.
+// See also tensorflow/compiler/xla/tsl/python/lib/core/numpy.h for a similar
+// approach.
 #define PY_ARRAY_UNIQUE_SYMBOL _tflite_numpy_api
 #ifndef TFLITE_IMPORT_NUMPY
 #define NO_IMPORT_ARRAY
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 210fa3b1bff005..159e6db204f0cb 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -834,9 +834,9 @@ pywrap_tensorflow_macro(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/profiler/rpc:profiler_server_impl",
         "@local_tsl//tsl/profiler/rpc/client:profiler_client_impl",
-        "@local_tsl//tsl/python/lib/core:numpy",
         "@local_xla//xla/backends/profiler/cpu:python_tracer",
         "@local_xla//xla/stream_executor:stream_executor_impl",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ] + select({
         "//tensorflow/compiler/mlir/python:disable_mlir_config": [],
         "//conditions:default": [
@@ -951,9 +951,9 @@ filegroup(
         "@local_tsl//tsl/profiler/lib:profiler_session_impl",  # profiler
         "@local_tsl//tsl/profiler/rpc:profiler_server_impl",  # profiler
         "@local_tsl//tsl/profiler/rpc/client:profiler_client_impl",
-        "@local_tsl//tsl/python/lib/core:ml_dtypes_lib",  # bfloat16, float8_e4m3fn, float8_e5m2
-        "@local_tsl//tsl/python/lib/core:numpy",  # checkpoint_reader
         "@local_xla//xla/stream_executor",  # stat_summarizer
+        "@local_xla//xla/tsl/python/lib/core:ml_dtypes_lib",  # bfloat16, float8_e4m3fn, float8_e5m2
+        "@local_xla//xla/tsl/python/lib/core:numpy",  # checkpoint_reader
     ] + select({
         "//tensorflow/compiler/mlir/python:disable_mlir_config": [],
         "//conditions:default": [
@@ -1151,7 +1151,7 @@ cc_library(
     name = "unified_api_pywrap_required_headers",
     textual_hdrs = [
         "//tensorflow/python/lib/core:basic_hdrs",
-        "@local_tsl//tsl/python/lib/core:basic_hdrs",
+        "@local_xla//xla/tsl/python/lib/core:basic_hdrs",
         "//tensorflow/c:headers",
         "//tensorflow/c:safe_ptr_hdr",
         "//tensorflow/c/eager:headers",
@@ -1205,7 +1205,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/util:util_hdr",
         "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
         "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
-        "@local_tsl//tsl/python/lib/core:numpy_hdr",
+        "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     dynamic_deps = [":_pywrap_tensorflow_internal.so"] + select({
         "//tensorflow:macos": ["//tensorflow:libtensorflow_framework.%s.dylib" % VERSION],
diff --git a/tensorflow/python/client/BUILD b/tensorflow/python/client/BUILD
index 15f5937a06b95a..64ff5af1017bea 100644
--- a/tensorflow/python/client/BUILD
+++ b/tensorflow/python/client/BUILD
@@ -45,7 +45,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
         "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
-        "@local_tsl//tsl/python/lib/core:numpy_hdr",
+        "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     enable_stub_generation = True,
     pytype_srcs = [
@@ -68,7 +68,7 @@ tf_python_pybind_extension(
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/types:optional",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
         "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
@@ -259,7 +259,7 @@ tf_cuda_library(
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 19e04fd1c5b29a..8a9682307c4ff5 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 // Must be included first
 // clang-format off
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include "tensorflow/c/c_api.h"
diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc
index 0e7a5efe142b13..7492c91bbdb355 100644
--- a/tensorflow/python/client/tf_session_wrapper.cc
+++ b/tensorflow/python/client/tf_session_wrapper.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_buffer.h"
 #include "tensorflow/c/tf_datatype.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/framework/full_type.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "tensorflow/python/lib/core/pybind11_status.h"
 #include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 #include "tsl/platform/mutex.h"
-#include "tsl/python/lib/core/numpy.h"
 
 namespace pybind11 {
 namespace detail {
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index af25ec35db0bdf..2c92adf4f1e300 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -80,7 +80,7 @@ cc_library(
         "@com_google_absl//absl/types:variant",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 167511e80c65ef..6dc8541ef09592 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 // Must be included first
 // clang-format off
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include "tensorflow/python/eager/pywrap_tensor.h"
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index 53c5b66a93fcb9..bebf4e8558c463 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 // Must be included first
 // clang-format off
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include "tensorflow/c/eager/c_api.h"
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index fd25d20a55ffef..dccc222720757f 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -865,7 +865,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
         "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
-        "@local_tsl//tsl/python/lib/core:numpy_hdr",
+        "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     enable_stub_generation = True,
     pytype_srcs = [
@@ -965,7 +965,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
         "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
-        "@local_tsl//tsl/python/lib/core:numpy_hdr",
+        "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     enable_stub_generation = True,
     pytype_srcs = [
@@ -1109,7 +1109,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
         "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
-        "@local_tsl//tsl/python/lib/core:numpy_hdr",
+        "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     enable_stub_generation = True,
     pytype_srcs = [
diff --git a/tensorflow/python/lib/core/BUILD b/tensorflow/python/lib/core/BUILD
index a6904053c60b76..0ab9f789976e11 100644
--- a/tensorflow/python/lib/core/BUILD
+++ b/tensorflow/python/lib/core/BUILD
@@ -39,8 +39,8 @@ cc_library(
         "//tensorflow/c:tf_datatype_hdrs",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "@local_tsl//tsl/python/lib/core:ml_dtypes_lib",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:ml_dtypes_lib",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
 )
 
@@ -171,7 +171,7 @@ cc_library(
         "//tensorflow/python/eager:pywrap_tfe_lib",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
     alwayslink = 1,
 )
@@ -213,7 +213,7 @@ cc_library(
         "//tensorflow/c:headers",
         "//tensorflow/c:safe_ptr_hdr",
         "//tensorflow/c/eager:headers",
-        "@local_tsl//tsl/python/lib/core:numpy_hdr",
+        "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     features = [
         "-parse_headers",
@@ -228,7 +228,7 @@ cc_library(
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
 )
 
@@ -249,8 +249,8 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "@local_tsl//tsl/python/lib/core:ml_dtypes_lib",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:ml_dtypes_lib",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
 )
 
@@ -273,7 +273,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
 )
 
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index ea38b32266ab63..5096026cbdb1db 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -15,7 +15,7 @@ limitations under the License.
 // Must be included first
 // clang-format off
 #include "tensorflow/c/tf_datatype.h"
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/tf_tensor_internal.h"
+#include "xla/tsl/python/lib/core/ml_dtypes.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/util/port.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 #include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
-#include "tsl/python/lib/core/ml_dtypes.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index d45ca6ee0c67d6..c7fa135c82ad7c 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // clang-format off
 // Must be included first.
 #include "tensorflow/c/tf_datatype.h"
-#include "tsl/python/lib/core/numpy.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 // clang-format on
 
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
@@ -24,10 +24,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/c_api.h"
+#include "xla/tsl/python/lib/core/ml_dtypes.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/python/lib/core/py_util.h"
-#include "tsl/python/lib/core/ml_dtypes.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.h b/tensorflow/python/lib/core/ndarray_tensor_bridge.h
index ed2da4afc0c230..fe98f8818d46bb 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.h
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 // Must be included first
 // clang-format off
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include <functional>
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 9eac4c2c207e97..d1b7986c0a998e 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // Must be included first.
 #include "tensorflow/python/lib/core/py_func.h"
 
-#include "tsl/python/lib/core/numpy.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 // clang-format: on
 
 #include <Python.h>
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index b572244c74b40f..aeac93b3711984 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 // Must be included first
 // clang-format off
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include "tensorflow/python/lib/core/py_seq_tensor.h"
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index a308a621912917..846b8693c227b9 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "pybind11/attr.h"  // from @pybind11
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include "Python.h"
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 1a158e696f8590..78c42c6f454c5c 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -59,7 +59,7 @@ tensorflow::tfprof::SerializeToString
 [//tensorflow/core/grappler/graph_analyzer:graph_analyzer_tool] # graph_analyze
 tensorflow::grappler::graph_analyzer::GraphAnalyzerTool
 
-[//external/local_tsl/tsl/python/lib/core:ml_dtypes_lib] # bfloat16, float8
+[//external/local_xla/xla/tsl/python/lib/core:ml_dtypes_lib] # bfloat16, float8
 tsl::ml_dtypes::RegisterTypes
 tsl::ml_dtypes::GetBfloat16Dtype
 tsl::ml_dtypes::GetFloat8E4m3b11fnuzDtype
@@ -307,7 +307,7 @@ tensorflow::AddWhileInputHack
 tensorflow::RecordMutation
 tensorflow::Graph::IsControlEdge
 
-[//external/local_tsl/tsl/python/lib/core:numpy] # tf_session
+[//external/local_xla/xla/tsl/python/lib/core:numpy] # tf_session
 tsl::ImportNumpy
 _tsl_numpy_api
 
diff --git a/tensorflow/tools/pip_package/build_pip_package.py b/tensorflow/tools/pip_package/build_pip_package.py
index 88f27b25cc5329..9ac5ad2d7b57c8 100644
--- a/tensorflow/tools/pip_package/build_pip_package.py
+++ b/tensorflow/tools/pip_package/build_pip_package.py
@@ -219,14 +219,21 @@ def patch_so(srcs_dir: str) -> None:
     srcs_dir: target directory with .so files to patch.
   """
   to_patch = {
-      "tensorflow/python/_pywrap_tensorflow_internal.so":
-      "$ORIGIN/../../tensorflow/tsl/python/lib/core",
-      ("tensorflow/compiler/mlir/quantization/tensorflow/python/"
-       "pywrap_function_lib.so"): "$ORIGIN/../../../../../python",
-      ("tensorflow/compiler/mlir/quantization/tensorflow/python/"
-       "pywrap_quantize_model.so"): "$ORIGIN/../../../../../python",
-      ("tensorflow/compiler/mlir/quantization/tensorflow/calibrator/"
-       "pywrap_calibration.so"): "$ORIGIN/../../../../../python",
+      "tensorflow/python/_pywrap_tensorflow_internal.so": (
+          "$ORIGIN/../../tensorflow/compiler/xla/tsl/python/lib/core"
+      ),
+      (
+          "tensorflow/compiler/mlir/quantization/tensorflow/python/"
+          "pywrap_function_lib.so"
+      ): "$ORIGIN/../../../../../python",
+      (
+          "tensorflow/compiler/mlir/quantization/tensorflow/python/"
+          "pywrap_quantize_model.so"
+      ): "$ORIGIN/../../../../../python",
+      (
+          "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/"
+          "pywrap_calibration.so"
+      ): "$ORIGIN/../../../../../python",
   }
   for file, path in to_patch.items():
     rpath = subprocess.check_output(
diff --git a/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt b/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
index 1a158e696f8590..78c42c6f454c5c 100644
--- a/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
+++ b/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
@@ -59,7 +59,7 @@ tensorflow::tfprof::SerializeToString
 [//tensorflow/core/grappler/graph_analyzer:graph_analyzer_tool] # graph_analyze
 tensorflow::grappler::graph_analyzer::GraphAnalyzerTool
 
-[//external/local_tsl/tsl/python/lib/core:ml_dtypes_lib] # bfloat16, float8
+[//external/local_xla/xla/tsl/python/lib/core:ml_dtypes_lib] # bfloat16, float8
 tsl::ml_dtypes::RegisterTypes
 tsl::ml_dtypes::GetBfloat16Dtype
 tsl::ml_dtypes::GetFloat8E4m3b11fnuzDtype
@@ -307,7 +307,7 @@ tensorflow::AddWhileInputHack
 tensorflow::RecordMutation
 tensorflow::Graph::IsControlEdge
 
-[//external/local_tsl/tsl/python/lib/core:numpy] # tf_session
+[//external/local_xla/xla/tsl/python/lib/core:numpy] # tf_session
 tsl::ImportNumpy
 _tsl_numpy_api
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h b/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
index c25efc2f865b70..504085af8518ee 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
@@ -20,16 +20,16 @@ limitations under the License.
 #include "ml_dtypes/include/int4.h"  // from @ml_dtypes
 
 namespace tsl {
-using float8_e4m3fn = ml_dtypes::float8_e4m3fn;
-using float8_e4m3fnuz = ml_dtypes::float8_e4m3fnuz;
-using float8_e4m3b11fnuz = ml_dtypes::float8_e4m3b11fnuz;
+using float8_e4m3fn = ::ml_dtypes::float8_e4m3fn;
+using float8_e4m3fnuz = ::ml_dtypes::float8_e4m3fnuz;
+using float8_e4m3b11fnuz = ::ml_dtypes::float8_e4m3b11fnuz;
 using float8_e4m3b11 = float8_e4m3b11fnuz;  // Deprecated: old name for
                                             // backward-compatibility only.
-using float8_e5m2 = ml_dtypes::float8_e5m2;
-using float8_e5m2fnuz = ml_dtypes::float8_e5m2fnuz;
+using float8_e5m2 = ::ml_dtypes::float8_e5m2;
+using float8_e5m2fnuz = ::ml_dtypes::float8_e5m2fnuz;
 
-using int4 = ml_dtypes::int4;
-using uint4 = ml_dtypes::uint4;
+using int4 = ::ml_dtypes::int4;
+using uint4 = ::ml_dtypes::uint4;
 }  // namespace tsl
 
 #endif  // TENSORFLOW_TSL_PLATFORM_ML_DTYPES_H_
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 267e7284f05cb0..4d0cda7ddd02fd 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -197,6 +197,7 @@ cc_library(
         "//xla/pjrt:exceptions",
         "//xla/python/ifrt",
         "//xla/python/pjrt_ifrt",
+        "//xla/tsl/python/lib/core:numpy",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status:statusor",
@@ -205,7 +206,6 @@ cc_library(
         "@local_config_python//:python_headers",  # buildcleaner: keep
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/python/lib/core:numpy",
     ],
 )
 
@@ -377,6 +377,7 @@ cc_library(
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
         "//xla/service:platform_util",
+        "//xla/tsl/python/lib/core:numpy",
         "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/framework:allocator",
         "@local_tsl//tsl/platform:casts",
@@ -387,7 +388,6 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/python/lib/core:numpy",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -749,11 +749,11 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:status_casters",
         "//xla/python/ifrt",
+        "//xla/tsl/python/lib/core:numpy",
         "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/python/lib/core:numpy",
     ],
 )
 
@@ -1246,12 +1246,12 @@ tsl_pybind_extension(
         "//xla/python/ifrt_proxy/client:py_module",
         "//xla/python/pjrt_ifrt",
         "//xla/service/cpu:collectives_interface",
+        "//xla/tsl/python/lib/core:numpy",
         "@local_tsl//tsl/distributed_runtime/preemption:preemption_sync_manager",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform/cloud:gcs_file_system",
-        "@local_tsl//tsl/python/lib/core:numpy",
     ] + select({
         # gloo transport only builds on linux
         "@local_tsl//tsl:macos": [],
@@ -1356,9 +1356,9 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         "//third_party/nanobind",
+        "//xla/tsl/python/lib/core:numpy",
         "@com_google_absl//absl/types:span",
         "@local_config_python//:python_headers",
-        "@local_tsl//tsl/python/lib/core:numpy",
     ],
 )
 
diff --git a/third_party/xla/xla/python/nb_numpy.cc b/third_party/xla/xla/python/nb_numpy.cc
index f6006cd94786e7..2210f67569a283 100644
--- a/third_party/xla/xla/python/nb_numpy.cc
+++ b/third_party/xla/xla/python/nb_numpy.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "tsl/python/lib/core/numpy.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 
 namespace nb = nanobind;
 
diff --git a/third_party/xla/xla/python/nb_numpy.h b/third_party/xla/xla/python/nb_numpy.h
index 64c6c55cc93d2e..23dc85f7ce900c 100644
--- a/third_party/xla/xla/python/nb_numpy.h
+++ b/third_party/xla/xla/python/nb_numpy.h
@@ -30,7 +30,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "tsl/python/lib/core/numpy.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 
 #if NPY_ABI_VERSION < 0x02000000
 #define PyDataType_ELSIZE(descr) ((descr)->elsize)
diff --git a/third_party/xla/xla/python/pmap_lib.cc b/third_party/xla/xla/python/pmap_lib.cc
index 61b16dedccbd1a..1dbeb30d01ef8b 100644
--- a/third_party/xla/xla/python/pmap_lib.cc
+++ b/third_party/xla/xla/python/pmap_lib.cc
@@ -68,13 +68,13 @@ limitations under the License.
 #include "xla/python/traceback.h"
 #include "xla/python/types.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
-#include "tsl/python/lib/core/numpy.h"
 
 namespace jax {
 
diff --git a/third_party/xla/xla/python/py_compile_only_client.cc b/third_party/xla/xla/python/py_compile_only_client.cc
index a4297bce47bacc..a85758f921096e 100644
--- a/third_party/xla/xla/python/py_compile_only_client.cc
+++ b/third_party/xla/xla/python/py_compile_only_client.cc
@@ -60,11 +60,11 @@ limitations under the License.
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/py_client.h"
 #include "xla/service/computation_placer.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 #include "xla/util.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/python/lib/core/numpy.h"
 
 namespace nb = nanobind;
 
diff --git a/third_party/xla/xla/python/py_values.cc b/third_party/xla/xla/python/py_values.cc
index 396dc542b11c1c..7454dd4e63bf93 100644
--- a/third_party/xla/xla/python/py_values.cc
+++ b/third_party/xla/xla/python/py_values.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "xla/python/sharding.h"
 #include "xla/python/types.h"
 #include "xla/shape.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -57,7 +58,6 @@ limitations under the License.
 #include "tsl/platform/ml_dtypes.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
-#include "tsl/python/lib/core/numpy.h"
 
 namespace nb = nanobind;
 
diff --git a/third_party/xla/xla/python/tools/BUILD b/third_party/xla/xla/python/tools/BUILD
index 2338525fe744d3..2b6ce41ba112b0 100644
--- a/third_party/xla/xla/python/tools/BUILD
+++ b/third_party/xla/xla/python/tools/BUILD
@@ -65,9 +65,9 @@ tsl_pybind_extension(
         "//xla/python:logging",
         "//xla/python:nb_numpy",
         "//xla/python:types",
+        "//xla/tsl/python/lib/core:numpy",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/python/lib/core:numpy",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:status_casters",
         "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
diff --git a/third_party/xla/xla/python/tools/_types.cc b/third_party/xla/xla/python/tools/_types.cc
index 320404637a0462..cd2a65382642c9 100644
--- a/third_party/xla/xla/python/tools/_types.cc
+++ b/third_party/xla/xla/python/tools/_types.cc
@@ -33,7 +33,7 @@ limitations under the License.
 // is fine); however, tsl-numpy does reexport NumPy's arrayobject.h header.
 // Since one of the TF headers above already includes tsl-numpy, therefore
 // we must include it down here rather than including actual NumPy directly.
-#include "tsl/python/lib/core/numpy.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 
 namespace py = ::pybind11;
 namespace nb = ::nanobind;
diff --git a/third_party/xla/xla/python/types.cc b/third_party/xla/xla/python/types.cc
index 3ca11ef9d9e836..5ecb181fb2592e 100644
--- a/third_party/xla/xla/python/types.cc
+++ b/third_party/xla/xla/python/types.cc
@@ -45,11 +45,11 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/python/lib/core/numpy.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index 1c9cf5bbcdadbb..cde953367c498d 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -56,7 +56,7 @@ limitations under the License.
 #include "xla/python/ifrt_proxy/client/py_module.h"
 #include "xla/python/py_client.h"
 #include "xla/service/cpu/collectives_interface.h"
-#include "tsl/python/lib/core/numpy.h"  //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h"  //NOLINT
 #ifdef XLA_PYTHON_ENABLE_GPU
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #endif  // XLA_PYTHON_ENABLE_GPU
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/BUILD b/third_party/xla/xla/tsl/python/lib/core/BUILD
similarity index 100%
rename from third_party/xla/third_party/tsl/tsl/python/lib/core/BUILD
rename to third_party/xla/xla/tsl/python/lib/core/BUILD
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc b/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.cc
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc
rename to third_party/xla/xla/tsl/python/lib/core/ml_dtypes.cc
index 2815b25b24469f..d138c00bf9e6d5 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc
+++ b/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tsl/python/lib/core/ml_dtypes.h"
+#include "xla/tsl/python/lib/core/ml_dtypes.h"
 
 #include <atomic>
 #include <exception>
 
 // Must be included first to ensure `NPY_NO_DEPRECATED_API` is defined.
 // clang-format off
-#include "tsl/python/lib/core/numpy.h"  // IWYU pragma: keep
+#include "xla/tsl/python/lib/core/numpy.h"  // IWYU pragma: keep
 // clang-format on
 #include "numpy/ndarraytypes.h"
 #include "absl/base/attributes.h"
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.h b/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h
similarity index 90%
rename from third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.h
rename to third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h
index f2b93ebee41a21..bf9eab2200a76b 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.h
+++ b/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
-#define TENSORFLOW_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
+#ifndef XLA_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
+#define XLA_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
 
 // Registers all custom types from the python ml_dtypes package.
 //   https://github.com/jax-ml/ml_dtypes
@@ -47,4 +47,4 @@ inline int GetBfloat16TypeNum() { return GetNumpyDtypes().bfloat16; }
 }  // namespace ml_dtypes
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
+#endif  // XLA_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/numpy.cc b/third_party/xla/xla/tsl/python/lib/core/numpy.cc
similarity index 95%
rename from third_party/xla/third_party/tsl/tsl/python/lib/core/numpy.cc
rename to third_party/xla/xla/tsl/python/lib/core/numpy.cc
index 3013a1a7c68d46..3f54df1281c2d5 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/numpy.cc
+++ b/third_party/xla/xla/tsl/python/lib/core/numpy.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // ImportNumpy function to populate it.
 #define XLA_IMPORT_NUMPY
 
-#include "tsl/python/lib/core/numpy.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/numpy.h b/third_party/xla/xla/tsl/python/lib/core/numpy.h
similarity index 91%
rename from third_party/xla/third_party/tsl/tsl/python/lib/core/numpy.h
rename to third_party/xla/xla/tsl/python/lib/core/numpy.h
index ee4b920d0ebf5c..6a5a6a6486ccf7 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/numpy.h
+++ b/third_party/xla/xla/tsl/python/lib/core/numpy.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_PYTHON_LIB_CORE_NUMPY_H_
-#define TENSORFLOW_TSL_PYTHON_LIB_CORE_NUMPY_H_
+#ifndef XLA_TSL_PYTHON_LIB_CORE_NUMPY_H_
+#define XLA_TSL_PYTHON_LIB_CORE_NUMPY_H_
 
 #ifdef PyArray_Type
 #error "Numpy cannot be included before numpy.h."
@@ -50,4 +50,4 @@ void ImportNumpy();
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_PYTHON_LIB_CORE_NUMPY_H_
+#endif  // XLA_TSL_PYTHON_LIB_CORE_NUMPY_H_

From 5bbc00214f106863808539104be9fdc6e01af02d Mon Sep 17 00:00:00 2001
From: Marcello Maggioni <maggioni@google.com>
Date: Fri, 29 Mar 2024 13:22:30 -0700
Subject: [PATCH 611/670] Reverts 3a2cd8887ed96de6abbd26e46844b959aa42e481

PiperOrigin-RevId: 620324878
---
 .../xla/xla/service/collective_opt_utils.cc   | 35 +------------------
 .../xla/xla/service/collective_opt_utils.h    |  2 +-
 .../xla/service/reduce_scatter_decomposer.cc  |  4 ---
 .../xla/service/reduce_scatter_decomposer.h   |  6 ++--
 .../service/reduce_scatter_decomposer_test.cc | 32 ++---------------
 5 files changed, 7 insertions(+), 72 deletions(-)

diff --git a/third_party/xla/xla/service/collective_opt_utils.cc b/third_party/xla/xla/service/collective_opt_utils.cc
index cbc7a4c8867bd4..8e7a6d874cfa8a 100644
--- a/third_party/xla/xla/service/collective_opt_utils.cc
+++ b/third_party/xla/xla/service/collective_opt_utils.cc
@@ -267,46 +267,13 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
   return true;
 }
 
-std::optional<ReduceScatterSpec> SpecFromReduceScatterInstr(
-    const HloInstruction* rs_instr, int64_t num_partitions,
-    int64_t num_replicas, int64_t min_rank, bool is_constrain_layout,
-    bool use_global_device_ids, bool is_cross_module) {
-  if (rs_instr->shape().rank() < min_rank) {
-    return std::nullopt;
-  }
-  CHECK(rs_instr->opcode() == HloOpcode::kReduceScatter);
-  ReduceScatterSpec spec;
-  spec.split_dim = rs_instr->dimensions(0);
-  if (!is_cross_module) {
-    spec.sharded_replicas = num_replicas;
-    spec.group_size = rs_instr->replica_groups().empty()
-                          ? num_replicas
-                          : rs_instr->replica_groups()[0].replica_ids_size();
-  } else if (use_global_device_ids) {
-    spec.sharded_replicas = num_replicas;
-    spec.sharded_partitions = num_partitions;
-    spec.group_size = rs_instr->replica_groups()[0].replica_ids_size();
-  } else {
-    spec.sharded_partitions = num_partitions;
-    spec.group_size = num_partitions;
-  }
-  spec.original_split_dims = {spec.split_dim};
-  spec.dynamic_slice = nullptr;
-  return spec;
-}
-
 }  // namespace
 
 std::optional<ReduceScatterSpec> MatchReduceScatter(
-    const HloAllReduceInstructionBase* ar, int64_t num_partitions,
+    const HloAllReduceInstruction* ar, int64_t num_partitions,
     int64_t num_replicas, bool allow_multiple_split_dims,
     bool allow_intervening_reshape, int64_t min_rank,
     HloPredicate match_partition_id, HloPredicate match_replica_id) {
-  if (ar->opcode() == HloOpcode::kReduceScatter) {
-    return SpecFromReduceScatterInstr(
-        ar, num_partitions, num_replicas, min_rank, ar->constrain_layout(),
-        ar->use_global_device_ids(), ar->channel_id().has_value());
-  }
   auto spec = MatchWithDynamicSlice(
       ar, num_partitions, num_replicas, allow_multiple_split_dims,
       allow_intervening_reshape, min_rank, match_partition_id, match_replica_id,
diff --git a/third_party/xla/xla/service/collective_opt_utils.h b/third_party/xla/xla/service/collective_opt_utils.h
index 7d044be3c34568..11b65c1acc4160 100644
--- a/third_party/xla/xla/service/collective_opt_utils.h
+++ b/third_party/xla/xla/service/collective_opt_utils.h
@@ -36,7 +36,7 @@ struct ReduceScatterSpec {
 
 // Matches the given all-reduce operation to a reduce-scatter pattern.
 std::optional<ReduceScatterSpec> MatchReduceScatter(
-    const HloAllReduceInstructionBase* ar, int64_t num_partitions,
+    const HloAllReduceInstruction* ar, int64_t num_partitions,
     int64_t num_replicas, bool allow_multiple_split_dims = false,
     bool allow_intervening_reshape = false, int64_t min_rank = 1,
     HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>,
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.cc b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
index da2fed224a53f5..7210a2c12b4f30 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
@@ -53,11 +53,7 @@ absl::StatusOr<bool> ReduceScatterDecomposer::Run(
       if (rs->channel_id()) {
         channel_id = next_channel_id++;
       }
-      if (should_decompose_ && !should_decompose_(rs)) {
-        continue;
-      }
 
-      VLOG(2) << "Decompose: " << rs->ToString();
       // Create an all-reduce
       HloComputation *apply_clone = module->AddComputationAndUnifyNamesAndIds(
           rs->to_apply()->Clone(), /*is_entry=*/false);
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.h b/third_party/xla/xla/service/reduce_scatter_decomposer.h
index 1ee1f603c09f28..324d97d0e915e9 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.h
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.h
@@ -29,9 +29,8 @@ namespace xla {
 class ReduceScatterDecomposer : public HloModulePass {
  public:
   explicit ReduceScatterDecomposer(
-      std::function<void(Shape&)> update_layout = nullptr,
-      std::function<bool(const HloInstruction*)> should_decompose = nullptr)
-      : update_layout_(update_layout), should_decompose_(should_decompose) {}
+      std::function<void(Shape&)> update_layout = nullptr)
+      : update_layout_(update_layout) {}
   absl::string_view name() const override {
     return "reduce-scatter-decomposer";
   }
@@ -41,7 +40,6 @@ class ReduceScatterDecomposer : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
   std::function<void(Shape&)> update_layout_;
-  std::function<bool(const HloInstruction*)> should_decompose_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc b/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
index d7f8360fbdc910..bfaa918930befb 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
@@ -41,18 +41,13 @@ class ReduceScatterDecomposerTest : public HloTestBase {
       absl::string_view hlo_module, PassAction action,
       CollectiveOpGroupMode mode = CollectiveOpGroupMode::kCrossReplica,
       int64_t shard_size = 0, int64_t shard_dimension = 0,
-      int64_t replica_count = 2,
-      std::function<bool(const HloInstruction *)> should_decompose =
-          [](const HloInstruction *) { return true; }) {
+      int64_t replica_count = 2) {
     const int64_t partition_count = 2;
     TF_ASSERT_OK_AND_ASSIGN(
         auto module, ParseAndReturnVerifiedModule(hlo_module, replica_count,
                                                   partition_count));
-    TF_ASSERT_OK_AND_ASSIGN(
-        bool changed,
-        ReduceScatterDecomposer(/*update_layout=*/nullptr,
-                                /*should_decompose=*/should_decompose)
-            .Run(module.get()));
+    TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                            ReduceScatterDecomposer().Run(module.get()));
     if (action == PassAction::kNoChange) {
       ASSERT_FALSE(changed);
       return;
@@ -227,26 +222,5 @@ ENTRY main {
   RunPass(hlo_string, PassAction::kNoChange);
 }
 
-TEST_F(ReduceScatterDecomposerTest, NoChangeWithShouldDecompose) {
-  absl::string_view hlo_string = R"(
-HloModule m
-
-sum {
-  a = f32[] parameter(0)
-  b = f32[] parameter(1)
-  ROOT add.2 = f32[] add(a, b)
-}
-
-ENTRY main {
-  p0 = f32[4, 8] parameter(0)
-  ROOT rs = f32[4, 4] reduce-scatter(p0), replica_groups={{0,1}, {2,3}}, channel_id=1, dimensions={1}, to_apply=sum, use_global_device_ids=true
-}
-)";
-  RunPass(hlo_string, PassAction::kNoChange,
-          CollectiveOpGroupMode::kCrossReplica,
-          /*shard_size=*/0, /*shard_dimension=*/0,
-          /*replica_count=*/2, [](const HloInstruction *) { return false; });
-}
-
 }  // namespace
 }  // namespace xla

From b25ab7111ed33efb300e60760dd6a03268826814 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 Mar 2024 13:38:12 -0700
Subject: [PATCH 612/670] Implementation of get / retrieve buffer attributes
 methods in GPU async kernel.

PiperOrigin-RevId: 620328996
---
 tensorflow/lite/delegates/gpu/delegate.cc | 138 +++++++++++++++++-----
 1 file changed, 110 insertions(+), 28 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index 3389cf7948e0d6..9fac6e598f1b1a 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -117,7 +117,7 @@ using tflite::delegates::utils::WriteSyncAttrs;
 namespace tflite {
 namespace gpu {
 namespace {
-
+// TODO(b/328628170): Add productive coverage to GPU delegate.
 using delegates::Serialization;
 using delegates::SerializationParams;
 using tflite::TFLITE_LOG_WARNING;
@@ -880,6 +880,37 @@ class DelegateAsyncKernel : public BackendAsyncKernelInterface {
     return desc_ahwb;
   }
 
+  // Validate the attributes passed in, return kTfLiteOk if the attributes
+  // meet the requirements. Return the registered buffer attributes in
+  // `buffer_attrs`.
+  static TfLiteStatus CheckAttributes(const TfLiteAttributeMap* attrs,
+                                      BufferAttributes& buffer_attrs) {
+    // Validate buffer attributes.
+    TFLITE_RET_CHECK_STATUS(
+        TfLiteAttributeMapIsBufferAttributeMap(attrs),
+        "calling RegisterBuffer with invalid attribute map type");
+    buffer_attrs = ReadBufferAttrs(attrs);
+    TFLITE_RET_CHECK_STATUS(
+        buffer_attrs.buffer_type.has_value(),
+        "calling RegisterBuffer with buffer resource type name unspecified");
+    TFLITE_RET_CHECK_STATUS(
+        buffer_attrs.buffer_type.value() != BufferType::kUnknown,
+        "calling RegisterBuffer with unknown buffer resource type");
+    size_t alignment = buffer_attrs.alignment.value_or(kRequiredByteAlignment);
+    TFLITE_RET_CHECK_STATUS(
+        alignment % kRequiredByteAlignment == 0,
+        "calling RegisterBuffer with non-zero buffer alignment");
+    size_t padding = buffer_attrs.padding.value_or(kRequiredBytePadding);
+    TFLITE_RET_CHECK_STATUS(
+        padding % kRequiredBytePadding == 0,
+        "calling RegisterBuffer with non-zero buffer padding");
+    size_t offset = buffer_attrs.offset.value_or(0);
+    TFLITE_RET_CHECK_STATUS(offset == 0,
+                            "calling RegisterBuffer with non-zero offset");
+
+    return kTfLiteOk;
+  }
+
   // For SupportedBufferTypes and SupportedSynchronizations
   const std::vector<const char*> supported_buffer_types_ = {
       ::tflite::delegates::utils::kBufferTypeAHardwareBufferBlob};
@@ -901,6 +932,9 @@ class DelegateAsyncKernel : public BackendAsyncKernelInterface {
 
   absl::flat_hash_map<TfLiteBufferHandle, UniquePtrAHardwareBuffer>
       buffer_by_handle_ ABSL_GUARDED_BY(eval_mutex_);
+
+  absl::flat_hash_map<AHardwareBuffer*, BufferAttributes> attributes_by_buffer_
+      ABSL_GUARDED_BY(eval_mutex_);
   std::vector<SyncType> output_sync_types_ ABSL_GUARDED_BY(eval_mutex_);
 };
 
@@ -1074,14 +1108,78 @@ TfLiteStatus DelegateAsyncKernel::SetAttributesImpl(
 
 TfLiteStatus DelegateAsyncKernel::SetBufferAttributes(
     const TfLiteBackendBuffer* buffer, const TfLiteAttributeMap* attrs) {
-  // TODO(b/325338475): Implement the details for set attributes to buffer.
-  return kTfLiteDelegateError;
+  TFLITE_ABORT_CHECK(buffer != nullptr, "Buffer is null");
+  TFLITE_ABORT_CHECK(attrs != nullptr, "Attribute is null");
+
+  // We depend on the availability of AHardwareBuffer.
+  TFLITE_RET_CHECK_STATUS(
+      TFLITE_AHWB_AVAILABLE(),
+      "calling tflite::gpu::DelegateAsyncKernel::SetBufferAttributes on device "
+      "without AHardwareBuffer support");
+  BufferAttributes buffer_attrs;
+  TFLITE_RET_CHECK_STATUS(CheckAttributes(attrs, buffer_attrs) == kTfLiteOk,
+                          "SetBufferAttributes(): Failed to check attributes");
+
+  // Validate ahardwarebuffer.
+  auto* ahwb =
+      reinterpret_cast<AHardwareBuffer*>(TfLiteBackendBufferGetPtr(buffer));
+  TFLITE_RET_CHECK_STATUS(ahwb != nullptr,
+                          "calling SetBufferAttributes with nullptr buffer");
+  UniquePtrAHardwareBuffer uptr_ahwb = Acquire(ahwb);
+  const AHardwareBuffer_Desc desc_ahwb = Describe(uptr_ahwb);
+  TFLITE_RET_CHECK_STATUS(desc_ahwb.format == AHARDWAREBUFFER_FORMAT_BLOB,
+                          "calling SetBufferAttributes with an AHardwareBuffer "
+                          "of format other than BLOB is not supported");
+  size_t size = buffer_attrs.size.value_or(desc_ahwb.width);
+  TFLITE_RET_CHECK_STATUS(
+      size <= desc_ahwb.width,
+      "calling SetBufferAttributes with buffer size larger than the actual "
+      "AHardwareBuffer size");
+
+  absl::MutexLock eval_lock(&eval_mutex_);
+  if (attributes_by_buffer_.find(uptr_ahwb.get()) !=
+      attributes_by_buffer_.end()) {
+    attributes_by_buffer_[uptr_ahwb.get()] = buffer_attrs;
+  } else {
+    TFLITE_LOG_PROD(
+        TFLITE_LOG_ERROR,
+        "SetBufferAttributes(): Unable to find the buffer in the map.");
+  }
+  return kTfLiteOk;
 }
 
 TfLiteStatus DelegateAsyncKernel::GetBufferAttributes(
     const TfLiteBackendBuffer* buffer, TfLiteAttributeMap* attrs) {
-  // TODO(b/325338475): Implement the details for get attributes from buffer.
-  return kTfLiteDelegateError;
+  TFLITE_ABORT_CHECK(buffer != nullptr, "Buffer is null");
+  TFLITE_ABORT_CHECK(attrs != nullptr, "Attribute map is null");
+
+  // We depend on the availability of AHardwareBuffer.
+  TFLITE_RET_CHECK_STATUS(
+      TFLITE_AHWB_AVAILABLE(),
+      "calling tflite::gpu::DelegateAsyncKernel::GetBufferAttributes on device "
+      "without AHardwareBuffer support");
+  TFLITE_RET_CHECK_STATUS(
+      TfLiteAttributeMapIsBufferAttributeMap(attrs),
+      "calling GetBufferAttributes with an invalid attribute map type");
+
+  // Validate ahardwarebuffer.
+  auto* ahwb =
+      reinterpret_cast<AHardwareBuffer*>(TfLiteBackendBufferGetPtr(buffer));
+  TFLITE_RET_CHECK_STATUS(ahwb != nullptr,
+                          "calling GetBufferAttributes with nullptr buffer");
+  UniquePtrAHardwareBuffer uptr_ahwb = Acquire(ahwb);
+  const AHardwareBuffer_Desc desc_ahwb = Describe(uptr_ahwb);
+  TFLITE_RET_CHECK_STATUS(desc_ahwb.format == AHARDWAREBUFFER_FORMAT_BLOB,
+                          "calling GetBufferAttributes with an AHardwareBuffer "
+                          "of format other than "
+                          "BLOB is not supported");
+
+  absl::MutexLock eval_lock(&eval_mutex_);
+  auto it = attributes_by_buffer_.find(uptr_ahwb.get());
+  TFLITE_RET_CHECK_STATUS(it != attributes_by_buffer_.end(),
+                          "Unable to find the buffer.");
+  WriteBufferAttrs(it->second, attrs);
+  return kTfLiteOk;
 }
 
 TfLiteStatus DelegateAsyncKernel::Prepare(TfLiteOpaqueContext* opaque_context,
@@ -1141,34 +1239,14 @@ TfLiteStatus DelegateAsyncKernel::RegisterBufferImpl(
   TFLITE_ABORT_CHECK(buffer != nullptr, "");                  // Crash OK
   TFLITE_ABORT_CHECK(attrs != nullptr, "");                   // Crash OK
   TFLITE_ABORT_CHECK(handle != kTfLiteNullBufferHandle, "");  // Crash OK
-
-  // Validate buffer attributes.
-  TFLITE_RET_CHECK_STATUS(
-      TfLiteAttributeMapIsBufferAttributeMap(attrs),
-      "calling RegisterBuffer with invalid attribute map type");
-  auto buffer_attrs = ReadBufferAttrs(attrs);
-  TFLITE_RET_CHECK_STATUS(
-      buffer_attrs.buffer_type.has_value(),
-      "calling RegisterBuffer with buffer resource type name unspecified");
-  TFLITE_RET_CHECK_STATUS(
-      buffer_attrs.buffer_type.value() != BufferType::kUnknown,
-      "calling RegisterBuffer with unknown buffer resource type");
-  size_t alignment = buffer_attrs.alignment.value_or(kRequiredByteAlignment);
-  TFLITE_RET_CHECK_STATUS(
-      alignment % kRequiredByteAlignment == 0,
-      "calling RegisterBuffer with invalid buffer alignment");
-  size_t padding = buffer_attrs.padding.value_or(kRequiredBytePadding);
-  TFLITE_RET_CHECK_STATUS(padding % kRequiredBytePadding == 0,
-                          "calling RegisterBuffer with invalid buffer padding");
-  size_t offset = buffer_attrs.offset.value_or(0);
-  TFLITE_RET_CHECK_STATUS(offset == 0,
-                          "calling RegisterBuffer with non-zero offset");
-
   // We depend on the availability of AHardwareBuffer.
   TFLITE_RET_CHECK_STATUS(
       TFLITE_AHWB_AVAILABLE(),
       "calling tflite::gpu::DelegateAsyncKernel::RegisterBuffer on device "
       "without AHardwareBuffer support");
+  BufferAttributes buffer_attrs;
+  TFLITE_RET_CHECK_STATUS(CheckAttributes(attrs, buffer_attrs) == kTfLiteOk,
+                          "RegisterBufferImpl(): Failed to check attributes");
 
   // Retrieve and validate the buffer.
   auto* ahwb =
@@ -1193,6 +1271,10 @@ TfLiteStatus DelegateAsyncKernel::RegisterBufferImpl(
       buffer_by_handle_.try_emplace(handle, std::move(uptr_ahwb));
   TFLITE_RET_CHECK_STATUS(did_something,
                           "RegisterBuffer called with duplicate handle");
+
+  auto [iterator, check] =
+      attributes_by_buffer_.try_emplace(it->second.get(), buffer_attrs);
+  TFLITE_RET_CHECK_STATUS(check, "RegisterBuffer called with same buffer");
   return kTfLiteOk;
 }
 

From 7810e7330e76c452e56bd5c42fe23d536d9cef14 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 29 Mar 2024 14:03:53 -0700
Subject: [PATCH 613/670] [xla:gpu] Add a version of `HloPredicateIsOp` for
 `HloInstructionAdaptor`

PiperOrigin-RevId: 620335202
---
 third_party/xla/xla/service/gpu/hlo_traversal.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.h b/third_party/xla/xla/service/gpu/hlo_traversal.h
index fa5bc0f81817fb..d77e669b48f162 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.h
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.h
@@ -68,6 +68,11 @@ H AbslHashValue(H h, const HloInstructionAdaptor& m) {
                     m.instruction_->unique_id());
 }
 
+template <HloOpcode op, HloOpcode... rest>
+bool IsOpcodeAnyOf(const HloInstructionAdaptor& adaptor) {
+  return (adaptor.opcode() == op) || ((adaptor.opcode() == rest) || ...);
+}
+
 class HloFusionAdaptor {
  public:
   virtual ~HloFusionAdaptor() = default;

From d6b38af08cbb1cec8d158529a4df9fb6d8fa16a1 Mon Sep 17 00:00:00 2001
From: Jieying Luo <jieying@google.com>
Date: Fri, 29 Mar 2024 15:37:59 -0700
Subject: [PATCH 614/670] [PJRT C API] Add a PJRT extension to register custom
 partitioner.

- This extension has one C API which registers a custom partitioner with callbacks from the input.
- Update xla_client.register_custom_call_partitioner to take an optional PJRT_Api* input.
- Add xla_bridge.register_plugin_initialization_callbacks to register callbacks to be called with PJRT_Api* after plugins are discovered.

PiperOrigin-RevId: 620357554
---
 third_party/xla/xla/pjrt/c/BUILD              |  11 ++
 third_party/xla/xla/pjrt/c/CHANGELOG.md       |  15 +-
 third_party/xla/xla/pjrt/c/pjrt_c_api.h       |   3 +-
 .../pjrt_c_api_custom_partitioner_extension.h | 134 ++++++++++++++++++
 .../xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc |  22 ++-
 third_party/xla/xla/python/BUILD              |   6 +-
 .../xla/xla/python/custom_call_sharding.cc    |  47 +++++-
 .../xla/python/custom_partition_callback.cc   |   1 +
 .../xla/python/custom_partition_callback.h    |  84 +----------
 third_party/xla/xla/python/xla_client.py      |   2 +-
 .../xla/xla/python/xla_extension/__init__.pyi |   1 +
 11 files changed, 229 insertions(+), 97 deletions(-)
 create mode 100644 third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h

diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index e01564fb6a0ca9..bf9a3a85cb1b3c 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -53,6 +53,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "pjrt_c_api_custom_partitioner_extension_hdrs",
+    hdrs = ["pjrt_c_api_custom_partitioner_extension.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pjrt_c_api_hdrs",
+    ],
+)
+
 cc_library(
     name = "pjrt_c_api_wrapper_impl",
     srcs = ["pjrt_c_api_wrapper_impl.cc"],
@@ -166,6 +175,7 @@ cc_library(
     hdrs = ["pjrt_c_api_gpu_internal.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":pjrt_c_api_custom_partitioner_extension_hdrs",
         ":pjrt_c_api_gpu_extension_hdrs",
         ":pjrt_c_api_hdrs",
         ":pjrt_c_api_helpers",
@@ -186,6 +196,7 @@ cc_library(
         "//xla/pjrt/gpu:gpu_helpers",
         "//xla/pjrt/gpu:se_gpu_pjrt_client",
         "//xla/pjrt/gpu:se_gpu_pjrt_compiler",  # To register GPU AOT compiler
+        "//xla/python:custom_partition_callback",
         "//xla/python:inspect_sharding",  # To register "InspectSharding" custom partitioning handler.
         "//xla/service:compiler",
         "//xla/service:custom_call_target_registry",
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index ca223ce6007b1e..cb9cb750d81940 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,25 +1,28 @@
 # PJRT C API changelog
 
-## 0.46
+## 0.47
+* Added ``PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner``.
+
+## 0.46 (Feb 29, 2024)
 * Update outdated struct sizes from previous changes to
   ``PJRT_Device_AddressableMemories_Args`` and ``PJRT_ExecuteOptions``.
 
-## 0.45
+## 0.45 (Feb 27, 2024)
 * Breaking changes
   * Added struct_size field to beginning of PJRT_Extension_Base. This is so
     forwards and backwards compatibility logic can be implemented with extension
     structs.
 
-## 0.44
+## 0.44 (Feb 26, 2024)
 * Changed all ``void*`` extension fields to have type ``PJRT_Extension_Base*``
 
-## 0.43
+## 0.43 (Feb 24, 2024)
 * Added some new fields to PJRT_Executable_GetCompiledMemoryStats
 
-## 0.42
+## 0.42 (Feb 13, 2024)
 * Renamed all ``priv`` fields to ``extension_start``
 
-## 0.41
+## 0.41 (Feb 13, 2024)
 * Renamed PJRT_Structure_Base to PJRT_Extension_Base
 * Renamed PJRT_Structure_Type to PJRT_Extension_Type (and similarly for enum fields)
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index c39b7636983e39..da1934e64f2e64 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -41,6 +41,7 @@ extern "C" {
 typedef enum {
   PJRT_Extension_Type_Gpu_Custom_Call = 0,
   PJRT_Extension_Type_Profiler,
+  PJRT_Extension_Type_Custom_Partitioner,
 } PJRT_Extension_Type;
 
 // PJRT_Extension_Base contains a type and a pointer to next
@@ -75,7 +76,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 46
+#define PJRT_API_MINOR 47
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h
new file mode 100644
index 00000000000000..825734610b863c
--- /dev/null
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h
@@ -0,0 +1,134 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_CUSTOM_PARTITIONER_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_CUSTOM_PARTITIONER_EXTENSION_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PJRT_API_CUSTOM_PARTITIONER_EXTENSION_VERSION 0
+
+struct JAX_CustomCallPartitioner_string {
+  const char* data;
+  size_t size;
+};
+
+struct JAX_CustomCallPartitioner_aval {
+  JAX_CustomCallPartitioner_string shape;
+  bool has_sharding;
+  JAX_CustomCallPartitioner_string sharding;
+};
+
+// General callback information containing api versions, the result error
+// message and the cleanup function to free any temporary memory that is backing
+// the results. Arguments are always owned by the caller, and results are owned
+// by the cleanup_fn. These should never be used directly. Args and results
+// should be serialized via the PopulateArgs, ReadArgs, PopulateResults,
+// ConsumeResults functions defined below.
+struct JAX_CustomCallPartitioner_version_and_error {
+  int64_t api_version;
+  void* data;  // out
+  // cleanup_fn cleans up any returned results. The caller must finish with all
+  // uses by the point the cleanup is called.
+  void (*cleanup_fn)(void* data);  // out
+  bool has_error;
+  PJRT_Error_Code code;                        // out
+  JAX_CustomCallPartitioner_string error_msg;  // out
+};
+
+struct JAX_CustomCallPartitioner_Partition_Args {
+  JAX_CustomCallPartitioner_version_and_error header;
+
+  size_t num_args;
+  JAX_CustomCallPartitioner_aval* op_args;
+  JAX_CustomCallPartitioner_aval op_result;
+  JAX_CustomCallPartitioner_string backend_config;
+
+  // out
+  JAX_CustomCallPartitioner_string mlir_module;
+  JAX_CustomCallPartitioner_string* args_sharding;
+  JAX_CustomCallPartitioner_string result_sharding;
+};
+
+struct JAX_CustomCallPartitioner_InferShardingFromOperands_Args {
+  JAX_CustomCallPartitioner_version_and_error header;
+
+  size_t num_args;
+  JAX_CustomCallPartitioner_aval* op_args;
+  JAX_CustomCallPartitioner_string result_shape;
+  JAX_CustomCallPartitioner_string backend_config;
+
+  bool has_result_sharding;
+  JAX_CustomCallPartitioner_string result_sharding;
+};
+
+struct JAX_CustomCallPartitioner_PropagateUserSharding_Args {
+  JAX_CustomCallPartitioner_version_and_error header;
+
+  JAX_CustomCallPartitioner_string backend_config;
+
+  JAX_CustomCallPartitioner_string result_shape;
+
+  JAX_CustomCallPartitioner_string result_sharding;  // inout
+};
+
+struct JAX_CustomCallPartitioner_Callbacks {
+  int64_t version;
+  void* private_data;
+  void (*dtor)(JAX_CustomCallPartitioner_Callbacks* data);
+  void (*partition)(JAX_CustomCallPartitioner_Callbacks* data,
+                    JAX_CustomCallPartitioner_Partition_Args* args);
+  void (*infer_sharding)(
+      JAX_CustomCallPartitioner_Callbacks* data,
+      JAX_CustomCallPartitioner_InferShardingFromOperands_Args* args);
+  void (*propagate_user_sharding)(
+      JAX_CustomCallPartitioner_Callbacks* data,
+      JAX_CustomCallPartitioner_PropagateUserSharding_Args* args);
+  bool can_side_effecting_have_replicated_sharding;
+};
+
+struct PJRT_Register_Custom_Partitioner_Args {
+  size_t struct_size;
+  const char* name;  // lifetime of the call.
+  size_t name_size;
+  JAX_CustomCallPartitioner_Callbacks* callbacks;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Register_Custom_Partitioner_Args, callbacks);
+
+// Registers a custom partitioner.
+typedef PJRT_Error* PJRT_Register_Custom_Partitioner(
+    PJRT_Register_Custom_Partitioner_Args* args);
+
+typedef struct PJRT_Custom_Partitioner_Extension {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  PJRT_Extension_Base* next;
+  PJRT_Register_Custom_Partitioner* register_custom_partitioner;
+} PJRT_Custom_Partitioner_Extension;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Custom_Partitioner_Extension,
+                          register_custom_partitioner);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_CUSTOM_PARTITIONER_EXTENSION_H_
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index c687c7760c8752..211fcc5d538c03 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_gpu_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/c/pjrt_c_api_profiler_extension.h"
@@ -42,6 +43,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/custom_partition_callback.h"
 #include "xla/service/compiler.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/stream_executor/device_description.h"
@@ -198,6 +200,24 @@ PJRT_Profiler_Extension profiler_extension{
     /*profiler_api=*/&profiler_api,
 };
 
+PJRT_Error* PJRT_Register_Custom_Partitioner(
+    PJRT_Register_Custom_Partitioner_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Register_Custom_Partitioner_Args",
+      PJRT_Register_Custom_Partitioner_Args_STRUCT_SIZE, args->struct_size));
+  std::string name(args->name, args->name_size);
+  RegisterCustomCallPartitioner(
+      name, jax::CreateCApiCustomCallPartitioner(args->callbacks));
+  return nullptr;
+}
+
+PJRT_Custom_Partitioner_Extension custom_partitioner{
+    /*struct_size=*/PJRT_Gpu_Custom_Call_STRUCT_SIZE,
+    /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner,
+    /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&profiler_extension),
+    /*register_custom_partitioner=*/PJRT_Register_Custom_Partitioner,
+};
+
 PJRT_Error* PJRT_Gpu_Register_Custom_Call(
     PJRT_Gpu_Register_Custom_Call_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
@@ -228,7 +248,7 @@ const PJRT_Api* GetGpuPjrtApi() {
   static PJRT_Gpu_Custom_Call custom_call{
       /*struct_size=*/PJRT_Gpu_Custom_Call_STRUCT_SIZE,
       /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Gpu_Custom_Call,
-      /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&profiler_extension),
+      /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&custom_partitioner),
       /*custom_call=*/PJRT_Gpu_Register_Custom_Call,
   };
   static const PJRT_Api pjrt_api =
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 4d0cda7ddd02fd..660ef5ecb87adb 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -566,6 +566,7 @@ cc_library(
         "//xla/client:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:mlir_to_hlo",
+        "//xla/pjrt/c:pjrt_c_api_custom_partitioner_extension_hdrs",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_helpers",
         "//xla/service:call_inliner",
@@ -599,11 +600,14 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "//third_party/nanobind",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_sharding_util",
         "//xla/pjrt:status_casters",
-        "@local_tsl//tsl/platform:errors",
+        "//xla/pjrt/c:pjrt_c_api_custom_partitioner_extension_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_helpers",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/third_party/xla/xla/python/custom_call_sharding.cc b/third_party/xla/xla/python/custom_call_sharding.cc
index c9056f22465f05..599cb160a9d94d 100644
--- a/third_party/xla/xla/python/custom_call_sharding.cc
+++ b/third_party/xla/xla/python/custom_call_sharding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <cstring>
+#include <memory>
 #include <optional>
 #include <string>
 #include <string_view>
@@ -33,9 +34,14 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h"
+#include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/custom_partition_callback.h"
 #include "xla/python/inspect_sharding.h"
 #include "xla/shape.h"
+#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -210,15 +216,45 @@ void BuildCustomCallShardingPybindAPI(nb::module_& m) {
       "register_custom_call_partitioner",
       [](std::string name, nb::object prop_user_sharding, nb::object partition,
          nb::object infer_sharding_from_operands,
-         bool can_side_effecting_have_replicated_sharding) {
+         bool can_side_effecting_have_replicated_sharding,
+         std::optional<nb::capsule> c_api) {
         auto* c_fns =
             (new PyCustomCallPartitionerCallbacks(prop_user_sharding, partition,
                                                   infer_sharding_from_operands))
                 ->callbacks();
         c_fns->can_side_effecting_have_replicated_sharding =
             can_side_effecting_have_replicated_sharding;
-        RegisterCustomCallPartitioner(
-            name, jax::CreateCApiCustomCallPartitioner(c_fns));
+        if (!c_api.has_value()) {
+          RegisterCustomCallPartitioner(
+              name, jax::CreateCApiCustomCallPartitioner(c_fns));
+          return;
+        }
+
+        if (std::string_view(c_api->name()) != "pjrt_c_api") {
+          throw absl::InvalidArgumentError(
+              "Argument to register_custom_call_partitioner was not a "
+              "pjrt_c_api capsule.");
+        }
+        auto* c_api_value = static_cast<const PJRT_Api*>(c_api->data());
+        PJRT_Custom_Partitioner_Extension* extension =
+            pjrt::FindExtension<PJRT_Custom_Partitioner_Extension>(
+                c_api_value,
+                PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner);
+        if (extension == nullptr) {
+          return;
+        }
+        PJRT_Register_Custom_Partitioner_Args args;
+        args.struct_size = PJRT_Register_Custom_Partitioner_Args_STRUCT_SIZE;
+        args.name = name.c_str();
+        args.name_size = name.size();
+        args.callbacks = c_fns;
+        PJRT_Error* error =
+            reinterpret_cast<const PJRT_Custom_Partitioner_Extension*>(
+                extension)
+                ->register_custom_partitioner(&args);
+        std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> error_ptr(
+            error, pjrt::MakeErrorDeleter(c_api_value));
+        ThrowIfError(pjrt::PjrtErrorToStatus(error_ptr.get(), c_api_value));
       },
       R"(Registers a partitioner for a custom-call operation.
 
@@ -233,10 +269,13 @@ void BuildCustomCallShardingPybindAPI(nb::module_& m) {
      Takes operand sharding and returns the instruction sharding.
   can_side_effecting_have_replicated_sharding: Side effecting ops are not
      allowed to have replicated sharding. Pass true to disable this check.
+  c_api: Optional `PJRT_Api*` if it is called with a plugin. This is safe to
+     call on plugins that do not implement the custom partitioner extension
 )",
       nb::arg("name"), nb::arg("prop_user_sharding"), nb::arg("partition"),
       nb::arg("infer_sharding_from_operands"),
-      nb::arg("can_side_effecting_have_replicated_sharding") = false);
+      nb::arg("can_side_effecting_have_replicated_sharding") = false,
+      nb::arg("c_api").none() = std::nullopt);
   m.def("encode_inspect_sharding_callback",
         [](nb::object handler) -> nb::bytes {
           JAX_InspectSharding_Callback cb;
diff --git a/third_party/xla/xla/python/custom_partition_callback.cc b/third_party/xla/xla/python/custom_partition_callback.cc
index b37c6d03b4c729..d9bcb596bf5999 100644
--- a/third_party/xla/xla/python/custom_partition_callback.cc
+++ b/third_party/xla/xla/python/custom_partition_callback.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/service/call_inliner.h"
diff --git a/third_party/xla/xla/python/custom_partition_callback.h b/third_party/xla/xla/python/custom_partition_callback.h
index d026f1f5cbb59b..33cc31e75fc9bf 100644
--- a/third_party/xla/xla/python/custom_partition_callback.h
+++ b/third_party/xla/xla/python/custom_partition_callback.h
@@ -24,91 +24,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h"
 #include "xla/service/custom_call_sharding_helper.h"
 
-extern "C" {
-
-struct JAX_CustomCallPartitioner_string {
-  const char* data;
-  size_t size;
-};
-
-struct JAX_CustomCallPartitioner_aval {
-  JAX_CustomCallPartitioner_string shape;
-  bool has_sharding;
-  JAX_CustomCallPartitioner_string sharding;
-};
-
-// General callback information containing api versions, the result error
-// message and the cleanup function to free any temporary memory that is backing
-// the results. Arguments are always owned by the caller, and results are owned
-// by the cleanup_fn. These should never be used directly. Args and results
-// should be serialized via the PopulateArgs, ReadArgs, PopulateResults,
-// ConsumeResults functions defined below.
-struct JAX_CustomCallPartitioner_version_and_error {
-  int64_t api_version;
-  void* data;  // out
-  // cleanup_fn cleans up any returned results. The caller must finish with all
-  // uses by the point the cleanup is called.
-  void (*cleanup_fn)(void* data);  // out
-  bool has_error;
-  PJRT_Error_Code code;                        // out
-  JAX_CustomCallPartitioner_string error_msg;  // out
-};
-
-struct JAX_CustomCallPartitioner_Partition_Args {
-  JAX_CustomCallPartitioner_version_and_error header;
-
-  size_t num_args;
-  JAX_CustomCallPartitioner_aval* op_args;
-  JAX_CustomCallPartitioner_aval op_result;
-  JAX_CustomCallPartitioner_string backend_config;
-
-  // out
-  JAX_CustomCallPartitioner_string mlir_module;
-  JAX_CustomCallPartitioner_string* args_sharding;
-  JAX_CustomCallPartitioner_string result_sharding;
-};
-
-struct JAX_CustomCallPartitioner_InferShardingFromOperands_Args {
-  JAX_CustomCallPartitioner_version_and_error header;
-
-  size_t num_args;
-  JAX_CustomCallPartitioner_aval* op_args;
-  JAX_CustomCallPartitioner_string result_shape;
-  JAX_CustomCallPartitioner_string backend_config;
-
-  bool has_result_sharding;
-  JAX_CustomCallPartitioner_string result_sharding;
-};
-
-struct JAX_CustomCallPartitioner_PropagateUserSharding_Args {
-  JAX_CustomCallPartitioner_version_and_error header;
-
-  JAX_CustomCallPartitioner_string backend_config;
-
-  JAX_CustomCallPartitioner_string result_shape;
-
-  JAX_CustomCallPartitioner_string result_sharding;  // inout
-};
-
-struct JAX_CustomCallPartitioner_Callbacks {
-  int64_t version;
-  void* private_data;
-  void (*dtor)(JAX_CustomCallPartitioner_Callbacks* data);
-  void (*partition)(JAX_CustomCallPartitioner_Callbacks* data,
-                    JAX_CustomCallPartitioner_Partition_Args* args);
-  void (*infer_sharding)(
-      JAX_CustomCallPartitioner_Callbacks* data,
-      JAX_CustomCallPartitioner_InferShardingFromOperands_Args* args);
-  void (*propagate_user_sharding)(
-      JAX_CustomCallPartitioner_Callbacks* data,
-      JAX_CustomCallPartitioner_PropagateUserSharding_Args* args);
-  bool can_side_effecting_have_replicated_sharding;
-};
-
-}  // extern "C"
-
 namespace jax {
 
 struct PartitionScratch {
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 64eb6cd7d4e1dd..a6b8b6e8dc4e58 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -48,7 +48,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 251
+_version = 252
 
 # Version number for MLIR:Python components.
 mlir_api_version = 55
diff --git a/third_party/xla/xla/python/xla_extension/__init__.pyi b/third_party/xla/xla/python/xla_extension/__init__.pyi
index 8fe1300bd94c73..b8a02ae8f1f41e 100644
--- a/third_party/xla/xla/python/xla_extension/__init__.pyi
+++ b/third_party/xla/xla/python/xla_extension/__init__.pyi
@@ -267,6 +267,7 @@ def register_custom_call_partitioner(
     partition: Callable,
     infer_sharding_from_operands: Callable,
     can_side_effecting_have_replicated_sharding: bool,
+    c_api: Optional[Any],
 ) -> None: ...
 def encode_inspect_sharding_callback(handler: Any) -> bytes: ...
 

From 30464e6cef4184f40333c60c906f574f72fb85f2 Mon Sep 17 00:00:00 2001
From: Gunhyun Park <gunhyun@google.com>
Date: Fri, 29 Mar 2024 16:22:52 -0700
Subject: [PATCH 615/670] In general, avoid the suffix on StatusOr.

PiperOrigin-RevId: 620366682
---
 third_party/xla/xla/service/BUILD             |    3 +-
 .../xla/xla/service/shape_inference_test.cc   | 1177 ++++++++---------
 2 files changed, 586 insertions(+), 594 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 1db01dd5c90dea..dc4a8ff4263651 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -596,7 +596,6 @@ xla_cc_test(
         ":hlo_parser",
         ":shape_inference",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:test",
         "//xla:test_helpers",
@@ -609,7 +608,7 @@ xla_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/service/shape_inference_test.cc b/third_party/xla/xla/service/shape_inference_test.cc
index 174e10f007037d..781852cd41d9dd 100644
--- a/third_party/xla/xla/service/shape_inference_test.cc
+++ b/third_party/xla/xla/service/shape_inference_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/test_helpers.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -82,11 +83,11 @@ class ReduceShapeInferenceTest : public ShapeInferenceTest {
       const Shape& expected_inferred_shape, const Shape& arg,
       absl::Span<const int64_t> dimensions_to_reduce) {
     ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
-    const absl::StatusOr<Shape> inferred_status =
+    const absl::StatusOr<Shape> inferred_shape =
         ShapeInference::InferReduceShape({&arg, &f32_}, dimensions_to_reduce,
                                          to_apply);
-    EXPECT_IS_OK(inferred_status.status());
-    EXPECT_TRUE(ShapeUtil::Equal(expected_inferred_shape, *inferred_status));
+    EXPECT_IS_OK(inferred_shape.status());
+    EXPECT_TRUE(ShapeUtil::Equal(expected_inferred_shape, *inferred_shape));
   }
 };
 
@@ -164,138 +165,138 @@ class UnboundedSelectOpShapeInferenceTest
 
 TEST_F(ShapeInferenceTest, UnaryNegateMatrix) {
   const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferUnaryOpShape(HloOpcode::kNegate, matrix_shape);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_shape, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_shape, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, SelectScalarPredBetweenTuples) {
   const Shape tuple = ShapeUtil::MakeTupleShape({s32_, f32_});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, pred_, tuple,
                                           tuple);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Expected array argument for select"));
 }
 
 TEST_F(ShapeInferenceTest, SelectScalarPredBetweenArrays) {
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, pred_,
                                           matrix_64_48_, matrix_64_48_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, SelectArrayPredBetweenArrays) {
   const Shape predarray = ShapeUtil::MakeShape(PRED, {64, 48});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, predarray,
                                           matrix_64_48_, matrix_64_48_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, SelectBadShapes) {
-  const absl::StatusOr<Shape> inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_shape_error1 =
       ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, pred_,
                                           matrix_64_48_, matrix_32_64_);
-  ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().message(),
+  ASSERT_FALSE(inferred_shape_error1.ok());
+  ASSERT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("Operands to select must be the same shape"));
 
-  const absl::StatusOr<Shape> inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_shape_error2 =
       ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, s32_,
                                           matrix_64_48_, matrix_64_48_);
-  ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().message(),
+  ASSERT_FALSE(inferred_shape_error2.ok());
+  ASSERT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("pred operand must have PRED"));
 
-  const absl::StatusOr<Shape> inferred_status_error3 =
+  const absl::StatusOr<Shape> inferred_shape_error3 =
       ShapeInference::InferTernaryOpShape(HloOpcode::kSelect,
                                           ShapeUtil::MakeShape(PRED, {64}),
                                           matrix_64_48_, matrix_64_48_);
-  ASSERT_FALSE(inferred_status_error3.ok());
+  ASSERT_FALSE(inferred_shape_error3.ok());
   ASSERT_THAT(
-      inferred_status_error3.status().message(),
+      inferred_shape_error3.status().message(),
       HasSubstr("Operands to select and predicate must be the same shape"));
 
   // Tuples have a TUPLE element type and cannot be the pred of a select.
-  const absl::StatusOr<Shape> inferred_status_error4 =
+  const absl::StatusOr<Shape> inferred_shape_error4 =
       ShapeInference::InferTernaryOpShape(
           HloOpcode::kSelect, ShapeUtil::MakeTupleShape({pred_, pred_}),
           ShapeUtil::MakeTupleShape({f32_, f32_}),
           ShapeUtil::MakeTupleShape({f32_, f32_}));
-  ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_THAT(inferred_status_error4.status().message(),
+  ASSERT_FALSE(inferred_shape_error4.ok());
+  ASSERT_THAT(inferred_shape_error4.status().message(),
               HasSubstr("Expected array argument for select pred"));
 }
 
 TEST_F(ShapeInferenceTest, ClampAllMatrix) {
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, matrix_64_48_,
                                           matrix_64_48_, matrix_64_48_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ClampAllScalar) {
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, f32_, f32_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(f32_, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(f32_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ClampMinScalar) {
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_,
                                           matrix_64_48_, matrix_64_48_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ClampMaxScalar) {
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, matrix_64_48_,
                                           matrix_64_48_, f32_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ClampOperandScalar) {
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, matrix_64_48_,
                                           f32_, matrix_64_48_);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Clamp with incompatible shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampMinMatrix) {
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, matrix_64_48_,
                                           f32_, f32_);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Clamp with incompatible shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampMaxMatrix) {
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, f32_,
                                           matrix_64_48_);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Clamp with incompatible shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampOperandMatrix) {
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_,
                                           matrix_64_48_, f32_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ClampBadShapes) {
@@ -397,80 +398,80 @@ TEST_F(ShapeInferenceTest, ReduceWindowInHalf) {
   const Shape float_scalar = ShapeUtil::MakeShape(F32, {});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})}, f32_);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferReduceWindowShape(matrix_shape, init_value_shape,
                                              window, to_apply);
 
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {4, 4}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {4, 4}), *inferred_shape));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterProperShapes) {
-  const absl::StatusOr<Shape> inferred_status_ok =
+  const absl::StatusOr<Shape> inferred_shape_ok =
       ShapeInference::InferSelectAndScatterShape(
           operand_shape_, select_program_shape_, window_, source_shape_,
           init_value_shape_, scatter_program_shape_);
-  ASSERT_IS_OK(inferred_status_ok.status());
-  ASSERT_TRUE(ShapeUtil::Equal(operand_shape_, *inferred_status_ok));
+  ASSERT_IS_OK(inferred_shape_ok.status());
+  ASSERT_TRUE(ShapeUtil::Equal(operand_shape_, *inferred_shape_ok));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSourceShape) {
   const Shape source_shape_fail = ShapeUtil::MakeShape(F32, {4, 6});
-  const absl::StatusOr<Shape> inferred_status_fail =
+  const absl::StatusOr<Shape> inferred_shape_fail =
       ShapeInference::InferSelectAndScatterShape(
           operand_shape_, select_program_shape_, window_, source_shape_fail,
           init_value_shape_, scatter_program_shape_);
-  ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().message(),
+  ASSERT_FALSE(inferred_shape_fail.ok());
+  ASSERT_THAT(inferred_shape_fail.status().message(),
               HasSubstr("Source shape does not match"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape1) {
   ProgramShape select_program_shape_fail =
       ShapeUtil::MakeProgramShape({ShapeUtil::MakeShape(F32, {})}, pred_);
-  const absl::StatusOr<Shape> inferred_status_fail =
+  const absl::StatusOr<Shape> inferred_shape_fail =
       ShapeInference::InferSelectAndScatterShape(
           operand_shape_, select_program_shape_fail, window_, source_shape_,
           init_value_shape_, scatter_program_shape_);
-  ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().message(),
+  ASSERT_FALSE(inferred_shape_fail.ok());
+  ASSERT_THAT(inferred_shape_fail.status().message(),
               HasSubstr("Select function must take 2 parameters"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape2) {
   ProgramShape select_program_shape_fail = ShapeUtil::MakeProgramShape(
       {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})}, f32_);
-  const absl::StatusOr<Shape> inferred_status_fail =
+  const absl::StatusOr<Shape> inferred_shape_fail =
       ShapeInference::InferSelectAndScatterShape(
           operand_shape_, select_program_shape_fail, window_, source_shape_,
           init_value_shape_, scatter_program_shape_);
-  ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().message(),
+  ASSERT_FALSE(inferred_shape_fail.ok());
+  ASSERT_THAT(inferred_shape_fail.status().message(),
               HasSubstr("Select function must have rank-0 PRED"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape3) {
   ProgramShape select_program_shape_fail = ShapeUtil::MakeProgramShape(
       {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {})}, pred_);
-  const absl::StatusOr<Shape> inferred_status_fail =
+  const absl::StatusOr<Shape> inferred_shape_fail =
       ShapeInference::InferSelectAndScatterShape(
           operand_shape_, select_program_shape_fail, window_, source_shape_,
           init_value_shape_, scatter_program_shape_);
-  ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().message(),
+  ASSERT_FALSE(inferred_shape_fail.ok());
+  ASSERT_THAT(inferred_shape_fail.status().message(),
               HasSubstr("Select function's first parameter"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape4) {
   ProgramShape select_program_shape_fail = ShapeUtil::MakeProgramShape(
       {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(U32, {})}, pred_);
-  const absl::StatusOr<Shape> inferred_status_fail =
+  const absl::StatusOr<Shape> inferred_shape_fail =
       ShapeInference::InferSelectAndScatterShape(
           operand_shape_, select_program_shape_fail, window_, source_shape_,
           init_value_shape_, scatter_program_shape_);
-  ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().message(),
+  ASSERT_FALSE(inferred_shape_fail.ok());
+  ASSERT_THAT(inferred_shape_fail.status().message(),
               HasSubstr("Select function's second parameter"));
 }
 
@@ -575,14 +576,14 @@ TEST_F(ShapeInferenceTest, Convolve) {
   dim1->set_padding_high(0);
   dim1->set_window_dilation(1);
   dim1->set_base_dilation(1);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferConvolveShape(
           lhs_shape, rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, window, dnums,
           /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {10, 12, 2, 3}),
-                               *inferred_status));
+                               *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ConvolveWithWindowDilation) {
@@ -622,14 +623,14 @@ TEST_F(ShapeInferenceTest, ConvolveWithWindowDilation) {
   dim1->set_padding_high(1);
   dim1->set_window_dilation(2);
   dim1->set_base_dilation(1);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferConvolveShape(
           lhs_shape, rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, window, dnums,
           /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {10, 12, 31, 5}),
-                               *inferred_status));
+                               *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ConvolveWithBaseDilation) {
@@ -669,14 +670,14 @@ TEST_F(ShapeInferenceTest, ConvolveWithBaseDilation) {
   dim1->set_padding_high(1);
   dim1->set_window_dilation(1);
   dim1->set_base_dilation(2);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferConvolveShape(
           lhs_shape, rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, window, dnums,
           /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {10, 12, 4, 9}),
-                               *inferred_status));
+                               *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
@@ -709,13 +710,13 @@ TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
   dim1->set_stride(2);
   dim1->set_padding_low(1);
   dim1->set_padding_high(1);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferConvolveShape(
           lhs_shape, rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, window, dnums,
           /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("each dimension exactly once"));
 }
 
@@ -748,13 +749,13 @@ TEST_F(ShapeInferenceTest, ConvolveBatchGroupCountUnequalOutputFeature) {
   dim1->set_stride(1);
   dim0->set_window_dilation(3);
   dim1->set_window_dilation(2);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferConvolveShape(
           lhs_shape, rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/6, window, dnums,
           /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("to be a multiple of batch group count"));
 }
 
@@ -953,18 +954,18 @@ static const char* innermost_dimension_matches =
 static void Pass(const Shape& shape, FftType type,
                  absl::Span<const int64_t> length,
                  const Shape& expected_shape) {
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferFftShape(shape, type, length);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(expected_shape, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(expected_shape, *inferred_shape));
 }
 
 static void Fail(const Shape& shape, FftType type,
                  absl::Span<const int64_t> length, absl::string_view message) {
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferFftShape(shape, type, length);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr(std::string(message)));
 }
 
@@ -1087,35 +1088,35 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftTypes) {
 TEST_F(ShapeInferenceTest, MapThatChangesElementType) {
   const Shape arg = ShapeUtil::MakeShape(F32, {20});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_}, s32_);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferMapShape({&arg}, to_apply, {0});
-  EXPECT_IS_OK(inferred_status.status());
+  EXPECT_IS_OK(inferred_shape.status());
   const Shape expected = ShapeUtil::MakeShape(S32, {20});
-  EXPECT_TRUE(ShapeUtil::Equal(expected, *inferred_status));
+  EXPECT_TRUE(ShapeUtil::Equal(expected, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, Map) {
-  const absl::StatusOr<Shape> inferred_status_r1f32 =
+  const absl::StatusOr<Shape> inferred_shape_r1f32 =
       ShapeInference::InferMapShape(
           {&vector_32_, &vector_32_},
           ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
-  EXPECT_IS_OK(inferred_status_r1f32.status());
-  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_status_r1f32));
+  EXPECT_IS_OK(inferred_shape_r1f32.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_shape_r1f32));
 
   // It's OK to provide a single argument, as long as the applied arity matches
   // (this degenerates to a Map).
-  const absl::StatusOr<Shape> inferred_status_r1f32_one =
+  const absl::StatusOr<Shape> inferred_shape_r1f32_one =
       ShapeInference::InferMapShape(
           {&vector_32_}, ShapeUtil::MakeProgramShape({f32_}, f32_), {0});
-  EXPECT_IS_OK(inferred_status_r1f32_one.status());
-  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_status_r1f32_one));
+  EXPECT_IS_OK(inferred_shape_r1f32_one.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_shape_r1f32_one));
 
-  const absl::StatusOr<Shape> inferred_status_r2s32 =
+  const absl::StatusOr<Shape> inferred_shape_r2s32 =
       ShapeInference::InferMapShape(
           {&s32matrix_64_64_, &s32matrix_64_64_, &s32matrix_64_64_},
           ShapeUtil::MakeProgramShape({s32_, s32_, s32_}, s32_), {0, 1});
-  EXPECT_IS_OK(inferred_status_r2s32.status());
-  EXPECT_TRUE(ShapeUtil::Equal(s32matrix_64_64_, *inferred_status_r2s32));
+  EXPECT_IS_OK(inferred_shape_r2s32.status());
+  EXPECT_TRUE(ShapeUtil::Equal(s32matrix_64_64_, *inferred_shape_r2s32));
 
   const auto no_args_error = ShapeInference::InferMapShape(
       {}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {});
@@ -1160,37 +1161,37 @@ TEST_F(ShapeInferenceTest, Map) {
 
   const Shape arg = ShapeUtil::MakeShape(F32, {20});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_}, f32_);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferMapShape({&arg}, to_apply, {0});
-  EXPECT_IS_OK(inferred_status.status());
-  EXPECT_TRUE(ShapeUtil::Equal(arg, *inferred_status));
+  EXPECT_IS_OK(inferred_shape.status());
+  EXPECT_TRUE(ShapeUtil::Equal(arg, *inferred_shape));
 
-  const absl::StatusOr<Shape> inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_shape_error1 =
       ShapeInference::InferMapShape(
           {&arg}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
-  ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().message(),
+  ASSERT_FALSE(inferred_shape_error1.ok());
+  ASSERT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("arity must match number of arguments"));
 
-  const absl::StatusOr<Shape> inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_shape_error2 =
       ShapeInference::InferMapShape(
           {&arg}, ShapeUtil::MakeProgramShape({vector_32_}, f32_), {0});
-  ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().message(),
+  ASSERT_FALSE(inferred_shape_error2.ok());
+  ASSERT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("has to be a scalar"));
 
-  const absl::StatusOr<Shape> inferred_status_error3 =
+  const absl::StatusOr<Shape> inferred_shape_error3 =
       ShapeInference::InferMapShape(
           {&arg}, ShapeUtil::MakeProgramShape({f32_}, vector_32_), {0});
-  ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().message(),
+  ASSERT_FALSE(inferred_shape_error3.ok());
+  ASSERT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("has to be a scalar"));
 
-  const absl::StatusOr<Shape> inferred_status_error5 =
+  const absl::StatusOr<Shape> inferred_shape_error5 =
       ShapeInference::InferMapShape(
           {&arg}, ShapeUtil::MakeProgramShape({s32_}, s32_), {0});
-  ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_THAT(inferred_status_error5.status().message(),
+  ASSERT_FALSE(inferred_shape_error5.ok());
+  ASSERT_THAT(inferred_shape_error5.status().message(),
               HasSubstr("parameter type has to match argument"));
 }
 
@@ -1198,11 +1199,11 @@ TEST_F(ShapeInferenceTest, MapWithDifferentInputTypes) {
   const Shape arg0 = ShapeUtil::MakeShape(F32, {20});
   const Shape arg1 = ShapeUtil::MakeShape(S32, {20});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, s32_}, s32_);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferMapShape({&arg0, &arg1}, to_apply, {0});
-  EXPECT_IS_OK(inferred_status.status());
+  EXPECT_IS_OK(inferred_shape.status());
   const Shape expected = ShapeUtil::MakeShape(S32, {20});
-  EXPECT_TRUE(ShapeUtil::Equal(expected, *inferred_status));
+  EXPECT_TRUE(ShapeUtil::Equal(expected, *inferred_shape));
 }
 
 TEST_F(ReduceShapeInferenceTest, ReduceVectorToScalar) {
@@ -1255,12 +1256,11 @@ TEST_F(ReduceShapeInferenceTest, ReduceMultiOutput) {
   const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {f32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_}));
-  const absl::StatusOr<Shape> inferred_status =
-      ShapeInference::InferReduceShape(
-          {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
-  EXPECT_IS_OK(inferred_status.status());
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
+      {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
+  EXPECT_IS_OK(inferred_shape.status());
   EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeTupleShape({f32_, s32_}),
-                               *inferred_status));
+                               *inferred_shape));
 }
 
 TEST_F(ReduceShapeInferenceTest, ReduceWindowMultiOutput) {
@@ -1279,15 +1279,15 @@ TEST_F(ReduceShapeInferenceTest, ReduceWindowMultiOutput) {
       const Window window,
       ShapeInference::InferWindowFromDimensions(
           window_dimensions, window_strides, padding_values, {}, {}));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferReduceWindowShape(
           absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
-  VLOG(2) << inferred_status->ToString() << "\n";
-  EXPECT_IS_OK(inferred_status.status());
+  VLOG(2) << inferred_shape->ToString() << "\n";
+  EXPECT_IS_OK(inferred_shape.status());
   EXPECT_TRUE(ShapeUtil::Equal(
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {5, 2, 0}),
                                  ShapeUtil::MakeShape(S32, {5, 2, 0})}),
-      *inferred_status));
+      *inferred_shape));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput1) {
@@ -1296,11 +1296,10 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput1) {
   ProgramShape to_apply =
       ShapeUtil::MakeProgramShape({f32_, s32_, f32_, s32_, f32_, s32_},
                                   ShapeUtil::MakeTupleShape({f32_, s32_}));
-  const absl::StatusOr<Shape> inferred_status =
-      ShapeInference::InferReduceShape(
-          {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
+      {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("must take 4 parameters, but takes 6 parameter(s)"));
 }
 
@@ -1309,12 +1308,11 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput2) {
   const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {s32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_}));
-  const absl::StatusOr<Shape> inferred_status =
-      ShapeInference::InferReduceShape(
-          {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
+      {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
+  EXPECT_FALSE(inferred_shape.ok());
   EXPECT_THAT(
-      inferred_status.status().message(),
+      inferred_shape.status().message(),
       HasSubstr(
           "parameter shape differs from the result shape: s32[] vs f32[]"));
 }
@@ -1322,10 +1320,10 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput2) {
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput3) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {s32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_}));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferReduceShape({}, {0, 1}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("must have at least 2 arguments, has 0"));
 }
 
@@ -1345,11 +1343,11 @@ TEST_F(ReduceShapeInferenceTest, ErrorBadReduceWindowInput) {
       const Window window,
       ShapeInference::InferWindowFromDimensions(
           window_dimensions, window_strides, padding_values, {}, {}));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferReduceWindowShape(
           absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
-  EXPECT_FALSE(inferred_status.status().ok());
-  EXPECT_THAT(inferred_status.status().message(), HasSubstr("f32[] vs s32[]"));
+  EXPECT_FALSE(inferred_shape.status().ok());
+  EXPECT_THAT(inferred_shape.status().message(), HasSubstr("f32[] vs s32[]"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput1) {
@@ -1357,12 +1355,11 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput1) {
   const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply =
       ShapeUtil::MakeProgramShape({f32_, s32_, f32_, s32_}, f32_);
-  const absl::StatusOr<Shape> inferred_status =
-      ShapeInference::InferReduceShape(
-          {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
+      {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
+  EXPECT_FALSE(inferred_shape.ok());
   EXPECT_THAT(
-      inferred_status.status().message(),
+      inferred_shape.status().message(),
       HasSubstr("must produce a tuple with 2 elements, but produces a scalar"));
 }
 
@@ -1371,12 +1368,11 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput2) {
   const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {f32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_, s32_}));
-  const absl::StatusOr<Shape> inferred_status =
-      ShapeInference::InferReduceShape(
-          {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
+      {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
+  EXPECT_FALSE(inferred_shape.ok());
   EXPECT_THAT(
-      inferred_status.status().message(),
+      inferred_shape.status().message(),
       HasSubstr("must produce a tuple with 2 elements, but has 3 elements"));
 }
 
@@ -1385,11 +1381,10 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerBoth) {
   const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {s32_, s32_, s32_, s32_}, ShapeUtil::MakeTupleShape({s32_, s32_}));
-  const absl::StatusOr<Shape> inferred_status =
-      ShapeInference::InferReduceShape(
-          {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
+      {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("accumulator shape at index 0 differs from the "
                         "init_value shape: s32[] vs f32[]"));
 }
@@ -1397,108 +1392,106 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerBoth) {
 TEST_F(ReduceShapeInferenceTest, ErrorOutOfBoundsDimension) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
   const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  const absl::StatusOr<Shape> inferred_status =
-      ShapeInference::InferReduceShape({&arg_shape, &f32_},
-                                       /*dimensions_to_reduce=*/{3, 4},
-                                       to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
+      {&arg_shape, &f32_},
+      /*dimensions_to_reduce=*/{3, 4}, to_apply);
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("out-of-bounds dimension"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_, f32_}, f32_);
   const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferReduceShape({&arg_shape, &f32_},
                                        /*dimensions_to_reduce=*/{0}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("take 2 parameters"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, s32_);
   const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferReduceShape({&arg_shape, &f32_},
                                        /*dimensions_to_reduce=*/{0}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("0-th parameter shape differs"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ReduceWithRepeatedReduceDimension) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
   const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  const absl::StatusOr<Shape> inferred_status =
-      ShapeInference::InferReduceShape({&arg_shape, &f32_},
-                                       /*dimensions_to_reduce=*/{0, 0},
-                                       to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
+      {&arg_shape, &f32_},
+      /*dimensions_to_reduce=*/{0, 0}, to_apply);
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("Duplicate reduction dimension: 0"));
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2) {
   const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {1, 1});
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {32, 64}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {32, 64}), *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferSliceWithDynamicDimensions) {
   const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64}, {true, true});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {33, 64}, {1, 1});
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(ShapeUtil::Equal(
-      ShapeUtil::MakeShape(F32, {1, 64}, {false, true}), *inferred_status));
+      ShapeUtil::MakeShape(F32, {1, 64}, {false, true}), *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStrides) {
   const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {2, 4});
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {16, 16}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {16, 16}), *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStridesNotIntegral) {
   const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(matrix_shape, {15, 0}, {20, 13}, {2, 4});
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {3, 4}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {3, 4}), *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferInvalidStride) {
   const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {0, 1});
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_EQ(tsl::error::INVALID_ARGUMENT, inferred_status.status().code());
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_EQ(tsl::error::INVALID_ARGUMENT, inferred_shape.status().code());
 }
 
 TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) {
   const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {1, 1});
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_EQ(tsl::error::INVALID_ARGUMENT, inferred_status.status().code());
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_EQ(tsl::error::INVALID_ARGUMENT, inferred_shape.status().code());
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank1) {
   const Shape vector_shape = ShapeUtil::MakeShape(F32, {17});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(vector_shape, {2}, {4}, {1});
-  ASSERT_TRUE(inferred_status.ok());
+  ASSERT_TRUE(inferred_shape.ok());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {2}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {2}), *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferConstIndexShape) {
@@ -1529,21 +1522,21 @@ TEST_F(ShapeInferenceTest, InferTupleElementShapeOutOfBound) {
 
 TEST_F(ShapeInferenceTest, InferPowShape) {
   const Shape ten_floats = ShapeUtil::MakeShape(F32, {10});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBinaryOpShape(HloOpcode::kPower, ten_floats, f32_,
                                          {});
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(ten_floats, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(ten_floats, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferCompareShape) {
   const Shape ten_floats = ShapeUtil::MakeShape(F32, {10});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBinaryOpShape(HloOpcode::kCompare, ten_floats, f32_,
                                          {});
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}), *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferReshapeDegenerateCombine) {
@@ -1600,11 +1593,11 @@ TEST_F(ShapeInferenceTest, InferDynamicBroadcast) {
   // %broadcast = s32[15,<=15]{1,0} broadcast(s32[<=15]{0}), dimensions={1}
 
   const Shape operand_shape = ShapeUtil::MakeShape(F32, {15}, {true});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBroadcastShape(operand_shape, {15});
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_EQ(ShapeUtil::MakeShape(F32, {15, 15}, {false, true}),
-            *inferred_status);
+            *inferred_shape);
 }
 
 TEST_F(ShapeInferenceTest, BroadcastScalar) {
@@ -1645,10 +1638,10 @@ TEST_F(ShapeInferenceTest, BroadcastScalar) {
 // scalar <dot> vector: ok
 TEST_F(ShapeInferenceTest, ScalarDotVector) {
   DotDimensionNumbers dot_dnums;
-  const absl::StatusOr<Shape> inferred_status = ShapeInference::InferDotOpShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferDotOpShape(
       f32_, vector_32_, dot_dnums, /*preferred_element_type=*/std::nullopt);
-  EXPECT_TRUE(inferred_status.ok());
-  EXPECT_EQ(*inferred_status, vector_32_);
+  EXPECT_TRUE(inferred_shape.ok());
+  EXPECT_EQ(*inferred_shape, vector_32_);
 }
 
 // 3D <dot> 2D: error
@@ -1656,11 +1649,11 @@ TEST_F(ShapeInferenceTest, DotWithRankHigherThanTwo) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  const absl::StatusOr<Shape> inferred_status = ShapeInference::InferDotOpShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferDotOpShape(
       ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, dot_dnums,
       /*preferred_element_type=*/std::nullopt);
-  EXPECT_TRUE(inferred_status.ok());
-  EXPECT_TRUE(ShapeUtil::Equal(*inferred_status,
+  EXPECT_TRUE(inferred_shape.ok());
+  EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape,
                                ShapeUtil::MakeShape(F32, {32, 32, 64})));
 }
 
@@ -1669,15 +1662,15 @@ TEST_F(ShapeInferenceTest, VectorDotVector) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(vector_64_, vector_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(f32_, *inferred_status));
-  const absl::StatusOr<Shape> inferred_status_mismatch =
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(f32_, *inferred_shape));
+  const absl::StatusOr<Shape> inferred_shape_mismatch =
       ShapeInference::InferDotOpShape(vector_64_, vector_32_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status_mismatch.ok());
+  ASSERT_FALSE(inferred_shape_mismatch.ok());
 }
 
 // matrix <dot> vector -> vector
@@ -1685,15 +1678,15 @@ TEST_F(ShapeInferenceTest, MatrixDotVector) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(matrix_32_64_, vector_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status, vector_32_));
-  const absl::StatusOr<Shape> inferred_status_mismatch =
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape, vector_32_));
+  const absl::StatusOr<Shape> inferred_shape_mismatch =
       ShapeInference::InferDotOpShape(matrix_32_64_, vector_32_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status_mismatch.ok());
+  ASSERT_FALSE(inferred_shape_mismatch.ok());
 }
 
 // vector <dot> matrix -> vector
@@ -1701,15 +1694,15 @@ TEST_F(ShapeInferenceTest, VectorDotMatrix) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(vector_32_, matrix_32_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status, vector_64_));
-  const absl::StatusOr<Shape> inferred_status_mismatch =
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape, vector_64_));
+  const absl::StatusOr<Shape> inferred_shape_mismatch =
       ShapeInference::InferDotOpShape(vector_64_, matrix_32_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status_mismatch.ok());
+  ASSERT_FALSE(inferred_shape_mismatch.ok());
 }
 
 // matrix <dot> matrix -> matrix
@@ -1717,17 +1710,17 @@ TEST_F(ShapeInferenceTest, MatrixDotMatrix) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  const absl::StatusOr<Shape> inferred_status_match =
+  const absl::StatusOr<Shape> inferred_shape_match =
       ShapeInference::InferDotOpShape(matrix_32_64_, matrix_64_48_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, matrix_32_48_))
-      << "inferred: " << ShapeUtil::HumanString(*inferred_status_match)
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, matrix_32_48_))
+      << "inferred: " << ShapeUtil::HumanString(*inferred_shape_match)
       << " expected: " << ShapeUtil::HumanString(matrix_64_48_);
-  const absl::StatusOr<Shape> inferred_status_mismatch =
+  const absl::StatusOr<Shape> inferred_shape_mismatch =
       ShapeInference::InferDotOpShape(matrix_32_64_, matrix_32_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status_mismatch.ok());
+  ASSERT_FALSE(inferred_shape_mismatch.ok());
 }
 
 // BatchMatMul with two batch dimensions and one contracting dimension.
@@ -1745,12 +1738,12 @@ TEST_F(ShapeInferenceTest, DotGeneral) {
   dot_dnums.add_rhs_batch_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
 
-  const absl::StatusOr<Shape> inferred_status_match =
+  const absl::StatusOr<Shape> inferred_shape_match =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, output_shape))
-      << "inferred: " << ShapeUtil::HumanString(*inferred_status_match)
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, output_shape))
+      << "inferred: " << ShapeUtil::HumanString(*inferred_shape_match)
       << " expected: " << ShapeUtil::HumanString(output_shape);
 }
 
@@ -1767,11 +1760,11 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
   dot_dnums.add_rhs_contracting_dimensions(1);
   dot_dnums.add_rhs_batch_dimensions(0);
 
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Must specify the same number of contracting "
                         "dimensions for lhs and rhs."));
 }
@@ -1790,34 +1783,34 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsPasses) {
   dot_dnums.add_rhs_contracting_dimensions(2);
   dot_dnums.add_rhs_batch_dimensions(0);
 
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  EXPECT_TRUE(inferred_status.ok());
-  EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, output_shape));
+  EXPECT_TRUE(inferred_shape.ok());
+  EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, output_shape));
 }
 
 TEST_F(ShapeInferenceTest, ErrorSetDimensionSize) {
   const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
   const Shape val_shape = ShapeUtil::MakeShape(S32, {1});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSetDimensionSizeShape(arg_shape, val_shape,
                                                  /*dimension=*/0);
 
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("value has to be S32 scalar"));
 }
 
 TEST_F(ShapeInferenceTest, ErrorSetDimensionSizeWrongType) {
   const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
   const Shape val_shape = ShapeUtil::MakeShape(U32, {});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSetDimensionSizeShape(arg_shape, val_shape,
                                                  /*dimension=*/0);
 
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("value has to be S32 scalar"));
 }
 
@@ -1833,11 +1826,11 @@ TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimSizesFails) {
   dot_dnums.add_rhs_contracting_dimensions(1);
   dot_dnums.add_rhs_batch_dimensions(0);
 
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Batch dimension sizes are not compatible"));
 }
 
@@ -1853,11 +1846,11 @@ TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimNumbersPasses) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
 
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_TRUE(inferred_status.ok());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status,
+  ASSERT_TRUE(inferred_shape.ok());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape,
                                ShapeUtil::MakeShape(F32, {2, 11, 14})));
 }
 
@@ -1873,11 +1866,11 @@ TEST_F(ShapeInferenceTest, DotWithContractingDimNumberOutOfRange) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
 
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("A dimension number is out of range"));
 }
 
@@ -1893,11 +1886,11 @@ TEST_F(ShapeInferenceTest, DotWithContractingNonUniqueDimNumber) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
 
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("A dimension number is not unique"));
 }
 
@@ -2073,13 +2066,13 @@ TEST_F(ShapeInferenceTest, DotWithIncorrectSparseDimensionSizeRatio) {
   sparsity_descriptor.set_dimension(1);
 
   std::vector<SparsityDescriptor> sparsity = {sparsity_descriptor};
-  const absl::StatusOr<Shape> inferred_status = ShapeInference::InferDotOpShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferDotOpShape(
       ShapeUtil::MakeShape(F32, {10, 32}), ShapeUtil::MakeShape(F32, {32, 20}),
       dot_dnums, /*preferred_element_type=*/std::nullopt,
       absl::MakeSpan(sparsity));
-  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_FALSE(inferred_shape.ok());
   ASSERT_THAT(
-      inferred_status.status().message(),
+      inferred_shape.status().message(),
       HasSubstr("Sparse dimension size ratio doesn't match the descriptor"));
 }
 
@@ -2109,23 +2102,23 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastMatrixVector) {
   const Shape vec8 = ShapeUtil::MakeShape(F32, {8});
   const Shape vec16 = ShapeUtil::MakeShape(F32, {16});
 
-  absl::StatusOr<Shape> inferred_status_match =
+  absl::StatusOr<Shape> inferred_shape_match =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec8, {1});
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, mat));
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, mat));
 
-  absl::StatusOr<Shape> inferred_status_mismatch =
+  absl::StatusOr<Shape> inferred_shape_mismatch =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec8, {0});
-  ASSERT_FALSE(inferred_status_mismatch.ok());
+  ASSERT_FALSE(inferred_shape_mismatch.ok());
 
-  inferred_status_match =
+  inferred_shape_match =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec16, {0});
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, mat));
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, mat));
 
-  inferred_status_mismatch =
+  inferred_shape_mismatch =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec16, {1});
-  ASSERT_FALSE(inferred_status_mismatch.ok());
+  ASSERT_FALSE(inferred_shape_mismatch.ok());
 }
 
 TEST_F(ShapeInferenceTest, BinOpBroadcastCubeMatrix) {
@@ -2135,21 +2128,21 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastCubeMatrix) {
   const Shape matrix16_4 = ShapeUtil::MakeShape(F32, {16, 4});
   const Shape matrix16_8 = ShapeUtil::MakeShape(F32, {16, 8});
 
-  absl::StatusOr<Shape> inferred_status_match =
+  absl::StatusOr<Shape> inferred_shape_match =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, cube, matrix8_4,
                                          {1, 2});
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, cube));
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, cube));
 
-  inferred_status_match = ShapeInference::InferBinaryOpShape(
+  inferred_shape_match = ShapeInference::InferBinaryOpShape(
       HloOpcode::kAdd, cube, matrix16_4, {0, 2});
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, cube));
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, cube));
 
-  inferred_status_match = ShapeInference::InferBinaryOpShape(
+  inferred_shape_match = ShapeInference::InferBinaryOpShape(
       HloOpcode::kAdd, cube, matrix16_8, {0, 1});
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, cube));
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, cube));
 }
 
 TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
@@ -2161,65 +2154,65 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
   const Shape matrix8_8 = ShapeUtil::MakeShape(F32, {8, 8});
 
   // "magical" broadcast rejected
-  const absl::StatusOr<Shape> inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_shape_error1 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {});
-  ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().message(),
+  ASSERT_FALSE(inferred_shape_error1.ok());
+  ASSERT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("Shapes must be equal rank"));
 
   // broadcast_dimension out of bounds for tensor's rank
-  const absl::StatusOr<Shape> inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_shape_error2 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {3});
-  ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().message(),
+  ASSERT_FALSE(inferred_shape_error2.ok());
+  ASSERT_THAT(inferred_shape_error2.status().message(),
               ContainsRegex("Broadcast dimension number .* too large"));
 
   // broadcast_dimension doesn't match corresponding dimension
-  const absl::StatusOr<Shape> inferred_status_error3 =
+  const absl::StatusOr<Shape> inferred_shape_error3 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {0});
-  ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().message(),
+  ASSERT_FALSE(inferred_shape_error3.ok());
+  ASSERT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("Broadcast dimension 0 mismatch"));
 
   // broadcast_dimensions list too long
-  const absl::StatusOr<Shape> inferred_status_error4 =
+  const absl::StatusOr<Shape> inferred_shape_error4 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, matrix8_4,
                                          {0, 1, 2});
-  ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_THAT(inferred_status_error4.status().message(),
+  ASSERT_FALSE(inferred_shape_error4.ok());
+  ASSERT_THAT(inferred_shape_error4.status().message(),
               HasSubstr("broadcast_dimensions has to match"));
 
   // there's a dimension above the rank of the tensor
-  const absl::StatusOr<Shape> inferred_status_error5 =
+  const absl::StatusOr<Shape> inferred_shape_error5 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, matrix8_4,
                                          {3, 0});
-  ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_THAT(inferred_status_error5.status().message(),
+  ASSERT_FALSE(inferred_shape_error5.ok());
+  ASSERT_THAT(inferred_shape_error5.status().message(),
               ContainsRegex("dimension number .* too large"));
 
   // broadcasting dimensions don't match in this order
-  const absl::StatusOr<Shape> inferred_status_error6 =
+  const absl::StatusOr<Shape> inferred_shape_error6 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, matrix8_4,
                                          {2, 1});
-  ASSERT_FALSE(inferred_status_error6.ok());
-  ASSERT_THAT(inferred_status_error6.status().message(),
+  ASSERT_FALSE(inferred_shape_error6.ok());
+  ASSERT_THAT(inferred_shape_error6.status().message(),
               HasSubstr("dimension 0 mismatch"));
 
   // The following two tests make sure that broadcasting dimensions are listed
   // in a proper (strictly increasing) order, even if the lower-rank array
   // matches the higher-rank array in many different ways.
-  const absl::StatusOr<Shape> inferred_status_error7 =
+  const absl::StatusOr<Shape> inferred_shape_error7 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor8_8_8,
                                          matrix8_8, {0, 0});
-  ASSERT_FALSE(inferred_status_error7.ok());
-  ASSERT_THAT(inferred_status_error7.status().message(),
+  ASSERT_FALSE(inferred_shape_error7.ok());
+  ASSERT_THAT(inferred_shape_error7.status().message(),
               HasSubstr("dimensions order is wrong"));
 
-  const absl::StatusOr<Shape> inferred_status_error8 =
+  const absl::StatusOr<Shape> inferred_shape_error8 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor8_8_8,
                                          matrix8_8, {1, 0});
-  ASSERT_FALSE(inferred_status_error8.ok());
-  ASSERT_THAT(inferred_status_error8.status().message(),
+  ASSERT_FALSE(inferred_shape_error8.ok());
+  ASSERT_THAT(inferred_shape_error8.status().message(),
               HasSubstr("dimensions order is wrong"));
 }
 
@@ -2228,47 +2221,48 @@ TEST_F(ShapeInferenceTest, WhileWithCorrectShapes) {
   const Shape result_shape = ShapeUtil::MakeTupleShape({s32_, vector_32_});
   ProgramShape cond = ShapeUtil::MakeProgramShape({result_shape}, pred_);
   ProgramShape body = ShapeUtil::MakeProgramShape({result_shape}, result_shape);
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferWhileShape(cond, body, result_shape);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(result_shape, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(result_shape, *inferred_shape));
 }
 
 // Tests for the while instruction with wrong shapes.
 TEST_F(ShapeInferenceTest, WhileWithBadShapes) {
-  const Shape result_shape = ShapeUtil::MakeTupleShape({s32_, vector_32_});
-  ProgramShape cond = ShapeUtil::MakeProgramShape({result_shape}, pred_);
-  ProgramShape body = ShapeUtil::MakeProgramShape({result_shape}, result_shape);
+  const Shape inferred_shape = ShapeUtil::MakeTupleShape({s32_, vector_32_});
+  ProgramShape cond = ShapeUtil::MakeProgramShape({inferred_shape}, pred_);
+  ProgramShape body =
+      ShapeUtil::MakeProgramShape({inferred_shape}, inferred_shape);
 
   const auto bad_shape_1 =
-      ShapeUtil::MakeProgramShape({s32_, result_shape}, pred_);
-  const absl::StatusOr<Shape> inferred_status_error1 =
-      ShapeInference::InferWhileShape(bad_shape_1, body, result_shape);
-  ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().message(),
+      ShapeUtil::MakeProgramShape({s32_, inferred_shape}, pred_);
+  const absl::StatusOr<Shape> inferred_shape_error1 =
+      ShapeInference::InferWhileShape(bad_shape_1, body, inferred_shape);
+  ASSERT_FALSE(inferred_shape_error1.ok());
+  ASSERT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("Condition must take 1 arguments"));
 
   const auto bad_shape_2 =
-      ShapeUtil::MakeProgramShape({s32_, result_shape}, result_shape);
-  const absl::StatusOr<Shape> inferred_status_error2 =
-      ShapeInference::InferWhileShape(cond, bad_shape_2, result_shape);
-  ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().message(),
+      ShapeUtil::MakeProgramShape({s32_, inferred_shape}, inferred_shape);
+  const absl::StatusOr<Shape> inferred_shape_error2 =
+      ShapeInference::InferWhileShape(cond, bad_shape_2, inferred_shape);
+  ASSERT_FALSE(inferred_shape_error2.ok());
+  ASSERT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("Body must take 1 arguments"));
 
-  const auto bad_shape_3 = ShapeUtil::MakeProgramShape({result_shape}, s32_);
-  const absl::StatusOr<Shape> inferred_status_error3 =
-      ShapeInference::InferWhileShape(bad_shape_3, body, result_shape);
-  ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().message(),
+  const auto bad_shape_3 = ShapeUtil::MakeProgramShape({inferred_shape}, s32_);
+  const absl::StatusOr<Shape> inferred_shape_error3 =
+      ShapeInference::InferWhileShape(bad_shape_3, body, inferred_shape);
+  ASSERT_FALSE(inferred_shape_error3.ok());
+  ASSERT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("Condition must return a boolean"));
 
   const auto bad_shape_4 =
-      ShapeUtil::MakeProgramShape({result_shape}, vector_32_);
-  const absl::StatusOr<Shape> inferred_status_error4 =
-      ShapeInference::InferWhileShape(cond, bad_shape_4, result_shape);
-  ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_THAT(inferred_status_error4.status().message(),
+      ShapeUtil::MakeProgramShape({inferred_shape}, vector_32_);
+  const absl::StatusOr<Shape> inferred_shape_error4 =
+      ShapeInference::InferWhileShape(cond, bad_shape_4, inferred_shape);
+  ASSERT_FALSE(inferred_shape_error4.ok());
+  ASSERT_THAT(inferred_shape_error4.status().message(),
               HasSubstr("parameter of condition and body"));
 }
 
@@ -2278,81 +2272,81 @@ TEST_F(ShapeInferenceTest, ConcatenateWithDynamicShapes) {
       ShapeUtil::MakeShape(F32, {32, 160, 10}, {true, false, false});
   const auto dynamic_shape_2 =
       ShapeUtil::MakeShape(F32, {32, 160, 10}, {false, true, false});
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferConcatOpShape({&dynamic_shape_1, &dynamic_shape_2},
                                          /*dimension=*/0);
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(ShapeUtil::Equal(
       ShapeUtil::MakeShape(F32, {64, 160, 10}, {true, true, false}),
-      *inferred_status));
+      *inferred_shape));
 }
 
 // Tests for the concatenate instruction with proper shapes.
 TEST_F(ShapeInferenceTest, ConcatenateWithCorrectShapes) {
-  const absl::StatusOr<Shape> inferred_status_1 =
+  const absl::StatusOr<Shape> inferred_shape_1 =
       ShapeInference::InferConcatOpShape({&vector_32_, &vector_64_},
                                          /*dimension=*/0);
-  ASSERT_IS_OK(inferred_status_1.status());
+  ASSERT_IS_OK(inferred_shape_1.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {96}), *inferred_status_1));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {96}), *inferred_shape_1));
 
-  const absl::StatusOr<Shape> inferred_status_2 =
+  const absl::StatusOr<Shape> inferred_shape_2 =
       ShapeInference::InferConcatOpShape(
           {&vector_32_, &vector_64_, &vector_32_}, /*dimension=*/0);
-  ASSERT_IS_OK(inferred_status_2.status());
+  ASSERT_IS_OK(inferred_shape_2.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {128}), *inferred_status_2));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {128}), *inferred_shape_2));
 
-  const absl::StatusOr<Shape> inferred_status_3 =
+  const absl::StatusOr<Shape> inferred_shape_3 =
       ShapeInference::InferConcatOpShape(
           {&matrix_32_48_, &matrix_32_64_, &matrix_32_48_}, /*dimension=*/1);
-  ASSERT_IS_OK(inferred_status_3.status());
+  ASSERT_IS_OK(inferred_shape_3.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {32, 160}),
-                               *inferred_status_3));
+                               *inferred_shape_3));
 }
 
 // Tests for the concatenate instruction with wrong shapes.
 TEST_F(ShapeInferenceTest, ConcatenateWithBadShapes) {
-  const absl::StatusOr<Shape> inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_shape_error1 =
       ShapeInference::InferConcatOpShape({}, /*dimension=*/0);
-  ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().message(),
+  ASSERT_FALSE(inferred_shape_error1.ok());
+  ASSERT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("Concatenate expects at least one argument"));
 
-  const absl::StatusOr<Shape> inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_shape_error2 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/-1);
-  ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().message(),
+  ASSERT_FALSE(inferred_shape_error2.ok());
+  ASSERT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("dimension out of bounds: -1"));
 
-  const absl::StatusOr<Shape> inferred_status_error3 =
+  const absl::StatusOr<Shape> inferred_shape_error3 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/1);
-  ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().message(),
+  ASSERT_FALSE(inferred_shape_error3.ok());
+  ASSERT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("dimension out of bounds: 1"));
 
   const Shape tuple = ShapeUtil::MakeTupleShape({vector_32_});
-  const absl::StatusOr<Shape> inferred_status_error4 =
+  const absl::StatusOr<Shape> inferred_shape_error4 =
       ShapeInference::InferConcatOpShape({&vector_32_, &tuple},
                                          /*dimension=*/0);
-  ASSERT_FALSE(inferred_status_error4.ok());
+  ASSERT_FALSE(inferred_shape_error4.ok());
   ASSERT_THAT(
-      inferred_status_error4.status().message(),
+      inferred_shape_error4.status().message(),
       HasSubstr("Expected array argument for operand of concatenation"));
 
   const Shape vector_s32 = ShapeUtil::MakeShape(S32, {32});
-  const absl::StatusOr<Shape> inferred_status_error5 =
+  const absl::StatusOr<Shape> inferred_shape_error5 =
       ShapeInference::InferConcatOpShape({&vector_32_, &vector_s32},
                                          /*dimension=*/0);
-  ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_THAT(inferred_status_error5.status().message(),
+  ASSERT_FALSE(inferred_shape_error5.ok());
+  ASSERT_THAT(inferred_shape_error5.status().message(),
               HasSubstr("concatenate arrays with different element types"));
 
-  const absl::StatusOr<Shape> inferred_status_error6 =
+  const absl::StatusOr<Shape> inferred_shape_error6 =
       ShapeInference::InferConcatOpShape({&matrix_32_48_, &matrix_32_64_},
                                          /*dimension=*/0);
-  ASSERT_FALSE(inferred_status_error6.ok());
-  ASSERT_THAT(inferred_status_error6.status().message(),
+  ASSERT_FALSE(inferred_shape_error6.ok());
+  ASSERT_THAT(inferred_shape_error6.status().message(),
               HasSubstr("concatenate arrays that differ in "
                         "dimensions other than the one being "
                         "concatenated"));
@@ -2373,11 +2367,11 @@ TEST_F(ShapeInferenceTest, Pad) {
   dimension1->set_edge_padding_high(5);
   dimension1->set_interior_padding(0);
 
-  const absl::StatusOr<Shape> inferred_status = ShapeInference::InferPadShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferPadShape(
       input_shape, padding_value_shape, padding_config);
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {39, 31}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {39, 31}), *inferred_shape));
 
   dimension1->set_edge_padding_low(-20);
   dimension1->set_edge_padding_high(-10);
@@ -2391,74 +2385,74 @@ TEST_F(ShapeInferenceTest, Pad) {
 TEST_F(ShapeInferenceTest, Reverse) {
   const Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
 
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferReverseShape(input_shape, {0, 1});
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(input_shape, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(input_shape, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ReverseInvalidDimension) {
   const Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
 
-  const absl::StatusOr<Shape> inferred_status_error0 =
+  const absl::StatusOr<Shape> inferred_shape_error0 =
       ShapeInference::InferReverseShape(input_shape, {0, 2});
-  ASSERT_FALSE(inferred_status_error0.ok());
-  ASSERT_THAT(inferred_status_error0.status().message(),
+  ASSERT_FALSE(inferred_shape_error0.ok());
+  ASSERT_THAT(inferred_shape_error0.status().message(),
               HasSubstr("out-of-bounds"));
 
-  const absl::StatusOr<Shape> inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_shape_error1 =
       ShapeInference::InferReverseShape(input_shape, {0, -1});
-  ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().message(),
+  ASSERT_FALSE(inferred_shape_error1.ok());
+  ASSERT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("out-of-bounds"));
 
-  const absl::StatusOr<Shape> inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_shape_error2 =
       ShapeInference::InferReverseShape(input_shape, {0, 0});
-  ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().message(),
+  ASSERT_FALSE(inferred_shape_error2.ok());
+  ASSERT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("duplicated"));
 
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({input_shape, input_shape});
-  const absl::StatusOr<Shape> inferred_status_error3 =
+  const absl::StatusOr<Shape> inferred_shape_error3 =
       ShapeInference::InferReverseShape(tuple_shape, {0});
-  ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().message(),
+  ASSERT_FALSE(inferred_shape_error3.ok());
+  ASSERT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("Expected array argument"));
 }
 
 TEST_F(ShapeInferenceTest, Call) {
-  const absl::StatusOr<Shape> inferred_status0 =
+  const absl::StatusOr<Shape> inferred_shape0 =
       ShapeInference::InferCallShape({}, ShapeUtil::MakeProgramShape({}, f32_));
-  EXPECT_IS_OK(inferred_status0.status());
-  EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_status0));
+  EXPECT_IS_OK(inferred_shape0.status());
+  EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_shape0));
 
-  const absl::StatusOr<Shape> inferred_status1 = ShapeInference::InferCallShape(
+  const absl::StatusOr<Shape> inferred_shape1 = ShapeInference::InferCallShape(
       {&f32_, &s32_, &pred_, &vector_32_, &matrix_32_48_},
       ShapeUtil::MakeProgramShape(
           {f32_, s32_, pred_, vector_32_, matrix_32_48_}, s32matrix_64_64_));
-  EXPECT_IS_OK(inferred_status1.status());
-  EXPECT_TRUE(ShapeUtil::Equal(s32matrix_64_64_, *inferred_status1));
+  EXPECT_IS_OK(inferred_shape1.status());
+  EXPECT_TRUE(ShapeUtil::Equal(s32matrix_64_64_, *inferred_shape1));
 
-  const absl::StatusOr<Shape> inferred_status_error0 =
+  const absl::StatusOr<Shape> inferred_shape_error0 =
       ShapeInference::InferCallShape({},
                                      ShapeUtil::MakeProgramShape({f32_}, f32_));
-  EXPECT_FALSE(inferred_status_error0.ok());
-  EXPECT_THAT(inferred_status_error0.status().message(),
+  EXPECT_FALSE(inferred_shape_error0.ok());
+  EXPECT_THAT(inferred_shape_error0.status().message(),
               HasSubstr("arity must match"));
 
-  const absl::StatusOr<Shape> inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_shape_error1 =
       ShapeInference::InferCallShape({&f32_},
                                      ShapeUtil::MakeProgramShape({}, f32_));
-  EXPECT_FALSE(inferred_status_error1.ok());
-  EXPECT_THAT(inferred_status_error1.status().message(),
+  EXPECT_FALSE(inferred_shape_error1.ok());
+  EXPECT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("arity must match"));
 
-  const absl::StatusOr<Shape> inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_shape_error2 =
       ShapeInference::InferCallShape({&f32_},
                                      ShapeUtil::MakeProgramShape({s32_}, f32_));
-  EXPECT_FALSE(inferred_status_error2.ok());
-  EXPECT_THAT(inferred_status_error2.status().message(),
+  EXPECT_FALSE(inferred_shape_error2.ok());
+  EXPECT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("parameter must match argument"));
 }
 
@@ -2481,140 +2475,140 @@ TEST_F(ShapeInferenceTest, Rank1Transpose) {
 }
 
 TEST_F(ShapeInferenceTest, ConditionalPred) {
-  const absl::StatusOr<Shape> inferred_status0 =
+  const absl::StatusOr<Shape> inferred_shape0 =
       ShapeInference::InferConditionalShape(
           pred_,
           {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
            ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
           {vector_32_, vector_64_});
-  EXPECT_IS_OK(inferred_status0.status());
-  EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_status0));
+  EXPECT_IS_OK(inferred_shape0.status());
+  EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_shape0));
 
-  const absl::StatusOr<Shape> inferred_status1 =
+  const absl::StatusOr<Shape> inferred_shape1 =
       ShapeInference::InferConditionalShape(
           pred_,
           {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
            ShapeUtil::MakeProgramShape({vector_32_}, vector_64_)},
           {matrix_32_48_, vector_32_});
-  EXPECT_IS_OK(inferred_status1.status());
-  EXPECT_TRUE(ShapeUtil::Equal(vector_64_, *inferred_status1));
+  EXPECT_IS_OK(inferred_shape1.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_64_, *inferred_shape1));
 
   const auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
-  const absl::StatusOr<Shape> inferred_status2 =
+  const absl::StatusOr<Shape> inferred_shape2 =
       ShapeInference::InferConditionalShape(
           pred_,
           {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
            ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_)},
           {matrix_32_48_, tuple_f32_v32});
-  EXPECT_IS_OK(inferred_status2.status());
-  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_status2));
+  EXPECT_IS_OK(inferred_shape2.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_shape2));
 
-  const absl::StatusOr<Shape> inferred_status_error0 =
+  const absl::StatusOr<Shape> inferred_shape_error0 =
       ShapeInference::InferConditionalShape(
           f32_,
           {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
            ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
           {vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error0.ok());
-  EXPECT_THAT(inferred_status_error0.status().message(),
+  EXPECT_FALSE(inferred_shape_error0.ok());
+  EXPECT_THAT(inferred_shape_error0.status().message(),
               HasSubstr("must be bool or int32_t"));
 
-  const absl::StatusOr<Shape> inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_shape_error1 =
       ShapeInference::InferConditionalShape(
           pred_,
           {ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
            ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
           {ShapeUtil::MakeTupleShape({f32_, vector_32_}), matrix_32_48_});
-  EXPECT_FALSE(inferred_status_error1.ok());
-  EXPECT_THAT(inferred_status_error1.status().message(),
+  EXPECT_FALSE(inferred_shape_error1.ok());
+  EXPECT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("branch computation 0 must take 1 argument"));
 
-  const absl::StatusOr<Shape> inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_shape_error2 =
       ShapeInference::InferConditionalShape(
           pred_,
           {ShapeUtil::MakeProgramShape({vector_64_}, f32_),
            ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
           {vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error2.ok());
-  EXPECT_THAT(inferred_status_error2.status().message(),
+  EXPECT_FALSE(inferred_shape_error2.ok());
+  EXPECT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("branch operand 0 must match the shape of the only "
                         "parameter of branch computation 0"));
 
-  const absl::StatusOr<Shape> inferred_status_error3 =
+  const absl::StatusOr<Shape> inferred_shape_error3 =
       ShapeInference::InferConditionalShape(
           pred_,
           {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
            ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_)},
           {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_})});
-  EXPECT_FALSE(inferred_status_error3.ok());
-  EXPECT_THAT(inferred_status_error3.status().message(),
+  EXPECT_FALSE(inferred_shape_error3.ok());
+  EXPECT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("branch computation 1 must take 1 argument"));
 
-  const absl::StatusOr<Shape> inferred_status_error4 =
+  const absl::StatusOr<Shape> inferred_shape_error4 =
       ShapeInference::InferConditionalShape(
           pred_,
           {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
            ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
           {vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error4.ok());
-  EXPECT_THAT(inferred_status_error4.status().message(),
+  EXPECT_FALSE(inferred_shape_error4.ok());
+  EXPECT_THAT(inferred_shape_error4.status().message(),
               HasSubstr("branch operand 1 must match the shape of the only "
                         "parameter of branch computation 1"));
 
-  const absl::StatusOr<Shape> inferred_status_error5 =
+  const absl::StatusOr<Shape> inferred_shape_error5 =
       ShapeInference::InferConditionalShape(
           pred_,
           {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
            ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
           {vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error5.ok());
-  EXPECT_THAT(inferred_status_error5.status().message(),
+  EXPECT_FALSE(inferred_shape_error5.ok());
+  EXPECT_THAT(inferred_shape_error5.status().message(),
               HasSubstr("the result of branch 0 computation and branch 1 "
                         "computation must have the same shape"));
 }
 
 TEST_F(ShapeInferenceTest, ConditionalIndexed) {
   const Shape r0s32 = ShapeUtil::MakeShape(S32, {});
-  const absl::StatusOr<Shape> inferred_status0 =
+  const absl::StatusOr<Shape> inferred_shape0 =
       ShapeInference::InferConditionalShape(
           r0s32,
           {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
            ShapeUtil::MakeProgramShape({vector_64_}, f32_),
            ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
           {vector_32_, vector_64_, vector_64_});
-  EXPECT_IS_OK(inferred_status0.status());
-  EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_status0));
+  EXPECT_IS_OK(inferred_shape0.status());
+  EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_shape0));
 
-  const absl::StatusOr<Shape> inferred_status1 =
+  const absl::StatusOr<Shape> inferred_shape1 =
       ShapeInference::InferConditionalShape(
           r0s32,
           {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
            ShapeUtil::MakeProgramShape({vector_32_}, vector_64_),
            ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_)},
           {matrix_32_48_, vector_32_, matrix_32_48_});
-  EXPECT_IS_OK(inferred_status1.status());
-  EXPECT_TRUE(ShapeUtil::Equal(vector_64_, *inferred_status1));
+  EXPECT_IS_OK(inferred_shape1.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_64_, *inferred_shape1));
 
   const auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
-  const absl::StatusOr<Shape> inferred_status2 =
+  const absl::StatusOr<Shape> inferred_shape2 =
       ShapeInference::InferConditionalShape(
           r0s32, {ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_)},
           {tuple_f32_v32});
-  EXPECT_IS_OK(inferred_status2.status());
-  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_status2));
+  EXPECT_IS_OK(inferred_shape2.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_shape2));
 
-  const absl::StatusOr<Shape> inferred_status_error0 =
+  const absl::StatusOr<Shape> inferred_shape_error0 =
       ShapeInference::InferConditionalShape(
           pred_,
           {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
            ShapeUtil::MakeProgramShape({vector_32_}, f32_),
            ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
           {vector_32_, vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error0.ok());
-  EXPECT_THAT(inferred_status_error0.status().message(),
+  EXPECT_FALSE(inferred_shape_error0.ok());
+  EXPECT_THAT(inferred_shape_error0.status().message(),
               HasSubstr("2 == branch_computations.size()"));
 
-  const absl::StatusOr<Shape> inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_shape_error1 =
       ShapeInference::InferConditionalShape(
           r0s32,
           {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
@@ -2622,23 +2616,23 @@ TEST_F(ShapeInferenceTest, ConditionalIndexed) {
            ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
           {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_}),
            matrix_32_48_});
-  EXPECT_FALSE(inferred_status_error1.ok());
-  EXPECT_THAT(inferred_status_error1.status().message(),
+  EXPECT_FALSE(inferred_shape_error1.ok());
+  EXPECT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("branch computation 1 must take 1 argument"));
 
-  const absl::StatusOr<Shape> inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_shape_error2 =
       ShapeInference::InferConditionalShape(
           r0s32,
           {ShapeUtil::MakeProgramShape({r0s32}, f32_),
            ShapeUtil::MakeProgramShape({vector_32_}, f32_),
            ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
           {r0s32, vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error2.ok());
-  EXPECT_THAT(inferred_status_error2.status().message(),
+  EXPECT_FALSE(inferred_shape_error2.ok());
+  EXPECT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("branch operand 2 must match the shape of the only "
                         "parameter of branch computation 2"));
 
-  const absl::StatusOr<Shape> inferred_status_error3 =
+  const absl::StatusOr<Shape> inferred_shape_error3 =
       ShapeInference::InferConditionalShape(
           r0s32,
           {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
@@ -2646,15 +2640,15 @@ TEST_F(ShapeInferenceTest, ConditionalIndexed) {
            ShapeUtil::MakeProgramShape({vector_32_}, f32_),
            ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
           {vector_32_, vector_32_, vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error3.ok());
-  EXPECT_THAT(inferred_status_error3.status().message(),
+  EXPECT_FALSE(inferred_shape_error3.ok());
+  EXPECT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("the result of branch 0 computation and branch 3 "
                         "computation must have the same shape"));
 
-  const absl::StatusOr<Shape> inferred_status_error4 =
+  const absl::StatusOr<Shape> inferred_shape_error4 =
       ShapeInference::InferConditionalShape(r0s32, {}, {});
-  EXPECT_FALSE(inferred_status_error4.ok());
-  EXPECT_THAT(inferred_status_error4.status().message(),
+  EXPECT_FALSE(inferred_shape_error4.ok());
+  EXPECT_THAT(inferred_shape_error4.status().message(),
               HasSubstr("!branch_computations.empty()"));
 }
 
@@ -2662,25 +2656,25 @@ TEST_F(ShapeInferenceTest, ConditionalDynamic) {
   const Shape r0s32 = ShapeUtil::MakeShape(S32, {});
   const Shape static_shape = ShapeUtil::MakeShape(S32, {4}, {false});
   const Shape dynamic_shape = ShapeUtil::MakeShape(S32, {4}, {true});
-  const absl::StatusOr<Shape> inferred_status0 =
+  const absl::StatusOr<Shape> inferred_shape0 =
       ShapeInference::InferConditionalShape(
           r0s32,
           {ShapeUtil::MakeProgramShape({vector_32_}, static_shape),
            ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape),
            ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape)},
           {vector_32_, vector_64_, vector_64_});
-  EXPECT_IS_OK(inferred_status0.status());
-  EXPECT_TRUE(ShapeUtil::Equal(dynamic_shape, *inferred_status0));
+  EXPECT_IS_OK(inferred_shape0.status());
+  EXPECT_TRUE(ShapeUtil::Equal(dynamic_shape, *inferred_shape0));
 
-  const absl::StatusOr<Shape> inferred_status1 =
+  const absl::StatusOr<Shape> inferred_shape1 =
       ShapeInference::InferConditionalShape(
           r0s32,
           {ShapeUtil::MakeProgramShape({vector_32_}, dynamic_shape),
            ShapeUtil::MakeProgramShape({vector_64_}, static_shape),
            ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape)},
           {vector_32_, vector_64_, vector_64_});
-  EXPECT_IS_OK(inferred_status1.status());
-  EXPECT_TRUE(ShapeUtil::Equal(dynamic_shape, *inferred_status1));
+  EXPECT_IS_OK(inferred_shape1.status());
+  EXPECT_TRUE(ShapeUtil::Equal(dynamic_shape, *inferred_shape1));
 }
 
 TEST_F(ShapeInferenceTest, BadSlice) {
@@ -4018,18 +4012,18 @@ TEST_P(UnboundedUnaryOpShapeInferenceTest, UnboundedUnaryOps) {
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAdd) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, lhs, rhs,
                                          GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                             ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
@@ -4037,18 +4031,18 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAdd) {
 TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedAnd) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAnd, lhs, rhs,
                                          GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                             ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
@@ -4056,17 +4050,17 @@ TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedAnd) {
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAtan2) {
   TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAtan2, lhs, rhs,
                                          GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
@@ -4074,11 +4068,11 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAtan2) {
 TEST_F(ShapeInferenceTest, UnboundedBitcastConvert) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
   TF_ASSERT_OK_AND_ASSIGN(
-      const Shape inferred_status,
+      const Shape inferred_shape,
       ShapeInference::InferBitcastConvertShape(operand, PrimitiveType::F16));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f16[?, 10, 2]"));
-  EXPECT_TRUE(ShapeUtil::Equal(inferred_status, expected))
-      << "inferred: " << ShapeUtil::HumanString(inferred_status)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
@@ -4091,13 +4085,13 @@ TEST_F(ShapeInferenceTest, UnboundedBatchNormGrad) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape grad_scale, ParseShape("f32[?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape grad_offset, ParseShape("f32[?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape grad_output, ParseShape("f32[5, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(const Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferBatchNormGradShape(
                               operand, scale, mean, variance, grad_output, 1));
   const Shape expected_tuple_shape =
       ShapeUtil::MakeTupleShape({grad_operand, grad_scale, grad_offset});
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected_tuple_shape))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected_tuple_shape))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected_tuple_shape);
 }
 
@@ -4107,12 +4101,12 @@ TEST_F(ShapeInferenceTest, UnboundedBatchNormInference) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape offset, ParseShape("f32[5]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape mean, ParseShape("f32[5]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape variance, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(const Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferBatchNormInferenceShape(
                               operand, scale, offset, mean, variance, 1));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, 7]"));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
@@ -4126,28 +4120,28 @@ TEST_F(ShapeInferenceTest, UnboundedBatchNormTraining) {
   const Shape expected_tuple_shape =
       ShapeUtil::MakeTupleShape({output, batch_mean, batch_var});
   TF_ASSERT_OK_AND_ASSIGN(
-      const Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferBatchNormTrainingShape(operand, scale, offset, 1));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected_tuple_shape))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected_tuple_shape))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected_tuple_shape);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastUnsupportedOperand) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, <=2, ?]"));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBroadcastShape(operand, /*broadcast_sizes=*/{1});
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("is_unbounded_dynamic"));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastUnsupportedBroadcastSize) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, 4]"));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBroadcastShape(
           operand, /*broadcast_sizes=*/{Shape::kUnboundedSize});
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("Non-broadcast dimensions must not be dynamic."));
 }
 
@@ -4155,11 +4149,11 @@ TEST_F(ShapeInferenceTest, UnboundedBroadcastInDim) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, 3, 4]"));
   TF_ASSERT_OK_AND_ASSIGN(
-      const Shape inferred_status,
+      const Shape inferred_shape,
       ShapeInference::InferBroadcastShape(operand, expected,
                                           /*broadcast_dimensions=*/{0, 2}));
-  EXPECT_TRUE(ShapeUtil::Equal(inferred_status, expected))
-      << "inferred: " << ShapeUtil::HumanString(inferred_status)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
@@ -4167,30 +4161,30 @@ TEST_F(ShapeInferenceTest, UnboundedBroadcastInDimToBounded) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, 3, <=4]"));
   TF_ASSERT_OK_AND_ASSIGN(
-      const Shape inferred_status,
+      const Shape inferred_shape,
       ShapeInference::InferBroadcastShape(operand, expected,
                                           /*broadcast_dimensions=*/{0, 2}));
-  EXPECT_TRUE(ShapeUtil::Equal(inferred_status, expected))
-      << "inferred: " << ShapeUtil::HumanString(inferred_status)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastInDimUnsupportedOutput) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, 3, ?]"));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBroadcastShape(operand, expected,
                                           /*broadcast_dimensions=*/{0, 2});
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("is_unbounded_dynamic"));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastInDimUnsupported) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, 4]"));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBroadcastShape(
           operand, /*broadcast_sizes=*/{2, Shape::kUnboundedSize, 4});
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("Non-broadcast dimensions must not be dynamic."));
 }
 
@@ -4198,15 +4192,15 @@ TEST_P(UnboundedClampOpShapeInferenceTest, UnboundedClamp) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam()[0]));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam()[1]));
   TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape(GetParam()[2]));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, lhs, rhs, ehs);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape(GetParam()[3]));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
-    EXPECT_EQ(inferred_status.status().message(), GetParam()[4]);
+    EXPECT_EQ(inferred_shape.status().message(), GetParam()[4]);
   }
 }
 
@@ -4215,10 +4209,10 @@ TEST_F(ShapeInferenceTest, UnboundedClampWithTuple) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("(f32[?], f32[2])"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("(f32[2], f32[?])"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("(f32[?], f32[2])"));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, lhs, rhs, ehs);
   EXPECT_THAT(
-      inferred_status.status().message(),
+      inferred_shape.status().message(),
       HasSubstr(
           "Expected array argument for clamp min, but got (f32[2], f32[?])."));
 }
@@ -4226,18 +4220,18 @@ TEST_F(ShapeInferenceTest, UnboundedClampWithTuple) {
 TEST_P(UnboundedCompareOpShapeInferenceTest, UnboundedCompare) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBinaryOpShape(HloOpcode::kCompare, lhs, rhs,
                                          GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                             ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
@@ -4245,16 +4239,16 @@ TEST_P(UnboundedCompareOpShapeInferenceTest, UnboundedCompare) {
 TEST_P(UnboundedConcatenateOpShapeInferenceTest, UnboundedConcatenate) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape(GetParam()[0]));
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand2, ParseShape(GetParam()[1]));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferConcatOpShape({&operand1, &operand2},
                                          /*dimension=*/0);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape(GetParam()[2]));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
-    EXPECT_EQ(inferred_status.status().message(), GetParam()[3]);
+    EXPECT_EQ(inferred_shape.status().message(), GetParam()[3]);
   }
 }
 
@@ -4263,10 +4257,10 @@ TEST_F(UnboundedConcatenateOpShapeInferenceTest,
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape("f32[2, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand2, ParseShape("f32[2, 3]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand3, ParseShape("f32[2, 4]"));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferConcatOpShape({&operand1, &operand2, &operand3},
                                          /*dimension=*/0);
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("Mismatched dimension sizes 3 and 4 in dimension 1"));
 }
 
@@ -4275,10 +4269,10 @@ TEST_F(UnboundedConcatenateOpShapeInferenceTest,
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape("f32[2, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand2, ParseShape("f32[2, <=3]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand3, ParseShape("f32[2, <=4]"));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferConcatOpShape({&operand1, &operand2, &operand3},
                                          /*dimension=*/0);
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("Mismatched bound sizes 3 and 4 in dimension 1"));
 }
 
@@ -4319,31 +4313,31 @@ TEST_F(ShapeInferenceTest, UnboundedConvolution) {
                       /*window_dimensions=*/{2, 2},
                       /*window_strides=*/{1, 1}, Padding::kValid),
           /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
-  TF_ASSERT_OK_AND_ASSIGN(const Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferConvolveShape(
                               lhs, rhs, /*feature_group_count=*/1,
                               /*batch_group_count=*/1, window, dnums,
                               /*preferred_element_type=*/std::nullopt));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedDiv) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBinaryOpShape(HloOpcode::kDivide, lhs, rhs,
                                          GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                             ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
@@ -4358,11 +4352,11 @@ TEST_F(ShapeInferenceTest, UnboundedDot) {
   dnums.add_rhs_contracting_dimensions(0);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      const Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferDotOpShape(lhs, rhs, dnums,
                                       /*preferred_element_type=*/std::nullopt));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
@@ -4378,11 +4372,11 @@ TEST_F(ShapeInferenceTest, UnboundedDotGeneral) {
   dnums.add_rhs_contracting_dimensions(1);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      const Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferDotOpShape(lhs, rhs, dnums,
                                       /*preferred_element_type=*/std::nullopt));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
@@ -4400,30 +4394,30 @@ TEST_F(ShapeInferenceTest, UnboundedGather) {
   dimension_numbers.add_start_index_map(0);
   dimension_numbers.set_index_vector_dim(2);
 
-  TF_ASSERT_OK_AND_ASSIGN(const Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferGatherShape(
                               operand, start_indices, dimension_numbers,
                               /*slice_sizes=*/{1, 2, 2}));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMax) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBinaryOpShape(HloOpcode::kMaximum, lhs, rhs,
                                          GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                             ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
@@ -4431,18 +4425,18 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMax) {
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMul) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBinaryOpShape(HloOpcode::kMultiply, lhs, rhs,
                                          GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                             ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
@@ -4450,18 +4444,18 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMul) {
 TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedOr) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBinaryOpShape(HloOpcode::kOr, lhs, rhs,
                                          GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                             ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
@@ -4480,28 +4474,28 @@ TEST_F(ShapeInferenceTest, UnboundedPad) {
   }
 
   TF_ASSERT_OK_AND_ASSIGN(
-      const Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferPadShape(operand, padding_value, padding_config));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedPow) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBinaryOpShape(HloOpcode::kPower, lhs, rhs,
                                          GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                             ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
@@ -4515,13 +4509,13 @@ TEST_F(ShapeInferenceTest, UnboundedReduce) {
       {f32_, f32_, f32_, f32_, f32_, f32_},
       ShapeUtil::MakeTupleShape({f32_, f32_, f32_}));
   TF_ASSERT_OK_AND_ASSIGN(
-      const Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferReduceShape(
           {&input0, &input1, &input2, &f32_, &f32_, &f32_}, {1}, to_apply));
   const Shape shape = ShapeUtil::MakeShape(F32, {7});
   const Shape expected = ShapeUtil::MakeTupleShape({shape, shape, shape});
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
@@ -4533,10 +4527,9 @@ TEST_F(ShapeInferenceTest, UnboundedReduceInvalidReduceDimension) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {f32_, f32_, f32_, f32_, f32_, f32_},
       ShapeUtil::MakeTupleShape({f32_, f32_, f32_}));
-  const absl::StatusOr<Shape> inferred_status =
-      ShapeInference::InferReduceShape(
-          {&input0, &input1, &input2, &f32_, &f32_, &f32_}, {1}, to_apply);
-  EXPECT_THAT(inferred_status.status().message(),
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
+      {&input0, &input1, &input2, &f32_, &f32_, &f32_}, {1}, to_apply);
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("All reduced tensors must have compatible dimension"));
 }
 
@@ -4582,22 +4575,22 @@ TEST_F(ShapeInferenceTest, UnboundedReshape) {
 
 TEST_F(ShapeInferenceTest, UnboundedReshapeUnsupportedOutputShape) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[6]"));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferReshapeShape(
           operand, /*dimensions=*/{0},
           /*new_sizes=*/{Shape::kUnboundedSize, Shape::kUnboundedSize}, -1);
   EXPECT_THAT(
-      inferred_status.status().message(),
+      inferred_shape.status().message(),
       HasSubstr("Reshaping with unbounded result shape is not supported."));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReshapeUnsupportedMixOfDynamism) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, <=3]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=3]"));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferReshapeShape(operand, /*dimensions=*/{0},
                                         /*new_sizes=*/{3}, -1);
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Reshape operand with bounded and unbounded dynamism "
                         "not supported."));
 }
@@ -4606,15 +4599,15 @@ TEST_P(UnboundedSelectOpShapeInferenceTest, UnboundedSelect) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam()[0]));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam()[1]));
   TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape(GetParam()[2]));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, lhs, rhs, ehs);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape(GetParam()[3]));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
-    EXPECT_EQ(inferred_status.status().message(), GetParam()[4]);
+    EXPECT_EQ(inferred_shape.status().message(), GetParam()[4]);
   }
 }
 
@@ -4623,9 +4616,9 @@ TEST_F(ShapeInferenceTest, UnboundedSelectWithTupleUnsupported) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("(f32[?], f32[2])"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("(f32[2], f32[?])"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("(f32[?], f32[2])"));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, lhs, rhs, ehs);
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("Expected array argument for select pred, but got "
                         "(pred[2], pred[?])."));
 }
@@ -4634,30 +4627,30 @@ TEST_F(ShapeInferenceTest, UnboundedSlice) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, <=3, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, <=2, 3]"));
   TF_ASSERT_OK_AND_ASSIGN(
-      const Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferSliceShape(operand, /*starts=*/{0, 1, 2},
                                       /*limits=*/{1, 3, 5},
                                       /*strides=*/{1, 1, 1}));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedSub) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
-  const absl::StatusOr<Shape> inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBinaryOpShape(HloOpcode::kSubtract, lhs, rhs,
                                          GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                             ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
@@ -4693,11 +4686,11 @@ TEST_F(ShapeInferenceTest, UnboundedTranspose) {
                           ParseShape("f32[1, ?, 2, ?, <=2]{4,3,2,1,0}"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("f32[<=2, 1, ?, 2, ?]{0,2,3,4,1}"));
-  TF_ASSERT_OK_AND_ASSIGN(const Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferTransposeShape(
                               operand, /*dimensions=*/{4, 0, 3, 2, 1}));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
@@ -4705,10 +4698,10 @@ TEST_F(ShapeInferenceTest, UnboundedTransposeRank1) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?]"));
   TF_ASSERT_OK_AND_ASSIGN(
-      const Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferTransposeShape(operand, /*dimensions=*/{0}));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 

From 2c908271c07aadddbd7c6bfea7decc0be0026f65 Mon Sep 17 00:00:00 2001
From: Swachhand Lokhande <swachhand@google.com>
Date: Fri, 29 Mar 2024 16:24:01 -0700
Subject: [PATCH 616/670] Add a contextmanager to temporarily disable XLA
 sharding support for ResourceVariables.

PiperOrigin-RevId: 620366887
---
 .../resource_variable_xla_sharding_test.py    | 50 +++++++++++++++++++
 tensorflow/python/eager/context.py            | 19 +++++++
 2 files changed, 69 insertions(+)

diff --git a/tensorflow/python/compiler/xla/experimental/resource_variable_xla_sharding_test.py b/tensorflow/python/compiler/xla/experimental/resource_variable_xla_sharding_test.py
index ef7192a4f45807..cab7c9810063fa 100644
--- a/tensorflow/python/compiler/xla/experimental/resource_variable_xla_sharding_test.py
+++ b/tensorflow/python/compiler/xla/experimental/resource_variable_xla_sharding_test.py
@@ -131,6 +131,56 @@ def tpu_fn(x):
         sharding_proto,
     )
 
+  def test_disabling_xla_sharding_ops_temporarily(self):
+    w = variables.Variable(
+        initial_value=math_ops.range(8, dtype=dtypes.float32),
+        name='w',
+    )
+    self.assertIsInstance(w, resource_variable_ops.BaseResourceVariable)
+
+    context.enable_xla_sharding_for_resource_variables()
+    with context.temporarily_disable_xla_sharding_for_resource_variables():
+      with self.assertRaisesRegex(
+          AttributeError,
+          '.*Tensor.op is undefined when eager execution is enabled.*',
+      ):
+        xla_sharding.split(
+            w,
+            split_dimension=0,
+            num_devices=8,
+        )
+
+    # xla_sharding_for_resource_variables is enabled again. Following line
+    # doesn't throw an error.
+    xla_sharding.split(
+        w,
+        split_dimension=0,
+        num_devices=8,
+    )
+
+    context.disable_xla_sharding_for_resource_variables()
+    with context.temporarily_disable_xla_sharding_for_resource_variables():
+      with self.assertRaisesRegex(
+          AttributeError,
+          '.*Tensor.op is undefined when eager execution is enabled.*',
+      ):
+        xla_sharding.split(
+            w,
+            split_dimension=0,
+            num_devices=8,
+        )
+
+    # xla_sharding_for_resource_variables stays disabled.
+    with self.assertRaisesRegex(
+        AttributeError,
+        '.*Tensor.op is undefined when eager execution is enabled.*',
+    ):
+      xla_sharding.split(
+          w,
+          split_dimension=0,
+          num_devices=8,
+      )
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 0cb1074d251f04..e400fbaa117209 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -150,6 +150,25 @@ def xla_sharding_for_resource_variables_enabled():
   return _XLA_SHARDING_FOR_RESOURCE_VARIABLES
 
 
+@contextlib.contextmanager
+def temporarily_disable_xla_sharding_for_resource_variables():
+  """Temporarily disables XLA sharding for resource variables.
+
+  Should be a no-op if it is already disabled.
+
+  Yields:
+    None.
+  """
+  previously_enabled = xla_sharding_for_resource_variables_enabled()
+
+  try:
+    disable_xla_sharding_for_resource_variables()
+    yield
+  finally:
+    if previously_enabled:
+      enable_xla_sharding_for_resource_variables()
+
+
 # Expose it as internally public APIs for Keras use cases in b/171080602.
 tf_export("__internal__.is_tfrt_enabled", v1=[])(is_tfrt_enabled)
 

From 27bd888ea71430ba64bd8f1ab16cc201cf5c36cb Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 29 Mar 2024 17:57:07 -0700
Subject: [PATCH 617/670] [xla:gpu] AddressComputationFusionRewriter should run
 before other fusions

PiperOrigin-RevId: 620383409
---
 .../address_computation_fusion_rewriter.cc    |  13 -
 ...ddress_computation_fusion_rewriter_test.cc | 231 ++++++------------
 .../xla/xla/service/gpu/gpu_compiler.cc       |  21 +-
 .../xla/xla/service/gpu/gpu_compiler_test.cc  |  13 +-
 4 files changed, 99 insertions(+), 179 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index 631ce71a4407d1..afb429b1942ab3 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -404,8 +404,6 @@ absl::StatusOr<HloInstruction*> CreateFusionInstruction(
 absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  if (!module->has_schedule()) return Internal("module is not scheduled");
-
   auto process_slices = [&](bool dynamic) -> absl::StatusOr<bool> {
     absl::flat_hash_map<HloInstruction*,
                         std::pair<UseDefDataflowPaths, DefUseDataflowPaths>>
@@ -445,7 +443,6 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
 
     if (matches.empty()) return false;
 
-    HloSchedule& schedule = module->schedule();
     for (auto& [hero, paths] : matches) {
       auto& [sliced_operand_paths, sliced_user_paths] = paths;
       std::vector<HloInstruction*> matched_instrs;
@@ -471,15 +468,7 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
                           CreateFusionInstruction(module, hero, captures,
                                                   fusion_body, dynamic));
 
-      // As we are running after scheduling we have to keep it valid.
       HloComputation* parent = hero->parent();
-      // Update schedule to replace the custom call instruction with the fusion
-      // instruction.
-      // Removal of the rest of the instructions in the sequence is handled by
-      // schedule update below.
-      HloInstructionSequence& sequence = schedule.GetOrCreateSequence(parent);
-      sequence.replace_instruction(hero, fusion);
-
       if (fusion->shape().IsTuple()) {
         TF_RETURN_IF_ERROR(parent->ReplaceInstructionWithDifferentShape(
             const_cast<HloInstruction*>(hero), fusion));
@@ -519,8 +508,6 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
       }
     }
 
-    TF_RETURN_IF_ERROR(module->schedule().Update());
-
     return true;
   };
 
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
index ec358a7b522e9f..4d14024115a621 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
@@ -51,7 +51,7 @@ class AddressComputationFusionRewriterTest : public HloTestBase {};
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemm) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -107,15 +107,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemm) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmWithWorkspace) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -175,15 +172,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmWithWorkspace) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmWorkspaceIgnored) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -245,15 +239,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmWorkspaceIgnored) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNotRoot) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -311,16 +302,13 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNotRoot) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest,
        SimpleGemmOperandHasMultipleUsers) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -380,16 +368,13 @@ TEST_F(AddressComputationFusionRewriterTest,
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest,
        SimpleGemmOperandsHaveMultipleUsers) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -448,7 +433,7 @@ TEST_F(AddressComputationFusionRewriterTest,
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmSlicingNotParameter) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[4,8,8]{2,1,0} parameter(0)
@@ -510,15 +495,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmSlicingNotParameter) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNotContiguousSlice) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -557,7 +539,7 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNotContiguousSlice) {
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNonNoOpInSliceChain) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -600,7 +582,7 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNonNoOpInSliceChain) {
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmDuplicateOperand) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main {
       %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
@@ -674,15 +656,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmDuplicateOperand) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmReverseOperandOrder) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(1)
@@ -740,15 +719,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmReverseOperandOrder) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmReverseOperandOrder2) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -806,15 +782,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmReverseOperandOrder2) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandAliasingOutput) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
@@ -873,15 +846,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandAliasingOutput) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandsFromSameSlice) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -934,10 +904,7 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandsFromSameSlice) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
@@ -977,12 +944,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleCustomCall) {
   hlo_config.set_debug_options(debug_options);
   TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
                                         computation.proto(), hlo_config));
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloSchedule schedule,
-      ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
-        return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
-      }));
-  TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+  // TF_ASSERT_OK_AND_ASSIGN(
+  //     HloSchedule schedule,
+  //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+  //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  //     }));
+  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
@@ -1006,12 +973,8 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleCustomCall) {
   )";
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  RunAndFilecheckHloRewrite(hlo->ToString(),
-                            AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+  RunAndFilecheckHloRewrite(
+      hlo->ToString(), AddressComputationFusionRewriter(PLATFORM), expected);
 }
 
 void Callback_Void(se::gpu::GpuStreamHandle stream, void** buffers,
@@ -1035,12 +998,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleCustomCallLegacy) {
   hlo_config.set_debug_options(debug_options);
   TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
                                         computation.proto(), hlo_config));
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloSchedule schedule,
-      ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
-        return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
-      }));
-  TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+  // TF_ASSERT_OK_AND_ASSIGN(
+  //     HloSchedule schedule,
+  //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+  //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  //     }));
+  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
@@ -1063,12 +1026,8 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleCustomCallLegacy) {
   )";
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  RunAndFilecheckHloRewrite(hlo->ToString(),
-                            AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+  RunAndFilecheckHloRewrite(
+      hlo->ToString(), AddressComputationFusionRewriter(PLATFORM), expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, TupleSliceCustomCallLegacy) {
@@ -1099,12 +1058,12 @@ TEST_F(AddressComputationFusionRewriterTest, TupleSliceCustomCallLegacy) {
   hlo_config.set_debug_options(debug_options);
   TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
                                         computation.proto(), hlo_config));
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloSchedule schedule,
-      ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
-        return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
-      }));
-  TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+  // TF_ASSERT_OK_AND_ASSIGN(
+  //     HloSchedule schedule,
+  //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+  //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  //     }));
+  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
@@ -1128,12 +1087,8 @@ TEST_F(AddressComputationFusionRewriterTest, TupleSliceCustomCallLegacy) {
   )";
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  RunAndFilecheckHloRewrite(hlo->ToString(),
-                            AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+  RunAndFilecheckHloRewrite(
+      hlo->ToString(), AddressComputationFusionRewriter(PLATFORM), expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, TupledOutputCustomCallLegacy) {
@@ -1175,12 +1130,12 @@ TEST_F(AddressComputationFusionRewriterTest, TupledOutputCustomCallLegacy) {
   hlo_config.set_debug_options(debug_options);
   TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
                                         computation.proto(), hlo_config));
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloSchedule schedule,
-      ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
-        return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
-      }));
-  TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+  // TF_ASSERT_OK_AND_ASSIGN(
+  //     HloSchedule schedule,
+  //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+  //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  //     }));
+  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
@@ -1216,12 +1171,8 @@ TEST_F(AddressComputationFusionRewriterTest, TupledOutputCustomCallLegacy) {
   )";
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  RunAndFilecheckHloRewrite(hlo->ToString(),
-                            AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+  RunAndFilecheckHloRewrite(
+      hlo->ToString(), AddressComputationFusionRewriter(PLATFORM), expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, UnalignedSlice) {
@@ -1240,12 +1191,12 @@ TEST_F(AddressComputationFusionRewriterTest, UnalignedSlice) {
   hlo_config.set_debug_options(debug_options);
   TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
                                         computation.proto(), hlo_config));
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloSchedule schedule,
-      ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
-        return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
-      }));
-  TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+  // TF_ASSERT_OK_AND_ASSIGN(
+  //     HloSchedule schedule,
+  //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+  //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  //     }));
+  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo->ToString(),
@@ -1255,7 +1206,7 @@ TEST_F(AddressComputationFusionRewriterTest, UnalignedSlice) {
 
 TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemm) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY main.9 {
       p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -1315,15 +1266,12 @@ TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemm) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemmWithWorkspace) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY main.9 {
       p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -1388,16 +1336,13 @@ TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemmWithWorkspace) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest,
        DynamicSimpleGemmWorkspaceIgnored) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY main.9 {
       p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -1463,15 +1408,12 @@ TEST_F(AddressComputationFusionRewriterTest,
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemmNotRoot) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY main.9 {
       p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -1533,15 +1475,12 @@ TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemmNotRoot) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemm) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY main.9 {
       p0 = f16[1,8,8]{2,1,0} parameter(0)
@@ -1600,15 +1539,12 @@ TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemm) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmNotRoot) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY main.9 {
       p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -1676,15 +1612,12 @@ TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmNotRoot) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmWithWorkspace) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY main.9 {
       p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -1762,15 +1695,12 @@ TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmWithWorkspace) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmWorkspaceIgnored) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[8,8]{1,0} parameter(0)
@@ -1833,10 +1763,7 @@ TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmWorkspaceIgnored) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index c9c7e3523170c4..63d1082da8ef9e 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1286,6 +1286,17 @@ absl::Status GpuCompiler::OptimizeHloModule(
   TF_RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(
       hlo_module, stream_exec, options, gpu_target_config, thread_pool.get()));
 
+  // This is a "low effort, high impact" fusion that should be run first.
+  if (hlo_module->config()
+          .debug_options()
+          .xla_gpu_enable_address_computation_fusion()) {
+    HloPassPipeline pipeline("address-computation");
+    TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                        se::PlatformManager::PlatformWithId(PlatformId()));
+    pipeline.AddPass<AddressComputationFusionRewriter>(platform->Name());
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+
   TF_RETURN_IF_ERROR(RunFusionPasses(hlo_module, gpu_target_config,
                                      thread_pool.get(),
                                      ShapeSizeBytesFunction()));
@@ -2218,16 +2229,6 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
     }
   }
 
-  if (module->config()
-          .debug_options()
-          .xla_gpu_enable_address_computation_fusion()) {
-    HloPassPipeline pipeline("address-computation");
-    TF_ASSIGN_OR_RETURN(se::Platform * platform,
-                        se::PlatformManager::PlatformWithId(PlatformId()));
-    pipeline.AddPass<AddressComputationFusionRewriter>(platform->Name());
-    TF_RETURN_IF_ERROR(pipeline.Run(module).status());
-  }
-
   {
     HloPassPipeline pipeline("fusion-wrapper");
     pipeline.AddPass<FusionWrapper>();
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 02511c1f99e6bc..b168f69308eb55 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -349,16 +349,21 @@ ENTRY main {
 )";
 
   HloModuleConfig config;
-  DebugOptions debug_options = GetDebugOptionsForTest();
-  config.set_debug_options(GetDebugOptionsForTest());
+  DebugOptions triton_enabled_debug_options = GetDebugOptionsForTest();
+  triton_enabled_debug_options.set_xla_gpu_enable_address_computation_fusion(
+      false);
+  config.set_debug_options(triton_enabled_debug_options);
   config.set_replica_count(1);
   config.set_num_partitions(1);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string, config));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> triton_enabled_module,
                           GetOptimizedModule(std::move(module)));
-  debug_options.set_xla_gpu_enable_triton_gemm(false);
-  config.set_debug_options(debug_options);
+  DebugOptions triton_disabled_debug_options = GetDebugOptionsForTest();
+  triton_disabled_debug_options.set_xla_gpu_enable_address_computation_fusion(
+      false);
+  triton_disabled_debug_options.set_xla_gpu_enable_triton_gemm(false);
+  config.set_debug_options(triton_disabled_debug_options);
   TF_ASSERT_OK_AND_ASSIGN(module,
                           ParseAndReturnVerifiedModule(hlo_string, config));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> triton_disabled_module,

From ce7a13230b6a169b66b6c7a63abddd37b24849fe Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Fri, 29 Mar 2024 18:41:05 -0700
Subject: [PATCH 618/670] [xla:gpu] Unify GEMM emission for
 (Dynamic)AddressComputationFusion emitters

PiperOrigin-RevId: 620391616
---
 .../address_computation_fusion_test.cc        |  16 +-
 .../xla/xla/service/gpu/fusions/custom.cc     | 255 +++++++++---------
 2 files changed, 133 insertions(+), 138 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index 3a20611f1fde4b..341cf154394a87 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -159,7 +159,7 @@ TEST_F(AddressComputationFusionTest, CublasGemmSimple) {
     %p0 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
     %p1 = bf16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
     ROOT %fusion.2 = bf16[8,8]{1,0} fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
-        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
@@ -241,7 +241,7 @@ TEST_F(AddressComputationFusionTest, CublasGemmWithWorkspace) {
     %p0 = f16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
     %p1 = f16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
     ROOT %fusion.2 = (f16[8,8]{1,0}, s8[256]{0}) fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
-        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
@@ -320,7 +320,7 @@ TEST_F(AddressComputationFusionTest, ContiguousSlice) {
     %p0 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
     %p1 = bf16[8,8,10,8]{3,2,1,0} parameter(1), sharding={replicated}
     ROOT %fusion.2 = bf16[4,8]{1,0} fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
-        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
@@ -399,7 +399,7 @@ TEST_F(AddressComputationFusionTest, ContiguousSliceNonDefaultLayout) {
     %p0 = bf16[2,8,8]{1,2,0} parameter(0), sharding={replicated}
     %p1 = bf16[8,8,10,8]{1,2,3,0} parameter(1), sharding={replicated}
     ROOT %fusion.2 = bf16[4,8]{1,0} fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
-        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
@@ -529,7 +529,7 @@ TEST_F(AddressComputationFusionTest, OperandIsSlicedGetTupleElement) {
       calls=%address-computation,
       backend_config={
         "fusion_backend_config":{
-          "kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
   })";
@@ -615,7 +615,7 @@ TEST_F(AddressComputationFusionTest, ReversedOperandOrder) {
       calls=%address-computation,
       backend_config={
         "fusion_backend_config":{
-          "kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
   })";
@@ -746,7 +746,7 @@ TEST_F(AddressComputationFusionTest, SingleOperandComputation) {
       calls=%address-computation,
       backend_config={
         "fusion_backend_config":{
-          "kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
   })";
@@ -837,7 +837,7 @@ TEST_F(AddressComputationFusionTest, SlicedOperandAliasingOutput) {
       output_to_operand_aliasing={{0}: (1, {})},
       backend_config={
         "fusion_backend_config":{
-          "kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
   })";
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index c8c0900bf0523b..48d9ff6fc9cc6a 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -147,75 +147,24 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   const BufferAssignment& buffer_assignment =
       ir_emitter_context.buffer_assignment();
 
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice lhs_slice,
-      GetSliceWithUpdatedOffsetAndSize(buffer_assignment, adaptor, fusion,
-                                       *custom_call.operand(kLHSOperandIndex),
-                                       /*index=*/{}));
-
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice rhs_slice,
-      GetSliceWithUpdatedOffsetAndSize(buffer_assignment, adaptor, fusion,
-                                       *custom_call.operand(kRHSOperandIndex),
-                                       /*index=*/{}));
-
-  BufferAllocation::Slice output;
-  std::optional<BufferAllocation::Slice> workspace;
-
-  // Result of a legacy cuBLAS custom call can be a tuple if we explicitly
-  // allocate workspace buffer in HLO. If result is an array, it means that
-  // workspace is not available, and cuBLAS will allocate its own workspace.
-  if (custom_call.shape().IsArray()) {
-    TF_ASSIGN_OR_RETURN(output,
-                        GetAllocationSlice(buffer_assignment, &fusion, {}));
-  } else {
-    TF_ASSIGN_OR_RETURN(output, GetAllocationSlice(buffer_assignment, &fusion,
-                                                   {kGEMMOutputBufferIndex}));
-    TF_ASSIGN_OR_RETURN(workspace,
-                        GetAllocationSlice(buffer_assignment, &fusion,
-                                           {kGEMMWorkspaceBufferIndex}));
-  }
-
-  bool deterministic_ops =
-      ir_emitter_context.debug_options().xla_gpu_deterministic_ops();
-
-  TF_ASSIGN_OR_RETURN(
-      GemmConfig config,
-      GemmConfig::For(static_cast<const HloInstruction*>(&custom_call)));
-  auto thunk = std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(&custom_call), std::move(config),
-      lhs_slice, rhs_slice, output, workspace, deterministic_ops);
-
-  FusionEmissionResult result;
-  result.thunks.push_back(std::move(thunk));
-  return result;
-}
-
-absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
-    IrEmitterContext& ir_emitter_context, const HloFusionAdaptor& adaptor,
-    const HloFusionInstruction& fusion,
-    const HloCustomCallInstruction& custom_call) {
-  const BufferAssignment& buffer_assignment =
-      ir_emitter_context.buffer_assignment();
-
   std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
       offset_buffer_indices(4, std::nullopt);
   std::vector<std::optional<Shape>> orig_shapes(4, std::nullopt);
   std::vector<std::optional<Shape>> sliced_shapes(4, std::nullopt);
   std::vector<std::optional<uint64_t>> offset_byte_sizes(4, std::nullopt);
 
-  HloDynamicIndexInstruction* slice_instr = nullptr;
+  std::vector<HloInstruction*> slice_instrs(4, nullptr);
   auto get_original_operand_slice =
-      [&](const HloInstruction* start,
-          const ShapeIndex& index) -> absl::StatusOr<BufferAllocation::Slice> {
-    auto* param = DynCast<HloParameterInstruction>(start);
-    auto slice_adaptor = HloFindIf(
-        {HloInstructionAdaptor(*start)}, adaptor,
-        [](auto node) { return node.opcode() == HloOpcode::kDynamicSlice; });
+      [&](const HloInstruction* start, const ShapeIndex& index,
+          unsigned param_idx) -> absl::StatusOr<BufferAllocation::Slice> {
+    auto slice_adaptor =
+        HloFindIf({HloInstructionAdaptor(*start)}, adaptor, [](auto node) {
+          return IsOpcodeAnyOf<HloOpcode::kDynamicSlice, HloOpcode::kSlice>(
+              node);
+        });
     if (slice_adaptor.has_value()) {
-      slice_instr = const_cast<HloDynamicIndexInstruction*>(
-          static_cast<const HloDynamicIndexInstruction*>(
-              &slice_adaptor->instruction()));
+      auto* slice_instr =
+          const_cast<HloInstruction*>(&slice_adaptor->instruction());
 
       if (!IsContiguousSlice(slice_instr->operand(0)->shape(),
                              slice_instr->shape())) {
@@ -224,14 +173,49 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
             "currently");
       }
 
-      param = Cast<HloParameterInstruction>(slice_instr->operand(0));
+      slice_instrs[param_idx] = slice_instr;
+
+      const auto* param =
+          Cast<HloParameterInstruction>(slice_instr->operand(0));
+      TF_ASSIGN_OR_RETURN(
+          BufferAllocation::Slice orig_slice,
+          GetAllocationSlice(buffer_assignment,
+                             fusion.operand(param->parameter_number()), index));
+
+      if (auto* static_slice = DynCast<HloSliceInstruction>(slice_instr)) {
+        // Update static slices.
+        const Shape& src_shape = static_slice->operand(0)->shape();
+        const Shape& dst_shape = static_slice->shape();
+        int64_t size = ShapeUtil::ByteSizeOf(dst_shape);
+
+        // Given this slice
+        // f16[1,4,8]{2,1,0} slice(f16[2,8,8]{2,1,0}),
+        //                         slice={[1:2], [4:8], [0:8]}
+        //
+        // The offset of the slice should be:
+        //    slice_starts(0) * 8 * 8 * sizeof(f16) +
+        //    slice_starts(1) * 8 * sizeof(f16)
+        int64_t offset = orig_slice.offset();
+        for (auto [start, stride] :
+             llvm::zip(static_slice->slice_starts(),
+                       *ShapeUtil::ByteStrides(src_shape))) {
+          offset += start * stride;
+        }
+
+        return BufferAllocation::Slice(orig_slice.allocation(), offset, size);
+      }
+
+      return orig_slice;
     }
 
+    const auto* param = DynCast<HloParameterInstruction>(start);
     return GetAllocationSlice(buffer_assignment,
                               fusion.operand(param->parameter_number()), index);
   };
 
   auto collect_slice_info = [&](unsigned idx) {
+    auto* slice_instr =
+        DynCastOrNull<HloDynamicIndexInstruction>(slice_instrs[idx]);
     if (slice_instr == nullptr) {
       return;
     }
@@ -252,29 +236,26 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
                              : slice_instr->operand(1)->shape();
     offset_byte_sizes[idx] = ShapeUtil::ByteSizeOfPrimitiveType(
         slice_instr->index_operands().front()->shape().element_type());
-
-    // Reset `slice_instr` for the next call to `collect_slice_info()`.
-    slice_instr = nullptr;
   };
 
-  unsigned argument_idx = 0;
+  unsigned param_idx = 0;
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice lhs_slice,
-                      get_original_operand_slice(
-                          custom_call.operand(argument_idx), /*index=*/{}));
-  collect_slice_info(argument_idx++);
+                      get_original_operand_slice(custom_call.operand(param_idx),
+                                                 /*index=*/{}, param_idx));
+  collect_slice_info(param_idx++);
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice rhs_slice,
-                      get_original_operand_slice(
-                          custom_call.operand(argument_idx), /*index=*/{}));
-  collect_slice_info(argument_idx++);
+                      get_original_operand_slice(custom_call.operand(param_idx),
+                                                 /*index=*/{}, param_idx));
+  collect_slice_info(param_idx++);
 
   BufferAllocation::Slice output;
   std::optional<BufferAllocation::Slice> workspace = std::nullopt;
   std::optional<BufferAllocation::Slice> slice_workspace_fake = std::nullopt;
 
   auto get_original_result_slice =
-      [&](const HloInstruction* start,
-          const ShapeIndex& index) -> absl::StatusOr<BufferAllocation::Slice> {
+      [&](const HloInstruction* start, const ShapeIndex& index,
+          unsigned param_idx) -> absl::StatusOr<BufferAllocation::Slice> {
     auto slice_adaptor = HloFindIf(
         {HloInstructionAdaptor(*start)}, adaptor,
         [](auto node) {
@@ -282,9 +263,9 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
         },
         false);
     if (slice_adaptor.has_value()) {
-      slice_instr = const_cast<HloDynamicIndexInstruction*>(
-          static_cast<const HloDynamicIndexInstruction*>(
-              &slice_adaptor->instruction()));
+      auto* slice_instr =
+          const_cast<HloInstruction*>(&slice_adaptor->instruction());
+      slice_instrs[param_idx] = slice_instr;
 
       if (!IsContiguousSlice(slice_instr->shape(),
                              Cast<HloDynamicUpdateSliceInstruction>(slice_instr)
@@ -299,44 +280,42 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
     return GetAllocationSlice(buffer_assignment, &fusion, index);
   };
 
-  int64_t out_fake_byte_size = ShapeUtil::ByteSizeOf(
-      custom_call.shape().IsArray() ? custom_call.shape()
-                                    : custom_call.shape().tuple_shapes(0));
-
   // Handling cases where multiple operands share the same buffer, with
   // different offset by creating new fake allocations so each operand will have
   // a different buffer index. The slices can thus always start at offset 0.
   // AddressComputationThunk will take care of the offset adjustment.
   std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
   if (fusion.shape().IsArray()) {
-    TF_ASSIGN_OR_RETURN(output,
-                        get_original_result_slice(&custom_call, /*index=*/{}));
-    collect_slice_info(argument_idx);
+    TF_ASSIGN_OR_RETURN(output, get_original_result_slice(
+                                    &custom_call, /*index=*/{}, param_idx));
+    collect_slice_info(param_idx);
   } else {
     TF_ASSIGN_OR_RETURN(
-        output, get_original_result_slice(&custom_call,
-                                          /*index=*/{kGEMMOutputBufferIndex}));
-    collect_slice_info(argument_idx++);
+        output,
+        get_original_result_slice(
+            &custom_call, /*index=*/{kGEMMOutputBufferIndex}, param_idx));
+    collect_slice_info(param_idx++);
+
     // TODO(vuson): If we want to support slices of workspace, we'd need to
     // start `HloFindIf` with `get-tuple-element` with the right index.
     TF_ASSIGN_OR_RETURN(
         workspace, GetAllocationSlice(buffer_assignment, &fusion,
                                       /*index=*/{kGEMMWorkspaceBufferIndex}));
-    collect_slice_info(argument_idx);
-    fake_allocations[3] = std::make_unique<BufferAllocation>(
-        /*index=*/3, workspace->size(), /*color=*/0);
-    slice_workspace_fake = BufferAllocation::Slice(fake_allocations[3].get(), 0,
-                                                   workspace->size());
+    collect_slice_info(param_idx);
+    fake_allocations[param_idx] = std::make_unique<BufferAllocation>(
+        /*index=*/param_idx, workspace->size(), /*color=*/0);
+    slice_workspace_fake = BufferAllocation::Slice(
+        fake_allocations[param_idx].get(), 0, workspace->size());
   }
 
-  if (absl::c_all_of(offset_buffer_indices, [&](auto offset_slices) {
-        return offset_slices == std::nullopt;
-      }))
+  if (absl::c_all_of(slice_instrs, [&](auto slice_instr) {
+        return slice_instr == nullptr;
+      })) {
     return absl::InternalError(
         "DynamicAddressComputationFusion expects at least one sliced "
         "operand/result");
+  }
 
-  // Creating embedded GEMM thunk.
   bool deterministic_ops =
       ir_emitter_context.debug_options().xla_gpu_deterministic_ops();
 
@@ -344,38 +323,55 @@ absl::StatusOr<FusionEmissionResult> EmitDynamicSlicedGemm(
       GemmConfig config,
       GemmConfig::For(static_cast<const HloInstruction*>(&custom_call)));
 
-  int64_t lhs_byte_size =
-      ShapeUtil::ByteSizeOf(custom_call.operand(kLHSOperandIndex)->shape());
-  fake_allocations[kLHSOperandIndex] = std::make_unique<BufferAllocation>(
-      /*index=*/kLHSOperandIndex, lhs_byte_size, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_fake(
-      fake_allocations[kLHSOperandIndex].get(), 0, lhs_byte_size);
-
-  int64_t rhs_byte_size =
-      ShapeUtil::ByteSizeOf(custom_call.operand(kRHSOperandIndex)->shape());
-  fake_allocations[kRHSOperandIndex] = std::make_unique<BufferAllocation>(
-      /*index=*/kRHSOperandIndex, rhs_byte_size, /*color=*/0);
-  BufferAllocation::Slice slice_rhs_fake(
-      fake_allocations[kRHSOperandIndex].get(), 0, rhs_byte_size);
-
-  fake_allocations[2] = std::make_unique<BufferAllocation>(
-      /*index=*/2, out_fake_byte_size, /*color=*/0);
-  BufferAllocation::Slice slice_out_fake(fake_allocations[2].get(), 0,
-                                         out_fake_byte_size);
-  ThunkSequence seq;
-  seq.emplace_back(std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(&custom_call), std::move(config),
-      slice_lhs_fake, slice_rhs_fake, slice_out_fake, slice_workspace_fake,
-      deterministic_ops));
-
-  std::vector<std::optional<const BufferAllocation::Slice>> arguments{
-      lhs_slice, rhs_slice, output, workspace};
-
-  auto thunk = std::make_unique<AddressComputationThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(&custom_call),
-      std::make_unique<ThunkSequence>(std::move(seq)), arguments,
-      std::move(fake_allocations), offset_buffer_indices, orig_shapes,
-      sliced_shapes, offset_byte_sizes);
+  std::unique_ptr<Thunk> thunk;
+  auto thunk_info = Thunk::ThunkInfo::WithProfileAnnotation(&custom_call);
+
+  if (absl::c_any_of(slice_instrs, [&](auto slice_instr) {
+        return DynCastOrNull<HloDynamicIndexInstruction>(slice_instr) !=
+               nullptr;
+      })) {
+    // Creating embedded GEMM thunk.
+    unsigned arg_idx = 0;
+    int64_t lhs_byte_size =
+        ShapeUtil::ByteSizeOf(custom_call.operand(arg_idx)->shape());
+    fake_allocations[arg_idx] = std::make_unique<BufferAllocation>(
+        /*index=*/arg_idx, lhs_byte_size, /*color=*/0);
+    BufferAllocation::Slice slice_lhs_fake(fake_allocations[arg_idx].get(), 0,
+                                           lhs_byte_size);
+
+    arg_idx++;
+    int64_t rhs_byte_size =
+        ShapeUtil::ByteSizeOf(custom_call.operand(arg_idx)->shape());
+    fake_allocations[arg_idx] = std::make_unique<BufferAllocation>(
+        /*index=*/arg_idx, rhs_byte_size, /*color=*/0);
+    BufferAllocation::Slice slice_rhs_fake(fake_allocations[arg_idx].get(), 0,
+                                           rhs_byte_size);
+
+    arg_idx++;
+    int64_t out_fake_byte_size = ShapeUtil::ByteSizeOf(
+        custom_call.shape().IsArray() ? custom_call.shape()
+                                      : custom_call.shape().tuple_shapes(0));
+    fake_allocations[arg_idx] = std::make_unique<BufferAllocation>(
+        /*index=*/arg_idx, out_fake_byte_size, /*color=*/0);
+    BufferAllocation::Slice slice_out_fake(fake_allocations[arg_idx].get(), 0,
+                                           out_fake_byte_size);
+    ThunkSequence seq;
+    seq.emplace_back(std::make_unique<GemmThunk>(
+        thunk_info, std::move(config), slice_lhs_fake, slice_rhs_fake,
+        slice_out_fake, slice_workspace_fake, deterministic_ops));
+
+    std::vector<std::optional<const BufferAllocation::Slice>> arguments{
+        lhs_slice, rhs_slice, output, workspace};
+
+    thunk = std::make_unique<AddressComputationThunk>(
+        thunk_info, std::make_unique<ThunkSequence>(std::move(seq)), arguments,
+        std::move(fake_allocations), offset_buffer_indices, orig_shapes,
+        sliced_shapes, offset_byte_sizes);
+  } else {
+    thunk = std::make_unique<GemmThunk>(thunk_info, std::move(config),
+                                        lhs_slice, rhs_slice, output, workspace,
+                                        deterministic_ops);
+  }
 
   FusionEmissionResult result;
   result.thunks.push_back(std::move(thunk));
@@ -633,8 +629,7 @@ absl::StatusOr<FusionEmissionResult> DynamicAddressComputationFusion::Emit(
   const auto& custom_call = *static_cast<const HloCustomCallInstruction*>(
       &maybe_custom_call_adaptor->instruction());
   if (IsLegacyCublasMatmul(custom_call)) {
-    return EmitDynamicSlicedGemm(ir_emitter_context, adaptor, fusion,
-                                 custom_call);
+    return EmitGemm(ir_emitter_context, adaptor, fusion, custom_call);
   }
 
   return absl::UnimplementedError(absl::StrCat(

From 7e9d002edd604b73cd4d4ee3437fbad3e8a6b969 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Sat, 30 Mar 2024 01:42:11 -0700
Subject: [PATCH 619/670] [xla:gpu][NFC] Remove unused constexprs

PiperOrigin-RevId: 620446803
---
 third_party/xla/xla/service/gpu/fusions/custom.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 48d9ff6fc9cc6a..2397cd0bafbc99 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -70,9 +70,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-constexpr unsigned kLHSOperandIndex = 0;
-constexpr unsigned kRHSOperandIndex = 1;
-
 constexpr unsigned kGEMMOutputBufferIndex = 0;
 constexpr unsigned kGEMMWorkspaceBufferIndex = 1;
 
@@ -615,8 +612,6 @@ absl::StatusOr<FusionEmissionResult> AddressComputationFusion::Emit(
 absl::StatusOr<FusionEmissionResult> DynamicAddressComputationFusion::Emit(
     IrEmitterContext& ir_emitter_context,
     const HloFusionInstruction& fusion) const {
-  // std::cerr << "TYB \n"
-  //           << fusion.fused_instructions_computation()->ToString() << '\n';
   const HloFusionAdaptor& adaptor = analysis_.fusion();
   auto maybe_custom_call_adaptor = HloFindIf(
       adaptor.GetRoots(), adaptor,

From c96e6c560c2d89f6f5965146b15c1840a1b1a617 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 30 Mar 2024 02:02:02 -0700
Subject: [PATCH 620/670] Update GraphDef version to 1817.

PiperOrigin-RevId: 620449190
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 6352dcf15edd44..562fb1fe7d5136 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1816  // Updated: 2024/3/29
+#define TF_GRAPH_DEF_VERSION 1817  // Updated: 2024/3/30
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From c65e20552379c3d0e4f03391256546f2e729581b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 30 Mar 2024 02:02:03 -0700
Subject: [PATCH 621/670] compat: Update forward compatibility horizon to
 2024-03-30

PiperOrigin-RevId: 620449193
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 149839623ef935..5de1cb41d21693 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 29)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 30)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From fd091d499a90bd73e9eedcd648bfe35229e97ce3 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Sat, 30 Mar 2024 02:39:15 -0700
Subject: [PATCH 622/670] [xla:gpu] Unify static and dynamic slice cases for
 AddressComputationFusionRewriter

PiperOrigin-RevId: 620453556
---
 .../address_computation_fusion_rewriter.cc    | 49 ++++++++++---------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index afb429b1942ab3..03317a8f09e166 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -138,8 +138,7 @@ bool IsAlignedSlice(const Shape& src_shape, const Shape& dst_shape,
   return true;
 }
 
-UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr,
-                                          bool dynamic) {
+UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr) {
   UseDefDataflowPaths sliced_operand_paths;
 
   auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
@@ -164,34 +163,31 @@ UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr,
     auto maybe_slice_adaptor =
         HloFindIf({HloInstructionAdaptor(*operand)}, *fusion, [&](auto node) {
           const HloInstruction* cur = &node.instruction();
+
           // If the node is a match that has been processed, stop the traversal.
           if (processed_instrs.contains(cur)) return true;
+
           maybe_sliced_operand_path.push_back(const_cast<HloInstruction*>(cur));
-          if (dynamic) {
-            if (const auto slice_instr =
-                    DynCast<HloDynamicSliceInstruction>(cur)) {
-              if (IsAlignedSlice(slice_instr->operand(0)->shape(),
-                                 slice_instr->shape(), nullptr)) {
-                slice_found = true;
-                return slice_found;
-              }
-            }
-          } else {
-            if (const auto slice_instr = DynCast<HloSliceInstruction>(cur)) {
-              if (IsAlignedSlice(slice_instr->operand(0)->shape(),
-                                 slice_instr->shape(), slice_instr)) {
-                slice_found = true;
-                return slice_found;
-              }
+
+          if (IsOpcodeAnyOf<HloOpcode::kDynamicSlice, HloOpcode::kSlice>(
+                  node)) {
+            if (IsAlignedSlice(cur->operand(0)->shape(), cur->shape(),
+                               DynCast<HloSliceInstruction>(cur))) {
+              slice_found = true;
+              return slice_found;
             }
           }
+
           // TODO(vuson): lift the first restriction by considering fusing other
           // uses of the operand to reuse the address computation. Only worth it
           // if other uses are also custom calls though.
           return cur->user_count() > 1 || !IsNoOp(cur);
         });
+
     if (maybe_slice_adaptor == std::nullopt) continue;
+
     const auto& maybe_slice_instr = maybe_slice_adaptor->instruction();
+
     if (slice_found || processed_instrs.contains(&maybe_slice_instr)) {
       // Even in the case of stopping at a match that has been processed, we
       // still need to add instructions encountered in the sliced operand path
@@ -415,11 +411,11 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
       for (HloInstruction* instr : computation->instructions()) {
         if (IsLegacyCublasMatmul(*instr) ||
             (!dynamic && IsCustomCall(instr, platform_name_))) {
-          auto sliced_operand_paths = GetSlicedOperandPaths(instr, dynamic);
+          UseDefDataflowPaths sliced_operand_paths =
+              GetSlicedOperandPaths(instr);
           bool has_sliced_operand_paths = sliced_operand_paths.size() > 1;
-          DefUseDataflowPaths sliced_user_paths{};
-          if (dynamic) sliced_user_paths = GetSlicedUserPaths(instr);
 
+          DefUseDataflowPaths sliced_user_paths = GetSlicedUserPaths(instr);
           bool has_sliced_user_paths =
               absl::c_any_of(sliced_user_paths, [&](auto& sliced_user_path) {
                 return !sliced_user_path.empty();
@@ -464,9 +460,14 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
                            DataflowPathsView(sliced_user_paths_view),
                            captures));
 
-      TF_ASSIGN_OR_RETURN(HloInstruction * fusion,
-                          CreateFusionInstruction(module, hero, captures,
-                                                  fusion_body, dynamic));
+      bool has_dynamic_slices =
+          absl::c_any_of(matched_instrs, [&](auto* instr) {
+            return DynCast<HloDynamicIndexInstruction>(instr) != nullptr;
+          });
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * fusion,
+          CreateFusionInstruction(module, hero, captures, fusion_body,
+                                  has_dynamic_slices));
 
       HloComputation* parent = hero->parent();
       if (fusion->shape().IsTuple()) {

From 92b03bde2ec875d863d67fb5b6f9001f3bc7ab92 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Sat, 30 Mar 2024 12:09:23 -0700
Subject: [PATCH 623/670] [xla:gpu][NFC] Make lambdas static functions for
 better reusability

PiperOrigin-RevId: 620511999
---
 third_party/xla/xla/service/gpu/fusions/BUILD |   1 +
 .../xla/xla/service/gpu/fusions/custom.cc     | 322 ++++++++++--------
 2 files changed, 179 insertions(+), 144 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 87995e26bf2ec4..dd386e6d354270 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -147,6 +147,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 2397cd0bafbc99..999f7d967da5ad 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
@@ -137,6 +138,130 @@ absl::StatusOr<BufferAllocation::Slice> GetSliceWithUpdatedOffsetAndSize(
   return BufferAllocation::Slice(orig_slice.allocation(), offset, size);
 }
 
+absl::StatusOr<BufferAllocation::Slice> GetOperandSlice(
+    const BufferAssignment& buffer_assignment, const HloFusionAdaptor& adaptor,
+    const HloInstruction& fusion_instr, const HloInstruction& start_instr,
+    std::vector<HloInstruction*>& slice_instrs, const ShapeIndex& shape_idx,
+    unsigned arg_idx) {
+  auto slice_adaptor =
+      HloFindIf({HloInstructionAdaptor(start_instr)}, adaptor, [](auto node) {
+        return IsOpcodeAnyOf<HloOpcode::kDynamicSlice, HloOpcode::kSlice>(node);
+      });
+  if (slice_adaptor.has_value()) {
+    auto* slice_instr =
+        const_cast<HloInstruction*>(&slice_adaptor->instruction());
+
+    if (!IsContiguousSlice(slice_instr->operand(0)->shape(),
+                           slice_instr->shape())) {
+      return absl::InternalError(
+          "DynamicAddressComputationFusion only handles contiguous slices "
+          "currently");
+    }
+
+    slice_instrs[arg_idx] = slice_instr;
+
+    const auto* param = Cast<HloParameterInstruction>(slice_instr->operand(0));
+    TF_ASSIGN_OR_RETURN(
+        BufferAllocation::Slice orig_slice,
+        GetAllocationSlice(buffer_assignment,
+                           fusion_instr.operand(param->parameter_number()),
+                           shape_idx));
+
+    if (auto* static_slice = DynCast<HloSliceInstruction>(slice_instr)) {
+      // Update static slices.
+      const Shape& src_shape = static_slice->operand(0)->shape();
+      const Shape& dst_shape = static_slice->shape();
+      int64_t size = ShapeUtil::ByteSizeOf(dst_shape);
+
+      // Given this slice
+      // f16[1,4,8]{2,1,0} slice(f16[2,8,8]{2,1,0}),
+      //                         slice={[1:2], [4:8], [0:8]}
+      //
+      // The offset of the slice should be:
+      //    slice_starts(0) * 8 * 8 * sizeof(f16) +
+      //    slice_starts(1) * 8 * sizeof(f16)
+      int64_t offset = orig_slice.offset();
+      for (auto [start, stride] :
+           llvm::zip(static_slice->slice_starts(),
+                     *ShapeUtil::ByteStrides(src_shape))) {
+        offset += start * stride;
+      }
+
+      return BufferAllocation::Slice(orig_slice.allocation(), offset, size);
+    }
+
+    return orig_slice;
+  }
+
+  const auto* param = DynCast<HloParameterInstruction>(&start_instr);
+  return GetAllocationSlice(buffer_assignment,
+                            fusion_instr.operand(param->parameter_number()),
+                            shape_idx);
+}
+
+absl::Status CollectSliceInfo(
+    const BufferAssignment& buffer_assignment,
+    const HloInstruction& fusion_instr,
+    absl::Span<HloInstruction*> slice_instrs,
+    std::vector<std::optional<std::vector<BufferAllocation::Slice>>>&
+        offset_buffer_indices,
+    std::vector<std::optional<Shape>>& orig_shapes,
+    std::vector<std::optional<Shape>>& sliced_shapes,
+    std::vector<std::optional<uint64_t>>& offset_byte_sizes, unsigned arg_idx) {
+  auto* slice_instr =
+      DynCastOrNull<HloDynamicIndexInstruction>(slice_instrs[arg_idx]);
+  if (slice_instr == nullptr) {
+    return absl::OkStatus();
+  }
+
+  std::vector<BufferAllocation::Slice> offset_slices;
+  for (auto idx_op : slice_instr->index_operands()) {
+    const auto* param = Cast<HloParameterInstruction>(idx_op);
+    TF_ASSIGN_OR_RETURN(
+        auto offset_slice,
+        GetAllocationSlice(buffer_assignment,
+                           fusion_instr.operand(param->parameter_number()),
+                           /*index=*/{}));
+    offset_slices.push_back(offset_slice);
+  }
+  offset_buffer_indices[arg_idx] = std::move(offset_slices);
+  orig_shapes[arg_idx] = slice_instr->operand(0)->shape();
+  sliced_shapes[arg_idx] = DynCast<HloDynamicSliceInstruction>(slice_instr)
+                               ? slice_instr->shape()
+                               : slice_instr->operand(1)->shape();
+  offset_byte_sizes[arg_idx] = ShapeUtil::ByteSizeOfPrimitiveType(
+      slice_instr->index_operands().front()->shape().element_type());
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<BufferAllocation::Slice> GetResultSlice(
+    const BufferAssignment& buffer_assignment, const HloFusionAdaptor& adaptor,
+    const HloInstruction& fusion_instr, const HloInstruction& start_instr,
+    std::vector<HloInstruction*>& slice_instrs, const ShapeIndex& shape_idx,
+    unsigned arg_idx) {
+  auto slice_adaptor = HloFindIf(
+      {HloInstructionAdaptor(start_instr)}, adaptor,
+      [](auto node) { return node.opcode() == HloOpcode::kDynamicUpdateSlice; },
+      false);
+  if (slice_adaptor.has_value()) {
+    auto* slice_instr =
+        const_cast<HloInstruction*>(&slice_adaptor->instruction());
+    slice_instrs[arg_idx] = slice_instr;
+
+    if (!IsContiguousSlice(slice_instr->shape(),
+                           Cast<HloDynamicUpdateSliceInstruction>(slice_instr)
+                               ->update()
+                               ->shape())) {
+      return absl::InternalError(
+          "DynamicAddressComputationFusion only handles contiguous slices "
+          "currently");
+    }
+  }
+
+  return GetAllocationSlice(buffer_assignment, &fusion_instr, shape_idx);
+}
+
 absl::StatusOr<FusionEmissionResult> EmitGemm(
     IrEmitterContext& ir_emitter_context, const HloFusionAdaptor& adaptor,
     const HloFusionInstruction& fusion,
@@ -151,158 +276,67 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   std::vector<std::optional<uint64_t>> offset_byte_sizes(4, std::nullopt);
 
   std::vector<HloInstruction*> slice_instrs(4, nullptr);
-  auto get_original_operand_slice =
-      [&](const HloInstruction* start, const ShapeIndex& index,
-          unsigned param_idx) -> absl::StatusOr<BufferAllocation::Slice> {
-    auto slice_adaptor =
-        HloFindIf({HloInstructionAdaptor(*start)}, adaptor, [](auto node) {
-          return IsOpcodeAnyOf<HloOpcode::kDynamicSlice, HloOpcode::kSlice>(
-              node);
-        });
-    if (slice_adaptor.has_value()) {
-      auto* slice_instr =
-          const_cast<HloInstruction*>(&slice_adaptor->instruction());
-
-      if (!IsContiguousSlice(slice_instr->operand(0)->shape(),
-                             slice_instr->shape())) {
-        return absl::InternalError(
-            "DynamicAddressComputationFusion only handles contiguous slices "
-            "currently");
-      }
-
-      slice_instrs[param_idx] = slice_instr;
-
-      const auto* param =
-          Cast<HloParameterInstruction>(slice_instr->operand(0));
-      TF_ASSIGN_OR_RETURN(
-          BufferAllocation::Slice orig_slice,
-          GetAllocationSlice(buffer_assignment,
-                             fusion.operand(param->parameter_number()), index));
-
-      if (auto* static_slice = DynCast<HloSliceInstruction>(slice_instr)) {
-        // Update static slices.
-        const Shape& src_shape = static_slice->operand(0)->shape();
-        const Shape& dst_shape = static_slice->shape();
-        int64_t size = ShapeUtil::ByteSizeOf(dst_shape);
-
-        // Given this slice
-        // f16[1,4,8]{2,1,0} slice(f16[2,8,8]{2,1,0}),
-        //                         slice={[1:2], [4:8], [0:8]}
-        //
-        // The offset of the slice should be:
-        //    slice_starts(0) * 8 * 8 * sizeof(f16) +
-        //    slice_starts(1) * 8 * sizeof(f16)
-        int64_t offset = orig_slice.offset();
-        for (auto [start, stride] :
-             llvm::zip(static_slice->slice_starts(),
-                       *ShapeUtil::ByteStrides(src_shape))) {
-          offset += start * stride;
-        }
-
-        return BufferAllocation::Slice(orig_slice.allocation(), offset, size);
-      }
-
-      return orig_slice;
-    }
 
-    const auto* param = DynCast<HloParameterInstruction>(start);
-    return GetAllocationSlice(buffer_assignment,
-                              fusion.operand(param->parameter_number()), index);
-  };
-
-  auto collect_slice_info = [&](unsigned idx) {
-    auto* slice_instr =
-        DynCastOrNull<HloDynamicIndexInstruction>(slice_instrs[idx]);
-    if (slice_instr == nullptr) {
-      return;
-    }
-
-    std::vector<BufferAllocation::Slice> offset_slices;
-    for (auto idx_op : slice_instr->index_operands()) {
-      const auto* param = Cast<HloParameterInstruction>(idx_op);
-      offset_slices.push_back(
-          GetAllocationSlice(buffer_assignment,
-                             fusion.operand(param->parameter_number()),
-                             /*index=*/{})
-              .value());
-    }
-    offset_buffer_indices[idx] = std::move(offset_slices);
-    orig_shapes[idx] = slice_instr->operand(0)->shape();
-    sliced_shapes[idx] = DynCast<HloDynamicSliceInstruction>(slice_instr)
-                             ? slice_instr->shape()
-                             : slice_instr->operand(1)->shape();
-    offset_byte_sizes[idx] = ShapeUtil::ByteSizeOfPrimitiveType(
-        slice_instr->index_operands().front()->shape().element_type());
-  };
-
-  unsigned param_idx = 0;
+  unsigned arg_idx = 0;
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice lhs_slice,
-                      get_original_operand_slice(custom_call.operand(param_idx),
-                                                 /*index=*/{}, param_idx));
-  collect_slice_info(param_idx++);
+                      GetOperandSlice(buffer_assignment, adaptor, fusion,
+                                      *custom_call.operand(arg_idx),
+                                      slice_instrs, /*shape_idx=*/{}, arg_idx));
+  TF_RETURN_IF_ERROR(CollectSliceInfo(
+      buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
+      offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+      arg_idx++));
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice rhs_slice,
-                      get_original_operand_slice(custom_call.operand(param_idx),
-                                                 /*index=*/{}, param_idx));
-  collect_slice_info(param_idx++);
+                      GetOperandSlice(buffer_assignment, adaptor, fusion,
+                                      *custom_call.operand(arg_idx),
+                                      slice_instrs, /*shape_idx=*/{}, arg_idx));
+  TF_RETURN_IF_ERROR(CollectSliceInfo(
+      buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
+      offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+      arg_idx++));
 
   BufferAllocation::Slice output;
   std::optional<BufferAllocation::Slice> workspace = std::nullopt;
   std::optional<BufferAllocation::Slice> slice_workspace_fake = std::nullopt;
 
-  auto get_original_result_slice =
-      [&](const HloInstruction* start, const ShapeIndex& index,
-          unsigned param_idx) -> absl::StatusOr<BufferAllocation::Slice> {
-    auto slice_adaptor = HloFindIf(
-        {HloInstructionAdaptor(*start)}, adaptor,
-        [](auto node) {
-          return node.opcode() == HloOpcode::kDynamicUpdateSlice;
-        },
-        false);
-    if (slice_adaptor.has_value()) {
-      auto* slice_instr =
-          const_cast<HloInstruction*>(&slice_adaptor->instruction());
-      slice_instrs[param_idx] = slice_instr;
-
-      if (!IsContiguousSlice(slice_instr->shape(),
-                             Cast<HloDynamicUpdateSliceInstruction>(slice_instr)
-                                 ->update()
-                                 ->shape())) {
-        return absl::InternalError(
-            "DynamicAddressComputationFusion only handles contiguous slices "
-            "currently");
-      }
-    }
-
-    return GetAllocationSlice(buffer_assignment, &fusion, index);
-  };
-
   // Handling cases where multiple operands share the same buffer, with
   // different offset by creating new fake allocations so each operand will have
   // a different buffer index. The slices can thus always start at offset 0.
   // AddressComputationThunk will take care of the offset adjustment.
   std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
   if (fusion.shape().IsArray()) {
-    TF_ASSIGN_OR_RETURN(output, get_original_result_slice(
-                                    &custom_call, /*index=*/{}, param_idx));
-    collect_slice_info(param_idx);
+    TF_ASSIGN_OR_RETURN(
+        output, GetResultSlice(buffer_assignment, adaptor, fusion, custom_call,
+                               slice_instrs, /*shape_idx=*/{}, arg_idx));
+    TF_RETURN_IF_ERROR(CollectSliceInfo(
+        buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
+        offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+        arg_idx));
   } else {
     TF_ASSIGN_OR_RETURN(
         output,
-        get_original_result_slice(
-            &custom_call, /*index=*/{kGEMMOutputBufferIndex}, param_idx));
-    collect_slice_info(param_idx++);
+        GetResultSlice(buffer_assignment, adaptor, fusion, custom_call,
+                       slice_instrs, /*shape_idx=*/{kGEMMOutputBufferIndex},
+                       arg_idx));
+    TF_RETURN_IF_ERROR(CollectSliceInfo(
+        buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
+        offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+        arg_idx++));
 
     // TODO(vuson): If we want to support slices of workspace, we'd need to
     // start `HloFindIf` with `get-tuple-element` with the right index.
     TF_ASSIGN_OR_RETURN(
         workspace, GetAllocationSlice(buffer_assignment, &fusion,
                                       /*index=*/{kGEMMWorkspaceBufferIndex}));
-    collect_slice_info(param_idx);
-    fake_allocations[param_idx] = std::make_unique<BufferAllocation>(
-        /*index=*/param_idx, workspace->size(), /*color=*/0);
+    TF_RETURN_IF_ERROR(CollectSliceInfo(
+        buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
+        offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+        arg_idx));
+    fake_allocations[arg_idx] = std::make_unique<BufferAllocation>(
+        /*index=*/arg_idx, workspace->size(), /*color=*/0);
     slice_workspace_fake = BufferAllocation::Slice(
-        fake_allocations[param_idx].get(), 0, workspace->size());
+        fake_allocations[arg_idx].get(), 0, workspace->size());
   }
 
   if (absl::c_all_of(slice_instrs, [&](auto slice_instr) {
@@ -328,30 +362,30 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
                nullptr;
       })) {
     // Creating embedded GEMM thunk.
-    unsigned arg_idx = 0;
+    unsigned fake_arg_idx = 0;
     int64_t lhs_byte_size =
-        ShapeUtil::ByteSizeOf(custom_call.operand(arg_idx)->shape());
-    fake_allocations[arg_idx] = std::make_unique<BufferAllocation>(
-        /*index=*/arg_idx, lhs_byte_size, /*color=*/0);
-    BufferAllocation::Slice slice_lhs_fake(fake_allocations[arg_idx].get(), 0,
-                                           lhs_byte_size);
+        ShapeUtil::ByteSizeOf(custom_call.operand(fake_arg_idx)->shape());
+    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+        /*index=*/fake_arg_idx, lhs_byte_size, /*color=*/0);
+    BufferAllocation::Slice slice_lhs_fake(fake_allocations[fake_arg_idx].get(),
+                                           0, lhs_byte_size);
 
-    arg_idx++;
+    fake_arg_idx++;
     int64_t rhs_byte_size =
-        ShapeUtil::ByteSizeOf(custom_call.operand(arg_idx)->shape());
-    fake_allocations[arg_idx] = std::make_unique<BufferAllocation>(
-        /*index=*/arg_idx, rhs_byte_size, /*color=*/0);
-    BufferAllocation::Slice slice_rhs_fake(fake_allocations[arg_idx].get(), 0,
-                                           rhs_byte_size);
+        ShapeUtil::ByteSizeOf(custom_call.operand(fake_arg_idx)->shape());
+    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+        /*index=*/fake_arg_idx, rhs_byte_size, /*color=*/0);
+    BufferAllocation::Slice slice_rhs_fake(fake_allocations[fake_arg_idx].get(),
+                                           0, rhs_byte_size);
 
-    arg_idx++;
+    fake_arg_idx++;
     int64_t out_fake_byte_size = ShapeUtil::ByteSizeOf(
         custom_call.shape().IsArray() ? custom_call.shape()
                                       : custom_call.shape().tuple_shapes(0));
-    fake_allocations[arg_idx] = std::make_unique<BufferAllocation>(
-        /*index=*/arg_idx, out_fake_byte_size, /*color=*/0);
-    BufferAllocation::Slice slice_out_fake(fake_allocations[arg_idx].get(), 0,
-                                           out_fake_byte_size);
+    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+        /*index=*/fake_arg_idx, out_fake_byte_size, /*color=*/0);
+    BufferAllocation::Slice slice_out_fake(fake_allocations[fake_arg_idx].get(),
+                                           0, out_fake_byte_size);
     ThunkSequence seq;
     seq.emplace_back(std::make_unique<GemmThunk>(
         thunk_info, std::move(config), slice_lhs_fake, slice_rhs_fake,

From b2e993e84f8c7d03281ddf78dd8817e468bad83b Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Sat, 30 Mar 2024 15:05:27 -0700
Subject: [PATCH 624/670] [xla:gpu][NFC] Explicitly rewrite
 AddressComputationFusion in custom call tests

AddressComputationFusionRewriter is now part of `RunHloPasses`, we need to explicitly call it to transform the HLO in order to keep tests meaningful.

PiperOrigin-RevId: 620529073
---
 third_party/xla/xla/service/gpu/fusions/BUILD |  1 +
 .../address_computation_fusion_test.cc        | 28 +++++++++++++++----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index dd386e6d354270..2fbbcd17d47da1 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -173,6 +173,7 @@ xla_test(
         "//xla/service:custom_call_target_registry",
         "//xla/service:executable",
         "//xla/service:hlo_module_config",
+        "//xla/service/gpu:address_computation_fusion_rewriter",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/gpu:gpu_types_header",
diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index 341cf154394a87..b3832da27130cf 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/custom_call_target_registry.h"
+#include "xla/service/gpu/address_computation_fusion_rewriter.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
@@ -887,14 +888,14 @@ TEST_F(AddressComputationFusionTest, CustomCallSimple) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
                                             computation.proto(), hlo_config));
 
-  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
-  hlo_config.set_debug_options(debug_options);
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
                                             computation.proto(), hlo_config));
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
 
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
-                                      error_spec,
-                                      /*run_hlo_passes=*/false));
+                                      error_spec, /*run_hlo_passes=*/false));
 }
 
 static absl::Status SubBuffers(se::Stream* stream, ffi::BufferBase src0,
@@ -993,9 +994,12 @@ TEST_F(AddressComputationFusionTest, CustomCallWithTuple) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
                                             computation.proto(), hlo_config));
 
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
-                                      error_spec,
-                                      /*run_hlo_passes=*/false));
+                                      error_spec, /*run_hlo_passes=*/false));
 }
 
 static absl::Status NoOp(se::Stream* stream, ffi::BufferBase operand) {
@@ -1039,6 +1043,10 @@ TEST_F(AddressComputationFusionTest, NilTuple) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
                                             computation.proto(), hlo_config));
 
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
                                       error_spec,
                                       /*run_hlo_passes=*/false));
@@ -1079,6 +1087,10 @@ TEST_F(AddressComputationFusionTest, CustomCallLegacyAPI) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
                                             computation.proto(), hlo_config));
 
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
                                       error_spec,
                                       /*run_hlo_passes=*/false));
@@ -1113,6 +1125,10 @@ TEST_F(AddressComputationFusionTest, NilTupleLegacyAPI) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
                                             computation.proto(), hlo_config));
 
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
                                       error_spec,
                                       /*run_hlo_passes=*/false));

From e6bb7838413df021df74b9465d63720e1380d2f1 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Sat, 30 Mar 2024 20:55:31 -0700
Subject: [PATCH 625/670] [xla:gpu][NFC] Use the same helpers to get slices for
 GEMM and generic custom call emissions

PiperOrigin-RevId: 620567579
---
 .../xla/xla/service/gpu/fusions/custom.cc     | 88 ++++++-------------
 1 file changed, 28 insertions(+), 60 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 999f7d967da5ad..10eb3942924003 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -85,59 +85,6 @@ absl::StatusOr<std::unique_ptr<Thunk>> BuildCustomKernelThunkForFusion(
       &fusion, std::move(custom_kernel), std::move(kernel_arguments.args()));
 }
 
-absl::StatusOr<BufferAllocation::Slice> GetSliceWithUpdatedOffsetAndSize(
-    const BufferAssignment& buffer_assignment, const HloFusionAdaptor& fusion,
-    const HloInstruction& fusion_instr, const HloInstruction& start,
-    const ShapeIndex& index) {
-  if (const auto* param = DynCast<HloParameterInstruction>(&start)) {
-    return GetAllocationSlice(buffer_assignment,
-                              fusion_instr.operand(param->parameter_number()),
-                              index);
-  }
-
-  auto slice_adaptor =
-      HloFindIf({HloInstructionAdaptor(start)}, fusion,
-                [](auto node) { return node.opcode() == HloOpcode::kSlice; });
-  if (!slice_adaptor.has_value()) {
-    return absl::InternalError(
-        "AddressComputationFusion expects at least one sliced operand");
-  }
-
-  const auto& slice_instr =
-      *static_cast<const HloSliceInstruction*>(&slice_adaptor->instruction());
-
-  if (!IsContiguousSlice(slice_instr)) {
-    return absl::InternalError(
-        "AddressComputationFusion only handles contiguous slices currently");
-  }
-
-  const Shape& src_shape = slice_instr.operand(0)->shape();
-  const Shape& dst_shape = slice_instr.shape();
-  int64_t size = ShapeUtil::ByteSizeOf(dst_shape);
-
-  const auto* param = Cast<HloParameterInstruction>(slice_instr.operand(0));
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice orig_slice,
-      GetAllocationSlice(buffer_assignment,
-                         fusion_instr.operand(param->parameter_number()),
-                         index));
-
-  // Given this slice
-  // f16[1,4,8]{2,1,0} slice(f16[2,8,8]{2,1,0}),
-  //                         slice={[1:2], [4:8], [0:8]}
-  //
-  // The offset of the slice should be:
-  //    slice_starts(0) * 8 * 8 * sizeof(f16) +
-  //    slice_starts(1) * 8 * sizeof(f16)
-  int64_t offset = orig_slice.offset();
-  for (auto [start, stride] : llvm::zip(slice_instr.slice_starts(),
-                                        *ShapeUtil::ByteStrides(src_shape))) {
-    offset += start * stride;
-  }
-
-  return BufferAllocation::Slice(orig_slice.allocation(), offset, size);
-}
-
 absl::StatusOr<BufferAllocation::Slice> GetOperandSlice(
     const BufferAssignment& buffer_assignment, const HloFusionAdaptor& adaptor,
     const HloInstruction& fusion_instr, const HloInstruction& start_instr,
@@ -343,7 +290,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
         return slice_instr == nullptr;
       })) {
     return absl::InternalError(
-        "DynamicAddressComputationFusion expects at least one sliced "
+        "AddressComputationFusion expects at least one sliced "
         "operand/result");
   }
 
@@ -441,21 +388,31 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
 
   using Slices = std::vector<std::optional<CustomCallThunk::Slice>>;
 
+  int64_t num_args = ShapeUtil::GetLeafCount(custom_call.shape());
+  absl::c_for_each(custom_call.operands(), [&](auto* operand) {
+    num_args += ShapeUtil::GetLeafCount(operand->shape());
+  });
+
+  std::vector<HloInstruction*> slice_instrs(num_args, nullptr);
+
   Slices operands;
-  // TODO(vuson): add test with custom call with token-typed operands
+  unsigned arg_idx = 0;
+  // TODO(vuson): add test for custom call with token-typed operands
   for (auto* operand : custom_call.operands()) {
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
         operand->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
           if (subshape.IsToken()) {
+            arg_idx++;
             operands.push_back(std::nullopt);
             return absl::OkStatus();
           }
           if (!subshape.IsArray()) {
             return absl::OkStatus();
           }
-          TF_ASSIGN_OR_RETURN(auto slice, GetSliceWithUpdatedOffsetAndSize(
-                                              buffer_assignment, adaptor,
-                                              fusion, *operand, index));
+          TF_ASSIGN_OR_RETURN(
+              auto slice,
+              GetOperandSlice(buffer_assignment, adaptor, fusion, *operand,
+                              slice_instrs, /*shape_idx=*/index, arg_idx++));
           operands.push_back(CustomCallThunk::Slice{slice, subshape});
           return absl::OkStatus();
         }));
@@ -463,8 +420,9 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
 
   Slices results;
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      fusion.shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+      custom_call.shape(), [&](const Shape& subshape, const ShapeIndex& index) {
         if (subshape.IsToken()) {
+          arg_idx++;
           results.push_back(std::nullopt);
           return absl::OkStatus();
         }
@@ -472,11 +430,21 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
           return absl::OkStatus();
         }
         TF_ASSIGN_OR_RETURN(
-            auto slice, GetAllocationSlice(buffer_assignment, &fusion, index));
+            auto slice,
+            GetResultSlice(buffer_assignment, adaptor, fusion, custom_call,
+                           slice_instrs, /*shape_idx=*/index, arg_idx++));
         results.push_back(CustomCallThunk::Slice{slice, subshape});
         return absl::OkStatus();
       }));
 
+  if (absl::c_all_of(slice_instrs, [&](auto slice_instr) {
+        return slice_instr == nullptr;
+      })) {
+    return absl::InternalError(
+        "AddressComputationFusion expects at least one sliced "
+        "operand/result");
+  }
+
   // For legacy custom calls we convert all API versions into the latest
   // status-returning one and pass backend config as an opaque string.
   CustomCallThunk::CustomCallTarget custom_call_target;

From b4a779f77fab3d8ef8caab1f12cf3b7c4509dabf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 30 Mar 2024 20:58:08 -0700
Subject: [PATCH 626/670] Enable SparseCore threads in TpuLayoutAssignment

PiperOrigin-RevId: 620567851
---
 .../xla/xla/hlo/utils/hlo_sharding_util.cc    | 41 +++++++++++++++++-
 .../xla/xla/hlo/utils/hlo_sharding_util.h     |  7 +++
 .../xla/hlo/utils/hlo_sharding_util_test.cc   | 31 +++++++++++++
 .../xla/xla/service/layout_assignment.cc      | 43 ++++++++++---------
 .../xla/xla/service/layout_assignment.h       | 37 +++++++++++++++-
 5 files changed, 135 insertions(+), 24 deletions(-)

diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 0661c620c4d5f5..0177abc48f506f 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -3063,13 +3063,52 @@ Shape UntileLeafShape(const HloSharding& sharding, const Shape& shape) {
   if (sharding.IsTileMaximal() || sharding.IsManual() || sharding.IsUnknown()) {
     return shape;
   }
+  if (!shape.IsArray()) {
+    return shape;
+  }
   Shape result_shape = shape;
-  for (int64_t i = 0; i < sharding.TiledDataRank(); ++i) {
+  // sharding.TiledDataRank() == i < shape.rank() is not always true?
+  for (int64_t i = 0; i < sharding.TiledDataRank() && i < shape.rank(); ++i) {
     result_shape.set_dimensions(
         i, shape.dimensions(i) * sharding.tile_assignment().dim(i));
   }
   return result_shape;
 }
 
+Shape TileShape(const HloSharding& sharding, const Shape& shape) {
+  if (!sharding.IsTuple()) {
+    return TileLeafShape(sharding, shape);
+  }
+  Shape result_shape = shape;
+  ShapeUtil::ForEachMutableSubshape(
+      &result_shape,
+      [&shape, &sharding](Shape* subshape, const ShapeIndex& index) {
+        if (!ShapeUtil::IsLeafIndex(shape, index)) {
+          return;
+        }
+        const HloSharding& subshape_sharding =
+            sharding.GetSubSharding(shape, index);
+        *subshape = TileLeafShape(subshape_sharding, *subshape);
+      });
+
+  return result_shape;
+}
+
+Shape TileLeafShape(const HloSharding& sharding, const Shape& shape) {
+  if (sharding.IsTileMaximal() || sharding.IsManual() || sharding.IsUnknown()) {
+    return shape;
+  }
+  if (!shape.IsArray()) {
+    return shape;
+  }
+  Shape result_shape = shape;
+  for (int64_t i = 0; i < sharding.TiledDataRank() && i < shape.rank(); ++i) {
+    CHECK_EQ(shape.dimensions(i) % sharding.tile_assignment().dim(i), 0);
+    result_shape.set_dimensions(
+        i, shape.dimensions(i) / sharding.tile_assignment().dim(i));
+  }
+  return result_shape;
+}
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index bbf074c408a4a7..8671ebfe2554f2 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -479,6 +479,13 @@ Shape UntileShape(const HloSharding& sharding, const Shape& shape);
 // REQUIRES: !sharding.IsTuple()
 Shape UntileLeafShape(const HloSharding& sharding, const Shape& shape);
 
+// Returns the tiled shape.
+Shape TileShape(const HloSharding& sharding, const Shape& shape);
+
+// Returns the tiled shape.
+// REQUIRES: !sharding.IsTuple()
+Shape TileLeafShape(const HloSharding& sharding, const Shape& shape);
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
index bcf9d1690cda3c..ad042361a7bf27 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
@@ -823,6 +823,37 @@ TEST(HloShardingUtilTest, IsSortOperandShardingMovableSortDimUnsharded) {
   iota.set_sharding(HloSharding::IotaTile({1, 2}));
   EXPECT_FALSE(IsSortOperandShardingMovable(&iota, 0));
 }
+
+TEST(HloShardingUtilTest, TileShape) {
+  HloSharding sharding = HloSharding::Tile(TileAssignment({4, 1}));
+  Shape shape_0 = ShapeUtil::MakeShape(F32, {80, 128});
+  auto tile_shape_0 = hlo_sharding_util::TileShape(sharding, shape_0);
+  auto expected_shape_0 = ShapeUtil::MakeShape(F32, {20, 128});
+  EXPECT_EQ(tile_shape_0, expected_shape_0);
+  Shape shape_1 = ShapeUtil::MakeShape(F32, {40, 128});
+  auto tile_shape_1 = hlo_sharding_util::TileShape(sharding, shape_1);
+  auto expected_shape_1 = ShapeUtil::MakeShape(F32, {10, 128});
+  EXPECT_EQ(tile_shape_1, expected_shape_1);
+  const Shape tuple = ShapeUtil::MakeTupleShape({tile_shape_0, tile_shape_1});
+  EXPECT_EQ(hlo_sharding_util::TileShape(sharding, tuple),
+            ShapeUtil::MakeTupleShape({expected_shape_0, expected_shape_1}));
+}
+
+TEST(HloShardingUtilTest, UntileShape) {
+  HloSharding sharding = HloSharding::Tile(TileAssignment({4, 1}));
+  Shape shape_0 = ShapeUtil::MakeShape(F32, {80, 128});
+  auto tile_shape_0 = hlo_sharding_util::UntileShape(sharding, shape_0);
+  auto expected_shape_0 = ShapeUtil::MakeShape(F32, {320, 128});
+  EXPECT_EQ(tile_shape_0, expected_shape_0);
+  Shape shape_1 = ShapeUtil::MakeShape(F32, {40, 128});
+  auto tile_shape_1 = hlo_sharding_util::UntileShape(sharding, shape_1);
+  auto expected_shape_1 = ShapeUtil::MakeShape(F32, {160, 128});
+  EXPECT_EQ(tile_shape_1, expected_shape_1);
+  const Shape tuple = ShapeUtil::MakeTupleShape({tile_shape_0, tile_shape_1});
+  EXPECT_EQ(hlo_sharding_util::UntileShape(sharding, tuple),
+            ShapeUtil::MakeTupleShape({expected_shape_0, expected_shape_1}));
+}
+
 }  // namespace
 }  // namespace hlo_sharding_util
 }  // namespace xla
diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index 8ca2296875e62b..a6a3b9ebf94480 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -806,14 +806,17 @@ Status LayoutAssignment::AddMandatoryConstraints(
       const ComputationLayout& called_computation_layout =
           FindOrDie(computation_layouts_, instruction->to_apply())
               ->computation_layout();
-      TF_RETURN_IF_ERROR(SetInstructionLayout(
-          called_computation_layout.result_layout().shape(), instruction));
+      auto result_shape = UnShardedShape(
+          instruction, called_computation_layout.result_layout().shape(), -1);
+      TF_RETURN_IF_ERROR(SetInstructionLayout(result_shape, instruction));
       TF_RET_CHECK(instruction->operand_count() ==
                    called_computation_layout.parameter_count());
       for (int64_t i = 0; i < instruction->operand_count(); ++i) {
-        TF_RETURN_IF_ERROR(SetOperandLayout(
-            called_computation_layout.parameter_layout(i).shape(), instruction,
-            i, /*mandatory=*/true, /*dfs=*/true));
+        auto operand_shape = UnShardedShape(
+            instruction, called_computation_layout.parameter_layout(i).shape(),
+            i);
+        TF_RETURN_IF_ERROR(SetOperandLayout(operand_shape, instruction, i,
+                                            /*mandatory=*/true, /*dfs=*/true));
       }
     } else if (instruction->opcode() == HloOpcode::kWhile &&
                computation_layouts_.find(instruction->while_body()) !=
@@ -963,22 +966,6 @@ bool LayoutsInShapesEqual(const Shape& lhs, const Shape& rhs) {
   return Layout::Equal().MinorToMajorOnly()(lhs.layout(), rhs.layout());
 }
 
-// The operands of a call must match the layouts of parameters in the
-// ComputationLayout, and the call instruction itself must match the result
-// layout in the ComputationLayout.
-Status CheckCallLayout(HloInstruction* call,
-                       const ComputationLayout& computation_layout) {
-  HloComputation* computation = call->to_apply();
-  TF_RET_CHECK(computation->num_parameters() == call->operand_count());
-  for (int64_t i = 0; i < computation->num_parameters(); ++i) {
-    TF_RET_CHECK(computation_layout.parameter_layout(i).MatchesLayoutInShape(
-        call->operand(i)->shape(), /*minor_to_major_only=*/true));
-  }
-  TF_RET_CHECK(computation_layout.result_layout().MatchesLayoutInShape(
-      call->shape(), /*minor_to_major_only=*/true));
-  return OkStatus();
-}
-
 // Operands of layout-constrained custom calls must match the expected
 // constrained layouts.
 Status CheckCustomCallLayout(HloInstruction* instruction) {
@@ -1126,6 +1113,20 @@ Status CheckBroadcastLayout(HloInstruction* broadcast) {
 
 }  // namespace
 
+Status LayoutAssignment::CheckCallLayout(
+    HloInstruction* call, const ComputationLayout& computation_layout) {
+  HloComputation* computation = call->to_apply();
+  TF_RET_CHECK(computation->num_parameters() == call->operand_count());
+  for (int64_t i = 0; i < computation->num_parameters(); ++i) {
+    TF_RET_CHECK(computation_layout.parameter_layout(i).MatchesLayoutInShape(
+        ShardedShape(call, call->operand(i)->shape(), i),
+        /*minor_to_major_only=*/true));
+  }
+  TF_RET_CHECK(computation_layout.result_layout().MatchesLayoutInShape(
+      ShardedShape(call, call->shape(), -1), /*minor_to_major_only=*/true));
+  return OkStatus();
+}
+
 absl::StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
     const Shape& shape_with_layout, HloInstruction* instruction) {
   TF_RET_CHECK(LayoutUtil::HasLayout(shape_with_layout));
diff --git a/third_party/xla/xla/service/layout_assignment.h b/third_party/xla/xla/service/layout_assignment.h
index b855b315acd75f..22586493917105 100644
--- a/third_party/xla/xla/service/layout_assignment.h
+++ b/third_party/xla/xla/service/layout_assignment.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_LAYOUT_ASSIGNMENT_H_
 #define XLA_SERVICE_LAYOUT_ASSIGNMENT_H_
 
+#include <cstdint>
 #include <iosfwd>
 #include <map>
 #include <memory>
@@ -28,19 +29,26 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
 #include "xla/layout_util.h"
+#include "xla/map_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/service/tuple_points_to_analysis.h"
+#include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status.h"
 
@@ -117,7 +125,7 @@ class OperandLayoutConstraint : public LayoutConstraint {
 
   const ShapeLayout& shape_layout() const { return shape_layout_[0]; }
   const HloInstruction* instruction() const { return instruction_; }
-  const int64_t operand_no() const { return operand_no_; }
+  int64_t operand_no() const { return operand_no_; }
   const HloInstruction* operand() const {
     return instruction_->operand(operand_no_);
   }
@@ -196,7 +204,7 @@ class ComputationLayoutConstraint : public LayoutConstraint {
 class ChannelLayoutConstraints {
  public:
   // Construct an empty constraint set.
-  ChannelLayoutConstraints() {}
+  ChannelLayoutConstraints() = default;
 
   // Returns true if channel_id has a layout constraint.
   bool IsChannelConstrained(int64_t channel_id) const {
@@ -516,6 +524,31 @@ class LayoutAssignment : public HloModulePass {
   virtual bool InstructionCanChangeLayoutInstance(
       const HloInstruction* instruction);
 
+  // The shapes in caller can be different from the shapes in callee. For
+  // example, a shape (1024, 128) of an array can be distributed to four threads
+  // so the shape for each thread is (256, 128). When verifying the callee's
+  // shapes based on the caller, we should use this function to compute the
+  // expected shape. The param_id should be the parameter id of the shape or -1
+  // for the result output or unknown.
+  virtual Shape ShardedShape(const HloInstruction* call, const Shape& shape,
+                             int param_id) {
+    return shape;
+  }
+  // When verifying the caller's shapes based on the callee, we should use this
+  // function to compute the expected shape.
+  // The param_id should be the parameter id of the shape or -1 for the result
+  // output or unknown.
+  virtual Shape UnShardedShape(const HloInstruction* call, const Shape& shape,
+                               int param_id) {
+    return shape;
+  }
+
+  // The operands of a call must match the layouts of parameters in the
+  // ComputationLayout, and the call instruction itself must match the result
+  // layout in the ComputationLayout.
+  Status CheckCallLayout(HloInstruction* call,
+                         const ComputationLayout& computation_layout);
+
  private:
   // Initializes the layout assignment object for a new Run() call.
   Status Init(HloModule* module);

From 83f815b5c0bf5d718361902b058e22d98fb454a1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 31 Mar 2024 02:02:36 -0700
Subject: [PATCH 627/670] compat: Update forward compatibility horizon to
 2024-03-31

PiperOrigin-RevId: 620606935
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 5de1cb41d21693..8ca0d0f2d923c6 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 30)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 31)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From c1781fcc12bbf272385076ef1cea20f9f1a822d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 31 Mar 2024 02:03:10 -0700
Subject: [PATCH 628/670] Update GraphDef version to 1818.

PiperOrigin-RevId: 620607049
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 562fb1fe7d5136..a71104785c2b9d 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1817  // Updated: 2024/3/30
+#define TF_GRAPH_DEF_VERSION 1818  // Updated: 2024/3/31
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From e8cd9fd702cedf5b67824d4912d01325a9bda3c1 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Sun, 31 Mar 2024 12:52:56 -0700
Subject: [PATCH 629/670] [xla:gpu] Generic custom call emission for
 DynamicAddressComputationFusion emitter

PiperOrigin-RevId: 620681261
---
 .../address_computation_fusion_rewriter.cc    |   7 +-
 .../address_computation_fusion_test.cc        | 318 ++++++++++++++++--
 .../xla/xla/service/gpu/fusions/custom.cc     | 165 +++++++--
 3 files changed, 432 insertions(+), 58 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index 03317a8f09e166..db5ece796ffe79 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -160,6 +160,11 @@ UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr) {
     if (aliased_operands.contains(instr->operand_index(operand))) continue;
     UseDefDataflowPath maybe_sliced_operand_path;
     bool slice_found = false;
+    // TODO: currently HloFindIf exits upon encountering the first node that
+    // matches. This works well if each operand only has 1 data flow (i.e. only
+    // flows through unary op). We might want to keep finding until the queue is
+    // empty: if the operand is a tuple, it might have different data flows
+    // (i.e. 1 for each element).
     auto maybe_slice_adaptor =
         HloFindIf({HloInstructionAdaptor(*operand)}, *fusion, [&](auto node) {
           const HloInstruction* cur = &node.instruction();
@@ -410,7 +415,7 @@ absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
       if (computation->IsFusionComputation()) continue;
       for (HloInstruction* instr : computation->instructions()) {
         if (IsLegacyCublasMatmul(*instr) ||
-            (!dynamic && IsCustomCall(instr, platform_name_))) {
+            (IsCustomCall(instr, platform_name_))) {
           UseDefDataflowPaths sliced_operand_paths =
               GetSlicedOperandPaths(instr);
           bool has_sliced_operand_paths = sliced_operand_paths.size() > 1;
diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index b3832da27130cf..07733a13aa04e4 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -901,20 +901,27 @@ TEST_F(AddressComputationFusionTest, CustomCallSimple) {
 static absl::Status SubBuffers(se::Stream* stream, ffi::BufferBase src0,
                                ffi::BufferBase src1, ffi::BufferBase src2,
                                ffi::BufferBase src3, ffi::BufferBase src4,
-                               ffi::BufferBase dst0, ffi::BufferBase dst1,
-                               ffi::BufferBase dst2, ffi::BufferBase dst3,
-                               ffi::BufferBase dst4) {
+                               ffi::BufferBase src5, ffi::BufferBase src6,
+                               ffi::BufferBase src7, ffi::BufferBase dst0,
+                               ffi::BufferBase dst1, ffi::BufferBase dst2,
+                               ffi::BufferBase dst3, ffi::BufferBase dst4,
+                               ffi::BufferBase dst5, ffi::BufferBase dst6) {
   //  src0:  param 0 at tuple index {0}, shape f32[128]
   //  src1:  param 0 at tuple index {1}, shape f32[256]
   //  src2:  param 1 at tuple index {0}, shape f32[1024]
   //  src3:  param 1 at tuple index {1}, shape f32[8]
   //  src4:  param 2, shape f32[4,8]
+  //  src5:  param 3 at tuple index {0, 0}, shape f32[32]
+  //  src6:  param 3 at tuple index {0, 1}, shape f32[64]
+  //  src7:  param 3 at tuple index {1}, shape f32[3,128]
   //
   //  dst0:  result at tuple index {0}, shape f32[8]
   //  dst1:  result at tuple index {1, 0}, shape f32[128]
   //  dst2:  result at tuple index {1, 1}, shape f32[256]
   //  dst3:  result at tuple index {2}, shape f32[1024]
   //  dst4:  result at tuple index {3}, shape f32[4,8]
+  //  dst5:  result at tuple index {4}, shape f32[3,128]
+  //  dst6:  result at tuple index {5}, shape f32[96]
 
   TF_RETURN_IF_ERROR(
       stream->MemcpyD2D(&dst0.data, src3.data, 8 * sizeof(float)));
@@ -926,6 +933,13 @@ static absl::Status SubBuffers(se::Stream* stream, ffi::BufferBase src0,
       stream->MemcpyD2D(&dst3.data, src2.data, 1024 * sizeof(float)));
   TF_RETURN_IF_ERROR(
       stream->MemcpyD2D(&dst4.data, src4.data, 4 * 8 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst5.data, src7.data, 3 * 128 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst6.data, src6.data, 64 * sizeof(float)));
+  stream_executor::DeviceMemoryBase slice =
+      dst6.data.GetByteSlice(64 * sizeof(float), 32 * sizeof(float));
+  TF_RETURN_IF_ERROR(stream->MemcpyD2D(&slice, src6.data, 32 * sizeof(float)));
   return absl::OkStatus();
 }
 
@@ -937,52 +951,71 @@ XLA_FFI_DEFINE_HANDLER(kSubBuffers, SubBuffers,
                            .Arg<ffi::BufferBase>()  // src2
                            .Arg<ffi::BufferBase>()  // src3
                            .Arg<ffi::BufferBase>()  // src4
+                           .Arg<ffi::BufferBase>()  // src5
+                           .Arg<ffi::BufferBase>()  // src6
+                           .Arg<ffi::BufferBase>()  // src7
                            .Arg<ffi::BufferBase>()  // dst0
                            .Arg<ffi::BufferBase>()  // dst1
                            .Arg<ffi::BufferBase>()  // dst2
                            .Arg<ffi::BufferBase>()  // dst3
                            .Arg<ffi::BufferBase>()  // dst4
+                           .Arg<ffi::BufferBase>()  // dst5
+                           .Arg<ffi::BufferBase>()  // dst6
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$subbuffers",
                          PLATFORM, kSubBuffers);
 
 TEST_F(AddressComputationFusionTest, CustomCallWithTuple) {
   XlaBuilder b(TestName());
-  CustomCall(&b, "__xla_test$$subbuffers", /*operands=*/
-             {
-                 Tuple(&b,
-                       {
-                           Broadcast(ConstantR0WithType(&b, F32, 1), {128}),
-                           Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
-                       }),
-                 Tuple(&b,
-                       {
-                           Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
-                           Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
-                       }),
-                 Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}),
-                       {0, 0}, {4, 8}, {1, 1}),
-             },
-             ShapeUtil::MakeTupleShape({
-                 ShapeUtil::MakeShape(F32, {8}),
-                 ShapeUtil::MakeTupleShape({
-                     ShapeUtil::MakeShape(F32, {128}),
-                     ShapeUtil::MakeShape(F32, {256}),
-                 }),
-                 ShapeUtil::MakeShape(F32, {1024}),
-                 ShapeUtil::MakeShape(F32, {4, 8}),
-             }),
-             /*opaque=*/"",
-             /*has_side_effect=*/false,
-             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
-             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
-             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  CustomCall(
+      &b, "__xla_test$$subbuffers", /*operands=*/
+      {
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 1), {128}),
+                    Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                }),
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                    Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                }),
+          Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}), {0, 0},
+                {4, 8}, {1, 1}),
+          Tuple(&b,
+                {
+                    Tuple(&b,
+                          {
+                              Broadcast(ConstantR0WithType(&b, F32, 6), {32}),
+                              Broadcast(ConstantR0WithType(&b, F32, 7), {64}),
+                          }),
+                    Slice(Parameter(&b, 0, ShapeUtil::MakeShape(S32, {4, 128}),
+                                    "p0"),
+                          {1, 0}, {4, 128}, {1, 1}),
+                }),
+      },
+      ShapeUtil::MakeTupleShape({
+          ShapeUtil::MakeShape(F32, {8}),
+          ShapeUtil::MakeTupleShape({
+              ShapeUtil::MakeShape(F32, {128}),
+              ShapeUtil::MakeShape(F32, {256}),
+          }),
+          ShapeUtil::MakeShape(F32, {1024}),
+          ShapeUtil::MakeShape(F32, {4, 8}),
+          ShapeUtil::MakeShape(F32, {3, 128}),
+          ShapeUtil::MakeShape(F32, {32 + 64}),
+      }),
+      /*opaque=*/"",
+      /*has_side_effect=*/false,
+      /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+      /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
   xla::HloModuleConfig hlo_config(
       xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
+      /*ignore_layouts=*/true);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_address_computation_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -2422,6 +2455,225 @@ TEST_F(AddressComputationFusionTest, CublasGemmDUSOffsetOOB) {
                                       /*run_hlo_passes=*/false));
 }
 
+TEST_F(AddressComputationFusionTest, DynamicCustomCallSimple) {
+  XlaBuilder b(TestName());
+  CustomCall(
+      &b, "__xla_test$$memcpy",
+      /*operands=*/
+      {DynamicSlice(Parameter(&b, 0, ShapeUtil::MakeShape(S32, {4, 128}), "p0"),
+                    {Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "start0"),
+                     Parameter(&b, 2, ShapeUtil::MakeShape(S32, {}), "start1")},
+                    {2, 128})},
+      ShapeUtil::MakeShape(F32, {2, 128}), /*opaque=*/"",
+      /*has_side_effect=*/false,
+      /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+      /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec, /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicCustomCallWithTuple) {
+  XlaBuilder b(TestName());
+  CustomCall(
+      &b, "__xla_test$$subbuffers", /*operands=*/
+      {
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 1), {128}),
+                    Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                }),
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                    Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                }),
+          Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}), {0, 0},
+                {4, 8}, {1, 1}),
+          Tuple(&b,
+                {
+                    Tuple(&b,
+                          {
+                              Broadcast(ConstantR0WithType(&b, F32, 6), {32}),
+                              Broadcast(ConstantR0WithType(&b, F32, 7), {64}),
+                          }),
+                    DynamicSlice(
+                        Parameter(&b, 0, ShapeUtil::MakeShape(S32, {4, 128}),
+                                  "p0"),
+                        {Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}),
+                                   "start0"),
+                         Parameter(&b, 2, ShapeUtil::MakeShape(S32, {}),
+                                   "start1")},
+                        {3, 128}),
+                }),
+      },
+      ShapeUtil::MakeTupleShape({
+          ShapeUtil::MakeShape(F32, {8}),
+          ShapeUtil::MakeTupleShape({
+              ShapeUtil::MakeShape(F32, {128}),
+              ShapeUtil::MakeShape(F32, {256}),
+          }),
+          ShapeUtil::MakeShape(F32, {1024}),
+          ShapeUtil::MakeShape(F32, {4, 8}),
+          ShapeUtil::MakeShape(F32, {3, 128}),
+          ShapeUtil::MakeShape(F32, {32 + 64}),
+      }),
+      /*opaque=*/"",
+      /*has_side_effect=*/false,
+      /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+      /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/true);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec, /*run_hlo_passes=*/false));
+}
+
+static absl::Status SubBuffers2(se::Stream* stream, ffi::BufferBase src0,
+                                ffi::BufferBase src1, ffi::BufferBase src2,
+                                ffi::BufferBase src3, ffi::BufferBase src4,
+                                ffi::BufferBase src5, ffi::BufferBase src6,
+                                ffi::BufferBase src7, ffi::BufferBase dst0,
+                                ffi::BufferBase dst1, ffi::BufferBase dst2,
+                                ffi::BufferBase dst3, ffi::BufferBase dst4) {
+  //  src0:  param 0 at tuple index {0}, shape f32[128]
+  //  src1:  param 0 at tuple index {1}, shape f32[256]
+  //  src2:  param 1 at tuple index {0}, shape f32[1024]
+  //  src3:  param 1 at tuple index {1}, shape f32[8]
+  //  src4:  param 2, shape f32[4,8]
+  //
+  //  dst0:  result at tuple index {0}, shape f32[8]
+  //  dst1:  result at tuple index {1, 0}, shape f32[128]
+  //  dst2:  result at tuple index {1, 1}, shape f32[256]
+  //  dst3:  result at tuple index {2}, shape f32[1024]
+  //  dst4:  result at tuple index {3}, shape f32[4,8]
+
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst0.data, src3.data, 8 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst1.data, src0.data, 128 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst2.data, src1.data, 256 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst3.data, src2.data, 1024 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst4.data, src4.data, 4 * 8 * sizeof(float)));
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kSubBuffers2, SubBuffers2,
+                       ffi::Ffi::Bind()
+                           .Ctx<ffi::Stream>()
+                           .Arg<ffi::BufferBase>()  // src0
+                           .Arg<ffi::BufferBase>()  // src1
+                           .Arg<ffi::BufferBase>()  // src2
+                           .Arg<ffi::BufferBase>()  // src3
+                           .Arg<ffi::BufferBase>()  // src4
+                           .Arg<ffi::BufferBase>()  // src5
+                           .Arg<ffi::BufferBase>()  // src6
+                           .Arg<ffi::BufferBase>()  // src7
+                           .Arg<ffi::BufferBase>()  // dst0
+                           .Arg<ffi::BufferBase>()  // dst1
+                           .Arg<ffi::BufferBase>()  // dst2
+                           .Arg<ffi::BufferBase>()  // dst3
+                           .Arg<ffi::BufferBase>()  // dst4
+);
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$subbuffers2",
+                         PLATFORM, kSubBuffers2);
+
+TEST_F(AddressComputationFusionTest, Test) {
+  XlaBuilder b(TestName());
+  CustomCall(
+      &b, "Callback_Void", /*operands=*/
+      {
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 1), {128}),
+                    Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                }),
+          Tuple(&b,
+                {
+                    Slice(Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                          {512}, {512 + 256}, {1}),
+                    Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                }),
+          Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}), {0, 0},
+                {4, 8}, {1, 1}),
+          Tuple(&b,
+                {
+                    Tuple(&b,
+                          {
+                              Broadcast(ConstantR0WithType(&b, F32, 6), {32}),
+                              Broadcast(ConstantR0WithType(&b, F32, 7), {64}),
+                          }),
+                }),
+      },
+      ShapeUtil::MakeNil(),
+      //  ShapeUtil::MakeShape(F32, {128}),
+      /*opaque=*/"");
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 10eb3942924003..43d6c5d59b8e19 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -90,8 +90,29 @@ absl::StatusOr<BufferAllocation::Slice> GetOperandSlice(
     const HloInstruction& fusion_instr, const HloInstruction& start_instr,
     std::vector<HloInstruction*>& slice_instrs, const ShapeIndex& shape_idx,
     unsigned arg_idx) {
+  if (const auto* param = DynCast<HloParameterInstruction>(&start_instr)) {
+    return GetAllocationSlice(buffer_assignment,
+                              fusion_instr.operand(param->parameter_number()),
+                              shape_idx);
+  }
+
+  // Walk through ShapeIndex to find the real starting point.
+  auto* start = const_cast<HloInstruction*>(&start_instr);
+  for (auto idx : shape_idx) {
+    CHECK(start->shape().IsTuple());
+    start = const_cast<HloInstruction*>(start->operand(idx));
+  }
+
+  if (const auto* param = DynCast<HloParameterInstruction>(start)) {
+    // At this point we've walked through all `shape_idx`, `index` should be
+    // empty.
+    return GetAllocationSlice(buffer_assignment,
+                              fusion_instr.operand(param->parameter_number()),
+                              /*index*/ {});
+  }
+
   auto slice_adaptor =
-      HloFindIf({HloInstructionAdaptor(start_instr)}, adaptor, [](auto node) {
+      HloFindIf({HloInstructionAdaptor(*start)}, adaptor, [](auto node) {
         return IsOpcodeAnyOf<HloOpcode::kDynamicSlice, HloOpcode::kSlice>(node);
       });
   if (slice_adaptor.has_value()) {
@@ -108,11 +129,13 @@ absl::StatusOr<BufferAllocation::Slice> GetOperandSlice(
     slice_instrs[arg_idx] = slice_instr;
 
     const auto* param = Cast<HloParameterInstruction>(slice_instr->operand(0));
+    // At this point we've walked through all `shape_idx`, `index` should be
+    // empty.
     TF_ASSIGN_OR_RETURN(
         BufferAllocation::Slice orig_slice,
         GetAllocationSlice(buffer_assignment,
                            fusion_instr.operand(param->parameter_number()),
-                           shape_idx));
+                           /*index*/ {}));
 
     if (auto* static_slice = DynCast<HloSliceInstruction>(slice_instr)) {
       // Update static slices.
@@ -140,10 +163,7 @@ absl::StatusOr<BufferAllocation::Slice> GetOperandSlice(
     return orig_slice;
   }
 
-  const auto* param = DynCast<HloParameterInstruction>(&start_instr);
-  return GetAllocationSlice(buffer_assignment,
-                            fusion_instr.operand(param->parameter_number()),
-                            shape_idx);
+  return absl::InternalError("WTF");
 }
 
 absl::Status CollectSliceInfo(
@@ -190,7 +210,7 @@ absl::StatusOr<BufferAllocation::Slice> GetResultSlice(
   auto slice_adaptor = HloFindIf(
       {HloInstructionAdaptor(start_instr)}, adaptor,
       [](auto node) { return node.opcode() == HloOpcode::kDynamicUpdateSlice; },
-      false);
+      /*visit_operands=*/false);
   if (slice_adaptor.has_value()) {
     auto* slice_instr =
         const_cast<HloInstruction*>(&slice_adaptor->instruction());
@@ -342,9 +362,10 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
         lhs_slice, rhs_slice, output, workspace};
 
     thunk = std::make_unique<AddressComputationThunk>(
-        thunk_info, std::make_unique<ThunkSequence>(std::move(seq)), arguments,
-        std::move(fake_allocations), offset_buffer_indices, orig_shapes,
-        sliced_shapes, offset_byte_sizes);
+        thunk_info, std::make_unique<ThunkSequence>(std::move(seq)),
+        std::move(arguments), std::move(fake_allocations),
+        std::move(offset_buffer_indices), std::move(orig_shapes),
+        std::move(sliced_shapes), std::move(offset_byte_sizes));
   } else {
     thunk = std::make_unique<GemmThunk>(thunk_info, std::move(config),
                                         lhs_slice, rhs_slice, output, workspace,
@@ -393,11 +414,19 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
     num_args += ShapeUtil::GetLeafCount(operand->shape());
   });
 
+  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
+      offset_buffer_indices(num_args, std::nullopt);
+  std::vector<std::optional<Shape>> orig_shapes(num_args, std::nullopt);
+  std::vector<std::optional<Shape>> sliced_shapes(num_args, std::nullopt);
+  std::vector<std::optional<uint64_t>> offset_byte_sizes(num_args,
+                                                         std::nullopt);
+
   std::vector<HloInstruction*> slice_instrs(num_args, nullptr);
+  std::vector<std::optional<const BufferAllocation::Slice>> arguments;
 
-  Slices operands;
   unsigned arg_idx = 0;
   // TODO(vuson): add test for custom call with token-typed operands
+  Slices operands;
   for (auto* operand : custom_call.operands()) {
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
         operand->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
@@ -412,8 +441,14 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
           TF_ASSIGN_OR_RETURN(
               auto slice,
               GetOperandSlice(buffer_assignment, adaptor, fusion, *operand,
-                              slice_instrs, /*shape_idx=*/index, arg_idx++));
+                              slice_instrs, /*shape_idx=*/index, arg_idx));
+          TF_RETURN_IF_ERROR(CollectSliceInfo(
+              buffer_assignment, fusion,
+              absl::Span<HloInstruction*>(slice_instrs), offset_buffer_indices,
+              orig_shapes, sliced_shapes, offset_byte_sizes, arg_idx++));
+
           operands.push_back(CustomCallThunk::Slice{slice, subshape});
+          arguments.push_back(slice);
           return absl::OkStatus();
         }));
   }
@@ -432,8 +467,14 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
         TF_ASSIGN_OR_RETURN(
             auto slice,
             GetResultSlice(buffer_assignment, adaptor, fusion, custom_call,
-                           slice_instrs, /*shape_idx=*/index, arg_idx++));
+                           slice_instrs, /*shape_idx=*/index, arg_idx));
+        TF_RETURN_IF_ERROR(CollectSliceInfo(
+            buffer_assignment, fusion,
+            absl::Span<HloInstruction*>(slice_instrs), offset_buffer_indices,
+            orig_shapes, sliced_shapes, offset_byte_sizes, arg_idx++));
+
         results.push_back(CustomCallThunk::Slice{slice, subshape});
+        arguments.push_back(slice);
         return absl::OkStatus();
       }));
 
@@ -517,23 +558,101 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
                       custom_call.api_version());
   }
 
-  auto ffi_thunk = [&] {
+  std::unique_ptr<Thunk> thunk;
+  auto thunk_info = Thunk::ThunkInfo::WithProfileAnnotation(&custom_call);
+
+  auto ffi_thunk = [&](Slices ops, Slices res) {
     auto& called_computations = custom_call.called_computations();
     return std::make_unique<CustomCallThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(&custom_call),
-        registration->handler, std::move(operands), std::move(results),
+        thunk_info, registration->handler, std::move(ops), std::move(res),
         std::move(attributes),
         called_computations.empty() ? nullptr : called_computations[0]);
   };
 
-  auto legacy_thunk = [&] {
+  auto legacy_thunk = [&](Slices ops, Slices res) {
     return std::make_unique<CustomCallThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(&custom_call),
-        std::move(custom_call_target), std::move(operands), std::move(results),
-        std::move(opaque));
+        thunk_info, std::move(custom_call_target), std::move(ops),
+        std::move(res), std::move(opaque));
   };
+
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(num_args);
+  if (absl::c_any_of(slice_instrs, [&](auto slice_instr) {
+        return DynCastOrNull<HloDynamicIndexInstruction>(slice_instr) !=
+               nullptr;
+      })) {
+    // Creating embedded custom call thunk.
+    unsigned fake_arg_idx = 0;
+
+    Slices fake_operands;
+    for (auto* operand : custom_call.operands()) {
+      TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+          operand->shape(),
+          [&](const Shape& subshape, const ShapeIndex& index) {
+            if (subshape.IsToken()) {
+              fake_arg_idx++;
+              fake_operands.push_back(std::nullopt);
+              return absl::OkStatus();
+            }
+            if (!subshape.IsArray()) {
+              return absl::OkStatus();
+            }
+
+            int64_t operand_byte_size = ShapeUtil::ByteSizeOf(subshape);
+            fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+                /*index=*/fake_arg_idx, operand_byte_size, /*color=*/0);
+            BufferAllocation::Slice fake_slice(
+                fake_allocations[fake_arg_idx].get(), 0, operand_byte_size);
+
+            fake_arg_idx++;
+            fake_operands.push_back(
+                CustomCallThunk::Slice{fake_slice, subshape});
+            return absl::OkStatus();
+          }));
+    }
+
+    Slices fake_results;
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+        custom_call.shape(),
+        [&](const Shape& subshape, const ShapeIndex& index) {
+          if (subshape.IsToken()) {
+            fake_arg_idx++;
+            fake_results.push_back(std::nullopt);
+            return absl::OkStatus();
+          }
+          if (!subshape.IsArray()) {
+            return absl::OkStatus();
+          }
+
+          int64_t result_byte_size = ShapeUtil::ByteSizeOf(subshape);
+          fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+              /*index=*/fake_arg_idx, result_byte_size, /*color=*/0);
+          BufferAllocation::Slice fake_slice(
+              fake_allocations[fake_arg_idx].get(), 0, result_byte_size);
+
+          fake_arg_idx++;
+          fake_results.push_back(CustomCallThunk::Slice{fake_slice, subshape});
+          return absl::OkStatus();
+        }));
+
+    ThunkSequence seq;
+    seq.emplace_back(
+        found_ffi_handler
+            ? ffi_thunk(std::move(fake_operands), std::move(fake_results))
+            : legacy_thunk(std::move(fake_operands), std::move(fake_results)));
+
+    thunk = std::make_unique<AddressComputationThunk>(
+        thunk_info, std::make_unique<ThunkSequence>(std::move(seq)),
+        std::move(arguments), std::move(fake_allocations),
+        std::move(offset_buffer_indices), std::move(orig_shapes),
+        std::move(sliced_shapes), std::move(offset_byte_sizes));
+  } else {
+    thunk = found_ffi_handler
+                ? ffi_thunk(std::move(operands), std::move(results))
+                : legacy_thunk(std::move(operands), std::move(results));
+  }
+
   FusionEmissionResult result;
-  result.thunks.push_back(found_ffi_handler ? ffi_thunk() : legacy_thunk());
+  result.thunks.push_back(std::move(thunk));
   return result;
 }
 
@@ -629,9 +748,7 @@ absl::StatusOr<FusionEmissionResult> DynamicAddressComputationFusion::Emit(
     return EmitGemm(ir_emitter_context, adaptor, fusion, custom_call);
   }
 
-  return absl::UnimplementedError(absl::StrCat(
-      "No emission for DynamicAddressComputationFusion of custom call ",
-      custom_call.custom_call_target()));
+  return EmitCustomCall(ir_emitter_context, adaptor, fusion, custom_call);
 }
 
 }  // namespace gpu

From 0cc13d257ed9b2be535e6e898e2e4a2c467f51a5 Mon Sep 17 00:00:00 2001
From: Son Tuan Vu <vuson@google.com>
Date: Sun, 31 Mar 2024 14:34:02 -0700
Subject: [PATCH 630/670] [xla:gpu] DUS support for generic custom call
 emission in DynamicAddressComputationFusion emitter

PiperOrigin-RevId: 620691743
---
 .../address_computation_fusion_test.cc        | 137 +++++++++++++++---
 .../xla/xla/service/gpu/fusions/custom.cc     |  20 ++-
 2 files changed, 132 insertions(+), 25 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index 07733a13aa04e4..abe64600d848e9 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -1089,7 +1089,7 @@ void Callback_Memcpy(se::gpu::GpuStreamHandle stream, void** buffers,
                      const char* /*opaque*/, size_t /*opaque_len*/) {
   void* src = buffers[0];
   void* dst = buffers[1];
-  auto err = gpuMemcpyAsync(dst, src, /*count=*/sizeof(float) * 128,
+  auto err = gpuMemcpyAsync(dst, src, /*count=*/sizeof(float) * 3 * 128,
                             gpuMemcpyDeviceToDevice, stream);
   ASSERT_EQ(err, gpuSuccess);
 }
@@ -1100,9 +1100,9 @@ TEST_F(AddressComputationFusionTest, CustomCallLegacyAPI) {
   XlaBuilder b(TestName());
   CustomCall(&b, "Callback_Memcpy",
              /*operands=*/
-             {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {256}), {0},
-                    {128}, {1})},
-             ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"");
+             {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {512}), {128},
+                    {4 * 128}, {1})},
+             ShapeUtil::MakeShape(F32, {3 * 128}), /*opaque=*/"");
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
@@ -2570,20 +2570,25 @@ static absl::Status SubBuffers2(se::Stream* stream, ffi::BufferBase src0,
                                 ffi::BufferBase src1, ffi::BufferBase src2,
                                 ffi::BufferBase src3, ffi::BufferBase src4,
                                 ffi::BufferBase src5, ffi::BufferBase src6,
-                                ffi::BufferBase src7, ffi::BufferBase dst0,
-                                ffi::BufferBase dst1, ffi::BufferBase dst2,
-                                ffi::BufferBase dst3, ffi::BufferBase dst4) {
+                                ffi::BufferBase dst0, ffi::BufferBase dst1,
+                                ffi::BufferBase dst2, ffi::BufferBase dst3,
+                                ffi::BufferBase dst4, ffi::BufferBase dst5,
+                                ffi::BufferBase dst6) {
   //  src0:  param 0 at tuple index {0}, shape f32[128]
   //  src1:  param 0 at tuple index {1}, shape f32[256]
   //  src2:  param 1 at tuple index {0}, shape f32[1024]
   //  src3:  param 1 at tuple index {1}, shape f32[8]
   //  src4:  param 2, shape f32[4,8]
+  //  src5:  param 3 at tuple index {0, 0}, shape f32[3,128]
+  //  src6:  param 3 at tuple index {0, 1}, shape f32[5,128]
   //
   //  dst0:  result at tuple index {0}, shape f32[8]
   //  dst1:  result at tuple index {1, 0}, shape f32[128]
   //  dst2:  result at tuple index {1, 1}, shape f32[256]
   //  dst3:  result at tuple index {2}, shape f32[1024]
   //  dst4:  result at tuple index {3}, shape f32[4,8]
+  //  dst5:  result at tuple index {4, 0}, shape f32[5,128]
+  //  dst6:  result at tuple index {4, 1}, shape f32[3,128]
 
   TF_RETURN_IF_ERROR(
       stream->MemcpyD2D(&dst0.data, src3.data, 8 * sizeof(float)));
@@ -2595,6 +2600,10 @@ static absl::Status SubBuffers2(se::Stream* stream, ffi::BufferBase src0,
       stream->MemcpyD2D(&dst3.data, src2.data, 1024 * sizeof(float)));
   TF_RETURN_IF_ERROR(
       stream->MemcpyD2D(&dst4.data, src4.data, 4 * 8 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst5.data, src6.data, 5 * 128 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst6.data, src5.data, 3 * 128 * sizeof(float)));
   return absl::OkStatus();
 }
 
@@ -2608,20 +2617,64 @@ XLA_FFI_DEFINE_HANDLER(kSubBuffers2, SubBuffers2,
                            .Arg<ffi::BufferBase>()  // src4
                            .Arg<ffi::BufferBase>()  // src5
                            .Arg<ffi::BufferBase>()  // src6
-                           .Arg<ffi::BufferBase>()  // src7
                            .Arg<ffi::BufferBase>()  // dst0
                            .Arg<ffi::BufferBase>()  // dst1
                            .Arg<ffi::BufferBase>()  // dst2
                            .Arg<ffi::BufferBase>()  // dst3
                            .Arg<ffi::BufferBase>()  // dst4
+                           .Arg<ffi::BufferBase>()  // dst5
+                           .Arg<ffi::BufferBase>()  // dst6
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$subbuffers2",
                          PLATFORM, kSubBuffers2);
 
-TEST_F(AddressComputationFusionTest, Test) {
+TEST_F(AddressComputationFusionTest, CustomCallDUS) {
   XlaBuilder b(TestName());
-  CustomCall(
-      &b, "Callback_Void", /*operands=*/
+  auto custom_call =
+      CustomCall(&b, "Callback_Memcpy",
+                 /*operands=*/
+                 {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {10, 128}),
+                        {2, 0}, {5, 128}, {1, 1})},
+                 ShapeUtil::MakeShape(F32, {3, 128}), /*opaque=*/"");
+
+  DynamicUpdateSlice(
+      Broadcast(ConstantR0WithType(&b, F32, 92.0), {10, 128}), custom_call,
+      {ConstantR0WithType(&b, S32, 4), ConstantR0WithType(&b, S32, 0)});
+
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, CustomCallDUSTuple) {
+  XlaBuilder b(TestName());
+  auto big_buffer1 =
+      Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10, 128}), "p0");
+  auto big_buffer2 =
+      Parameter(&b, 1, ShapeUtil::MakeShape(F32, {10, 256}), "p1");
+  auto custom_call = CustomCall(
+      &b, "__xla_test$$subbuffers2", /*operands=*/
       {
           Tuple(&b,
                 {
@@ -2630,24 +2683,60 @@ TEST_F(AddressComputationFusionTest, Test) {
                 }),
           Tuple(&b,
                 {
-                    Slice(Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
-                          {512}, {512 + 256}, {1}),
+                    Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
                     Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
                 }),
           Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}), {0, 0},
                 {4, 8}, {1, 1}),
-          Tuple(&b,
-                {
-                    Tuple(&b,
-                          {
-                              Broadcast(ConstantR0WithType(&b, F32, 6), {32}),
-                              Broadcast(ConstantR0WithType(&b, F32, 7), {64}),
-                          }),
-                }),
+          Tuple(
+              &b,
+              {
+                  Tuple(
+                      &b,
+                      {
+                          Broadcast(ConstantR0WithType(&b, F32, 6), {3, 128}),
+                          DynamicSlice(Broadcast(ConstantR0WithType(&b, F32, 7),
+                                                 {8, 128}),
+                                       {ConstantR0WithType(&b, S32, 2),
+                                        ConstantR0WithType(&b, S32, 0)},
+                                       {5, 128}),
+                      }),
+              }),
       },
-      ShapeUtil::MakeNil(),
-      //  ShapeUtil::MakeShape(F32, {128}),
-      /*opaque=*/"");
+      ShapeUtil::MakeTupleShape({
+          ShapeUtil::MakeShape(F32, {8}),
+          ShapeUtil::MakeTupleShape({
+              ShapeUtil::MakeShape(F32, {128}),
+              ShapeUtil::MakeShape(F32, {256}),
+          }),
+          ShapeUtil::MakeShape(F32, {1024}),
+          ShapeUtil::MakeShape(F32, {4, 8}),
+          ShapeUtil::MakeTupleShape({
+              ShapeUtil::MakeShape(F32, {5, 128}),
+              ShapeUtil::MakeShape(F32, {3, 128}),
+          }),
+      }),
+      /*opaque=*/"",
+      /*has_side_effect=*/false,
+      /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+      /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  auto tuple_gte = GetTupleElement(custom_call, 4);
+  auto dus1 = DynamicUpdateSlice(
+      big_buffer1, GetTupleElement(tuple_gte, 0),
+      {ConstantR0WithType(&b, S32, 2), ConstantR0WithType(&b, S32, 0)});
+  auto dus2 = DynamicUpdateSlice(
+      big_buffer1, GetTupleElement(tuple_gte, 1),
+      {ConstantR0WithType(&b, S32, 7), ConstantR0WithType(&b, S32, 0)});
+  auto dus3 = DynamicUpdateSlice(
+      big_buffer2,
+      xla::internal::XlaBuilderFriend::BuildBitcast(
+          &b, GetTupleElement(custom_call, 2),
+          ShapeUtil::MakeShape(F32, {4, 256})),
+      {Parameter(&b, 2, ShapeUtil::MakeShape(S32, {}), "start0"),
+       Parameter(&b, 3, ShapeUtil::MakeShape(S32, {}), "start1")});
+  Tuple(&b, {dus1, dus2, dus3});
+
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 43d6c5d59b8e19..99b1e9a0918669 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -207,8 +207,26 @@ absl::StatusOr<BufferAllocation::Slice> GetResultSlice(
     const HloInstruction& fusion_instr, const HloInstruction& start_instr,
     std::vector<HloInstruction*>& slice_instrs, const ShapeIndex& shape_idx,
     unsigned arg_idx) {
+  auto* start = const_cast<HloInstruction*>(&start_instr);
+  // Walk through ShapeIndex to find the real "user" (i.e. not get-tuple-element
+  // user). Otherwise one sliced element will mark all buffers of all other
+  // elements "sliced" too.
+  if (start->shape().IsTuple()) {
+    for (auto idx : shape_idx) {
+      std::vector<HloGetTupleElementInstruction*> gte_users(
+          start->shape().tuple_shapes_size(), nullptr);
+      for (auto* user : start->users())
+        if (auto* gte = DynCast<HloGetTupleElementInstruction>(user))
+          gte_users[gte->tuple_index()] = gte;
+
+      start = static_cast<HloInstruction*>(gte_users[idx]);
+      if (start == nullptr)
+        return GetAllocationSlice(buffer_assignment, &fusion_instr, shape_idx);
+    }
+  }
+
   auto slice_adaptor = HloFindIf(
-      {HloInstructionAdaptor(start_instr)}, adaptor,
+      {HloInstructionAdaptor(*start)}, adaptor,
       [](auto node) { return node.opcode() == HloOpcode::kDynamicUpdateSlice; },
       /*visit_operands=*/false);
   if (slice_adaptor.has_value()) {

From 96817f4184908aef93634cf3ba61b1bc75c27b89 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Apr 2024 02:02:09 -0700
Subject: [PATCH 631/670] Update GraphDef version to 1819.

PiperOrigin-RevId: 620789686
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index a71104785c2b9d..bb798e7845959e 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1818  // Updated: 2024/3/31
+#define TF_GRAPH_DEF_VERSION 1819  // Updated: 2024/4/1
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 2686c02b1d10c44a27fd17cf69f259bdc153d682 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Apr 2024 02:02:35 -0700
Subject: [PATCH 632/670] compat: Update forward compatibility horizon to
 2024-04-01

PiperOrigin-RevId: 620789797
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 8ca0d0f2d923c6..7f59445d85966d 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 31)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 4, 1)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 869892f6e59da06300488380aaf3df1f4c5af651 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 1 Apr 2024 08:19:49 -0700
Subject: [PATCH 633/670] [PJRT:CPU] Fix thread-pool stack sizes to 2MiB.

The default thread pool size is too small on Mac OS.

An older version of this runtime based on StreamExecutor set a 2MiB stack size as well, but that change was most likely lost during the TFRT rewrite.

Fixes https://github.com/google/jax/issues/20428

PiperOrigin-RevId: 620853544
---
 third_party/xla/xla/pjrt/cpu/cpu_client.cc | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 6c0bcfc141e5a9..63f930a3c037af 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -363,14 +363,23 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
       std::move(options.collectives), num_threads));
 }
 
+static tsl::ThreadOptions GetThreadOptions() {
+  tsl::ThreadOptions thread_options;
+  // On Mac OS the default stack size is 512KiB, which is too small for some
+  // BLAS and LAPACK functions (https://github.com/google/jax/issues/20428).
+  thread_options.stack_size = 2 * 1024 * 1024;
+  return thread_options;
+}
+
 TfrtCpuClient::TfrtCpuClient(
     int process_index, std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
     std::shared_ptr<cpu::CollectivesInterface> collectives, size_t num_threads)
     : process_index_(process_index),
       owned_devices_(std::move(devices)),
       computation_placer_(std::make_unique<ComputationPlacer>()),
-      pjrt_client_thread_pool_(new tsl::thread::ThreadPool(
-          tsl::Env::Default(), "XLATfrtCpuClient", num_threads)),
+      pjrt_client_thread_pool_(
+          new tsl::thread::ThreadPool(tsl::Env::Default(), GetThreadOptions(),
+                                      "XLATfrtCpuClient", num_threads)),
       async_work_runner_(std::make_unique<ThreadPoolAsyncWorkRunner>(
           pjrt_client_thread_pool_.get())),
       eigen_intraop_pool_(new tsl::thread::ThreadPool(

From 61cac5cec1e3c31d795decb56b57c82584aefeb9 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 1 Apr 2024 09:23:01 -0700
Subject: [PATCH 634/670] [PJRT:CPU] Replace references to
 pjrt/tfrt_cpu_pjrt_client with pjrt/cpu/cpu_client.h.

The two are aliases and the former is a forwarding header pointing to the latter.

Cleanup only, no functional changes.

PiperOrigin-RevId: 620867527
---
 tensorflow/c/experimental/next_pluggable_device/BUILD       | 2 +-
 .../next_pluggable_device/tensor_pjrt_buffer_util_test.cc   | 2 +-
 third_party/xla/xla/pjrt/cpu/BUILD                          | 6 ++----
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/c/experimental/next_pluggable_device/BUILD b/tensorflow/c/experimental/next_pluggable_device/BUILD
index 03c83a4e8f99e0..3d92b7ad3d2992 100644
--- a/tensorflow/c/experimental/next_pluggable_device/BUILD
+++ b/tensorflow/c/experimental/next_pluggable_device/BUILD
@@ -94,9 +94,9 @@ tf_cc_test(
         "@local_xla//xla:shape_util",
         "@local_xla//xla/pjrt:pjrt_api",
         "@local_xla//xla/pjrt:pjrt_c_api_client",
-        "@local_xla//xla/pjrt:tfrt_cpu_pjrt_client",
         "@local_xla//xla/pjrt/c:pjrt_c_api_cpu",
         "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
         "@local_xla//xla/pjrt/c:pjrt_c_api_wrapper_impl",
+        "@local_xla//xla/pjrt/cpu:cpu_client",
     ],
 )
diff --git a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc
index c72f0cfafa6ead..7f45fd91a1baea 100644
--- a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc
+++ b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_cpu.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/pjrt_api.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index 324d684611e22c..603a8ef30e2dcc 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -1,4 +1,4 @@
-load("@local_tsl//tsl:tsl.bzl", "if_oss")
+load("@local_tsl//tsl:tsl.bzl", "if_oss", "internal_visibility")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
@@ -130,9 +130,7 @@ cc_library(
     name = "cpu_client",
     srcs = ["cpu_client.cc"],
     hdrs = ["cpu_client.h"],
-    visibility = [
-        "//xla:friends",
-    ],
+    visibility = internal_visibility(["//xla:friends"]),
     deps = [
         ":abstract_tfrt_cpu_buffer",
         ":cpu_topology",

From 007eb8a5b793ecbaa648b2e894df4f4c846a6156 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 1 Apr 2024 09:41:37 -0700
Subject: [PATCH 635/670] [xla:gpu] No need for dynamic/static mode in
 AddressComputationFusionRewriter

PiperOrigin-RevId: 620871987
---
 .../address_computation_fusion_rewriter.cc    | 195 ++++++++----------
 1 file changed, 90 insertions(+), 105 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index db5ece796ffe79..5b92eb3423ebbb 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -405,125 +405,110 @@ absl::StatusOr<HloInstruction*> CreateFusionInstruction(
 absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  auto process_slices = [&](bool dynamic) -> absl::StatusOr<bool> {
-    absl::flat_hash_map<HloInstruction*,
-                        std::pair<UseDefDataflowPaths, DefUseDataflowPaths>>
-        matches;
-
-    // Collect all potential custom call matches in the non-fusion computations.
-    for (HloComputation* computation : module->computations()) {
-      if (computation->IsFusionComputation()) continue;
-      for (HloInstruction* instr : computation->instructions()) {
-        if (IsLegacyCublasMatmul(*instr) ||
-            (IsCustomCall(instr, platform_name_))) {
-          UseDefDataflowPaths sliced_operand_paths =
-              GetSlicedOperandPaths(instr);
-          bool has_sliced_operand_paths = sliced_operand_paths.size() > 1;
-
-          DefUseDataflowPaths sliced_user_paths = GetSlicedUserPaths(instr);
-          bool has_sliced_user_paths =
-              absl::c_any_of(sliced_user_paths, [&](auto& sliced_user_path) {
-                return !sliced_user_path.empty();
-              });
-
-          if (absl::c_any_of(sliced_user_paths, [&](auto& sliced_user_path) {
-                return DynCast<HloDynamicUpdateSliceInstruction>(
-                           sliced_user_path.back()) == nullptr;
-              })) {
-            return absl::InternalError(
-                "Expect sliced user path to end with a DUS.");
-          }
+  absl::flat_hash_map<HloInstruction*,
+                      std::pair<UseDefDataflowPaths, DefUseDataflowPaths>>
+      matches;
+
+  // Collect all potential custom call matches in the non-fusion computations.
+  for (HloComputation* computation : module->computations()) {
+    if (computation->IsFusionComputation()) continue;
+    for (HloInstruction* instr : computation->instructions()) {
+      if (IsLegacyCublasMatmul(*instr) ||
+          (IsCustomCall(instr, platform_name_))) {
+        UseDefDataflowPaths sliced_operand_paths = GetSlicedOperandPaths(instr);
+        bool has_sliced_operand_paths = sliced_operand_paths.size() > 1;
+
+        DefUseDataflowPaths sliced_user_paths = GetSlicedUserPaths(instr);
+        bool has_sliced_user_paths = absl::c_any_of(
+            sliced_user_paths,
+            [&](auto& sliced_user_path) { return !sliced_user_path.empty(); });
+
+        if (absl::c_any_of(sliced_user_paths, [&](auto& sliced_user_path) {
+              return DynCast<HloDynamicUpdateSliceInstruction>(
+                         sliced_user_path.back()) == nullptr;
+            })) {
+          return absl::InternalError(
+              "Expect sliced user path to end with a DUS.");
+        }
 
-          if (has_sliced_operand_paths || has_sliced_user_paths) {
-            matches[instr] = std::make_pair(std::move(sliced_operand_paths),
-                                            std::move(sliced_user_paths));
-          }
+        if (has_sliced_operand_paths || has_sliced_user_paths) {
+          matches[instr] = std::make_pair(std::move(sliced_operand_paths),
+                                          std::move(sliced_user_paths));
         }
       }
     }
+  }
 
-    if (matches.empty()) return false;
+  if (matches.empty()) return false;
 
-    for (auto& [hero, paths] : matches) {
-      auto& [sliced_operand_paths, sliced_user_paths] = paths;
-      std::vector<HloInstruction*> matched_instrs;
-      absl::c_copy(sliced_operand_paths, std::back_inserter(matched_instrs));
+  for (auto& [hero, paths] : matches) {
+    auto& [sliced_operand_paths, sliced_user_paths] = paths;
+    std::vector<HloInstruction*> matched_instrs;
+    absl::c_copy(sliced_operand_paths, std::back_inserter(matched_instrs));
 
-      std::vector<DataflowPathView> sliced_user_paths_view;
+    std::vector<DataflowPathView> sliced_user_paths_view;
+    for (auto& sliced_user_path : sliced_user_paths) {
+      absl::c_copy(sliced_user_path, std::back_inserter(matched_instrs));
+      DataflowPathView sliced_user_path_view{&sliced_user_path.front(),
+                                             sliced_user_path.size()};
+      sliced_user_paths_view.push_back(std::move(sliced_user_path_view));
+    }
+
+    auto captures = GetPatternCaptures(matched_instrs);
+
+    TF_ASSIGN_OR_RETURN(
+        HloComputation * fusion_body,
+        CreateFusionBody(module, sliced_operand_paths,
+                         DataflowPathsView(sliced_user_paths_view), captures));
+
+    bool has_dynamic_slices = absl::c_any_of(matched_instrs, [&](auto* instr) {
+      return DynCast<HloDynamicIndexInstruction>(instr) != nullptr;
+    });
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * fusion,
+        CreateFusionInstruction(module, hero, captures, fusion_body,
+                                has_dynamic_slices));
+
+    HloComputation* parent = hero->parent();
+    if (fusion->shape().IsTuple()) {
+      TF_RETURN_IF_ERROR(parent->ReplaceInstructionWithDifferentShape(
+          const_cast<HloInstruction*>(hero), fusion));
       for (auto& sliced_user_path : sliced_user_paths) {
-        absl::c_copy(sliced_user_path, std::back_inserter(matched_instrs));
-        DataflowPathView sliced_user_path_view{&sliced_user_path.front(),
-                                               sliced_user_path.size()};
-        sliced_user_paths_view.push_back(std::move(sliced_user_path_view));
+        auto old_gte =
+            Cast<HloGetTupleElementInstruction>(sliced_user_path.front());
+        HloInstruction* gte =
+            parent->AddInstruction(HloInstruction::CreateGetTupleElement(
+                fusion, old_gte->tuple_index()));
+        TF_RETURN_IF_ERROR(
+            parent->ReplaceInstruction(sliced_user_path.back(), gte));
       }
-
-      auto captures = GetPatternCaptures(matched_instrs);
-
-      TF_ASSIGN_OR_RETURN(
-          HloComputation * fusion_body,
-          CreateFusionBody(module, sliced_operand_paths,
-                           DataflowPathsView(sliced_user_paths_view),
-                           captures));
-
-      bool has_dynamic_slices =
-          absl::c_any_of(matched_instrs, [&](auto* instr) {
-            return DynCast<HloDynamicIndexInstruction>(instr) != nullptr;
-          });
-      TF_ASSIGN_OR_RETURN(
-          HloInstruction * fusion,
-          CreateFusionInstruction(module, hero, captures, fusion_body,
-                                  has_dynamic_slices));
-
-      HloComputation* parent = hero->parent();
-      if (fusion->shape().IsTuple()) {
-        TF_RETURN_IF_ERROR(parent->ReplaceInstructionWithDifferentShape(
-            const_cast<HloInstruction*>(hero), fusion));
-        for (auto& sliced_user_path : sliced_user_paths) {
-          auto old_gte =
-              Cast<HloGetTupleElementInstruction>(sliced_user_path.front());
-          HloInstruction* gte =
-              parent->AddInstruction(HloInstruction::CreateGetTupleElement(
-                  fusion, old_gte->tuple_index()));
-          TF_RETURN_IF_ERROR(
-              parent->ReplaceInstruction(sliced_user_path.back(), gte));
-        }
-      } else {
-        auto* instr_to_be_replaced = const_cast<HloInstruction*>(hero);
-        if (sliced_user_paths.empty()) {
-          // The only case where a tuple-shaped original hero op is fused into a
-          // non-tuple-shaped fusion is there's only one element of the original
-          // tuple being used. In that case, we need to replace that single
-          // get-tuple-element (instead of the hero op) with the fusion
-          // instruction.
-          if (hero->shape().IsTuple()) {
-            if (hero->user_count() != 1 ||
-                !DynCast<HloGetTupleElementInstruction>(
-                    hero->users().front())) {
-              return absl::InternalError(
-                  "Expect a single get-tuple-element user of the original "
-                  "tuple-shaped hero op when address computation fusion does "
-                  "not return a tuple");
-            }
-            instr_to_be_replaced = hero->users().front();
+    } else {
+      auto* instr_to_be_replaced = const_cast<HloInstruction*>(hero);
+      if (sliced_user_paths.empty()) {
+        // The only case where a tuple-shaped original hero op is fused into a
+        // non-tuple-shaped fusion is there's only one element of the original
+        // tuple being used. In that case, we need to replace that single
+        // get-tuple-element (instead of the hero op) with the fusion
+        // instruction.
+        if (hero->shape().IsTuple()) {
+          if (hero->user_count() != 1 ||
+              !DynCast<HloGetTupleElementInstruction>(hero->users().front())) {
+            return absl::InternalError(
+                "Expect a single get-tuple-element user of the original "
+                "tuple-shaped hero op when address computation fusion does "
+                "not return a tuple");
           }
-        } else {
-          instr_to_be_replaced = sliced_user_paths.front().back();
+          instr_to_be_replaced = hero->users().front();
         }
-        TF_RETURN_IF_ERROR(
-            parent->ReplaceInstruction(instr_to_be_replaced, fusion));
+      } else {
+        instr_to_be_replaced = sliced_user_paths.front().back();
       }
+      TF_RETURN_IF_ERROR(
+          parent->ReplaceInstruction(instr_to_be_replaced, fusion));
     }
+  }
 
-    return true;
-  };
-
-  // TODO(vuson): unify dynamic_address_computation and address_computation
-  TF_ASSIGN_OR_RETURN(bool processed_pattern_with_static_slices,
-                      process_slices(false));
-  TF_ASSIGN_OR_RETURN(bool processed_pattern_with_dynamic_slices,
-                      process_slices(true));
-  return processed_pattern_with_static_slices ||
-         processed_pattern_with_dynamic_slices;
+  return true;
 }
 
 }  // namespace gpu

From e7efc3a618162ce1093007467b75da2139aa622d Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 1 Apr 2024 10:57:48 -0700
Subject: [PATCH 636/670] [xla:gpu][NFC] No need for custom HloModuleConfigs in
 address_computation_fusion_test

Since the rewriter is now in RunHloPasses, these configs do not do anything.

PiperOrigin-RevId: 620894763
---
 .../address_computation_fusion_test.cc        | 81 +++++++------------
 1 file changed, 27 insertions(+), 54 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index abe64600d848e9..2374f5abe6e2fd 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -70,18 +70,8 @@ namespace {
 
 class AddressComputationFusionTest : public HloTestBase {
  public:
-  HloModuleConfig GetRefModuleConfig() {
+  HloModuleConfig GetModuleConfigWithoutCommandBuffer() {
     DebugOptions debug_options = GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_address_computation_fusion(false);
-    debug_options.clear_xla_gpu_enable_command_buffer();
-    HloModuleConfig config;
-    config.set_debug_options(debug_options);
-    return config;
-  }
-
-  HloModuleConfig GetOptModuleConfig() {
-    DebugOptions debug_options = GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_address_computation_fusion(true);
     debug_options.clear_xla_gpu_enable_command_buffer();
     HloModuleConfig config;
     config.set_debug_options(debug_options);
@@ -163,8 +153,7 @@ TEST_F(AddressComputationFusionTest, CublasGemmSimple) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -245,8 +234,7 @@ TEST_F(AddressComputationFusionTest, CublasGemmWithWorkspace) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -324,8 +312,7 @@ TEST_F(AddressComputationFusionTest, ContiguousSlice) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -403,8 +390,7 @@ TEST_F(AddressComputationFusionTest, ContiguousSliceNonDefaultLayout) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -535,8 +521,7 @@ TEST_F(AddressComputationFusionTest, OperandIsSlicedGetTupleElement) {
       }
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -621,8 +606,7 @@ TEST_F(AddressComputationFusionTest, ReversedOperandOrder) {
       }
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -752,8 +736,7 @@ TEST_F(AddressComputationFusionTest, SingleOperandComputation) {
       }
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -843,8 +826,7 @@ TEST_F(AddressComputationFusionTest, SlicedOperandAliasingOutput) {
       }
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -1247,8 +1229,7 @@ TEST_F(AddressComputationFusionTest, CublasGemmDynamic) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -1335,8 +1316,7 @@ TEST_F(AddressComputationFusionTest, CublasGemmDynamicWithWorkspace) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -1426,8 +1406,7 @@ TEST_F(AddressComputationFusionTest, DynamicContiguousSlice) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -1518,8 +1497,7 @@ TEST_F(AddressComputationFusionTest, DynamicContiguousSliceNonDefaultLayout) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -1653,8 +1631,7 @@ TEST_F(AddressComputationFusionTest, DynamicOperandIsSlicedGetTupleElement) {
       }
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -1745,8 +1722,7 @@ TEST_F(AddressComputationFusionTest, DynamicReversedOperandOrder) {
       }
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -1879,8 +1855,7 @@ TEST_F(AddressComputationFusionTest, DynamicSingleOperandComputation) {
       }
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -1978,8 +1953,7 @@ TEST_F(AddressComputationFusionTest, DynamicSlicedOperandAliasingOutput) {
       }
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -2070,9 +2044,12 @@ TEST_F(AddressComputationFusionTest, CublasGemmDUS) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
-                                      /*run_hlo_passes=*/false));
+  // The GEMM custom call does not have a workspace, shouldn't be run in command
+  // buffer.
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      hlo_ref, hlo_opt, GetModuleConfigWithoutCommandBuffer(),
+      GetModuleConfigWithoutCommandBuffer(), error_spec,
+      /*run_hlo_passes=*/false));
 }
 
 TEST_F(AddressComputationFusionTest, CublasGemmDUSWithWorkspace) {
@@ -2168,8 +2145,7 @@ TEST_F(AddressComputationFusionTest, CublasGemmDUSWithWorkspace) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -2254,8 +2230,7 @@ TEST_F(AddressComputationFusionTest, CublasGemmDUSWorkspaceIgnored) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -2352,8 +2327,7 @@ TEST_F(AddressComputationFusionTest, CublasGemmDUSOffsetS32NotConstant) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -2450,8 +2424,7 @@ TEST_F(AddressComputationFusionTest, CublasGemmDUSOffsetOOB) {
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 

From 98fb64b55f3ad880b15175d9bdf2ad497fe4a669 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Apr 2024 13:11:31 -0700
Subject: [PATCH 637/670] Automated Code Change

PiperOrigin-RevId: 620932239
---
 tensorflow/c/eager/parallel_device/BUILD         |  4 +---
 .../c/experimental/saved_model/internal/BUILD    |  4 +---
 .../common_runtime/next_pluggable_device/BUILD   |  4 +---
 tensorflow/core/function/polymorphism/BUILD      | 12 +++---------
 tensorflow/core/lib/random/BUILD                 |  4 +---
 tensorflow/python/autograph/impl/BUILD           |  4 +---
 tensorflow/python/autograph/operators/BUILD      | 16 ++++------------
 third_party/xla/xla/BUILD                        |  4 +---
 8 files changed, 13 insertions(+), 39 deletions(-)

diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index d91dd4092cd5bb..54afc6f757d740 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -56,9 +56,7 @@ cc_library(
     name = "parallel_device",
     srcs = [":device_sources"],
     hdrs = [":device_headers"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":parallel_device_lib",
         "//tensorflow/c:c_api",
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index 51446cca058352..244bbc9e515f19 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -161,9 +161,7 @@ cc_library(
     hdrs = [
         "saved_model_api_type.h",
     ],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/c:conversion_macros",
         "//tensorflow/c/experimental/saved_model/core:saved_model_api",
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
index 545e6017672410..b07376c1614788 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
@@ -246,9 +246,7 @@ cc_library(
     name = "loose_headers",
     tags = ["avoid_dep"],
     textual_hdrs = ["c_plugin_op_kernel.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 cc_library(
diff --git a/tensorflow/core/function/polymorphism/BUILD b/tensorflow/core/function/polymorphism/BUILD
index 5027542560ee1d..a33eee23597e77 100644
--- a/tensorflow/core/function/polymorphism/BUILD
+++ b/tensorflow/core/function/polymorphism/BUILD
@@ -18,9 +18,7 @@ pytype_strict_library(
         "type_dispatch.py",
     ],
     srcs_version = "PY3",
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":function_type",
     ],
@@ -97,18 +95,14 @@ tf_proto_library(
     protodeps = [
         "//tensorflow/core/function/trace_type:serialization_proto",
     ],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 # copybara:uncomment_begin(google-only)
 # py_proto_library(
 #     name = "function_type_py_pb2",
 #     api_version = 2,
-#     visibility = [
-#         "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-#     ],
+#     visibility = ["//visibility:private"],
 #     deps = [":function_type_proto"],
 # )
 # copybara:uncomment_end
diff --git a/tensorflow/core/lib/random/BUILD b/tensorflow/core/lib/random/BUILD
index cbb6fc6dd2bba0..db2c962671c3f0 100644
--- a/tensorflow/core/lib/random/BUILD
+++ b/tensorflow/core/lib/random/BUILD
@@ -40,9 +40,7 @@ cc_library(
     name = "random_distributions_utils",
     hdrs = ["random_distributions_utils.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":philox_random",
         "@local_tsl//tsl/lib/random:random_distributions_utils",
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index d3c4b9e07b2bb3..812eb5740f36a1 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -56,9 +56,7 @@ py_strict_library(
 py_strict_library(
     name = "conversion",
     srcs = ["conversion.py"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/python/autograph/core:config",
         "//tensorflow/python/autograph/pyct:cache",
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 25dd28737fce2e..9f4f87dad148e4 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -30,9 +30,7 @@ py_strict_library(
 py_strict_library(
     name = "exceptions",
     srcs = ["exceptions.py"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/ops:control_flow_assert",
@@ -59,9 +57,7 @@ py_strict_library(
 py_strict_library(
     name = "logical",
     srcs = ["logical.py"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/ops:cond",
@@ -94,9 +90,7 @@ py_strict_library(
 py_strict_library(
     name = "conditional_expressions",
     srcs = ["conditional_expressions.py"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":control_flow",
         "//tensorflow/python/autograph/utils:tensors",
@@ -139,9 +133,7 @@ py_strict_library(
 py_strict_library(
     name = "slices",
     srcs = ["slices.py"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_util",
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index fc4d67a6fe564f..b9434882faf5c1 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -250,9 +250,7 @@ xla_cc_test(
     name = "types_test",
     size = "small",
     srcs = ["types_test.cc"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":test",
         ":types",

From c6bf758651f6cf1f138561395689abae65e7c9f3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Apr 2024 13:11:52 -0700
Subject: [PATCH 638/670] Automated Code Change

PiperOrigin-RevId: 620932347
---
 tensorflow/c/experimental/gradients/BUILD | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tensorflow/c/experimental/gradients/BUILD b/tensorflow/c/experimental/gradients/BUILD
index 2ca36f898737c0..a3fa49fffa34b7 100644
--- a/tensorflow/c/experimental/gradients/BUILD
+++ b/tensorflow/c/experimental/gradients/BUILD
@@ -55,9 +55,7 @@ cc_library(
     hdrs = [
         "nn_grad.h",
     ],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:gradients_internal",
@@ -148,9 +146,7 @@ cc_library(
     testonly = True,
     srcs = ["grad_test_helper.cc"],
     hdrs = ["grad_test_helper.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/c/eager:gradient_checker",
         "//tensorflow/c/eager:gradients_internal",

From 811329e3f913646fc57c9e641a9eca52dfe25a8a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Apr 2024 13:26:49 -0700
Subject: [PATCH 639/670] GPU Load tracker in TFRT.

PiperOrigin-RevId: 620936640
---
 tensorflow/compiler/jit/BUILD                 |  5 ++--
 tensorflow/compiler/jit/xla_launch_util.cc    | 25 +++++++++++++++++++
 tensorflow/core/common_runtime/gpu/BUILD      |  1 +
 .../gpu/gpu_serving_device_selector.h         | 22 ++++++++++++++++
 4 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 5511640af1e668..76f3c147903748 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -520,10 +520,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_xla//xla:status_macros",
         "@local_xla//xla/pjrt:pjrt_client",
     ],
 )
@@ -585,6 +582,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime:dma_helper",
+        "//tensorflow/core/common_runtime/gpu:gpu_serving_device_selector",
         "//tensorflow/core/tfrt/common:async_value_tensor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/cleanup",
@@ -592,6 +590,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/framework:device_id_utils",
+        "@local_tsl//tsl/framework:serving_device_selector_policies",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:status_macros",
         "@local_xla//xla/client:local_client",
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 3f0a4847c54540..f9657509623cc1 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op.h"
@@ -58,6 +59,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/common/async_value_tensor.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 #include "tsl/framework/device_id_utils.h"
+#include "tsl/framework/serving_device_selector_policies.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
@@ -863,12 +865,35 @@ Status RunPjRtExecutable(
   TF_ASSIGN_OR_RETURN(xla::PjRtDevice * device,
                       pjrt_client->LookupAddressableDevice(pjrt_device_id));
 
+  gpu::GpuServingDeviceSelectorResource* device_selector_resource = nullptr;
+  if (device_type == DEVICE_GPU) {
+    auto rm = ctx->resource_manager();
+    TF_RETURN_IF_ERROR(rm->LookupOrCreate<
+                       gpu::GpuServingDeviceSelectorResource>(
+        rm->default_container(), gpu::kGpuServingDeviceSelectorResourceName,
+        &device_selector_resource,
+        [&](gpu::GpuServingDeviceSelectorResource** device_selector_resource) {
+          *device_selector_resource = new gpu::GpuServingDeviceSelectorResource(
+              pjrt_client->addressable_device_count(),
+              std::make_unique<tsl::RoundRobinPolicy>());
+          return absl::OkStatus();
+        }));
+    core::ScopedUnref device_selector_resource_ref(device_selector_resource);
+
+    TF_ASSIGN_OR_RETURN(absl::string_view fingerprint,
+                        executable->FingerprintExecutable());
+    device_selector_resource->selector()->Enqueue(pjrt_device_id, fingerprint);
+  }
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<xla::PjRtBuffer>> execute_outputs,
       RunPjRtExecutable(num_missing_prefix_ctx_inputs, inputs,
                         variable_snapshots, updated_variables, device_type,
                         use_pjrt_tensor_buffer, compilation_result, device,
                         pjrt_client, executable));
+  if (device_selector_resource != nullptr) {
+    device_selector_resource->selector()->Completed(pjrt_device_id,
+                                                    /*had_error=*/false);
+  }
 
   TF_RETURN_IF_ERROR(PopulateCtxOutputsFromPjRtExecutableOutputs(
       num_missing_prefix_ctx_inputs, inputs, updated_variables,
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 7967710cb37ded..7c016cb809ad92 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -446,6 +446,7 @@ cc_library(
     features = ["-layering_check"],
     deps = [
         ":gpu_scheduling_metrics_storage",
+        "//tensorflow/core/framework:resource_base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:node_hash_map",
diff --git a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
index a63a51508da008..1a3af85a22af24 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
@@ -24,10 +24,32 @@ limitations under the License.
 #include "absl/container/node_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/resource_base.h"
 #include "tsl/framework/serving_device_selector.h"
 
 namespace tensorflow {
 namespace gpu {
+class GpuServingDeviceSelector;
+const char kGpuServingDeviceSelectorResourceName[] =
+    "gpu_serving_device_selector";
+
+class GpuServingDeviceSelectorResource : public ResourceBase {
+ public:
+  explicit GpuServingDeviceSelectorResource(
+      int num_devices, std::unique_ptr<tsl::ServingDeviceSelector::Policy>
+                           device_selector_policy)
+      : selector_(std::make_unique<GpuServingDeviceSelector>(
+            num_devices, std::move(device_selector_policy))) {}
+
+  std::string DebugString() const override {
+    return "GpuServingDeviceSelectorResource";
+  };
+
+  GpuServingDeviceSelector* selector() const { return selector_.get(); }
+
+ private:
+  std::unique_ptr<GpuServingDeviceSelector> selector_;
+};
 
 class GpuServingDeviceSelector : public tsl::ServingDeviceSelector {
  public:

From 4849757a3acd8f33edfd48244410c720ba023966 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Apr 2024 13:35:42 -0700
Subject: [PATCH 640/670] Add support for parameter streaming with while loop
 by: - fix up shapes of root instruction of while body and parameter of while
 condition - root instruction of while body should use the "on host" operand
 since it's on host   at loop entry.

PiperOrigin-RevId: 620939395
---
 third_party/xla/xla/service/BUILD             |  5 +-
 .../xla/xla/service/host_offload_legalize.cc  | 45 +++++++++++++--
 third_party/xla/xla/service/host_offloader.cc | 32 +++++++++--
 third_party/xla/xla/service/host_offloader.h  |  1 +
 .../xla/xla/service/host_offloader_test.cc    | 56 ++++++++++++++++++-
 5 files changed, 127 insertions(+), 12 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index dc4a8ff4263651..e9c73e6fd54c35 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -5940,14 +5940,13 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -5973,7 +5972,9 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/service/host_offload_legalize.cc b/third_party/xla/xla/service/host_offload_legalize.cc
index e80e0ef32b5a50..c6349f7680a544 100644
--- a/third_party/xla/xla/service/host_offload_legalize.cc
+++ b/third_party/xla/xla/service/host_offload_legalize.cc
@@ -419,9 +419,12 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
       }
     }
   }
+
   auto update_shape_layout =
       [&](const std::pair<HloInstruction*, int>& instruction,
           HloInstruction* copy_to_move) {
+        VLOG(5) << "Update shape layout: " << instruction.first->ToString()
+                << " " << instruction.second;
         // Update shape. Tuple shape vs array shape.
         if (instruction.second != -1) {
           *instruction.first->mutable_shape()
@@ -431,7 +434,24 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
           *instruction.first->mutable_shape()->mutable_layout() =
               copy_to_move->operand(0)->shape().layout();
         }
+
+        if (instruction.first->opcode() == HloOpcode::kWhile) {
+          // Fix up while body's root instruction shape and condition's
+          // parameter shape for while loops.
+          Shape new_shape = copy_to_move->operand(0)->shape();
+          *instruction.first->while_body()
+               ->root_instruction()
+               ->mutable_shape()
+               ->mutable_tuple_shapes(instruction.second)
+               ->mutable_layout() = new_shape.layout();
+          *instruction.first->while_condition()
+               ->parameter_instruction(0)
+               ->mutable_shape()
+               ->mutable_tuple_shapes(instruction.second)
+               ->mutable_layout() = new_shape.layout();
+        }
       };
+
   // Process all copies one at a time from the last to the first and push it to
   // its specific user.
   while (!copies_to_move.empty()) {
@@ -440,8 +460,8 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
     stack.clear();
     stack.push_back(copy_to_move);
     while (!stack.empty()) {
-      VLOG(5) << "Current value before down: "
-              << stack.back().first->ToString();
+      VLOG(5) << "Current value before down: " << stack.back().first->ToString()
+              << " " << stack.back().second;
       auto current_value_down =
           WalkDownMemoryOffload(stack.back(), *call_graph);
       if (!current_value_down.ok()) {
@@ -458,8 +478,25 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
                 "Expected to be called only by one caller");
           }
           auto* caller = callers[0];
-          update_shape_layout(std::make_pair(caller, instruction.second),
-                              copy_to_move.first);
+          if (caller->opcode() == HloOpcode::kWhile) {
+            update_shape_layout(std::make_pair(caller, instruction.second),
+                                copy_to_move.first);
+
+            HloInstruction* root_instruction =
+                caller->while_body()->root_instruction();
+            // Fix while loop's result tuple to not use move-to-device since
+            // at loop entry it's still on host.
+            if (root_instruction->operand(instruction.second)
+                    ->IsCustomCall(host_memory_offload_annotations::
+                                       kMoveToDeviceCustomCallTarget)) {
+              root_instruction
+                  ->ReplaceOperandWith(
+                      instruction.second,
+                      root_instruction->mutable_operand(instruction.second)
+                          ->mutable_operand(0))
+                  .IgnoreError();
+            }
+          }
         }
       }
       stack.pop_back();
diff --git a/third_party/xla/xla/service/host_offloader.cc b/third_party/xla/xla/service/host_offloader.cc
index d17277a9141c6b..747b877617b16c 100644
--- a/third_party/xla/xla/service/host_offloader.cc
+++ b/third_party/xla/xla/service/host_offloader.cc
@@ -22,9 +22,9 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -584,8 +583,31 @@ absl::StatusOr<bool> HostOffloader::TryParameterStreaming(
   HloInstruction* copy_to_device =
       custom_call->parent()->AddInstruction(HloInstruction::CreateUnary(
           copy_shape, HloOpcode::kCopy, operand_of_load_annotation));
-  TF_RETURN_IF_ERROR(
-      operand_of_load_annotation->ReplaceAllUsesWith(copy_to_device));
+
+  auto users = operand_of_load_annotation->users();
+  for (HloInstruction* use : users) {
+    if (use == copy_to_device) {
+      continue;
+    }
+    auto callers = call_graph_->GetComputationCallers(copy_to_device->parent());
+    if (callers.size() > 1) {
+      return absl::InvalidArgumentError(
+          "Expected to be called only by one caller");
+    } else if (callers.size() == 1) {
+      auto* caller = callers[0];
+      if (caller->opcode() == HloOpcode::kWhile &&
+          use->opcode() == HloOpcode::kTuple && use->IsRoot()) {
+        // Do not replace the while loop parameter with the moved data. Because
+        // of the nature of while loops, since the data started on the host, it
+        // must end on the host. Only the while loop body's root should not use
+        // copy_to_device since it's on host at the loop entry.
+        continue;
+      }
+    }
+
+    TF_RETURN_IF_ERROR(
+        operand_of_load_annotation->ReplaceUseWith(use, copy_to_device));
+  }
 
   AddAllPositionsToBeMovedToHostMemory(unique_buffer);
   return true;
@@ -633,6 +655,8 @@ absl::StatusOr<bool> HostOffloader::Run(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
 
+  call_graph_ = CallGraph::Build(module);
+
   // Run HloAliasAnalysis on module.
   TF_ASSIGN_OR_RETURN(alias_analysis_, HloAliasAnalysis::Run(module));
 
diff --git a/third_party/xla/xla/service/host_offloader.h b/third_party/xla/xla/service/host_offloader.h
index 8bd2c0fb26598a..cd6e319a6fd50a 100644
--- a/third_party/xla/xla/service/host_offloader.h
+++ b/third_party/xla/xla/service/host_offloader.h
@@ -62,6 +62,7 @@ class HostOffloader : public HloModulePass {
   absl::flat_hash_set<HloInstruction*> annotations_for_copy_to_host_to_insert_;
   absl::flat_hash_set<HloInstruction*>
       annotations_for_copy_to_device_to_insert_;
+  std::unique_ptr<CallGraph> call_graph_;
 
   // Positions of all HloValues of the given HloBuffer will be added to
   // positions_to_move_to_host_memory_.
diff --git a/third_party/xla/xla/service/host_offloader_test.cc b/third_party/xla/xla/service/host_offloader_test.cc
index 162bbb4630de45..73579a11d1c3d4 100644
--- a/third_party/xla/xla/service/host_offloader_test.cc
+++ b/third_party/xla/xla/service/host_offloader_test.cc
@@ -22,7 +22,9 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -47,14 +49,15 @@ class HostOffloaderTest : public HloTestBase {
  protected:
   static constexpr int64_t kHostMemorySpaceColor{5};
 
-  absl::StatusOr<bool> RunHostOffloader(HloModule* module) {
+  absl::StatusOr<bool> RunHostOffloader(HloModule* module,
+                                        bool after_layout = false) {
     TF_EXPECT_OK(verifier().Run(module).status());
     if (module->has_schedule()) {
       return absl::InternalError("Expected a non-scheduled module");
     }
     bool changed = false;
     HostOffloadLegalize host_offload_legalize(kHostMemorySpaceColor,
-                                              /*after_layout=*/false);
+                                              after_layout);
     TF_ASSIGN_OR_RETURN(bool legal_changed, host_offload_legalize.Run(module));
     changed |= legal_changed;
     HostOffloader host_offloader(kHostMemorySpaceColor);
@@ -172,6 +175,55 @@ ENTRY main {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, ParameterStreamingWithXposeCopyFeedingIntoWhile) {
+  const std::string& hlo_string = R"(
+HloModule jit__prefill_impl, entry_computation_layout={(bf16[2,16,16]{2,1,0:T(8,128)(2,1)S(5)})->bf16[2,16,16]{1,2,0:T(8,128)(2,1)}}
+
+while_condition {
+  condition_param = (s32[], bf16[2,16,16]{1,2,0:T(8,128)(2,1)}, bf16[2,16,16]{1,2,0:T(8,128)(2,1)}) parameter(0)
+  condition_current_iteration_index = s32[] get-tuple-element(condition_param), index=0
+  condition_iteration_count = s32[] constant(16)
+  ROOT condition_result = pred[] compare(condition_current_iteration_index, condition_iteration_count), direction=LT
+}
+
+while_body {
+  input_tuple.0 = (s32[], bf16[2,16,16]{1,2,0:T(8,128)(2,1)}, bf16[2,16,16]{1,2,0:T(8,128)(2,1)}) parameter(0)
+  current_iteration_index.0 = s32[] get-tuple-element(input_tuple.0), index=0
+  orig_data = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} get-tuple-element(input_tuple.0), index=1
+  custom-call.0 = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} custom-call(orig_data), custom_call_target="MoveToDevice"
+  sum = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} get-tuple-element(input_tuple.0), index=2
+  sum.1 = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} add(custom-call.0, sum)
+
+  constant_1 = s32[] constant(1)
+  /* Increment iteration index */
+  incremented_index.0 = s32[] add(current_iteration_index.0, constant_1)
+  ROOT tuple_result.0 = (s32[], bf16[2,16,16]{1,2,0:T(8,128)(2,1)}, bf16[2,16,16]{1,2,0:T(8,128)(2,1)}) tuple(incremented_index.0, custom-call.0, sum.1)
+}
+
+ENTRY main {
+  param.0 = bf16[2,16,16]{2,1,0:T(8,128)(2,1)} parameter(0)
+  copy = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} copy(param.0)
+  constant_0 = s32[] constant(0)
+  constant_0.0 = bf16[] constant(0.0)
+  broadcast = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} broadcast(constant_0.0), dimensions={}
+  tuple_for_while = (s32[], bf16[2,16,16]{1,2,0:T(8,128)(2,1)}, bf16[2,16,16]{1,2,0:T(8,128)(2,1)}) tuple(constant_0, copy, broadcast)
+  while = (s32[], bf16[2,16,16]{1,2,0:T(8,128)(2,1)}, bf16[2,16,16]{1,2,0:T(8,128)(2,1)}) while(tuple_for_while), condition=while_condition, body=while_body
+  ROOT gte = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} get-tuple-element(while), index=2
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, RunHostOffloader(module.get(), /*after_layout=*/true));
+  EXPECT_TRUE(changed);
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+  HloVerifier verifier(/*layout_sensitive=*/true,
+                       /*allow_mixed_precision=*/true);
+  TF_EXPECT_OK(verifier.Run(module.get()).status());
+  VLOG(1) << "module after: " << module->ToString();
+}
+
 TEST_F(HostOffloaderTest, BasicNoCopy) {
   const std::string& hlo_string = R"(
 HloModule my_module

From a88761e42734464249ebb3061366eafd8ff3a0bc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Apr 2024 13:42:17 -0700
Subject: [PATCH 641/670] Hoist async copies when start_after is -1

Previously, the counter for `MemorySpaceAssignment::FixSchedule` started at `0` and would check for async copies scheduled for before the counter and after (but only the exact value). We have some async copies that set their `start_after` value to `-1` meaning we would skip inserting them at the earliest point and then catch that they weren't inserted by their `start_before` time and insert them then. This would lead to a few async copy operations where `*-start` would be scheduled immediately before their corresponding `*-done` operation, leading to none of the latency being hidden.

PiperOrigin-RevId: 620941225
---
 .../memory_space_assignment.cc                | 46 ++++++++------
 .../memory_space_assignment_test.cc           | 60 +++++++++++++++++++
 2 files changed, 87 insertions(+), 19 deletions(-)

diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 014f5c074e0934..022591adf1e6a1 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -6010,7 +6010,7 @@ Status MemorySpaceAssignment::FixSchedule() {
 
     VLOG(4) << "Scheduling: " << computation->ToString();
 
-    for (int64_t instruction_index = 0;; ++instruction_index) {
+    for (int64_t instruction_index = -1;; ++instruction_index) {
       auto insts_before_iter = schedule_before_.find(instruction_index);
       if (insts_before_iter != schedule_before_.end()) {
         for (HloInstruction* new_instruction : insts_before_iter->second) {
@@ -6022,25 +6022,32 @@ Status MemorySpaceAssignment::FixSchedule() {
           }
         }
       }
-      // We allow scheduling copy dones past the root instruction (for
-      // end-of-program cross-program prefetch). So the loop exit condition is
-      // actually here.
-      if (instruction_index >= flattened_instructions_.size()) {
-        break;
-      }
-      HloInstruction* instruction = flattened_instructions_[instruction_index];
-      // Insert only if it is not deleted (SimplifyGraph sets it to nullptr if
-      // it was deleted) and not previously inserted. Also bitcasts and tuples
-      // are treated specially and only inserted as a result of operand
-      // dependencies.
-      if (instruction != nullptr && instruction->parent() == computation &&
-          instruction->opcode() != HloOpcode::kBitcast &&
-          instruction->opcode() != HloOpcode::kTuple &&
-          !inserted_instructions.contains(instruction)) {
-        VLOG(4) << "inst " << instruction_index << ": " << instruction->name();
-        TF_RETURN_IF_ERROR(InsertInstructionAndEnsureOperandsInserted(
-            instruction, &new_sequence, &inserted_instructions));
+
+      if (instruction_index != -1) {
+        // We allow scheduling copy dones past the root instruction (for
+        // end-of-program cross-program prefetch). So the loop exit condition is
+        // actually here.
+        if (instruction_index >= flattened_instructions_.size()) {
+          break;
+        }
+
+        HloInstruction* instruction =
+            flattened_instructions_[instruction_index];
+        // Insert only if it is not deleted (SimplifyGraph sets it to nullptr if
+        // it was deleted) and not previously inserted. Also bitcasts and tuples
+        // are treated specially and only inserted as a result of operand
+        // dependencies.
+        if (instruction != nullptr && instruction->parent() == computation &&
+            instruction->opcode() != HloOpcode::kBitcast &&
+            instruction->opcode() != HloOpcode::kTuple &&
+            !inserted_instructions.contains(instruction)) {
+          VLOG(4) << "inst " << instruction_index << ": "
+                  << instruction->name();
+          TF_RETURN_IF_ERROR(InsertInstructionAndEnsureOperandsInserted(
+              instruction, &new_sequence, &inserted_instructions));
+        }
       }
+
       auto insts_after_iter = schedule_after_.find(instruction_index);
       if (insts_after_iter != schedule_after_.end()) {
         for (HloInstruction* new_instruction : insts_after_iter->second) {
@@ -6053,6 +6060,7 @@ Status MemorySpaceAssignment::FixSchedule() {
         }
       }
     }
+
     // For rare cases where the original sequence is empty, ensure the root
     // instruction and its dependencies are scheduled.
     TF_RETURN_IF_ERROR(EnsureInstructionAndOperandsInserted(
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index afb8ecf0a07534..5c9cc1e1321ba8 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -7886,6 +7886,66 @@ ENTRY main {
       kAlternateMemorySpace);
 }
 
+// This test seeks to test that MSA will schedule async copy operations with
+// schedule_after=-1 at the very beginning of the program.
+//
+// The machinery for this is a little opaque from the public API, so we attempt
+// to get MSA to self-assign an async copies with schedule_after=-1 by
+// exploiting how the hidden algorithm works. This is brittle and subject to
+// inadvertent breakage in the future.
+TEST_P(MemorySpaceAssignmentTest, HoistCopyStart) {
+  absl::string_view hlo_string = R"(
+  HloModule cross_program_prefetch, is_scheduled=true
+
+  ENTRY cross_program_prefetch {
+    p0 = (f32[8,8]{1,0}, f32[8,2]{1,0}) parameter(0)
+    get-tuple-element.0 = f32[8,8]{1,0} get-tuple-element(p0), index=0
+    add.0 = f32[8,8]{1,0} add(get-tuple-element.0, get-tuple-element.0)
+    get-tuple-element.1 = f32[8,2]{1,0} get-tuple-element(p0), index=1
+    dot.0 = f32[8,2]{1,0} dot(add.0, get-tuple-element.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    negate.1 = f32[8,2]{1,0} negate(dot.0)
+    negate.2 = f32[8,2]{1,0} negate(negate.1)
+    negate.3 = f32[8,2]{1,0} negate(negate.2)
+    negate.4 = f32[8,2]{1,0} negate(negate.3)
+    negate.5 = f32[8,2]{1,0} negate(negate.4)
+    negate.6 = f32[8,2]{1,0} negate(negate.5)
+    negate.7 = f32[8,2]{1,0} negate(negate.6)
+    negate.8 = f32[8,2]{1,0} negate(negate.7)
+    ROOT dot.1 = f32[2,2]{1,0} dot(negate.8, get-tuple-element.1), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options options = DefaultMemorySpaceOptions();
+  options.enable_cross_program_prefetch = true;
+  AssignMemorySpace(module.get(), options);
+
+  // Ensure that get-tuple-element.1 is chosen for cross-program prefetch.
+  auto cross_program_prefetches = module->CrossProgramPrefetches();
+  ASSERT_EQ(cross_program_prefetches.size(), 1);
+  ASSERT_EQ(cross_program_prefetches[0].parameter, 0);
+  ASSERT_EQ(cross_program_prefetches[0].index, ShapeIndex({1}));
+
+  // Check that the async copy-start for get-tuple-element.1 is hoisted
+  // after MSA (get-tuple-element.1 was initially the third operation of the
+  // original schedule).
+  //
+  // We expect the only instructions before it are declaring parameter(0) and
+  // get-tuple-element.1.
+  for (auto* instruction : module->schedule()
+                               .sequence(module->entry_computation())
+                               .instructions()) {
+    auto p0 = op::Parameter(0);
+    auto get_tuple_element_1 = op::GetTupleElement(p0, 1);
+    auto copy_start = op::CopyStart(get_tuple_element_1);
+    EXPECT_THAT(instruction, AnyOf(p0, get_tuple_element_1, copy_start));
+    if (::testing::Matches(copy_start)(instruction)) {
+      EXPECT_TRUE(instruction->cross_program_prefetch_index().has_value());
+      break;
+    }
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(MemorySpaceAssignmentInstantiation,
                          MemorySpaceAssignmentTest,
                          ::testing::Values(false, true));

From e527b5c398fb5d3874f714f277afc6daa35ed965 Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Mon, 1 Apr 2024 14:06:20 -0700
Subject: [PATCH 642/670] Integrate LLVM at llvm/llvm-project@0f6ed4c394fd

Updates LLVM usage to match
[0f6ed4c394fd](https://github.com/llvm/llvm-project/commit/0f6ed4c394fd)

PiperOrigin-RevId: 620947888
---
 third_party/llvm/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 6ed4d29d211c15..611ef16dc3fab3 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "80aa52d8c5a8a1c26b4114c60c2159c743d236d8"
-    LLVM_SHA256 = "b9079d7e8d72d7bb2453d908be1bd2bc4e5d62fd358ea9b7108f1c7d3b3c8585"
+    LLVM_COMMIT = "0f6ed4c394fd8f843029f6919230bf8df8618529"
+    LLVM_SHA256 = "d74c9ce4f5c826f0d60bc380ad2bb23b7d48bde9b3addcf1fa6bce67c1e92d28"
 
     tf_http_archive(
         name = name,

From 80b08b8e73b02dbeeae5e2f03a666313ffd1753a Mon Sep 17 00:00:00 2001
From: Swachhand Lokhande <swachhand@google.com>
Date: Mon, 1 Apr 2024 15:17:47 -0700
Subject: [PATCH 643/670] Add replicate_on_last_tile_dims handling when
 validating tiled output sharding.

PiperOrigin-RevId: 620967716
---
 .../tensorflow/utils/xla_sharding_util.cc     | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index 1d3df520549da7..ea76adb284b7e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -609,33 +609,35 @@ mlir::LogicalResult ValidateAndGetTiledExecuteOutputShape(
     const mlir::TensorType cluster_func_output_type,
     const xla::OpSharding& output_sharding,
     mlir::Type* tiled_logical_computation_type) {
-  auto new_output_shape =
-      llvm::to_vector<4>(cluster_func_output_type.getShape());
-  for (const auto& dimension_and_output_splits :
-       llvm::enumerate(output_sharding.tile_assignment_dimensions())) {
-    const auto dimension_index = dimension_and_output_splits.index();
-    const auto output_splits = dimension_and_output_splits.value();
-    const auto output_shape = cluster_func_output_type.getShape();
-
-    if (output_shape[dimension_index] == mlir::ShapedType::kDynamic) {
+  const auto output_shape = cluster_func_output_type.getShape();
+  auto new_output_shape = llvm::to_vector<4>(output_shape);
+  auto dimension_to_splits_map =
+      GetDimensionIndicesAndNumSplitsFromSharding(output_sharding);
+  if (!dimension_to_splits_map.ok()) {
+    LOG(ERROR) << dimension_to_splits_map.status();
+    return mlir::failure();
+  }
+
+  for (const auto& dimension_and_output_splits : *dimension_to_splits_map) {
+    const auto dimension = dimension_and_output_splits.first;
+    const auto output_splits = dimension_and_output_splits.second;
+
+    if (output_shape[dimension] == mlir::ShapedType::kDynamic) {
       *tiled_logical_computation_type = cluster_func_output_type;
       break;
     }
 
-    auto output_shape_at_dim =
-        cluster_func_output_type.getShape()[dimension_index];
-    if (output_shape_at_dim % output_splits != 0) {
+    if (output_shape[dimension] % output_splits != 0) {
       mlir::emitError(
           location,
           llvm::formatv("incorrect output sharding received. "
                         "{0}-th dimension of the output must be "
                         "evenly divisible by {1}, got dimension "
                         "shape {2}",
-                        dimension_index, output_splits, output_shape_at_dim));
+                        dimension, output_splits, output_shape[dimension]));
     }
 
-    new_output_shape[dimension_index] =
-        output_shape[dimension_index] / output_splits;
+    new_output_shape[dimension] = output_shape[dimension] / output_splits;
   }
 
   *tiled_logical_computation_type = mlir::RankedTensorType::get(

From f1bdacbc7a1784c39a4579646adcafc63d5dd074 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Mon, 1 Apr 2024 16:26:29 -0700
Subject: [PATCH 644/670] Integrate LLVM at llvm/llvm-project@c09b6fac12b0

Updates LLVM usage to match
[c09b6fac12b0](https://github.com/llvm/llvm-project/commit/c09b6fac12b0)

PiperOrigin-RevId: 620985353
---
 third_party/llvm/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 611ef16dc3fab3..6812a4abda6237 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "0f6ed4c394fd8f843029f6919230bf8df8618529"
-    LLVM_SHA256 = "d74c9ce4f5c826f0d60bc380ad2bb23b7d48bde9b3addcf1fa6bce67c1e92d28"
+    LLVM_COMMIT = "c09b6fac12b0299841bf1bf04974712963736db5"
+    LLVM_SHA256 = "37b63fac6ac8b44c89b1c0b2ba3ed662adfc213dddac3221ebed2c7c0a60e606"
 
     tf_http_archive(
         name = name,

From 4738091c64bc308f4f8f8d28f3e166d9de61a9ce Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Apr 2024 16:27:07 -0700
Subject: [PATCH 645/670] Fix a cornercase in lowering form saved model, for
 the tf.BatchFunction op.

PiperOrigin-RevId: 620985513
---
 .../mlir/tfrt/tests/hoist_invariant_ops.mlir  | 24 +++++++++++++++++++
 .../mlir/tfrt/transforms/lower_saved_model.cc |  6 ++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
index e6d5aec8285a0b..cf14af8f3d35f8 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
@@ -96,6 +96,30 @@ func.func @hoist_const_return(%arg: tensor<i32> {tf_saved_model.index_path = ["i
 
 module attributes {tf_saved_model.semantics} {
 
+// Test not hoisting `tf.BatchFunction`.
+
+// CHECK-LABEL: func @_tfrt_resource_init
+// CHECK: [[const:%.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> {device = "/CPU:0"} : () -> tensor<1xi32>
+// CHECK: "tf._TfrtSetResource"([[const]]) <{index = 0 : i64}> {device = "/CPU:0"} : (tensor<1xi32>) -> ()
+
+// CHECK-LABEL: func.func private @func_with_batch_function
+func.func private @func_with_batch_function() -> tensor<*xi32> attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "StatefulPartitionedCall:0"}} {
+  // CHECK:  "tf._TfrtGetResource"()
+  %cst = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> {device = "/CPU:0"} : () -> tensor<1xi32>
+  // CHECK:  "tf.BatchFunction"
+  %0 = "tf.BatchFunction"(%cst) <{allowed_batch_sizes = [1], batch_timeout_micros = 5000 : i64, batching_queue = "", container = "", enable_large_batch_splitting = true, f = @_batched, low_priority_allowed_batch_sizes = [], low_priority_batch_timeout_micros = 0 : i64, low_priority_max_batch_size = 0 : i64, low_priority_max_enqueued_batches = 0 : i64, max_batch_size = 1 : i64, max_enqueued_batches = 1 : i64, num_batch_threads = 1 : i64, operandSegmentSizes = array<i32: 1, 0>, shared_name = "batch_function___inference_signature_wrapper_fn_with_defaults_36"}> {device = "/CPU:0"} : (tensor<1xi32>) -> tensor<*xi32>
+  return %0 : tensor<*xi32>
+}
+func.func private @_batched(%arg0: tensor<1xi32>) -> tensor<1xi32> {
+  return %arg0 : tensor<1xi32>
+}
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
 // Test hoisting write side-effect ops.
 
 // CHECK-LABEL: func @_tfrt_resource_init
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
index 085d77df441e2e..17e3d8be95204d 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
@@ -115,6 +115,10 @@ bool CanHoist(const llvm::DenseSet<mlir::TF::ResourceHandle> &read_only_vars,
   // return ops should not be hoisted.
   if (op->mightHaveTrait<mlir::OpTrait::IsTerminator>()) return false;
 
+  // Fixes a corner case where hoisting the tf.BatchFunction leads to
+  // a compilation error; such a case may occur in unit tests.
+  if (llvm::isa<mlir::TF::BatchFunctionOp>(op)) return false;
+
   // Non-side-effecting ops can be hoisted.
   if (mlir::isMemoryEffectFree(op)) return true;
 
@@ -402,7 +406,7 @@ void LowerTFSavedModelPass::HoistInvariantOps(mlir::ModuleOp module) {
     } else if (auto func = llvm::dyn_cast<mlir::func::FuncOp>(op)) {
       if (!IsSessionInitializer(func)) return;
       FindCalleesRecursive(symbol_table, func, init_callees);
-    } else if (op->getName().getStringRef().str() == "tf.XlaLaunch") {
+    } else if (llvm::isa<mlir::TF::XlaLaunchOp>(op)) {
       // TODO(b/275095412): Clean up MLIR XLA functions after they are written
       // back to function library, so that we don't need to do special handling
       // for those functions here.

From bc1c3475f6d1a5d4e130521175f5a9329e07d4ec Mon Sep 17 00:00:00 2001
From: David Dunleavy <ddunleavy@google.com>
Date: Mon, 1 Apr 2024 16:54:17 -0700
Subject: [PATCH 646/670] Remove unnecessary Copybara transforms

PiperOrigin-RevId: 620991965
---
 third_party/xla/third_party/tsl/tsl/BUILD | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/third_party/xla/third_party/tsl/tsl/BUILD b/third_party/xla/third_party/tsl/tsl/BUILD
index 926656f3b84087..fbd87cb54a5a14 100644
--- a/third_party/xla/third_party/tsl/tsl/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/BUILD
@@ -528,6 +528,9 @@ bzl_library(
     srcs = ["tsl.bzl"],
     visibility = ["//visibility:public"],
     deps = [
+        "//third_party/compute_library:build_defs_bzl",
+        "//third_party/mkl_dnn:build_defs_bzl",
+        "//tsl/platform:rules_cc_bzl",
         "@bazel_skylib//lib:new_sets",
         "@local_config_cuda//cuda:build_defs_bzl",
         "@local_config_rocm//rocm:build_defs_bzl",

From 3ed11a8f3b9b31149128ac5ce9f2ebb1ee37b3af Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Apr 2024 17:31:40 -0700
Subject: [PATCH 647/670] Speed up DetermineHloInstructionIsReplicated.

A reasonable chunk of time gets used in resizing hash sets after insertions. We can instead resize these hashsets before insertions.

PiperOrigin-RevId: 621000442
---
 third_party/xla/xla/service/hlo_replication_analysis.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/hlo_replication_analysis.cc b/third_party/xla/xla/service/hlo_replication_analysis.cc
index 7b3f966728e3f1..b92b6fe816e3df 100644
--- a/third_party/xla/xla/service/hlo_replication_analysis.cc
+++ b/third_party/xla/xla/service/hlo_replication_analysis.cc
@@ -98,9 +98,13 @@ HloReplicationAnalysis::DetermineHloInstructionIsReplicated(
         bool replicated_across_replicas = true;
         const int64_t num_partitions =
             hlo->GetModule()->config().num_partitions();
+        absl::flat_hash_set<int64_t> visited_partitions;
+        absl::flat_hash_set<int64_t> visited_replicas;
         for (const auto& group : hlo->replica_groups()) {
-          absl::flat_hash_set<int64_t> visited_partitions;
-          absl::flat_hash_set<int64_t> visited_replicas;
+          visited_partitions.clear();
+          visited_replicas.clear();
+          visited_replicas.reserve(group.replica_ids().size());
+          visited_partitions.reserve(group.replica_ids().size());
           for (int64_t id : group.replica_ids()) {
             int64_t rid = id / num_partitions;
             int64_t pid = id % num_partitions;

From 98bdcfeb666403ed653fd864306557d60c61ce9f Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen@google.com>
Date: Mon, 1 Apr 2024 17:51:16 -0700
Subject: [PATCH 648/670] #tf-data Set `warm_start` to false when global
 shuffling is enabled.

When `warm_start` is enabled, some ops may use the wrong IteratorContext
to prefetch elements. For example:
https://github.com/tensorflow/tensorflow/blob/29561af231863afb3b6b89e3aa8a6a550c2b7bb0/tensorflow/core/kernels/data/prefetch_dataset_op.cc#L197-L199

PiperOrigin-RevId: 621004304
---
 tensorflow/python/data/experimental/ops/BUILD   |  1 +
 .../data/experimental/ops/global_shuffle_op.py  |  9 +++++++++
 tensorflow/python/data/kernel_tests/BUILD       |  1 +
 .../python/data/kernel_tests/list_files_test.py | 17 ++++++++++-------
 tensorflow/python/data/kernel_tests/map_test.py |  1 -
 5 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index ac1ad1666b9a5a..3151856ff5831d 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -181,6 +181,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:random_seed",
         "//tensorflow/python/framework:tensor",
         "//tensorflow/python/ops:dataset_ops_gen",
diff --git a/tensorflow/python/data/experimental/ops/global_shuffle_op.py b/tensorflow/python/data/experimental/ops/global_shuffle_op.py
index 911b29ac6278ba..6527c4980e326e 100644
--- a/tensorflow/python/data/experimental/ops/global_shuffle_op.py
+++ b/tensorflow/python/data/experimental/ops/global_shuffle_op.py
@@ -17,6 +17,7 @@
 from typing import Optional, Union
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.data.util import random_seed
 from tensorflow.python.framework import tensor
 from tensorflow.python.ops import gen_dataset_ops
@@ -75,6 +76,14 @@ def __init__(
       seed: Optional[Union[int, tensor.Tensor]] = None,
       reshuffle_each_iteration: bool = True,
       name: Optional[str] = None):
+
+    options = options_lib.Options()
+    # Currently, prefetching threads cannot access the runtime context required
+    # for global shuffling when `warm_start` is enabled. Supporting it will be
+    # future work.
+    options.experimental_warm_start = False
+    input_dataset = input_dataset.with_options(options)
+
     self._input_dataset = input_dataset
     self._seed, self._seed2 = random_seed.get_seed(seed)
     self._reshuffle_each_iteration = reshuffle_each_iteration
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 77e899a9968199..bfdb79b802c64b 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -690,6 +690,7 @@ tf_py_strict_test(
         "//tensorflow/python/data/experimental/ops:global_shuffle_op",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/data/ops:test_mode",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:client_testlib",
diff --git a/tensorflow/python/data/kernel_tests/list_files_test.py b/tensorflow/python/data/kernel_tests/list_files_test.py
index 525ced4d80c9ab..bad5e73bccb664 100644
--- a/tensorflow/python/data/kernel_tests/list_files_test.py
+++ b/tensorflow/python/data/kernel_tests/list_files_test.py
@@ -26,6 +26,7 @@
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import options as options_lib
+from tensorflow.python.data.ops import test_mode
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -284,6 +285,13 @@ class ListFilesGlobalShuffleCheckpointTest(
     checkpoint_test_base.CheckpointTestBase,
     parameterized.TestCase):
 
+  def setUp(self):
+    super().setUp()
+    # Bypasses the default value for `warm_start`, which is not supported for
+    # global shuffling:
+    # https://github.com/tensorflow/tensorflow/blob/29561af231863afb3b6b89e3aa8a6a550c2b7bb0/tensorflow/python/data/ops/options.py#L633
+    test_mode.toggle_test_mode(False)
+
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
@@ -304,25 +312,20 @@ def test(
     def _build_dataset() -> dataset_ops.Dataset:
       dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'),
                                                shuffle=False)
-      # TODO(b/325112575): Swapping the order of `repeat` and `prefetch` causes
-      # `warm_start` to be turned on which causes the wrong iterator context to
-      # be passed to the prefetch thread. Investigate this.
+      dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
       if repetitions > 1:
         dataset = dataset.repeat(repetitions)
-      dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
       dataset = global_shuffle_op._global_shuffle(
           dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
       options = options_lib.Options()
       options.experimental_symbolic_checkpoint = symbolic_checkpoint
-      options.experimental_warm_start = False
-      options.experimental_optimization.apply_default_optimizations = False
       return dataset.with_options(options)
 
     verify_fn(
         self,
         _build_dataset,
         num_outputs=len(filenames) * repetitions,
-        assert_items_equal=True)
+        assert_items_equal=reshuffle_each_iteration)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index 9256df3a5224a6..2d00e6bcaf7aa9 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -1887,7 +1887,6 @@ def _build_dataset() -> dataset_ops.Dataset:
           dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
       options = options_lib.Options()
       options.experimental_optimization.apply_default_optimizations = False
-      options.experimental_warm_start = False
       options.experimental_symbolic_checkpoint = symbolic_checkpoint
       return dataset.with_options(options)
 

From 9a5c06c9382417ba65bd8e79ef0de9ba093f9086 Mon Sep 17 00:00:00 2001
From: Chenguang Wang <chenguangwang@google.com>
Date: Mon, 1 Apr 2024 20:28:25 -0700
Subject: [PATCH 649/670] Temporary fix for TensorFlow VS2019 breakage.

This is a duplicate of https://github.com/llvm/llvm-project/pull/87288.

PiperOrigin-RevId: 621034833
---
 third_party/llvm/vs2019.patch  | 394 +++++++++++++++++++++++++++++++++
 third_party/llvm/workspace.bzl |   1 +
 2 files changed, 395 insertions(+)
 create mode 100644 third_party/llvm/vs2019.patch

diff --git a/third_party/llvm/vs2019.patch b/third_party/llvm/vs2019.patch
new file mode 100644
index 00000000000000..d6eff968e5f832
--- /dev/null
+++ b/third_party/llvm/vs2019.patch
@@ -0,0 +1,394 @@
+diff --git a/llvm/include/llvm/Support/FormatAdapters.h b/llvm/include/llvm/Support/FormatAdapters.h
+index 495205d11748..4131e956873e 100644
+--- a/llvm/include/llvm/Support/FormatAdapters.h
++++ b/llvm/include/llvm/Support/FormatAdapters.h
+@@ -16,13 +16,15 @@
+ #include "llvm/Support/raw_ostream.h"
+ 
+ namespace llvm {
+-template <typename T> class FormatAdapter : public detail::format_adapter {
++template <typename T>
++class FormatAdapter : public support::detail::format_adapter {
+ protected:
+   explicit FormatAdapter(T &&Item) : Item(std::forward<T>(Item)) {}
+ 
+   T Item;
+ };
+ 
++namespace support {
+ namespace detail {
+ template <typename T> class AlignAdapter final : public FormatAdapter<T> {
+   AlignStyle Where;
+@@ -80,29 +82,31 @@ public:
+     Stream << Item;
+   }
+ };
+-}
++} // namespace detail
++} // namespace support
+ 
+ template <typename T>
+-detail::AlignAdapter<T> fmt_align(T &&Item, AlignStyle Where, size_t Amount,
+-                                  char Fill = ' ') {
+-  return detail::AlignAdapter<T>(std::forward<T>(Item), Where, Amount, Fill);
++support::detail::AlignAdapter<T> fmt_align(T &&Item, AlignStyle Where,
++                                           size_t Amount, char Fill = ' ') {
++  return support::detail::AlignAdapter<T>(std::forward<T>(Item), Where, Amount,
++                                          Fill);
+ }
+ 
+ template <typename T>
+-detail::PadAdapter<T> fmt_pad(T &&Item, size_t Left, size_t Right) {
+-  return detail::PadAdapter<T>(std::forward<T>(Item), Left, Right);
++support::detail::PadAdapter<T> fmt_pad(T &&Item, size_t Left, size_t Right) {
++  return support::detail::PadAdapter<T>(std::forward<T>(Item), Left, Right);
+ }
+ 
+ template <typename T>
+-detail::RepeatAdapter<T> fmt_repeat(T &&Item, size_t Count) {
+-  return detail::RepeatAdapter<T>(std::forward<T>(Item), Count);
++support::detail::RepeatAdapter<T> fmt_repeat(T &&Item, size_t Count) {
++  return support::detail::RepeatAdapter<T>(std::forward<T>(Item), Count);
+ }
+ 
+ // llvm::Error values must be consumed before being destroyed.
+ // Wrapping an error in fmt_consume explicitly indicates that the formatv_object
+ // should take ownership and consume it.
+-inline detail::ErrorAdapter fmt_consume(Error &&Item) {
+-  return detail::ErrorAdapter(std::move(Item));
++inline support::detail::ErrorAdapter fmt_consume(Error &&Item) {
++  return support::detail::ErrorAdapter(std::move(Item));
+ }
+ }
+ 
+diff --git a/llvm/include/llvm/Support/FormatCommon.h b/llvm/include/llvm/Support/FormatCommon.h
+index 24a40c325e13..326e00936aa7 100644
+--- a/llvm/include/llvm/Support/FormatCommon.h
++++ b/llvm/include/llvm/Support/FormatCommon.h
+@@ -17,13 +17,13 @@ namespace llvm {
+ enum class AlignStyle { Left, Center, Right };
+ 
+ struct FmtAlign {
+-  detail::format_adapter &Adapter;
++  support::detail::format_adapter &Adapter;
+   AlignStyle Where;
+   size_t Amount;
+   char Fill;
+ 
+-  FmtAlign(detail::format_adapter &Adapter, AlignStyle Where, size_t Amount,
+-           char Fill = ' ')
++  FmtAlign(support::detail::format_adapter &Adapter, AlignStyle Where,
++           size_t Amount, char Fill = ' ')
+       : Adapter(Adapter), Where(Where), Amount(Amount), Fill(Fill) {}
+ 
+   void format(raw_ostream &S, StringRef Options) {
+diff --git a/llvm/include/llvm/Support/FormatProviders.h b/llvm/include/llvm/Support/FormatProviders.h
+index aa0773847161..bf489e2bfa07 100644
+--- a/llvm/include/llvm/Support/FormatProviders.h
++++ b/llvm/include/llvm/Support/FormatProviders.h
+@@ -25,6 +25,7 @@
+ #include <type_traits>
+ 
+ namespace llvm {
++namespace support {
+ namespace detail {
+ template <typename T>
+ struct use_integral_formatter
+@@ -98,7 +99,8 @@ protected:
+     return Default;
+   }
+ };
+-}
++} // namespace detail
++} // namespace support
+ 
+ /// Implementation of format_provider<T> for integral arithmetic types.
+ ///
+@@ -125,8 +127,8 @@ protected:
+ 
+ template <typename T>
+ struct format_provider<
+-    T, std::enable_if_t<detail::use_integral_formatter<T>::value>>
+-    : public detail::HelperFunctions {
++    T, std::enable_if_t<support::detail::use_integral_formatter<T>::value>>
++    : public support::detail::HelperFunctions {
+ private:
+ public:
+   static void format(const T &V, llvm::raw_ostream &Stream, StringRef Style) {
+@@ -174,8 +176,8 @@ public:
+ /// cases indicates the minimum number of nibbles to print.
+ template <typename T>
+ struct format_provider<
+-    T, std::enable_if_t<detail::use_pointer_formatter<T>::value>>
+-    : public detail::HelperFunctions {
++    T, std::enable_if_t<support::detail::use_pointer_formatter<T>::value>>
++    : public support::detail::HelperFunctions {
+ private:
+ public:
+   static void format(const T &V, llvm::raw_ostream &Stream, StringRef Style) {
+@@ -199,7 +201,7 @@ public:
+ 
+ template <typename T>
+ struct format_provider<
+-    T, std::enable_if_t<detail::use_string_formatter<T>::value>> {
++    T, std::enable_if_t<support::detail::use_string_formatter<T>::value>> {
+   static void format(const T &V, llvm::raw_ostream &Stream, StringRef Style) {
+     size_t N = StringRef::npos;
+     if (!Style.empty() && Style.getAsInteger(10, N)) {
+@@ -231,8 +233,8 @@ template <> struct format_provider<Twine> {
+ /// character.  Otherwise, it is treated as an integer options string.
+ ///
+ template <typename T>
+-struct format_provider<T,
+-                       std::enable_if_t<detail::use_char_formatter<T>::value>> {
++struct format_provider<
++    T, std::enable_if_t<support::detail::use_char_formatter<T>::value>> {
+   static void format(const char &V, llvm::raw_ostream &Stream,
+                      StringRef Style) {
+     if (Style.empty())
+@@ -297,9 +299,9 @@ template <> struct format_provider<bool> {
+ /// else.
+ 
+ template <typename T>
+-struct format_provider<T,
+-                       std::enable_if_t<detail::use_double_formatter<T>::value>>
+-    : public detail::HelperFunctions {
++struct format_provider<
++    T, std::enable_if_t<support::detail::use_double_formatter<T>::value>>
++    : public support::detail::HelperFunctions {
+   static void format(const T &V, llvm::raw_ostream &Stream, StringRef Style) {
+     FloatStyle S;
+     if (Style.consume_front("P") || Style.consume_front("p"))
+@@ -321,6 +323,7 @@ struct format_provider<T,
+   }
+ };
+ 
++namespace support {
+ namespace detail {
+ template <typename IterT>
+ using IterValue = typename std::iterator_traits<IterT>::value_type;
+@@ -328,8 +331,10 @@ using IterValue = typename std::iterator_traits<IterT>::value_type;
+ template <typename IterT>
+ struct range_item_has_provider
+     : public std::integral_constant<
+-          bool, !uses_missing_provider<IterValue<IterT>>::value> {};
+-}
++          bool,
++          !support::detail::uses_missing_provider<IterValue<IterT>>::value> {};
++} // namespace detail
++} // namespace support
+ 
+ /// Implementation of format_provider<T> for ranges.
+ ///
+@@ -393,7 +398,7 @@ template <typename IterT> class format_provider<llvm::iterator_range<IterT>> {
+   }
+ 
+ public:
+-  static_assert(detail::range_item_has_provider<IterT>::value,
++  static_assert(support::detail::range_item_has_provider<IterT>::value,
+                 "Range value_type does not have a format provider!");
+   static void format(const llvm::iterator_range<IterT> &V,
+                      llvm::raw_ostream &Stream, StringRef Style) {
+@@ -403,18 +408,18 @@ public:
+     auto Begin = V.begin();
+     auto End = V.end();
+     if (Begin != End) {
+-      auto Adapter = detail::build_format_adapter(*Begin);
++      auto Adapter = support::detail::build_format_adapter(*Begin);
+       Adapter.format(Stream, ArgStyle);
+       ++Begin;
+     }
+     while (Begin != End) {
+       Stream << Sep;
+-      auto Adapter = detail::build_format_adapter(*Begin);
++      auto Adapter = support::detail::build_format_adapter(*Begin);
+       Adapter.format(Stream, ArgStyle);
+       ++Begin;
+     }
+   }
+ };
+-}
++} // namespace llvm
+ 
+ #endif
+diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h
+index ddd80d89f1cd..595f2cf559a4 100644
+--- a/llvm/include/llvm/Support/FormatVariadic.h
++++ b/llvm/include/llvm/Support/FormatVariadic.h
+@@ -66,7 +66,7 @@ struct ReplacementItem {
+ class formatv_object_base {
+ protected:
+   StringRef Fmt;
+-  ArrayRef<detail::format_adapter *> Adapters;
++  ArrayRef<support::detail::format_adapter *> Adapters;
+ 
+   static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
+                                  size_t &Align, char &Pad);
+@@ -75,7 +75,7 @@ protected:
+   splitLiteralAndReplacement(StringRef Fmt);
+ 
+   formatv_object_base(StringRef Fmt,
+-                      ArrayRef<detail::format_adapter *> Adapters)
++                      ArrayRef<support::detail::format_adapter *> Adapters)
+       : Fmt(Fmt), Adapters(Adapters) {}
+ 
+   formatv_object_base(formatv_object_base const &rhs) = delete;
+@@ -130,7 +130,7 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
+   // of the parameters, we have to own the storage for the parameters here, and
+   // have the base class store type-erased pointers into this tuple.
+   Tuple Parameters;
+-  std::array<detail::format_adapter *, std::tuple_size<Tuple>::value>
++  std::array<support::detail::format_adapter *, std::tuple_size<Tuple>::value>
+       ParameterPointers;
+ 
+   // The parameters are stored in a std::tuple, which does not provide runtime
+@@ -142,8 +142,8 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
+   // std::array<Base*>.
+   struct create_adapters {
+     template <typename... Ts>
+-    std::array<detail::format_adapter *, std::tuple_size<Tuple>::value>
+-    operator()(Ts &... Items) {
++    std::array<support::detail::format_adapter *, std::tuple_size<Tuple>::value>
++    operator()(Ts &...Items) {
+       return {{&Items...}};
+     }
+   };
+@@ -248,13 +248,14 @@ public:
+ // the details of what that is are undefined.
+ //
+ template <typename... Ts>
+-inline auto formatv(const char *Fmt, Ts &&... Vals) -> formatv_object<decltype(
+-    std::make_tuple(detail::build_format_adapter(std::forward<Ts>(Vals))...))> {
+-  using ParamTuple = decltype(
+-      std::make_tuple(detail::build_format_adapter(std::forward<Ts>(Vals))...));
++inline auto formatv(const char *Fmt, Ts &&...Vals)
++    -> formatv_object<decltype(std::make_tuple(
++        support::detail::build_format_adapter(std::forward<Ts>(Vals))...))> {
++  using ParamTuple = decltype(std::make_tuple(
++      support::detail::build_format_adapter(std::forward<Ts>(Vals))...));
+   return formatv_object<ParamTuple>(
+-      Fmt,
+-      std::make_tuple(detail::build_format_adapter(std::forward<Ts>(Vals))...));
++      Fmt, std::make_tuple(support::detail::build_format_adapter(
++               std::forward<Ts>(Vals))...));
+ }
+ 
+ } // end namespace llvm
+diff --git a/llvm/include/llvm/Support/FormatVariadicDetails.h b/llvm/include/llvm/Support/FormatVariadicDetails.h
+index 068c327df396..a221fcadbd3c 100644
+--- a/llvm/include/llvm/Support/FormatVariadicDetails.h
++++ b/llvm/include/llvm/Support/FormatVariadicDetails.h
+@@ -19,6 +19,7 @@ namespace llvm {
+ template <typename T, typename Enable = void> struct format_provider {};
+ class Error;
+ 
++namespace support {
+ namespace detail {
+ class format_adapter {
+   virtual void anchor();
+@@ -156,7 +157,8 @@ std::enable_if_t<uses_missing_provider<T>::value, missing_format_adapter<T>>
+ build_format_adapter(T &&) {
+   return missing_format_adapter<T>();
+ }
+-}
+-}
++} // namespace detail
++} // namespace support
++} // namespace llvm
+ 
+ #endif
+diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp
+index 91b4c38ade4f..3c07a80a00ae 100644
+--- a/llvm/lib/Support/FormatVariadic.cpp
++++ b/llvm/lib/Support/FormatVariadic.cpp
+@@ -152,4 +152,4 @@ formatv_object_base::parseFormatString(StringRef Fmt) {
+   return Replacements;
+ }
+ 
+-void detail::format_adapter::anchor() { }
++void support::detail::format_adapter::anchor() {}
+
+diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp
+index 58c89aad7a85..a78b25c53d7e 100644
+--- a/llvm/unittests/Support/FormatVariadicTest.cpp
++++ b/llvm/unittests/Support/FormatVariadicTest.cpp
+@@ -20,8 +20,8 @@ struct Format : public FormatAdapter<int> {
+   void format(raw_ostream &OS, StringRef Opt) override { OS << "Format"; }
+ };
+ 
+-using detail::uses_format_member;
+-using detail::uses_missing_provider;
++using support::detail::uses_format_member;
++using support::detail::uses_missing_provider;
+ 
+ static_assert(uses_format_member<Format>::value, "");
+ static_assert(uses_format_member<Format &>::value, "");
+diff --git a/mlir/include/mlir/TableGen/Format.h b/mlir/include/mlir/TableGen/Format.h
+index 79d3d26a9d68..e92f6c64eab5 100644
+--- a/mlir/include/mlir/TableGen/Format.h
++++ b/mlir/include/mlir/TableGen/Format.h
+@@ -133,14 +133,15 @@ protected:
+   // std::vector<Base*>.
+   struct CreateAdapters {
+     template <typename... Ts>
+-    std::vector<llvm::detail::format_adapter *> operator()(Ts &...items) {
+-      return std::vector<llvm::detail::format_adapter *>{&items...};
++    std::vector<llvm::support::detail::format_adapter *>
++    operator()(Ts &...items) {
++      return std::vector<llvm::support::detail::format_adapter *>{&items...};
+     }
+   };
+ 
+   StringRef fmt;
+   const FmtContext *context;
+-  std::vector<llvm::detail::format_adapter *> adapters;
++  std::vector<llvm::support::detail::format_adapter *> adapters;
+   std::vector<FmtReplacement> replacements;
+ 
+ public:
+@@ -205,8 +206,8 @@ public:
+ 
+ class FmtStrVecObject : public FmtObjectBase {
+ public:
+-  using StrFormatAdapter =
+-      decltype(llvm::detail::build_format_adapter(std::declval<std::string>()));
++  using StrFormatAdapter = decltype(llvm::support::detail::build_format_adapter(
++      std::declval<std::string>()));
+ 
+   FmtStrVecObject(StringRef fmt, const FmtContext *ctx,
+                   ArrayRef<std::string> params);
+@@ -259,14 +260,15 @@ private:
+ ///    in C++ code generation.
+ template <typename... Ts>
+ inline auto tgfmt(StringRef fmt, const FmtContext *ctx, Ts &&...vals)
+-    -> FmtObject<decltype(std::make_tuple(
+-        llvm::detail::build_format_adapter(std::forward<Ts>(vals))...))> {
++    -> FmtObject<
++        decltype(std::make_tuple(llvm::support::detail::build_format_adapter(
++            std::forward<Ts>(vals))...))> {
+   using ParamTuple = decltype(std::make_tuple(
+-      llvm::detail::build_format_adapter(std::forward<Ts>(vals))...));
++      llvm::support::detail::build_format_adapter(std::forward<Ts>(vals))...));
+   return FmtObject<ParamTuple>(
+       fmt, ctx,
+-      std::make_tuple(
+-          llvm::detail::build_format_adapter(std::forward<Ts>(vals))...));
++      std::make_tuple(llvm::support::detail::build_format_adapter(
++          std::forward<Ts>(vals))...));
+ }
+ 
+ inline FmtStrVecObject tgfmt(StringRef fmt, const FmtContext *ctx,
+diff --git a/mlir/lib/TableGen/Format.cpp b/mlir/lib/TableGen/Format.cpp
+index 03f888b139f8..65f4ad56dd15 100644
+--- a/mlir/lib/TableGen/Format.cpp
++++ b/mlir/lib/TableGen/Format.cpp
+@@ -203,7 +203,8 @@ FmtStrVecObject::FmtStrVecObject(StringRef fmt, const FmtContext *ctx,
+     : FmtObjectBase(fmt, ctx, params.size()) {
+   parameters.reserve(params.size());
+   for (std::string p : params)
+-    parameters.push_back(llvm::detail::build_format_adapter(std::move(p)));
++    parameters.push_back(
++        llvm::support::detail::build_format_adapter(std::move(p)));
+ 
+   adapters.reserve(parameters.size());
+   for (auto &p : parameters)
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 6812a4abda6237..93795513a18c66 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -21,6 +21,7 @@ def repo(name):
             "//third_party/llvm:build.patch",
             "//third_party/llvm:mathextras.patch",
             "//third_party/llvm:toolchains.patch",
+            "//third_party/llvm:vs2019.patch",
             "//third_party/llvm:zstd.patch",
         ],
         link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},

From 3932830658fb11a2c52aa168380128626a2511ac Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 1 Apr 2024 21:25:01 -0700
Subject: [PATCH 650/670] [SavedModelBundleLite] Avoid copying the GraphDef
 during the load path.

Previously, we reused the "legacy" SavedModelBundle loading path, then discarded the unused MetaGraphDef. An old TODO called out the opportunity to avoid copying the MetaGraphDef's GraphDef, and this change does that.

PiperOrigin-RevId: 621045248
---
 tensorflow/cc/saved_model/loader.cc | 118 ++++++++++++++++++----------
 1 file changed, 78 insertions(+), 40 deletions(-)

diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index ae63fdab2fa32c..18fd6655fd269d 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_set>
+#include <utility>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/cc/saved_model/fingerprinting.h"
 #include "tensorflow/cc/saved_model/loader_util.h"
@@ -280,6 +282,16 @@ Status LoadMetagraphIntoSession(const SessionOptions& session_options,
   return (*session)->Create(meta_graph.graph_def());
 }
 
+Status LoadGraphDefIntoSession(const SessionOptions& session_options,
+                               GraphDef graph_def,
+                               std::unique_ptr<Session>* session) {
+  Session* session_p = nullptr;
+  TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
+  session->reset(session_p);
+  TF_RETURN_IF_ERROR(ValidateSavedTensors(graph_def));
+  return (*session)->Create(std::move(graph_def));
+}
+
 Status LoadSavedModelInternal(const SessionOptions& session_options,
                               const RunOptions& run_options,
                               const string& export_dir,
@@ -296,40 +308,6 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
   return absl::OkStatus();
 }
 
-Status LoadSavedModel(const SessionOptions& session_options,
-                      const RunOptions& run_options, const string& export_dir,
-                      const std::unordered_set<string>& tags,
-                      SavedModelBundle* const bundle) {
-  metrics::SavedModelReadApi(kCCLoadLabel).IncrementBy(1);
-  auto fingerprint_proto =
-      saved_model::fingerprinting::ReadSavedModelFingerprint(export_dir);
-  if (fingerprint_proto.ok()) {
-    // Set gauge cell with saved_model_checksum.
-    metrics::SavedModelReadFingerprint().Set(
-        std::to_string(fingerprint_proto->saved_model_checksum()));
-  }
-
-  // TODO(robson): Add tests for the counters.
-  const uint64 start_microseconds = Env::Default()->NowMicros();
-  const Status status = LoadSavedModelInternal(session_options, run_options,
-                                               export_dir, tags, bundle);
-  auto log_and_count = [&](const string& status_str) {
-    LOG(INFO) << "SavedModel load for tags { " << absl::StrJoin(tags, " ")
-              << " }; Status: " << status_str << ": " << status << ". Took "
-              << GetLatencyMicroseconds(start_microseconds) << " microseconds.";
-    load_attempt_count->GetCell(export_dir, status_str)->IncrementBy(1);
-  };
-  if (status.ok()) {
-    log_and_count(kLoadAttemptSuccess);
-    metrics::SavedModelReadPath().Set(export_dir);
-  } else {
-    log_and_count(kLoadAttemptFail);
-  }
-  load_latency->GetCell(export_dir)
-      ->IncrementBy(GetLatencyMicroseconds(start_microseconds));
-  return status;
-}
-
 namespace {
 // Session wrapper that prevents calls to Session::Create(), Session::Extend(),
 // and the deprecated partial-run methods.
@@ -441,6 +419,70 @@ class LiteSessionWrapper : public Session {
 };
 }  // namespace
 
+Status LoadSavedModelInternal(const SessionOptions& session_options,
+                              const RunOptions& run_options,
+                              const string& export_dir,
+                              const std::unordered_set<string>& tags,
+                              SavedModelBundleLite* const bundle) {
+  MetaGraphDef meta_graph_def;
+  TF_RETURN_IF_ERROR(
+      ReadMetaGraphDefFromSavedModel(export_dir, tags, &meta_graph_def));
+  std::unique_ptr<Session> session;
+  TF_RETURN_IF_ERROR(LoadGraphDefIntoSession(
+      session_options, std::move(*meta_graph_def.mutable_graph_def()),
+      &session));
+  TF_RETURN_IF_ERROR(
+      RestoreSession(run_options, meta_graph_def, export_dir, &session));
+  *bundle = SavedModelBundleLite(
+      std::make_unique<LiteSessionWrapper>(std::move(session)),
+      std::move(*meta_graph_def.mutable_signature_def()));
+  return absl::OkStatus();
+}
+
+template <typename BundleType>
+Status LoadSavedModelGeneric(const SessionOptions& session_options,
+                             const RunOptions& run_options,
+                             const string& export_dir,
+                             const std::unordered_set<string>& tags,
+                             BundleType* const bundle) {
+  metrics::SavedModelReadApi(kCCLoadLabel).IncrementBy(1);
+  auto fingerprint_proto =
+      saved_model::fingerprinting::ReadSavedModelFingerprint(export_dir);
+  if (fingerprint_proto.ok()) {
+    // Set gauge cell with saved_model_checksum.
+    metrics::SavedModelReadFingerprint().Set(
+        std::to_string(fingerprint_proto->saved_model_checksum()));
+  }
+
+  // TODO(robson): Add tests for the counters.
+  const uint64 start_microseconds = Env::Default()->NowMicros();
+  const Status status = LoadSavedModelInternal(session_options, run_options,
+                                               export_dir, tags, bundle);
+  auto log_and_count = [&](const string& status_str) {
+    LOG(INFO) << "SavedModel load for tags { " << absl::StrJoin(tags, " ")
+              << " }; Status: " << status_str << ": " << status << ". Took "
+              << GetLatencyMicroseconds(start_microseconds) << " microseconds.";
+    load_attempt_count->GetCell(export_dir, status_str)->IncrementBy(1);
+  };
+  if (status.ok()) {
+    log_and_count(kLoadAttemptSuccess);
+    metrics::SavedModelReadPath().Set(export_dir);
+  } else {
+    log_and_count(kLoadAttemptFail);
+  }
+  load_latency->GetCell(export_dir)
+      ->IncrementBy(GetLatencyMicroseconds(start_microseconds));
+  return status;
+}
+
+Status LoadSavedModel(const SessionOptions& session_options,
+                      const RunOptions& run_options, const string& export_dir,
+                      const std::unordered_set<string>& tags,
+                      SavedModelBundle* const bundle) {
+  return LoadSavedModelGeneric<SavedModelBundle>(session_options, run_options,
+                                                 export_dir, tags, bundle);
+}
+
 Status RestoreSession(const RunOptions& run_options,
                       const MetaGraphDef& meta_graph, const string& export_dir,
                       std::unique_ptr<Session>* session) {
@@ -476,7 +518,6 @@ Status LoadSavedModel(const SessionOptions& session_options,
                       const RunOptions& run_options, const string& export_dir,
                       const std::unordered_set<string>& tags,
                       SavedModelBundleLite* const bundle) {
-  SavedModelBundle legacy_bundle;
   SessionOptions rewritten_options(session_options);
   // We disallow calls to Session::Extend() on the returned session, so we can
   // reduce memory consumption by not storing the original GraphDef.
@@ -489,11 +530,8 @@ Status LoadSavedModel(const SessionOptions& session_options,
       ->set_disable_output_partition_graphs(true);
   // TODO(mrry): Consider specializing the session creation to reduce peak
   // RAM consumption by using `Session::Create(GraphDef&&)`.
-  TF_RETURN_IF_ERROR(LoadSavedModel(rewritten_options, run_options, export_dir,
-                                    tags, &legacy_bundle));
-  *bundle = SavedModelBundleLite(
-      std::make_unique<LiteSessionWrapper>(std::move(legacy_bundle.session)),
-      std::move(*legacy_bundle.meta_graph_def.mutable_signature_def()));
+  TF_RETURN_IF_ERROR(LoadSavedModelGeneric(rewritten_options, run_options,
+                                           export_dir, tags, bundle));
   return absl::OkStatus();
 }
 

From 4b642c5f62318e48331a0e907f198af150900559 Mon Sep 17 00:00:00 2001
From: Arturo Schmidt <arturoschmidt@google.com>
Date: Mon, 1 Apr 2024 21:48:59 -0700
Subject: [PATCH 651/670] Remove unnecessary GraphExportConfig flags. They are
 only evaluated once and are never reset.

PiperOrigin-RevId: 621049573
---
 .../tensorflow/translate/export_graphdef.cc   | 20 -------------------
 .../translate/mlir_roundtrip_flags.h          |  6 ------
 2 files changed, 26 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 550ff9fee330a4..523048cd7cd582 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -687,15 +687,6 @@ Status Exporter::ConvertLibFunction(
   TF_RETURN_IF_ERROR(
       GraphToFunctionDef(*sub_graph, function_name, control_ret, &func_def));
 
-  // The node defs in FunctionDef might contain debug info which was added
-  // by the GraphToFunctionDef method. We should remove it if we don't want
-  // to export them to avoid failing the roundtrip test.
-  if (!configs.export_debug_info) {
-    for (auto& node_def : *func_def.mutable_node_def()) {
-      node_def.clear_experimental_debug_info();
-    }
-  }
-
   // Checks for gradient attribute. If present converts the gradient function
   // and populates the GradientDef.
   auto grad_string = mlir::TF::TensorFlowDialect::GetGradientAttrName();
@@ -831,17 +822,6 @@ StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
 
   auto graphdef = std::make_unique<GraphDef>();
   graph->ToGraphDef(graphdef.get());
-  if (!configs.export_library) graphdef->clear_library();
-  if (!configs.export_shapes) {
-    for (auto& node_def : *graphdef->mutable_node()) {
-      node_def.mutable_attr()->erase("shape");
-    }
-  }
-  if (!configs.export_debug_info) {
-    for (auto& node_def : *graphdef->mutable_node()) {
-      node_def.clear_experimental_debug_info();
-    }
-  }
   return graphdef;
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
index 00fd5b7de6aa4d..fca039c2601636 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
@@ -102,12 +102,6 @@ struct GraphImportConfig {
 };
 
 struct GraphExportConfig {
-  // Whether to export shape attribute for the NodeDefs in the GraphDef.
-  bool export_shapes = true;
-  // Whether to export library field in the GraphDef.
-  bool export_library = true;
-  // Whether to export debug original node name in the GraphDef.
-  bool export_debug_info = true;
   // Whether to export the entry function to function library instead of the
   // graph.
   bool export_entry_func_to_flib = false;

From 6ae81ed0271e20fce3a8d1202d09e33b0254ebed Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Apr 2024 22:24:17 -0700
Subject: [PATCH 652/670] Automated Code Change

PiperOrigin-RevId: 621055899
---
 .../eager/cluster_function_library_runtime.cc                 | 4 ++--
 .../eager/cluster_function_library_runtime.h                  | 4 ++--
 tensorflow/core/distributed_runtime/eager/remote_mgr.cc       | 2 +-
 tensorflow/core/distributed_runtime/eager/remote_mgr.h        | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index 3c5d53cb0dc6b1..9349ae54a15052 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -113,7 +113,7 @@ void EagerClusterFunctionLibraryRuntime::Instantiate(
 
 void EagerClusterFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
-    FunctionLibraryRuntime::LocalHandle handle, gtl::ArraySlice<Tensor> args,
+    FunctionLibraryRuntime::LocalHandle handle, absl::Span<const Tensor> args,
     std::vector<Tensor>* rets, FunctionLibraryRuntime::DoneCallback done) {
   std::vector<FunctionArg> function_args;
   for (const auto& tensor : args) {
@@ -143,7 +143,7 @@ void EagerClusterFunctionLibraryRuntime::Run(
 void EagerClusterFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::LocalHandle handle,
-    gtl::ArraySlice<FunctionArg> args, std::vector<FunctionRet>* rets,
+    absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
     FunctionLibraryRuntime::DoneCallback done) {
   FunctionData* function_data = nullptr;
   {
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
index ed0cb32348f3d5..58af5ed93ae8ac 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
@@ -64,7 +64,7 @@ class EagerClusterFunctionLibraryRuntime
   // (i.e., the done callbacks triggered) before finishing its execution.
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
-           gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+           absl::Span<const Tensor> args, std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
   // The component function inputs `args` and outputs `rets` may refer to remote
@@ -72,7 +72,7 @@ class EagerClusterFunctionLibraryRuntime
   // the inputs/outputs are actually consumed.
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
-           gtl::ArraySlice<FunctionArg> args, std::vector<FunctionRet>* rets,
+           absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
   void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 042e485394f764..d415ca1123780c 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -45,7 +45,7 @@ Status WithErrorSourcePayload(Status error) {
 namespace eager {
 
 void RemoteMgr::AddOperationOutputs(
-    const gtl::ArraySlice<tensorflow::TensorHandle*> handles,
+    const absl::Span<tensorflow::TensorHandle* const> handles,
     int64_t operation_id) {
   mutex_lock l(remote_tensor_handle_mu_);
   for (int i = 0, end = handles.size(); i < end; i++) {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index 06893b54399d7f..8a2e9ea61ad27f 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -43,7 +43,7 @@ class RemoteMgr {
   bool IsMaster() { return is_master_; }
 
   void AddOperationOutputs(
-      const gtl::ArraySlice<tensorflow::TensorHandle*> handles,
+      const absl::Span<tensorflow::TensorHandle* const> handles,
       int64_t operation_id);
 
   void AddOperationOutput(tensorflow::TensorHandle* handles,

From e3b58d9617ecd5db98a3751983a30414bd3412f5 Mon Sep 17 00:00:00 2001
From: Zixuan Jiang <zixuanjiang@google.com>
Date: Mon, 1 Apr 2024 22:45:45 -0700
Subject: [PATCH 653/670] Support RngBitGenerator HloInstruction with single
 output in the HLO -> MHLO conversion.

For the rng-bit-generator operator, xla HLO instruction can have two kinds of shapes, (1) tuple(output_state, output_data), and (2) output_data. On the contrary, `mhlo::RngBitGeneratorOp` has only one shape, (output_state, output_data). This cl supports RngBitGenerator HloInstruction with single output in the HLO -> MHLO conversion.

PiperOrigin-RevId: 621059534
---
 .../hlo_to_mhlo/hlo_function_importer.cc      | 43 ++++++++++++++++---
 .../translate/hlo_to_mhlo/tests/import.hlotxt | 25 ++++++++---
 2 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
index 62c88294fa449c..0e234bcdb7c60d 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
@@ -575,7 +575,12 @@ absl::StatusOr<Value> HloFunctionImporter::ImportInstructionsImpl(
         auto new_operation,
         ImportInstructionWithLayout(instruction, operands, builder));
     if (new_operation) {
-      instruction_value_map_[instruction] = new_operation->getResult(0);
+      unsigned int idx =
+          (instruction->opcode() == HloOpcode::kRngBitGenerator &&
+           instruction->shape().IsArray())
+              ? 1
+              : 0;
+      instruction_value_map_[instruction] = new_operation->getResult(idx);
     }
   }
 
@@ -1643,18 +1648,44 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       }
     }
     case HloOpcode::kRngBitGenerator: {
+      // HloRngBitGeneratorInstruction can have two kinds of shapes, (1)
+      // tuple(output_state, output_data), and (2) output_data.
+      // mhlo::RngBitGeneratorOp has only one shape, (output_state,
+      // output_data).
       auto rng_op = Cast<HloRngBitGeneratorInstruction>(instruction);
 
+      auto algorithm_attr = mlir::mhlo::RngAlgorithmAttr::get(
+          builder_->getContext(),
+          *mlir::mhlo::symbolizeRngAlgorithm(rng_op->algorithm()));
+      attributes.push_back(
+          builder_->getNamedAttr("rng_algorithm", algorithm_attr));
+
       // Flatten the return type if they are tuple-typed.
       llvm::SmallVector<Type> flattened_ret_types;
       FlattenTupleType(result_type, flattened_ret_types);
+      if (rng_op->shape().IsArray()) {
+        TF_ASSIGN_OR_RETURN(auto state_type,
+                            ConvertShapeToType<RankedTensorType>(
+                                rng_op->operand(0)->shape(), *builder_));
+        flattened_ret_types.insert(flattened_ret_types.begin(), state_type);
+
+        if (instruction->has_sharding()) {
+          Shape tuple_shape = ShapeUtil::MakeTupleShape(
+              {rng_op->operand(0)->shape(), instruction->shape()});
+          HloSharding tuple_sharding = HloSharding::Tuple(
+              tuple_shape, {HloSharding::Replicate(), instruction->sharding()});
+          CHECK_EQ(attributes.front().getName().str(), kShardingAttr);
+          attributes.front() = builder_->getNamedAttr(
+              kShardingAttr, ConvertSharding(tuple_sharding, builder_));
+        }
+      }
+      CHECK_EQ(flattened_ret_types.size(), 2);
 
-      auto algorithm_attr = mlir::mhlo::RngAlgorithmAttr::get(
-          builder_->getContext(),
-          *mlir::mhlo::symbolizeRngAlgorithm(rng_op->algorithm()));
       auto op = func_builder->create<mlir::mhlo::RngBitGeneratorOp>(
-          loc, flattened_ret_types, algorithm_attr, operands[0]);
-
+          loc, flattened_ret_types, operands[0], attributes);
+      if (rng_op->shape().IsArray()) {
+        return op.getOperation();
+      }
       return CreateTupleFromOpResults(func_builder, loc, op.getOperation(),
                                       result_type);
     }
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
index dc9de36c5a7134..d8995bd5bdd70a 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
@@ -1723,14 +1723,29 @@ add {
   ROOT %not.2 = u16[4] not(u16[4] %Arg_0.1)
 }
 
-// CHECK-LABEL:  func private @rngbitgen
-// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3xui64>)
-%rngbitgen (Arg_0.1: u64[3]) -> (u64[3], u32[2,2]) {
+// CHECK-LABEL:  func private @rngbitgen_tuple_shape
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3xui64>) -> (tuple<tensor<3xui64>, tensor<2x2xui32>> {mhlo.sharding = "{{\{}}{maximal device=0}, {maximal device=1}}"})
+%rngbitgen_tuple_shape (Arg_0.1: u64[3]) -> (u64[3], u32[2,2]) {
   %Arg_0.1 = u64[3] parameter(0)
-  // CHECK: %[[RNG0:.+]], %[[RNG1:.+]] = "mhlo.rng_bit_generator"(%[[ARG0]]) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
+  // CHECK: %[[RNG0:.+]], %[[RNG1:.+]] = "mhlo.rng_bit_generator"(%[[ARG0]]) 
+  // CHECK-SAME: mhlo.sharding = "{{\{}}{maximal device=0}, {maximal device=1}}"
+  // CHECK-SAME: rng_algorithm = #mhlo.rng_algorithm<PHILOX>
+  // CHECK-SAME: (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
   // CHECK: %[[TUPLE:.+]] = mhlo.tuple %[[RNG0]], %[[RNG1]] {xla_shape = "(u64[3]{0}, u32[2,2]{1,0})"} : tuple<tensor<3xui64>, tensor<2x2xui32>>
   // CHECK: return %[[TUPLE]]
-  ROOT %rng-bit-generator.2 = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %Arg_0.1), algorithm=rng_philox
+  ROOT %rng-bit-generator.2 = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %Arg_0.1), algorithm=rng_philox, sharding={{maximal device=0}, {maximal device=1}}
+}
+
+// CHECK-LABEL:  func private @rngbitgen_array_shape
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3xui64>) -> (tensor<2x2xui32> {mhlo.sharding = "{maximal device=0}"})
+%rngbitgen_array_shape (Arg_0.1: u64[3]) -> u32[2,2] {
+  %Arg_0.1 = u64[3] parameter(0)
+  // CHECK: %[[RNG0:.+]], %[[RNG1:.+]] = "mhlo.rng_bit_generator"(%[[ARG0]])
+  // CHECK-SAME: mhlo.sharding = "{{\{}}{replicated}, {maximal device=0}}"
+  // CHECK-SAME: rng_algorithm = #mhlo.rng_algorithm<DEFAULT>
+  // CHECK-SAME: (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
+  // CHECK: return %[[RNG1]]
+  ROOT %rng-bit-generator.2 = u32[2,2] rng-bit-generator(u64[3] %Arg_0.1), algorithm=rng_default, sharding={maximal device=0}
 }
 
 // CHECK-LABEL:  func private @cbrt

From ced28148be8cec1f234886f21c7e528c374032b8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Apr 2024 23:07:37 -0700
Subject: [PATCH 654/670] Automated Code Change

PiperOrigin-RevId: 621063304
---
 third_party/xla/xla/python/py_array.h | 26 +++++++++++++-------------
 third_party/xla/xla/python/types.h    |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/third_party/xla/xla/python/py_array.h b/third_party/xla/xla/python/py_array.h
index 31d26b95feb68c..f5b39e86e85853 100644
--- a/third_party/xla/xla/python/py_array.h
+++ b/third_party/xla/xla/python/py_array.h
@@ -64,7 +64,7 @@ class PyHostValue {
   Status CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
                          ifrt::Array* ifrt_array);
 
-  StatusOr<nanobind::object> AsNumPyArray(
+  absl::StatusOr<nanobind::object> AsNumPyArray(
       std::optional<Shape>& dynamic_shape_holder, ifrt::Array* ifrt_array);
 
  private:
@@ -176,7 +176,7 @@ class PyArray : public nanobind::object {
 
   const nanobind::object& sharding() const { return GetStorage().sharding; }
 
-  StatusOr<std::unique_ptr<PjRtLayout>> layout() {
+  absl::StatusOr<std::unique_ptr<PjRtLayout>> layout() {
     return ifrt_array()->layout();
   }
 
@@ -197,7 +197,7 @@ class PyArray : public nanobind::object {
 
   // Returns xla::InvalidArgument if the buffer has been deleted.
   // See `PjRtFuture` for the semantics of `IsReady` and `IsKnownReady`.
-  StatusOr<bool> IsReady() {
+  absl::StatusOr<bool> IsReady() {
     ifrt::Array* ifrt_array_ptr = ifrt_array();
     if (ifrt_array_ptr->IsDeleted()) {
       return InvalidArgument("Array has been deleted.");
@@ -249,7 +249,7 @@ class PyArray : public nanobind::object {
 
   nanobind::object arrays();
   Status set_arrays(nanobind::object obj);
-  StatusOr<PyArray> FullyReplicatedShard();
+  absl::StatusOr<PyArray> FullyReplicatedShard();
 
   int num_shards() const {
     ifrt::Array* ifrt_array_ptr = ifrt_array();
@@ -275,11 +275,11 @@ class PyArray : public nanobind::object {
 
   absl::Status BlockUntilResultStatusIsReady();
 
-  StatusOr<size_t> GetOnDeviceSizeInBytes();
-  StatusOr<nanobind::object> SingleDeviceArrayToNumpyArray();
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes();
+  absl::StatusOr<nanobind::object> SingleDeviceArrayToNumpyArray();
   Status CopySingleDeviceArrayToHostAsync();
   nanobind::dict CudaArrayInterface();
-  StatusOr<std::uintptr_t> UnsafeBufferPointer();
+  absl::StatusOr<std::uintptr_t> UnsafeBufferPointer();
 
   Status Delete();
 
@@ -287,10 +287,10 @@ class PyArray : public nanobind::object {
 
   PyArray Clone() const;
 
-  StatusOr<PyArray> CopyToDeviceWithSharding(ifrt::DeviceList devices,
-                                             nanobind::object dst_sharding);
+  absl::StatusOr<PyArray> CopyToDeviceWithSharding(
+      ifrt::DeviceList devices, nanobind::object dst_sharding);
 
-  static StatusOr<PyArray> BatchedDevicePut(
+  static absl::StatusOr<PyArray> BatchedDevicePut(
       nanobind::object aval, nanobind::object sharding,
       std::vector<nanobind::object> xs,
       absl::Span<const PyDevice* const> dst_devices, bool committed,
@@ -301,8 +301,8 @@ class PyArray : public nanobind::object {
       std::vector<nanobind::object> objs);
 
  private:
-  StatusOr<PyArray> FetchSingleShard(std::string_view api);
-  StatusOr<PyArray> AssertUnsharded(std::string_view api);
+  absl::StatusOr<PyArray> FetchSingleShard(std::string_view api);
+  absl::StatusOr<PyArray> AssertUnsharded(std::string_view api);
 
   void CheckAndRearrange();
 
@@ -338,7 +338,7 @@ class PyArrayResultHandler {
   std::vector<int64_t> shape_;
 };
 
-StatusOr<nanobind::object> CudaArrayInterfaceToBuffer(
+absl::StatusOr<nanobind::object> CudaArrayInterfaceToBuffer(
     const nanobind::dict& cai, nb_class_ptr<PyClient> cuda_client);
 
 }  // namespace xla
diff --git a/third_party/xla/xla/python/types.h b/third_party/xla/xla/python/types.h
index 772bccc64e89f3..00f28dbdbc3607 100644
--- a/third_party/xla/xla/python/types.h
+++ b/third_party/xla/xla/python/types.h
@@ -47,7 +47,7 @@ absl::StatusOr<nb_dtype> PrimitiveTypeToNbDtype(PrimitiveType type);
 // Converts an IFRT dtype to a NumPy dtype.
 absl::StatusOr<nb_dtype> IfrtDtypeToNbDtype(ifrt::DType dtype);
 
-StatusOr<ifrt::DType> DtypeToIfRtDType(nb_dtype dtype);
+absl::StatusOr<ifrt::DType> DtypeToIfRtDType(nb_dtype dtype);
 
 // Returns a Python buffer protocol (PEP 3118) format descriptor string for
 // `type`. Return nullptr if there is no suitable choice of format string.

From 8ceb574e87cdfe5d959ab83ce94f53f87c604c54 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Apr 2024 23:28:24 -0700
Subject: [PATCH 655/670] Automated Code Change

PiperOrigin-RevId: 621066820
---
 third_party/xla/xla/python/ifrt/mock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index b33209c1a3c921..13ae0e1483f95d 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -66,7 +66,7 @@ class MockArray final : public llvm::RTTIExtends<MockArray, Array> {
               (const, final));
   MOCK_METHOD(absl::StatusOr<std::unique_ptr<PjRtLayout>>, layout, (),
               (const, final));
-  MOCK_METHOD(StatusOr<std::vector<tsl::RCReference<Array>>>,
+  MOCK_METHOD(absl::StatusOr<std::vector<tsl::RCReference<Array>>>,
               DisassembleIntoSingleDeviceArrays, (ArrayCopySemantics semantics),
               (final));
   MOCK_METHOD(absl::StatusOr<tsl::RCReference<Array>>, FullyReplicatedShard,

From 5190ab7df0c9ca2157a8f82406a7f3a8d5f517df Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Apr 2024 00:08:04 -0700
Subject: [PATCH 656/670] Automated Code Change

PiperOrigin-RevId: 621074007
---
 .../core/kernels/example_parsing_ops.cc       | 56 +++++++++----------
 .../core/kernels/fake_quant_ops_test.cc       | 14 ++---
 tensorflow/core/kernels/list_kernels.h        |  4 +-
 .../kernels/merge_v2_checkpoints_op_test.cc   |  4 +-
 tensorflow/core/kernels/padding_fifo_queue.cc |  2 +-
 tensorflow/core/kernels/padding_fifo_queue.h  |  2 +-
 .../core/kernels/quantization_utils_test.cc   |  4 +-
 tensorflow/core/kernels/queue_base.cc         |  2 +-
 tensorflow/core/kernels/queue_base.h          |  2 +-
 .../ragged_tensor_to_tensor_op_test.cc        |  6 +-
 tensorflow/core/kernels/range_sampler.cc      | 45 +++++++--------
 tensorflow/core/kernels/range_sampler.h       | 46 +++++++--------
 tensorflow/core/kernels/range_sampler_test.cc | 14 ++---
 tensorflow/core/kernels/reverse_op.cc         |  2 +-
 tensorflow/core/kernels/roll_op.cc            | 19 ++++---
 tensorflow/core/kernels/roll_op.h             |  6 +-
 .../core/kernels/save_restore_tensor.cc       |  2 +-
 tensorflow/core/kernels/save_restore_tensor.h |  2 +-
 .../core/kernels/save_restore_v2_ops.cc       |  4 +-
 .../core/kernels/scoped_allocator_ops_test.cc |  2 +-
 tensorflow/core/kernels/sequence_ops_test.cc  |  2 +-
 tensorflow/core/kernels/slice_op.cc           |  4 +-
 tensorflow/core/kernels/sparse_add_op_test.cc |  4 +-
 .../kernels/sparse_dense_binary_op_shared.cc  |  3 +-
 .../sparse_dense_binary_op_shared_test.cc     | 18 +++---
 tensorflow/core/kernels/sparse_reduce_op.cc   |  6 +-
 .../core/kernels/sparse_reduce_sum_op_test.cc |  8 +--
 tensorflow/core/kernels/sparse_reorder_op.cc  |  4 +-
 tensorflow/core/kernels/sparse_slice_op.cc    |  6 +-
 tensorflow/core/kernels/sparse_softmax_op.cc  |  5 +-
 .../core/kernels/strided_slice_op_impl.h      | 42 +++++++-------
 tensorflow/core/kernels/tensor_array_ops.cc   |  2 +-
 tensorflow/core/kernels/transpose_functor.h   | 12 ++--
 .../core/kernels/transpose_functor_cpu.cc     |  4 +-
 tensorflow/core/kernels/transpose_op.cc       |  4 +-
 tensorflow/core/kernels/transpose_op.h        | 10 ++--
 .../core/kernels/transpose_util_test.cc       | 10 ++--
 37 files changed, 187 insertions(+), 195 deletions(-)

diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index a4cfdd0f766ac4..b7ea4c9f9bf592 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -249,8 +249,8 @@ class ParseExampleOp : public OpKernel {
                             example::Result* result) const {
     auto serialized_t = serialized->flat<tstring>();
     auto names_t = names->flat<tstring>();
-    gtl::ArraySlice<tstring> slice(serialized_t.data(), serialized_t.size());
-    gtl::ArraySlice<tstring> names_slice(names_t.data(), names_t.size());
+    absl::Span<const tstring> slice(serialized_t.data(), serialized_t.size());
+    absl::Span<const tstring> names_slice(names_t.data(), names_t.size());
     return FastParseExample(
         config, slice, names_slice,
         ctx->device()->tensorflow_cpu_worker_threads()->workers, result);
@@ -470,9 +470,9 @@ class ParseSequenceExampleOp : public OpKernel {
     bool is_batch = TensorShapeUtils::IsVector(serialized->shape());
     auto serialized_t = serialized->flat<tstring>();
     auto debug_name_t = debug_name->flat<tstring>();
-    gtl::ArraySlice<tstring> slice(serialized_t.data(), serialized_t.size());
-    gtl::ArraySlice<tstring> names_slice(debug_name_t.data(),
-                                         debug_name_t.size());
+    absl::Span<const tstring> slice(serialized_t.data(), serialized_t.size());
+    absl::Span<const tstring> names_slice(debug_name_t.data(),
+                                          debug_name_t.size());
 
     example::Result context_result, feature_list_result;
     std::vector<Tensor> dense_feature_lengths;
@@ -575,21 +575,21 @@ class ParseSequenceExampleOp : public OpKernel {
       const OpInputList& context_dense_defaults) const {
     // Convert the tensors/attrs to ArraySlices once, instead of re-evaluating
     // them in each loop iteration.
-    gtl::ArraySlice<tstring> dense_keys_slice =
+    absl::Span<const tstring> dense_keys_slice =
         dense_keys
-            ? gtl::ArraySlice<tstring>(dense_keys->flat<tstring>().data(),
-                                       attrs_.num_context_dense)
+            ? absl::Span<const tstring>(dense_keys->flat<tstring>().data(),
+                                        attrs_.num_context_dense)
             : attrs_.context_dense_keys;
-    gtl::ArraySlice<tstring> sparse_keys_slice =
+    absl::Span<const tstring> sparse_keys_slice =
         sparse_keys
-            ? gtl::ArraySlice<tstring>(sparse_keys->flat<tstring>().data(),
-                                       attrs_.num_context_sparse)
+            ? absl::Span<const tstring>(sparse_keys->flat<tstring>().data(),
+                                        attrs_.num_context_sparse)
             : attrs_.context_sparse_keys;
-    gtl::ArraySlice<tstring> ragged_keys_slice =
+    absl::Span<const tstring> ragged_keys_slice =
         ragged_keys
-            ? gtl::ArraySlice<tstring>(ragged_keys->flat<tstring>().data(),
-                                       attrs_.num_context_ragged)
-            : gtl::ArraySlice<tstring>(nullptr, 0);
+            ? absl::Span<const tstring>(ragged_keys->flat<tstring>().data(),
+                                        attrs_.num_context_ragged)
+            : absl::Span<const tstring>(nullptr, 0);
 
     example::FastParseExampleConfig config;
     config.dense.reserve(attrs_.num_context_dense);
@@ -634,29 +634,29 @@ class ParseSequenceExampleOp : public OpKernel {
       const Tensor* feature_list_dense_missing_assumed_empty) const {
     // Convert the tensors/attrs to ArraySlices once, instead of re-evaluating
     // them in each loop iteration.
-    gtl::ArraySlice<tstring> dense_keys_slice =
+    absl::Span<const tstring> dense_keys_slice =
         dense_keys
-            ? gtl::ArraySlice<tstring>(dense_keys->flat<tstring>().data(),
-                                       attrs_.num_feature_list_dense)
+            ? absl::Span<const tstring>(dense_keys->flat<tstring>().data(),
+                                        attrs_.num_feature_list_dense)
             : attrs_.feature_list_dense_keys;
-    gtl::ArraySlice<tstring> sparse_keys_slice =
+    absl::Span<const tstring> sparse_keys_slice =
         sparse_keys
-            ? gtl::ArraySlice<tstring>(sparse_keys->flat<tstring>().data(),
-                                       attrs_.num_feature_list_sparse)
+            ? absl::Span<const tstring>(sparse_keys->flat<tstring>().data(),
+                                        attrs_.num_feature_list_sparse)
             : attrs_.feature_list_sparse_keys;
-    gtl::ArraySlice<tstring> ragged_keys_slice =
+    absl::Span<const tstring> ragged_keys_slice =
         ragged_keys
-            ? gtl::ArraySlice<tstring>(ragged_keys->flat<tstring>().data(),
-                                       attrs_.num_feature_list_ragged)
-            : gtl::ArraySlice<tstring>(nullptr, 0);
+            ? absl::Span<const tstring>(ragged_keys->flat<tstring>().data(),
+                                        attrs_.num_feature_list_ragged)
+            : absl::Span<const tstring>(nullptr, 0);
     // Use an empty slice to indicate that the map in attrs_ should be used
     // instead.
-    gtl::ArraySlice<bool> feature_list_dense_missing_assumed_empty_slice =
+    absl::Span<const bool> feature_list_dense_missing_assumed_empty_slice =
         feature_list_dense_missing_assumed_empty
-            ? gtl::ArraySlice<bool>(
+            ? absl::Span<const bool>(
                   feature_list_dense_missing_assumed_empty->flat<bool>().data(),
                   attrs_.num_feature_list_dense)
-            : gtl::ArraySlice<bool>(nullptr, 0);
+            : absl::Span<const bool>(nullptr, 0);
 
     example::FastParseExampleConfig config;
     config.dense.reserve(attrs_.num_feature_list_dense);
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
index 9f832ce9915189..3d4024f48377e5 100644
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -53,8 +53,8 @@ class QuantOpsTest : public OpsTestBase {
   void RunTestFakeQuantWithMinMaxArgs(const int num_bits,
                                       const bool narrow_range, const float min,
                                       const float max, const TensorShape& shape,
-                                      const gtl::ArraySlice<float> data,
-                                      gtl::ArraySlice<float> expected_data,
+                                      const absl::Span<const float> data,
+                                      absl::Span<const float> expected_data,
                                       const double atol = -1.0,
                                       const double rtol = -1.0,
                                       const DeviceType device = DEVICE_CPU) {
@@ -88,8 +88,8 @@ class QuantOpsTest : public OpsTestBase {
   void RunTestFakeQuantWithMinMaxVars(const int num_bits,
                                       const bool narrow_range, const float min,
                                       const float max, const TensorShape& shape,
-                                      const gtl::ArraySlice<float> data,
-                                      gtl::ArraySlice<float> expected_data,
+                                      const absl::Span<const float> data,
+                                      absl::Span<const float> expected_data,
                                       const double atol = -1.0,
                                       const double rtol = -1.0,
                                       const DeviceType device = DEVICE_CPU) {
@@ -125,9 +125,9 @@ class QuantOpsTest : public OpsTestBase {
 
   void RunTestFakeQuantWithMinMaxVarsPerChannel(
       const int num_bits, const bool narrow_range,
-      const TensorShape& minmax_shape, const gtl::ArraySlice<float> min,
-      const gtl::ArraySlice<float> max, const TensorShape& shape,
-      const gtl::ArraySlice<float> data, gtl::ArraySlice<float> expected_data,
+      const TensorShape& minmax_shape, const absl::Span<const float> min,
+      const absl::Span<const float> max, const TensorShape& shape,
+      const absl::Span<const float> data, absl::Span<const float> expected_data,
       const double atol = -1.0, const double rtol = -1.0,
       const DeviceType device = DEVICE_CPU) {
     if (device == DEVICE_GPU) {
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 90f03fe7bd7780..fec3ebab2aa27f 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -400,7 +400,7 @@ class TensorListConcat : public OpKernel {
       OP_REQUIRES(c, !dim_sizes.empty(),
                   errors::InvalidArgument("element_shape must not be empty"));
       element_shape_except_first_dim =
-          PartialTensorShape(gtl::ArraySlice<int64_t>(dim_sizes).subspan(1));
+          PartialTensorShape(absl::Span<const int64_t>(dim_sizes).subspan(1));
     }
     // Check that the input Variant tensor is indeed a TensorList and has the
     // correct element type.
@@ -459,7 +459,7 @@ class TensorListConcat : public OpKernel {
               errors::InvalidArgument("Concat saw a scalar shape at index ", i,
                                       " but requires at least vectors."));
           TensorShape shape_except_first_dim = TensorShape(
-              gtl::ArraySlice<int64_t>(t.shape().dim_sizes()).subspan(1));
+              absl::Span<const int64_t>(t.shape().dim_sizes()).subspan(1));
           OP_REQUIRES_OK(c, tmp.MergeWith(shape_except_first_dim,
                                           &element_shape_except_first_dim));
           OP_REQUIRES(c, first_dim == -1 || first_dim == t.shape().dim_size(0),
diff --git a/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc b/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
index 063c3d19619d8d..cc838aace88f33 100644
--- a/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
+++ b/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
@@ -34,8 +34,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void WriteCheckpoint(const string& prefix, gtl::ArraySlice<string> names,
-                     gtl::ArraySlice<Tensor> tensors) {
+void WriteCheckpoint(const string& prefix, absl::Span<const string> names,
+                     absl::Span<const Tensor> tensors) {
   BundleWriter writer(Env::Default(), prefix);
   ASSERT_TRUE(names.size() == tensors.size());
   for (size_t i = 0; i < names.size(); ++i) {
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index 57b46a0a06f48e..fb0dfd553430ac 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -394,7 +394,7 @@ Status PaddingFIFOQueue::SetElementZero(Tensor* element) {
 }
 
 std::vector<TensorShape> PaddingFIFOQueue::ConvertShapesPartialDimensionsToZero(
-    const gtl::ArraySlice<PartialTensorShape>& partial_shapes) {
+    const absl::Span<const PartialTensorShape>& partial_shapes) {
   std::vector<TensorShape> shapes(partial_shapes.size());
   for (size_t i = 0; i < shapes.size(); ++i) {
     const PartialTensorShape& partial = partial_shapes[i];
diff --git a/tensorflow/core/kernels/padding_fifo_queue.h b/tensorflow/core/kernels/padding_fifo_queue.h
index 939e04dd5e769b..15b25efbc6f87e 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.h
+++ b/tensorflow/core/kernels/padding_fifo_queue.h
@@ -57,7 +57,7 @@ class PaddingFIFOQueue : public FIFOQueue {
   // Any unknown dimension sizes are converted to 0.
   // REQUIRED: All the input shapes have well defined rank.
   static std::vector<TensorShape> ConvertShapesPartialDimensionsToZero(
-      const gtl::ArraySlice<PartialTensorShape>& partial_shapes);
+      const absl::Span<const PartialTensorShape>& partial_shapes);
 
   // Sets the values in the given element to zero.
   static Status SetElementZero(Tensor* element);
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index 1b037b76d2b047..1a82a793f615c5 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -45,7 +45,7 @@ void TestRequantizeMany(Eigen::ThreadPoolDevice* eigen_device, float input_min,
   }
 
   Tensor i_tensor =
-      tensorflow::test::AsTensor(gtl::ArraySlice<qint32>(values_quantized));
+      tensorflow::test::AsTensor(absl::Span<const qint32>(values_quantized));
   Tensor o_tensor(DT_QUINT8, TensorShape{values_count});
   auto output_values = o_tensor.flat<quint8>();
 
@@ -87,7 +87,7 @@ void TestRequantizeMany8To32Bit(float input_min, float input_max,
   }
 
   const Tensor i_tensor =
-      tensorflow::test::AsTensor(gtl::ArraySlice<quint8>(values_quantized));
+      tensorflow::test::AsTensor(absl::Span<const quint8>(values_quantized));
   Tensor o_tensor(DT_QINT32, TensorShape{values_count});
   auto output_values = o_tensor.flat<qint32>();
 
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 1b1d7f448ada46..f9f86e599a8b0f 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -78,7 +78,7 @@ Status QueueBase::ValidateTupleCommon(const Tuple& tuple) const {
 }
 
 // static
-string QueueBase::ShapeListString(const gtl::ArraySlice<TensorShape>& shapes) {
+string QueueBase::ShapeListString(const absl::Span<const TensorShape>& shapes) {
   string result = "[";
   bool first = true;
   for (const TensorShape& shape : shapes) {
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index 71884a9dff0af9..e47ddb4dd28894 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -135,7 +135,7 @@ class QueueBase : public QueueInterface {
   ~QueueBase() override;
 
   // Helpers for implementing MatchesNodeDef().
-  static string ShapeListString(const gtl::ArraySlice<TensorShape>& shapes);
+  static string ShapeListString(const absl::Span<const TensorShape>& shapes);
   Status MatchesNodeDefOp(const NodeDef& node_def, const string& op) const;
   Status MatchesNodeDefCapacity(const NodeDef& node_def,
                                 int32_t capacity) const;
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
index cfc3a0812ec4c3..999fd44d7c0e92 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
@@ -508,7 +508,7 @@ class RaggedTensorToTensorOpUnknownShapeTest
     : public ::tensorflow::OpsTestBase {
  protected:
   std::unique_ptr<ShapeInferenceTestOp> op_;
-  void SetAttributes(const gtl::ArraySlice<string> row_partition_types,
+  void SetAttributes(const absl::Span<const string> row_partition_types,
                      int num_row_partition_tensors) {
     op_ = std::make_unique<ShapeInferenceTestOp>("RaggedTensorToTensor");
     SetAttrValue(row_partition_types,
@@ -519,7 +519,7 @@ class RaggedTensorToTensorOpUnknownShapeTest
 };
 
 TEST_F(RaggedTensorToTensorOpUnknownShapeTest, ValueRowIDs) {
-  SetAttributes(gtl::ArraySlice<string>{"FIRST_DIM_SIZE", "VALUE_ROWIDS"}, 2);
+  SetAttributes(absl::Span<const string>{"FIRST_DIM_SIZE", "VALUE_ROWIDS"}, 2);
 
   INFER_OK(*op_, "?;?;?;?;?", "?");
   INFER_OK(*op_, "?;[6];[];[];[6]", "[?,?]");
@@ -544,7 +544,7 @@ TEST_F(RaggedTensorToTensorOpUnknownShapeTest, ValueRowIDs) {
 TEST_F(RaggedTensorToTensorOpUnknownShapeTest, RowSplits) {
   // RaggedTensorToTensor(param_splits+, param_values, indices) -> [splits+,
   // values]
-  SetAttributes(gtl::ArraySlice<string>{"ROW_SPLITS"}, 1);
+  SetAttributes(absl::Span<const string>{"ROW_SPLITS"}, 1);
 
   // value, default_value, ROW_SPLITS
   INFER_OK(*op_, "?;?;?;?", "?");
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index eae756b89896e5..449e0ccb879253 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -36,21 +36,18 @@ using gtl::MutableArraySlice;
 RangeSampler::~RangeSampler() {}
 
 void RangeSampler::SampleBatch(random::SimplePhilox* rnd, bool unique,
-                               gtl::MutableArraySlice<int64_t> batch) const {
-  SampleBatchGetExpectedCount(
-      rnd, unique, batch, gtl::MutableArraySlice<float>(),
-      gtl::ArraySlice<int64_t>(), gtl::MutableArraySlice<float>());
+                               absl::Span<int64_t> batch) const {
+  SampleBatchGetExpectedCount(rnd, unique, batch, absl::Span<float>(),
+                              absl::Span<const int64_t>(), absl::Span<float>());
 }
 
 void RangeSampler::SampleBatchGetExpectedCount(
-    random::SimplePhilox* rnd, bool unique,
-    gtl::MutableArraySlice<int64_t> batch,
-    gtl::MutableArraySlice<float> batch_expected_count,
-    gtl::ArraySlice<int64_t> extras,
-    gtl::MutableArraySlice<float> extras_expected_count) const {
+    random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+    absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+    absl::Span<float> extras_expected_count) const {
   SampleBatchGetExpectedCountAvoid(rnd, unique, batch, batch_expected_count,
                                    extras, extras_expected_count,
-                                   gtl::ArraySlice<int64_t>());
+                                   absl::Span<const int64_t>());
 }
 
 namespace {
@@ -77,10 +74,10 @@ static float ExpectedCountHelper(float p, int batch_size, int num_tries) {
 }  // namespace
 
 void RangeSampler::SampleBatchGetExpectedCountAvoid(
-    random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64_t> batch,
-    MutableArraySlice<float> batch_expected_count, ArraySlice<int64_t> extras,
-    MutableArraySlice<float> extras_expected_count,
-    ArraySlice<int64_t> avoided_values) const {
+    random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+    absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+    absl::Span<float> extras_expected_count,
+    absl::Span<const int64_t> avoided_values) const {
   const int batch_size = batch.size();
   int num_tries;
 
@@ -124,10 +121,10 @@ void RangeSampler::SampleBatchGetExpectedCountAvoid(
 AllSampler::AllSampler(int64_t range) : RangeSampler(range) {}
 
 void AllSampler::SampleBatchGetExpectedCountAvoid(
-    random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64_t> batch,
-    MutableArraySlice<float> batch_expected_count, ArraySlice<int64_t> extras,
-    MutableArraySlice<float> extras_expected_count,
-    ArraySlice<int64_t> avoided_values) const {
+    random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+    absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+    absl::Span<float> extras_expected_count,
+    absl::Span<const int64_t> avoided_values) const {
   const int batch_size = batch.size();
   CHECK_EQ(range_, batch_size);
   for (int i = 0; i < batch_size; i++) {
@@ -191,7 +188,7 @@ float ThreadUnsafeUnigramSampler::Probability(int64_t value) const {
   return static_cast<float>(picker_.get_weight(value)) / picker_.total_weight();
 }
 
-void ThreadUnsafeUnigramSampler::Update(ArraySlice<int64_t> values) {
+void ThreadUnsafeUnigramSampler::Update(absl::Span<const int64_t> values) {
   int num_updates = std::min(static_cast<int>(values.size()),
                              kint32max - picker_.total_weight());
   for (int i = 0; i < num_updates; i++) {
@@ -218,17 +215,17 @@ float UnigramSampler::Probability(int64_t value) const {
 
 // Overriding at a high level results in far fewer lock acquisitions.
 void UnigramSampler::SampleBatchGetExpectedCountAvoid(
-    random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64_t> batch,
-    MutableArraySlice<float> batch_expected_count, ArraySlice<int64_t> extras,
-    MutableArraySlice<float> extras_expected_count,
-    ArraySlice<int64_t> avoided_values) const {
+    random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+    absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+    absl::Span<float> extras_expected_count,
+    absl::Span<const int64_t> avoided_values) const {
   tf_shared_lock lock(mu_);
   unsafe_sampler_.SampleBatchGetExpectedCountAvoid(
       rnd, unique, batch, batch_expected_count, extras, extras_expected_count,
       avoided_values);
 }
 
-void UnigramSampler::Update(ArraySlice<int64_t> values) {
+void UnigramSampler::Update(absl::Span<const int64_t> values) {
   mutex_lock lock(mu_);
   unsafe_sampler_.Update(values);
 }
diff --git a/tensorflow/core/kernels/range_sampler.h b/tensorflow/core/kernels/range_sampler.h
index 8710e10977300c..94a0801a43be25 100644
--- a/tensorflow/core/kernels/range_sampler.h
+++ b/tensorflow/core/kernels/range_sampler.h
@@ -52,7 +52,7 @@ class RangeSampler {
   // If unique=true, then we re-pick each element until we get a
   // value distinct from all previously picked values in the batch.
   void SampleBatch(random::SimplePhilox* rnd, bool unique,
-                   gtl::MutableArraySlice<int64_t> batch) const;
+                   absl::Span<int64_t> batch) const;
 
   // Fill "batch" with samples from the distribution, and report
   // "expected counts".
@@ -74,29 +74,25 @@ class RangeSampler {
   // "batch_expected_count" must have size equal to 0 or to the size of "batch".
   // "extras" and "extras_expected_count" must have equal size.
   void SampleBatchGetExpectedCount(
-      random::SimplePhilox* rnd, bool unique,
-      gtl::MutableArraySlice<int64_t> batch,
-      gtl::MutableArraySlice<float> batch_expected_count,
-      gtl::ArraySlice<int64_t> extras,
-      gtl::MutableArraySlice<float> extras_expected_count) const;
+      random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+      absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+      absl::Span<float> extras_expected_count) const;
 
   // Same as SampleBatchGetExpectedCount (see above), but with avoided values.
   // We repick to avoid all of the values in "avoided_values".
   // "avoided_values" is only supported with unique=true.  If
   // unique=false, then avoided_values must be empty.
   virtual void SampleBatchGetExpectedCountAvoid(
-      random::SimplePhilox* rnd, bool unique,
-      gtl::MutableArraySlice<int64_t> batch,
-      gtl::MutableArraySlice<float> batch_expected_count,
-      gtl::ArraySlice<int64_t> extras,
-      gtl::MutableArraySlice<float> extras_expected_count,
-      gtl::ArraySlice<int64_t> avoided_values) const;
+      random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+      absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+      absl::Span<float> extras_expected_count,
+      absl::Span<const int64_t> avoided_values) const;
 
   // Does this sampler need to be updated with values, e.g. UnigramSampler
   virtual bool NeedsUpdates() const { return false; }
 
   // Updates the underlying distribution
-  virtual void Update(gtl::ArraySlice<int64_t> values) {
+  virtual void Update(absl::Span<const int64_t> values) {
     LOG(FATAL) << "Update not supported for this sampler type.";
   }
 
@@ -126,12 +122,10 @@ class AllSampler : public RangeSampler {
   }
 
   void SampleBatchGetExpectedCountAvoid(
-      random::SimplePhilox* rnd, bool unique,
-      gtl::MutableArraySlice<int64_t> batch,
-      gtl::MutableArraySlice<float> batch_expected_count,
-      gtl::ArraySlice<int64_t> extras,
-      gtl::MutableArraySlice<float> extras_expected_count,
-      gtl::ArraySlice<int64_t> avoided_values) const override;
+      random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+      absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+      absl::Span<float> extras_expected_count,
+      absl::Span<const int64_t> avoided_values) const override;
 };
 
 class UniformSampler : public RangeSampler {
@@ -173,7 +167,7 @@ class ThreadUnsafeUnigramSampler : public RangeSampler {
   float Probability(int64_t value) const override;
 
   bool NeedsUpdates() const override { return true; }
-  void Update(gtl::ArraySlice<int64_t> values) override;
+  void Update(absl::Span<const int64_t> values) override;
 
  private:
   random::WeightedPicker picker_;
@@ -191,15 +185,13 @@ class UnigramSampler : public RangeSampler {
 
   // Overriding at a high level results in far fewer lock acquisitions.
   void SampleBatchGetExpectedCountAvoid(
-      random::SimplePhilox* rnd, bool unique,
-      gtl::MutableArraySlice<int64_t> batch,
-      gtl::MutableArraySlice<float> batch_expected_count,
-      gtl::ArraySlice<int64_t> extras,
-      gtl::MutableArraySlice<float> extras_expected_count,
-      gtl::ArraySlice<int64_t> avoided_values) const override;
+      random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+      absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+      absl::Span<float> extras_expected_count,
+      absl::Span<const int64_t> avoided_values) const override;
 
   bool NeedsUpdates() const override { return true; }
-  void Update(gtl::ArraySlice<int64_t> values) override;
+  void Update(absl::Span<const int64_t> values) override;
 
  private:
   ThreadUnsafeUnigramSampler unsafe_sampler_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/range_sampler_test.cc b/tensorflow/core/kernels/range_sampler_test.cc
index aa1f398ed0cc01..32aed624a8926f 100644
--- a/tensorflow/core/kernels/range_sampler_test.cc
+++ b/tensorflow/core/kernels/range_sampler_test.cc
@@ -74,7 +74,7 @@ class RangeSamplerTest : public ::testing::Test {
       a[i] = i;
     }
     for (int64_t i = 1; i < 10; i++) {
-      sampler_->Update(ArraySlice<int64_t>(a + i, 10 - i));
+      sampler_->Update(absl::Span<const int64_t>(a + i, 10 - i));
     }
   }
   std::unique_ptr<RangeSampler> sampler_;
@@ -321,7 +321,7 @@ TEST_F(RangeSamplerTest, Unique) {
 
   // Sample one batch and get the expected counts of all values
   sampler_->SampleBatchGetExpectedCount(&rnd, true, absl::MakeSpan(batch),
-                                        MutableArraySlice<float>(), all_values,
+                                        absl::Span<float>(), all_values,
                                         absl::MakeSpan(expected));
   // Check that all elements are unique
   std::set<int64_t> s(batch.begin(), batch.end());
@@ -329,9 +329,9 @@ TEST_F(RangeSamplerTest, Unique) {
 
   for (int trial = 0; trial < num_batches; trial++) {
     std::vector<float> trial_expected(range);
-    sampler_->SampleBatchGetExpectedCount(
-        &rnd, true, absl::MakeSpan(batch), MutableArraySlice<float>(),
-        all_values, absl::MakeSpan(trial_expected));
+    sampler_->SampleBatchGetExpectedCount(&rnd, true, absl::MakeSpan(batch),
+                                          absl::Span<float>(), all_values,
+                                          absl::MakeSpan(trial_expected));
     for (int i = 0; i < range; i++) {
       EXPECT_NEAR(expected[i], trial_expected[i], expected[i] * 0.5);
     }
@@ -358,8 +358,8 @@ TEST_F(RangeSamplerTest, Avoid) {
 
   // We expect to pick all elements of [0, 100) except the avoided two.
   sampler_->SampleBatchGetExpectedCountAvoid(
-      &rnd, true, absl::MakeSpan(batch), MutableArraySlice<float>(),
-      ArraySlice<int64_t>(), MutableArraySlice<float>(), avoided);
+      &rnd, true, absl::MakeSpan(batch), absl::Span<float>(),
+      absl::Span<const int64_t>(), absl::Span<float>(), avoided);
 
   int sum = 0;
   for (auto val : batch) {
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index d43a4aabdd3981..6a5053460e4400 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -217,7 +217,7 @@ class ReverseOp : public OpKernel {
 
 template <typename Device, typename T, int NDIMS>
 void HandleReverseV2Case(OpKernelContext* context,
-                         const gtl::ArraySlice<bool> axes, Tensor* result) {
+                         const absl::Span<const bool> axes, Tensor* result) {
   const Tensor& input = context->input(0);
 
   // Use optimized reverse if possible.
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index ae9169df96ac3a..269f1011acc61a 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -124,9 +124,9 @@ namespace functor {
 //    back to the front
 template <typename T>
 void DoRoll(const OpKernelContext* context, const int64_t num_elements,
-            const int num_dims, const gtl::ArraySlice<int32> dim_size,
-            const T* input, T* output, const gtl::ArraySlice<int32> threshold,
-            const gtl::ArraySlice<int64_t> dim_range) {
+            const int num_dims, const absl::Span<const int32> dim_size,
+            const T* input, T* output, const absl::Span<const int32> threshold,
+            const absl::Span<const int64_t> dim_range) {
   auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range](
                   int64_t start, int64_t end) {
     // array of indices for each dimension
@@ -188,9 +188,9 @@ template <typename T>
 // Use memcpy to copy memory in groups when the data type supports memcpy
 void DoRollWithMemcpy(const OpKernelContext* context,
                       const int64_t num_elements, const int num_dims,
-                      const gtl::ArraySlice<int32> dim_size, const T* input,
-                      T* output, const gtl::ArraySlice<int32> threshold,
-                      const gtl::ArraySlice<int64_t> dim_range,
+                      const absl::Span<const int32> dim_size, const T* input,
+                      T* output, const absl::Span<const int32> threshold,
+                      const absl::Span<const int64_t> dim_range,
                       const int64_t isd) {
   auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range, isd](
                   int64_t start, int64_t end) {
@@ -311,10 +311,11 @@ void DoRollWithMemcpy(const OpKernelContext* context,
 template <typename T>
 struct Roll<CPUDevice, T> {
   void operator()(const OpKernelContext* context, const int64_t num_elements,
-                  const int num_dims, const gtl::ArraySlice<int32> dim_size,
+                  const int num_dims, const absl::Span<const int32> dim_size,
                   const T* input, T* output,
-                  const gtl::ArraySlice<int32> threshold,
-                  const gtl::ArraySlice<int64_t> dim_range, const int64_t isd) {
+                  const absl::Span<const int32> threshold,
+                  const absl::Span<const int64_t> dim_range,
+                  const int64_t isd) {
     if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
       // V2 copies memory in groups instead of element by element
       DoRollWithMemcpy<T>(context, num_elements, num_dims, dim_size, input,
diff --git a/tensorflow/core/kernels/roll_op.h b/tensorflow/core/kernels/roll_op.h
index 69fbe3c366b59a..7ae1d8f58bc57b 100644
--- a/tensorflow/core/kernels/roll_op.h
+++ b/tensorflow/core/kernels/roll_op.h
@@ -34,10 +34,10 @@ struct Roll {
   //    back to the front
   // isd - inner shift dimension
   void operator()(const OpKernelContext* context, const int64_t num_elements,
-                  const int num_dims, const gtl::ArraySlice<int32> dim_size,
+                  const int num_dims, const absl::Span<const int32> dim_size,
                   const T* input, T* output,
-                  const gtl::ArraySlice<int32> threshold,
-                  const gtl::ArraySlice<int64_t> dim_range, const int64_t isd);
+                  const absl::Span<const int32> threshold,
+                  const absl::Span<const int64_t> dim_range, const int64_t isd);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 8e0a0965a010ab..8cda48097cf9b8 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -365,7 +365,7 @@ struct RestoreOp {
 Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
                         const Tensor& tensor_names,
                         const Tensor& shape_and_slices,
-                        gtl::ArraySlice<DataType> dtypes) {
+                        absl::Span<const DataType> dtypes) {
   const string& prefix_string = prefix.scalar<tstring>()();
 
   const auto& tensor_names_flat = tensor_names.flat<tstring>();
diff --git a/tensorflow/core/kernels/save_restore_tensor.h b/tensorflow/core/kernels/save_restore_tensor.h
index be7f4b889e78fd..6e58b90c6c4d87 100644
--- a/tensorflow/core/kernels/save_restore_tensor.h
+++ b/tensorflow/core/kernels/save_restore_tensor.h
@@ -66,7 +66,7 @@ void RestoreTensor(OpKernelContext* context,
 Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
                         const Tensor& tensor_names,
                         const Tensor& shape_and_slices,
-                        gtl::ArraySlice<DataType> dtypes);
+                        absl::Span<const DataType> dtypes);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index eb329b9b90b616..53329049936d66 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -283,8 +283,8 @@ class MergeV2Checkpoints : public OpKernel {
                     "Input destination_prefix should be a scalar tensor, got ",
                     destination_prefix.shape().DebugString(), " instead."));
 
-    const gtl::ArraySlice<tstring> input_prefixes =
-        gtl::ArraySlice<tstring>(checkpoint_prefixes.flat<tstring>());
+    const absl::Span<const tstring> input_prefixes =
+        absl::Span<const tstring>(checkpoint_prefixes.flat<tstring>());
     Env* env = Env::Default();
     const string& merged_prefix = destination_prefix.scalar<tstring>()();
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index 65eee00197e7f6..ec4571b91cea62 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -39,7 +39,7 @@ namespace tensorflow {
 class ScopedAllocatorOpTest : public OpsTestBase {
  protected:
   void MakeOp(const TensorShape& shape,
-              const gtl::ArraySlice<TensorShape> shapes, DataType dtype,
+              const absl::Span<const TensorShape> shapes, DataType dtype,
               const string& name, int32_t id, int32_t expected_call_count) {
     TF_EXPECT_OK(NodeDefBuilder("scoped_allocator_op", "_ScopedAllocator")
                      .Attr("T", dtype)
diff --git a/tensorflow/core/kernels/sequence_ops_test.cc b/tensorflow/core/kernels/sequence_ops_test.cc
index 7a319d51ec9e64..1985d631d23739 100644
--- a/tensorflow/core/kernels/sequence_ops_test.cc
+++ b/tensorflow/core/kernels/sequence_ops_test.cc
@@ -96,7 +96,7 @@ TEST_F(RangeOpTest, Large_Double) {
   Tensor expected(allocator(), DT_DOUBLE, TensorShape({20000}));
   std::vector<double> result;
   for (int32_t i = 0; i < 20000; ++i) result.push_back(i * 0.5);
-  test::FillValues<double>(&expected, gtl::ArraySlice<double>(result));
+  test::FillValues<double>(&expected, absl::Span<const double>(result));
   test::ExpectTensorEqual<double>(expected, *GetOutput(0));
 }
 
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index e5c18e399518db..f3f73a4c718d71 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -216,8 +216,8 @@ class SliceOp : public OpKernel {
 
  private:
   template <int NDIM>
-  void HandleCase(OpKernelContext* context, gtl::ArraySlice<int64_t> begin,
-                  gtl::ArraySlice<int64_t> size, const Tensor& input,
+  void HandleCase(OpKernelContext* context, absl::Span<const int64_t> begin,
+                  absl::Span<const int64_t> size, const Tensor& input,
                   Tensor* result) {
     Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
     Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
diff --git a/tensorflow/core/kernels/sparse_add_op_test.cc b/tensorflow/core/kernels/sparse_add_op_test.cc
index 3c4eea8b279948..e2d19a62769fe7 100644
--- a/tensorflow/core/kernels/sparse_add_op_test.cc
+++ b/tensorflow/core/kernels/sparse_add_op_test.cc
@@ -63,9 +63,9 @@ TEST_F(SparseAddOpTest, TwoD_AddSparseTensorWithSelf) {
 
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
 #define ADD_TENSOR_INPUT()                                  \
   AddInputFromArray<int64_t>(indices_shape, indices);       \
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index 6b57a460c2be65..4007c69b3bd403 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -107,7 +107,8 @@ class SparseDenseBinaryOpShared : public OpKernel {
 
     // True iff (size(lhs) >= size(rhs)) and all dims in lhs is greater or equal
     // to dims in rhs (from right to left).
-    auto VecGreaterEq = [](ArraySlice<int64_t> lhs, ArraySlice<int64_t> rhs) {
+    auto VecGreaterEq = [](absl::Span<const int64_t> lhs,
+                           absl::Span<const int64_t> rhs) {
       if (lhs.size() < rhs.size()) return false;
       for (size_t i = 0; i < rhs.size(); ++i) {
         if (lhs[lhs.size() - 1 - i] < rhs[rhs.size() - 1 - i]) return false;
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index 90d7a3c77b1c6d..27115f3153458f 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -99,9 +99,9 @@ TEST_F(SparseDenseCDivTest, SameShape) {
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
   // Tensor dense(DT_FLOAT, TensorShape({3, 1}));
   Tensor dense(DT_FLOAT, TensorShape(shape));
@@ -128,9 +128,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseSameDims) {
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({3, 1}));
   auto dense_flat = dense.flat<float>();
@@ -155,9 +155,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseFewerDims) {
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
   auto dense_flat = dense.flat<float>();
@@ -187,9 +187,9 @@ TEST_F(SparseDenseCMulTest, BroadcastDense) {
   // [1.5 0]
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
   auto dense_flat = dense.flat<float>();
@@ -243,7 +243,7 @@ static ST MakeSparseTensor(Graph* g, int B, int M, int N, int nnz_inner) {
   Tensor vals(DT_FLOAT, TensorShape({total_nnz}));
   Tensor shape(DT_INT64, TensorShape({kNumDims}));
   vals.flat<float>().setRandom();
-  test::FillValues(&shape, gtl::ArraySlice<int64_t>({B, M, N}));
+  test::FillValues(&shape, absl::Span<const int64_t>({B, M, N}));
   auto indices_mat = indices.matrix<int64_t>();
 
   int nnz_cnt = 0;
diff --git a/tensorflow/core/kernels/sparse_reduce_op.cc b/tensorflow/core/kernels/sparse_reduce_op.cc
index d64d7829a65fc1..348a73e0816280 100644
--- a/tensorflow/core/kernels/sparse_reduce_op.cc
+++ b/tensorflow/core/kernels/sparse_reduce_op.cc
@@ -54,7 +54,7 @@ struct ReduceDetails {
 // }
 // // Set output shape to reduction.reduced_shape.
 ReduceDetails SparseTensorReduceHelper(const SparseTensor &sp,
-                                       gtl::ArraySlice<int32> axes_slice,
+                                       absl::Span<const int32> axes_slice,
                                        bool keep_dims) {
   ReduceDetails reduction;
 
@@ -206,8 +206,8 @@ class SparseReduceOp : public OpKernel {
       }
     }
 
-    auto CoordinatesToFlatIndex = [](ArraySlice<int64_t> coords,
-                                     ArraySlice<int64_t> strides) -> int64 {
+    auto CoordinatesToFlatIndex = [](absl::Span<const int64_t> coords,
+                                     absl::Span<const int64_t> strides) -> int64 {
       if (strides.empty()) {  // Reduce all.
         return 0;
       }
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
index 4aa89a31962329..9ec7e18cf3315e 100644
--- a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
+++ b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
@@ -53,9 +53,9 @@ TEST_F(SparseReduceSumOpTest, SimpleReduce) {
 
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
   AddInputFromArray<int64_t>(indices_shape, indices);
   AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4});
@@ -95,9 +95,9 @@ TEST_F(SparseReduceSumSparseOpTest, SimpleReduce) {
 
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
   AddInputFromArray<int64_t>(indices_shape, indices);
   AddInputFromArray<float>(TensorShape({4}), {2, 2, 3, 4});
diff --git a/tensorflow/core/kernels/sparse_reorder_op.cc b/tensorflow/core/kernels/sparse_reorder_op.cc
index 0caeaf0a0e0699..c3f1787f1f2130 100644
--- a/tensorflow/core/kernels/sparse_reorder_op.cc
+++ b/tensorflow/core/kernels/sparse_reorder_op.cc
@@ -45,8 +45,8 @@ template <typename T>
 struct SparseReorderFunctor<CPUDevice, T> {
   void operator()(OpKernelContext* context, const Tensor& input_ind,
                   const Tensor& input_val, const Tensor& input_shape_in) {
-    gtl::ArraySlice<int64_t> input_shape(input_shape_in.vec<int64_t>().data(),
-                                         input_shape_in.NumElements());
+    absl::Span<const int64_t> input_shape(input_shape_in.vec<int64_t>().data(),
+                                          input_shape_in.NumElements());
 
     gtl::InlinedVector<int64_t, 8> std_order(input_shape.size());
     std::iota(std_order.begin(), std_order.end(), 0);
diff --git a/tensorflow/core/kernels/sparse_slice_op.cc b/tensorflow/core/kernels/sparse_slice_op.cc
index de179d087654ad..fdaba288f20168 100644
--- a/tensorflow/core/kernels/sparse_slice_op.cc
+++ b/tensorflow/core/kernels/sparse_slice_op.cc
@@ -50,10 +50,10 @@ struct SparseSliceFunctor<CPUDevice, T> {
                                 input_indices, input_values,
                                 sparse_tensor_shape, &sparse_tensor));
 
-    const gtl::ArraySlice<int64_t> start(input_start.flat<int64_t>().data(),
+    const absl::Span<const int64_t> start(input_start.flat<int64_t>().data(),
+                                          input_dims);
+    const absl::Span<const int64_t> size(input_size.flat<int64_t>().data(),
                                          input_dims);
-    const gtl::ArraySlice<int64_t> size(input_size.flat<int64_t>().data(),
-                                        input_dims);
 
     const absl::StatusOr<sparse::SparseTensor> output_or =
         sparse::SparseTensor::Slice<T>(sparse_tensor, start, size);
diff --git a/tensorflow/core/kernels/sparse_softmax_op.cc b/tensorflow/core/kernels/sparse_softmax_op.cc
index 9a073a12570ff1..47006f056e0356 100644
--- a/tensorflow/core/kernels/sparse_softmax_op.cc
+++ b/tensorflow/core/kernels/sparse_softmax_op.cc
@@ -87,9 +87,10 @@ class SparseSoftmaxOp : public OpKernel {
     gtl::InlinedVector<int64_t, 4> dims(rank);
     std::iota(dims.begin(), dims.end(), 0);
     // { 0, ..., rank-1 }.
-    const ArraySlice<int64_t> kReorderDims(dims);
+    const absl::Span<const int64_t> kReorderDims(dims);
     // All but the last dim -- the class dimension to be max-reduced along.
-    const ArraySlice<int64_t> kGroupByDims = kReorderDims.subspan(0, rank - 1);
+    const absl::Span<const int64_t> kGroupByDims =
+        kReorderDims.subspan(0, rank - 1);
     st.Reorder<T>(kReorderDims);
     int count = 0;
 
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 23f3155392ef9a..5604f836c404b7 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -38,17 +38,17 @@ namespace tensorflow {
 
 template <typename Device, typename T, int NDIM>
 void HandleStridedSliceCase(OpKernelContext* context,
-                            const gtl::ArraySlice<int64_t>& begin,
-                            const gtl::ArraySlice<int64_t>& end,
-                            const gtl::ArraySlice<int64_t>& strides,
+                            const absl::Span<const int64_t>& begin,
+                            const absl::Span<const int64_t>& end,
+                            const absl::Span<const int64_t>& strides,
                             const TensorShape& processing_shape,
                             bool is_simple_slice, Tensor* result);
 
 template <typename Device, typename T, int NDIM>
 void HandleStridedSliceGradCase(OpKernelContext* context,
-                                const gtl::ArraySlice<int64_t>& begin,
-                                const gtl::ArraySlice<int64_t>& end,
-                                const gtl::ArraySlice<int64_t>& strides,
+                                const absl::Span<const int64_t>& begin,
+                                const absl::Span<const int64_t>& end,
+                                const absl::Span<const int64_t>& strides,
                                 const TensorShape& processing_shape,
                                 bool is_simple_slice, Tensor* result);
 
@@ -56,9 +56,9 @@ template <typename Device, typename T, int NDIM>
 class HandleStridedSliceAssignCase {
  public:
   void operator()(OpKernelContext* context,
-                  const gtl::ArraySlice<int64_t>& begin,
-                  const gtl::ArraySlice<int64_t>& end,
-                  const gtl::ArraySlice<int64_t>& strides,
+                  const absl::Span<const int64_t>& begin,
+                  const absl::Span<const int64_t>& end,
+                  const absl::Span<const int64_t>& strides,
                   const StridedSliceAssignBCast& bcast, Tensor* result);
 };
 }  // namespace tensorflow
@@ -76,9 +76,9 @@ namespace tensorflow {
 
 template <typename Device, typename T, int NDIM>
 void HandleStridedSliceCase(OpKernelContext* context,
-                            const gtl::ArraySlice<int64_t>& begin,
-                            const gtl::ArraySlice<int64_t>& end,
-                            const gtl::ArraySlice<int64_t>& strides,
+                            const absl::Span<const int64_t>& begin,
+                            const absl::Span<const int64_t>& end,
+                            const absl::Span<const int64_t>& strides,
                             const TensorShape& processing_shape,
                             bool is_simple_slice, Tensor* result) {
   typedef typename proxy_type<Device, T>::type Proxy;
@@ -114,9 +114,9 @@ void HandleStridedSliceCase(OpKernelContext* context,
 
 template <typename Device, typename T, int NDIM>
 void HandleStridedSliceGradCase(OpKernelContext* context,
-                                const gtl::ArraySlice<int64_t>& begin,
-                                const gtl::ArraySlice<int64_t>& end,
-                                const gtl::ArraySlice<int64_t>& strides,
+                                const absl::Span<const int64_t>& begin,
+                                const absl::Span<const int64_t>& end,
+                                const absl::Span<const int64_t>& strides,
                                 const TensorShape& processing_shape,
                                 bool is_simple_slice, Tensor* result) {
   gtl::InlinedVector<int64_t, 4> processing_dims = processing_shape.dim_sizes();
@@ -139,9 +139,9 @@ void HandleStridedSliceGradCase(OpKernelContext* context,
 
 template <typename Device, typename T, int NDIM>
 void HandleStridedSliceAssignCase<Device, T, NDIM>::operator()(
-    OpKernelContext* context, const gtl::ArraySlice<int64_t>& begin,
-    const gtl::ArraySlice<int64_t>& end,
-    const gtl::ArraySlice<int64_t>& strides,
+    OpKernelContext* context, const absl::Span<const int64_t>& begin,
+    const absl::Span<const int64_t>& end,
+    const absl::Span<const int64_t>& strides,
     const StridedSliceAssignBCast& bcast, Tensor* result) {
   typedef typename proxy_type<Device, T>::type Proxy;
   Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
@@ -166,9 +166,9 @@ class HandleStridedSliceAssignCase<Device, T, 0> {
  public:
   enum { NDIM_PROXY = 1 };
   void operator()(OpKernelContext* context,
-                  const gtl::ArraySlice<int64_t>& begin,
-                  const gtl::ArraySlice<int64_t>& end,
-                  const gtl::ArraySlice<int64_t>& strides,
+                  const absl::Span<const int64_t>& begin,
+                  const absl::Span<const int64_t>& end,
+                  const absl::Span<const int64_t>& strides,
                   const StridedSliceAssignBCast& bcast, Tensor* result) {
     gtl::InlinedVector<int64_t, 1> processing_dims(1);
     processing_dims[0] = 1;
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 5d1322e6f4b7d6..7ccd9b5afe12a6 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -376,7 +376,7 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
           dims.push_back(dim.size);
         }
         TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
-            gtl::ArraySlice<int64_t>(dims), &element_shape));
+            absl::Span<const int64_t>(dims), &element_shape));
       }
     } else {
       element_shape = tensor_array->ElemShape();
diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index d640d051a40f4d..1dab98fb968d6d 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -34,7 +34,7 @@ namespace tensorflow {
 // REQUIRES: in.dim_size(perm[i]) == out->dim_size(i)
 template <typename Device>
 Status DoTranspose(const Device& device, const Tensor& in,
-                   const gtl::ArraySlice<int32> perm, Tensor* out);
+                   const absl::Span<const int32> perm, Tensor* out);
 
 // Conjugate and transpose tensor 'in' into tensor 'out' according to dimension
 // permutation 'perm'.
@@ -45,7 +45,7 @@ Status DoTranspose(const Device& device, const Tensor& in,
 // REQUIRES: in.dim_size(perm[i]) == out->dim_size(i)
 template <typename Device>
 Status DoConjugateTranspose(const Device& device, const Tensor& in,
-                            const gtl::ArraySlice<int32> perm, Tensor* out);
+                            const absl::Span<const int32> perm, Tensor* out);
 
 // Convenience versions of DoTranspose that only swap the last (inner) two
 // dimensions.
@@ -62,7 +62,7 @@ Status DoConjugateMatrixTranspose(const Device& device, const Tensor& in,
 template <typename Device, typename T, bool conjugate = false>
 struct Transpose {
   static void run(const Device& d, const Tensor& in,
-                  const gtl::ArraySlice<int32> perm, Tensor* out);
+                  const absl::Span<const int32> perm, Tensor* out);
 };
 
 // Implementation details.
@@ -77,7 +77,7 @@ typedef gtl::InlinedVector<int32, 8> TransposePermsVec;
 // Example: Tensor shape {2, 3, 4, 5, 120} and permutation {0, 4, 1, 2, 3} will
 // produce new shape {2, 60, 120} and new permutation {0, 2, 1}.
 inline void ReduceTransposeDimensions(const TensorShape& shape,
-                                      gtl::ArraySlice<int32> perm,
+                                      absl::Span<const int32> perm,
                                       TransposePermsVec* new_perm,
                                       TransposeDimsVec* new_dims) {
   CHECK_EQ(shape.dims(), perm.size());
@@ -146,7 +146,7 @@ inline bool NonSingletonDimensionsAlign(const TensorShape& input_shape,
 // Uses Eigen to transpose.
 template <typename Device, typename T, int NDIMS>
 void TransposeUsingEigen(const Device& d, const Tensor& in,
-                         const gtl::ArraySlice<int32> perm, bool conjugate,
+                         const absl::Span<const int32> perm, bool conjugate,
                          Tensor* out) {
   Eigen::array<int, NDIMS> p;
   for (int i = 0; i < NDIMS; ++i) p[i] = perm[i];
@@ -165,7 +165,7 @@ void TransposeUsingEigen(const Device& d, const Tensor& in,
 
 template <typename Device>
 Status DoTransposeImpl(const Device& d, const Tensor& in,
-                       const gtl::ArraySlice<int32> perm, bool conjugate,
+                       const absl::Span<const int32> perm, bool conjugate,
                        Tensor* out) {
   CHECK_EQ(in.dims(), out->dims());
   CHECK_EQ(in.dims(), perm.size());
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 4a3e744b4ada36..084721f6290d07 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -33,7 +33,7 @@ namespace {
 
 template <typename T, bool conjugate>
 void TransposeSimple(const CPUDevice& device, const Tensor& in,
-                     const gtl::ArraySlice<int32> perm, Tensor* out) {
+                     const absl::Span<const int32> perm, Tensor* out) {
   const int ndims = in.dims();
   gtl::InlinedVector<int64_t, 8> in_strides =
       ComputeStride<int64_t>(in.shape());
@@ -73,7 +73,7 @@ void TransposeSimple(const CPUDevice& device, const Tensor& in,
 template <typename T, bool conjugate>
 struct Transpose<CPUDevice, T, conjugate> {
   static void run(const CPUDevice& d, const Tensor& in,
-                  const gtl::ArraySlice<int32> perm, Tensor* out) {
+                  const absl::Span<const int32> perm, Tensor* out) {
     switch (in.dims()) {
       case 2:
         internal::TransposeUsingEigen<CPUDevice, T, 2>(d, in, perm, conjugate,
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index e3719aab6c648e..a28b32e91c0187 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -188,7 +188,7 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
 }
 
 Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                                   gtl::ArraySlice<int32> perm, Tensor* out) {
+                                   absl::Span<const int32> perm, Tensor* out) {
   typedef Eigen::ThreadPoolDevice CPUDevice;
   return ::tensorflow::DoTranspose(ctx->eigen_device<CPUDevice>(), in, perm,
                                    out);
@@ -196,7 +196,7 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
 
 Status ConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
                                             const Tensor& in,
-                                            gtl::ArraySlice<int32> perm,
+                                            absl::Span<const int32> perm,
                                             Tensor* out) {
   typedef Eigen::ThreadPoolDevice CPUDevice;
   return ::tensorflow::DoConjugateTranspose(ctx->eigen_device<CPUDevice>(), in,
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
index 840526468e96f6..f758c6ae66471f 100644
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -29,7 +29,7 @@ class TransposeOp : public OpKernel {
 
  protected:
   virtual Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                             gtl::ArraySlice<int32> perm, Tensor* out) = 0;
+                             absl::Span<const int32> perm, Tensor* out) = 0;
   virtual bool IsConjugate() const { return false; }
 };
 
@@ -39,7 +39,7 @@ class TransposeCpuOp : public TransposeOp {
 
  protected:
   Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+                     absl::Span<const int32> perm, Tensor* out) override;
 };
 
 #if defined(INTEL_MKL)
@@ -59,7 +59,7 @@ class TransposeGpuOp : public TransposeOp {
 
  protected:
   Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+                     absl::Span<const int32> perm, Tensor* out) override;
 };
 
 
@@ -71,7 +71,7 @@ class ConjugateTransposeCpuOp : public TransposeOp {
 
  protected:
   Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+                     absl::Span<const int32> perm, Tensor* out) override;
   bool IsConjugate() const override { return true; }
 };
 
@@ -95,7 +95,7 @@ class ConjugateTransposeGpuOp : public TransposeOp {
 
  protected:
   Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+                     absl::Span<const int32> perm, Tensor* out) override;
   bool IsConjugate() const override { return true; }
 };
 
diff --git a/tensorflow/core/kernels/transpose_util_test.cc b/tensorflow/core/kernels/transpose_util_test.cc
index a6e8d0898c1f09..caba50c268dd8d 100644
--- a/tensorflow/core/kernels/transpose_util_test.cc
+++ b/tensorflow/core/kernels/transpose_util_test.cc
@@ -23,15 +23,15 @@ namespace tensorflow {
 class TransposeUtilTest : public ::testing::Test {
  protected:
   void TestDimensionReduction(const TensorShape& shape,
-                              const gtl::ArraySlice<int32> perm,
-                              const gtl::ArraySlice<int32> expected_perm,
-                              const gtl::ArraySlice<int64_t> expected_dims) {
+                              const absl::Span<const int32> perm,
+                              const absl::Span<const int32> expected_perm,
+                              const absl::Span<const int64_t> expected_dims) {
     internal::TransposePermsVec new_perm;
     internal::TransposeDimsVec new_dims;
     internal::ReduceTransposeDimensions(shape, perm, &new_perm, &new_dims);
 
-    gtl::ArraySlice<int32> computed_perm(new_perm);
-    gtl::ArraySlice<int64_t> computed_dims(new_dims);
+    absl::Span<const int32> computed_perm(new_perm);
+    absl::Span<const int64_t> computed_dims(new_dims);
     EXPECT_EQ(computed_perm, expected_perm);
     EXPECT_EQ(computed_dims, expected_dims);
   }

From 53719bab84c1584ebf71324c279409c98826a4ea Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Apr 2024 00:12:07 -0700
Subject: [PATCH 657/670] Automated Code Change

PiperOrigin-RevId: 621074703
---
 tensorflow/core/data/rewrite_utils_test.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/data/rewrite_utils_test.cc b/tensorflow/core/data/rewrite_utils_test.cc
index 21132e7682063f..0f075dc62c427b 100644
--- a/tensorflow/core/data/rewrite_utils_test.cc
+++ b/tensorflow/core/data/rewrite_utils_test.cc
@@ -48,8 +48,8 @@ NodeDef GetMapNode(absl::string_view name, absl::string_view input_node_name,
       name, /*op=*/"MapDataset", {std::string(input_node_name)},
       {{"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
        {"Targuments", {}},
-       {"output_shapes", gtl::ArraySlice<TensorShape>{TensorShape()}},
-       {"output_types", gtl::ArraySlice<DataType>{DT_INT64}}});
+       {"output_shapes", absl::Span<const TensorShape>{TensorShape()}},
+       {"output_types", absl::Span<const DataType>{DT_INT64}}});
 }
 
 FunctionDef XTimesX() {
@@ -71,8 +71,8 @@ GraphDef GetRangeSquareDatasetDef(const int64_t range) {
        NDef("step", "Const", /*inputs=*/{},
             {{"value", AsScalar<int64_t>(1)}, {"dtype", DT_INT64}}),
        NDef("range", "RangeDataset", /*inputs=*/{"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{TensorShape()}},
-             {"output_types", gtl::ArraySlice<DataType>{DT_INT64}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{TensorShape()}},
+             {"output_types", absl::Span<const DataType>{DT_INT64}}}),
        GetMapNode("map", "range", "XTimesX"),
        NDef("dataset", "_Retval", /*inputs=*/{"map"},
             {{"T", DT_VARIANT}, {"index", 0}})},

From 5e7f9c38cea4fa74ba1aa007716a3c2b75223b5f Mon Sep 17 00:00:00 2001
From: Ziyin Huang <ziyinh@google.com>
Date: Tue, 2 Apr 2024 00:17:30 -0700
Subject: [PATCH 658/670] Fix the situation where the input tensor is an empty
 sparse tensor with empty indices.

PiperOrigin-RevId: 621075752
---
 .../core/tpu/kernels/sparse_core_preprocess_ops.cc  | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index 1a3deec7b09ed5..785367ff5cefdc 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -70,17 +70,22 @@ Status ValidateInputs(const Tensor& indices_or_row_splits, const Tensor& values,
           "Weights input should have dimension 0 or 1. But got dimension ",
           weights.dims(), "."));
   }
-  if (indices_or_row_splits.NumElements() == 0) {
-    // Dense tensor.
-    if (values.NumElements() != sample_count) {
+  // The indices_or_row_splits input for dense tensor is strictly 0 element
+  // with dimension 1.
+  if (indices_or_row_splits.NumElements() == 0 &&
+      indices_or_row_splits.dims() == 1) {
+    // Dense tensor with 0 element is also valid.
+    if (values.NumElements() != 0 && values.NumElements() != sample_count) {
       return absl::InvalidArgumentError(absl::StrCat(
           "Dense tensor input should have values elements number the same as "
           "the sample count. But got ",
           values.NumElements(), " elements for values and sample count as ",
           sample_count, "."));
     }
+    // 0 element indices with dimension as 2 is also valid for empty sparse
+    // tensor.
   } else if (indices_or_row_splits.dims() == 2 &&
-             indices_or_row_splits.NumElements() > 0) {
+             indices_or_row_splits.NumElements() >= 0) {
     // TODO(pineapplejuice233): Add checking logic for sparse tensor input.
   } else if (indices_or_row_splits.dims() == 1 &&
              indices_or_row_splits.NumElements() > 0) {

From 29817cbe8aeed338e11001f9b184ea98141cae95 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Apr 2024 01:43:47 -0700
Subject: [PATCH 659/670] Automated Code Change

PiperOrigin-RevId: 621094129
---
 .../data/experimental/set_stats_aggregator_dataset_op.cc      | 2 +-
 .../core/kernels/data/experimental/stats_aggregator_ops.cc    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index 3d2ae0e7f7434a..5c73ed46cdfae0 100644
--- a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -35,7 +35,7 @@ class StatsAggregatorWithTagAndPrefix : public StatsAggregator {
       const string& prefix)
       : wrapped_(stats_aggregator), tag_(tag), prefix_(prefix) {}
 
-  void AddToHistogram(const string& name, gtl::ArraySlice<double> values,
+  void AddToHistogram(const string& name, absl::Span<const double> values,
                       int64_t steps) override {
     wrapped_->AddToHistogram(TaggedName(name), values, steps);
   }
diff --git a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index 4bb95b087fcb5b..b34ddcf43c8a9b 100644
--- a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -49,7 +49,7 @@ class StatsAggregatorImpl : public StatsAggregator {
  public:
   StatsAggregatorImpl() {}
 
-  void AddToHistogram(const string& name, gtl::ArraySlice<double> values,
+  void AddToHistogram(const string& name, absl::Span<const double> values,
                       const int64_t steps) override {
     mutex_lock l(mu_);
     histogram::Histogram& histogram = histograms_[name];
@@ -138,7 +138,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
     }
   }
 
-  void AddToHistogram(const string& name, gtl::ArraySlice<double> values,
+  void AddToHistogram(const string& name, absl::Span<const double> values,
                       const int64_t steps) override {
     mutex_lock l(mu_);
     histogram::Histogram& histogram = histograms_[name];

From 52c540b645f5cdedcd42a554afa9e8b64e57a7f0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Apr 2024 02:01:48 -0700
Subject: [PATCH 660/670] compat: Update forward compatibility horizon to
 2024-04-02

PiperOrigin-RevId: 621097961
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 7f59445d85966d..75aeecbbc1ab94 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 4, 1)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 4, 2)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 727dbb69b2baa2943c07a33dbf9abae5b5e6f693 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Tue, 2 Apr 2024 17:51:35 +0000
Subject: [PATCH 661/670] Fix conflicts

---
 .../mlir/quantization/tensorflow/python/BUILD |   4 -
 .../optimizers/auto_mixed_precision_lists.h   |  37 -
 .../profiler/gpu/device_tracer_rocm.cc        | 709 ------------------
 .../xla/service/gpu/conv_algorithm_picker.cc  |   4 -
 .../xla/xla/stream_executor/gpu/gpu_timer.cc  |  86 ---
 .../xla/xla/stream_executor/rocm/rocm_dnn.cc  |   5 -
 6 files changed, 845 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index 3146a6baf07fa4..90a2addeff9aa6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -176,12 +176,8 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/container:flat_hash_map",
-<<<<<<< HEAD
         #"@com_google_absl//absl/strings:string_view",
         "@pybind11",
-=======
-        "@com_google_absl//absl/strings:string_view",
->>>>>>> upstream/master
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index 04fe8944980626..46f5cfa924a59e 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -125,32 +125,7 @@ class AutoMixedPrecisionListsFp16 : public AutoMixedPrecisionLists {
 
   gtl::FlatSet<string> AllowList() override {
     auto list = gtl::FlatSet<string>{
-<<<<<<< HEAD
-        "BlockLSTM",
-        "BlockLSTMV2",
-        "BlockLSTMGrad",
-        "BlockLSTMGradV2",
-        "Conv2D",
-        "Conv2DBackpropFilter",
-        "Conv2DBackpropInput",
-        "CudnnRNN",
-        "CudnnRNNBackprop",
-        "CudnnRNNBackpropV2",
-        "CudnnRNNBackpropV3",
-        "CudnnRNNV2",
-        "CudnnRNNV3",
-        "Einsum",
-        "Dropout",
-        "DropoutGrad",
-        "FusedConv2DBiasActivation",
-        "FusedSparseConvGpuV2",
-        "GRUBlockCell",
-        "GRUBlockCellGrad",
-        "LSTMBlockCell",
-        "LSTMBlockCellGrad",
-=======
         "Conv2D", "Conv2DBackpropFilter", "Conv2DBackpropInput", "Einsum",
->>>>>>> upstream/master
         "MatMul",
     };
     if (use_cuda_) {
@@ -195,14 +170,10 @@ class AutoMixedPrecisionListsFp16 : public AutoMixedPrecisionLists {
       list.insert("Conv3DBackpropInput");
       list.insert("Conv3DBackpropInputV2");
     }
-<<<<<<< HEAD
 #if TENSORFLOW_USE_ROCM
       list.insert("_ROCmFusedConvolutionBiasActivation");
 #endif
     if (cudnn_version_ >= 8000) {
-=======
-    if ((use_cuda_ && cudnn_version_ >= 8000) || use_onednn_) {
->>>>>>> upstream/master
       list.insert("DepthwiseConv2dNative");
       list.insert("DepthwiseConv2dNativeBackpropFilter");
       list.insert("DepthwiseConv2dNativeBackpropInput");
@@ -265,7 +236,6 @@ class AutoMixedPrecisionListsFp16 : public AutoMixedPrecisionLists {
         "Tanh",
         "TanhGrad",
     };
-<<<<<<< HEAD
 #if TENSORFLOW_USE_ROCM
       list.insert("_FusedMulAdd");
       list.insert("_FusedMulAdd2");
@@ -279,13 +249,6 @@ class AutoMixedPrecisionListsFp16 : public AutoMixedPrecisionLists {
       list.insert("_ROCmFusedBatchNormActivationBackward");
       list.insert("_ROCmFusedConvolutionBiasBatchNormActivation");
 #endif
-=======
-    if (use_onednn_) {
-      list.insert("Rsqrt");
-      list.insert("Square");
-      list.insert("SquaredDifference");
-    }
->>>>>>> upstream/master
     UpdateList("INFERLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
     // TODO(reedwm): This should be removed if we don't have active users.
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
index 921d74e060d466..81eb2d192ea09a 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
@@ -66,715 +66,6 @@ using tsl::profiler::XLineBuilder;
 using tsl::profiler::XPlaneBuilder;
 using tsl::profiler::XSpace;
 
-<<<<<<< HEAD
-namespace {
-// Set the all XLines of specified XPlane to starting walltime.
-// Events time in both host and device planes are CUTPI timestamps.
-// We set initial RocmTracer timestamp as start time for all lines to reflect
-// this fact. Eventually we change line start time to corresponding
-// start_walltime_ns to normalize with CPU wall time.
-static void NormalizeTimeStamps(XPlaneBuilder* plane,
-                                uint64_t start_walltime_ns) {
-  plane->ForEachLine([&](tsl::profiler::XLineBuilder line) {
-    line.SetTimestampNs(start_walltime_ns);
-  });
-}
-
-std::string GetDeviceXLineName(
-    int64_t stream_id, absl::flat_hash_set<RocmTracerEventType>& event_types) {
-  std::string line_name = absl::StrCat("Stream #", stream_id);
-  event_types.erase(RocmTracerEventType::Unsupported);
-  if (event_types.empty()) return line_name;
-  std::vector<const char*> type_names;
-  for (const auto event_type : event_types) {
-    type_names.emplace_back(GetRocmTracerEventTypeName(event_type));
-  }
-  return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
-}
-
-}  // namespace
-
-class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
- public:
-  RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options,
-                         uint64_t start_walltime_ns, uint64_t start_gputime_ns)
-      : RocmTraceCollector(options),
-        num_callback_events_(0),
-        num_activity_events_(0),
-        start_walltime_ns_(start_walltime_ns),
-        start_gputime_ns_(start_gputime_ns),
-        num_gpus_(options.num_gpus),
-        per_device_collector_(options.num_gpus) {}
-
-  void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override {
-    mutex_lock lock(event_maps_mutex_);
-
-    if (event.source == RocmTracerEventSource::ApiCallback && !is_auxiliary) {
-      if (num_callback_events_ > options_.max_callback_api_events) {
-        OnEventsDropped("max callback event capacity reached",
-                        event.correlation_id);
-        DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-        return;
-      }
-      num_callback_events_++;
-    } else if (event.source == RocmTracerEventSource::Activity &&
-               event.domain == RocmTracerEventDomain::HIP_API) {
-      // we do not count HIP_OPS activities.
-      if (num_activity_events_ > options_.max_activity_api_events) {
-        OnEventsDropped("max activity event capacity reached",
-                        event.correlation_id);
-        DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-        return;
-      }
-      num_activity_events_++;
-    }
-
-    bool emplace_result = false;
-    if (event.source == RocmTracerEventSource::ApiCallback) {
-      auto& target_api_event_map =
-          (is_auxiliary) ? auxiliary_api_events_map_ : api_events_map_;
-      std::tie(std::ignore, emplace_result) =
-          target_api_event_map.emplace(event.correlation_id, std::move(event));
-    } else if (event.source == RocmTracerEventSource::Activity) {
-      auto result = activity_ops_events_map_.emplace(
-          event.correlation_id, std::vector<RocmTracerEvent>{});
-      result.first->second.push_back(std::move(event));
-      emplace_result = true;  // we always accept Hip-Ops events
-    }
-    if (!emplace_result) {
-      OnEventsDropped("event with duplicate correlation_id was received.",
-                      event.correlation_id);
-      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-    }
-  }
-
-  void OnEventsDropped(const std::string& reason,
-                       uint32_t correlation_id) override {
-    LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
-              << ",) : " << reason << ".";
-  }
-
-  void Flush() override {
-    mutex_lock lock(event_maps_mutex_);
-    auto& aggregated_events_ = ApiActivityInfoExchange();
-
-    VLOG(3) << "RocmTraceCollector collected " << num_callback_events_
-            << " callback events, " << num_activity_events_
-            << " activity events, and aggregated them into "
-            << aggregated_events_.size() << " events.";
-
-    // device ids for GPUs filled in by roctracer are not zero indexed.
-    // They are offset by number of CPUs on the machine
-    tsl::uint32 min_device_id = INT32_MAX;
-    for (auto& event : aggregated_events_) {
-      if (event.device_id < min_device_id) {
-        min_device_id = event.device_id;
-      }
-    }
-
-    for (auto event : aggregated_events_) {
-      event.device_id = event.device_id - min_device_id;
-      if (event.device_id < num_gpus_) {
-        per_device_collector_[event.device_id].AddEvent(event);
-      } else {
-        OnEventsDropped("Invalid device id for an event.",
-                        event.correlation_id);
-        DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-      }
-    }
-
-    activity_ops_events_map_.clear();
-    api_events_map_.clear();
-    auxiliary_api_events_map_.clear();
-  }
-
-  void Export(XSpace* space) {
-    uint64_t end_gputime_ns = RocmTracer::GetTimestamp();
-    XPlaneBuilder host_plane(FindOrAddMutablePlaneWithName(
-        space, tsl::profiler::kRoctracerApiPlaneName));
-    for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
-      std::string name = GpuPlaneName(device_ordinal);
-      XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
-      device_plane.SetId(device_ordinal);
-      // Calculate device capabilities before flushing, so that device
-      // properties are available to the occupancy calculator in export().
-      per_device_collector_[device_ordinal].GetDeviceCapabilities(
-          device_ordinal, &device_plane);
-      per_device_collector_[device_ordinal].Export(
-          start_walltime_ns_, start_gputime_ns_, end_gputime_ns, &device_plane,
-          &host_plane);
-      NormalizeTimeStamps(&device_plane, start_walltime_ns_);
-    }
-    NormalizeTimeStamps(&host_plane, start_walltime_ns_);
-  }
-
- private:
-  std::atomic<int> num_callback_events_;
-  std::atomic<int> num_activity_events_;
-  uint64_t start_walltime_ns_;
-  uint64_t start_gputime_ns_;
-  int num_gpus_;
-
-  mutex event_maps_mutex_;
-  absl::flat_hash_map<uint32_t, RocmTracerEvent> api_events_map_
-      TF_GUARDED_BY(event_maps_mutex_);
-
-  /* Some apis such as MEMSETD32 (based on an observation with ResNet50),
-   trigger multiple HIP ops domain activities. We keep them in a vector and
-   merge them with api activities at flush time.
- */
-  absl::flat_hash_map<uint32_t, std::vector<RocmTracerEvent>>
-      activity_ops_events_map_ TF_GUARDED_BY(event_maps_mutex_);
-  // This is for the APIs that we track because we need some information from
-  // them to populate the corresponding activity that we actually track.
-  absl::flat_hash_map<uint32_t, RocmTracerEvent> auxiliary_api_events_map_
-      TF_GUARDED_BY(event_maps_mutex_);
-
-  const std::vector<RocmTracerEvent> ApiActivityInfoExchange() {
-    /* Different from CUDA, roctracer activity records are not enough to fill a
-      TF event. For most of the activities, we need to enable the corresponding
-      API callsbacks (we call them auxiliary API callbacks) to capture the
-      necessary fields from them using the correlation id. The purpose of this
-      function is to let APIs and activities exchange information to reach a
-      state very similar to TF CUDA and getting ready to dump the event.
-    */
-
-    std::vector<RocmTracerEvent> aggregated_events;
-
-    // Copy info from activity events to API callback events
-    for (auto& api_iter : api_events_map_) {
-      RocmTracerEvent& api_event = api_iter.second;
-      auto activity_event =
-          activity_ops_events_map_.find(api_event.correlation_id);
-
-      if (activity_event == activity_ops_events_map_.end()) {
-        OnEventsDropped(
-            "An event from HIP API discarded."
-            "Could not find the counterpart activity.",
-            api_event.correlation_id);
-        DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
-      } else {
-        api_event.device_id = activity_event->second.front().device_id;
-        api_event.stream_id = activity_event->second.front().stream_id;
-        switch (api_event.type) {
-          case RocmTracerEventType::Kernel:
-          case RocmTracerEventType::Memset:
-          case RocmTracerEventType::MemoryAlloc:
-          case RocmTracerEventType::MemoryFree:
-          case RocmTracerEventType::Synchronization: {
-            aggregated_events.push_back(api_event);
-            break;
-          }
-          case RocmTracerEventType::MemcpyD2H:
-          case RocmTracerEventType::MemcpyH2D:
-          case RocmTracerEventType::MemcpyD2D:
-          case RocmTracerEventType::MemcpyOther: {
-            api_event.memcpy_info.destination =
-                activity_event->second.front().device_id;
-            aggregated_events.push_back(api_event);
-            break;
-          }
-          default:
-            OnEventsDropped(
-                "Missing API-Activity information exchange. Dropped!",
-                api_event.correlation_id);
-            DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
-            LOG(WARNING) << "A ROCm API event type with unimplemented activity "
-                            "merge dropped! "
-                            "Type="
-                         << GetRocmTracerEventTypeName(api_event.type);
-        }
-      }
-    }
-
-    // Make sure for all activity events we have API callback events
-    for (auto& activity_iter : activity_ops_events_map_) {
-      RocmTracerEvent& activity_event = activity_iter.second.front();
-      auto api_event = api_events_map_.find(activity_event.correlation_id);
-
-      if (api_event == api_events_map_.end()) {
-        api_event =
-            auxiliary_api_events_map_.find(activity_event.correlation_id);
-      }
-
-      if (api_event == auxiliary_api_events_map_.end()) {
-        OnEventsDropped(
-            "An event from activity was discarded."
-            "Could not find the counterpart HIP API.",
-            activity_event.correlation_id);
-        DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
-      } else {
-        switch (activity_event.type) {
-          // KERNEL ACTIVITY
-          case RocmTracerEventType::Kernel: {
-            activity_event.name = api_event->second.name;
-            activity_event.kernel_info = api_event->second.kernel_info;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          // MEMCPY ACTIVITY
-          case RocmTracerEventType::MemcpyD2H:
-          case RocmTracerEventType::MemcpyH2D:
-          case RocmTracerEventType::MemcpyD2D:
-          case RocmTracerEventType::MemcpyOther: {
-            activity_event.memcpy_info = api_event->second.memcpy_info;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          // MEMSET ACTIVITY
-          case RocmTracerEventType::Memset: {
-            activity_event.memset_info = api_event->second.memset_info;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          // MALLOC ACTIVITY, FREE ACTIVITY
-          case RocmTracerEventType::MemoryAlloc:
-          case RocmTracerEventType::MemoryFree: {
-            activity_event.device_id = api_event->second.device_id;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          // SYNCHRONIZATION ACTIVITY
-          case RocmTracerEventType::Synchronization: {
-            activity_event.device_id = api_event->second.device_id;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          default:
-            OnEventsDropped(
-                "Missing API-Activity information exchange. Dropped!",
-                activity_event.correlation_id);
-            DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
-            LOG(WARNING) << "A ROCm activity event with unimplemented API "
-                            "callback merge dropped! "
-                            "Type="
-                         << GetRocmTracerEventTypeName(activity_event.type);
-            break;
-        }
-      }
-    }
-
-    return aggregated_events;
-  }
-  struct RocmDeviceOccupancyParams {
-    hipFuncAttributes attributes = {};
-    int block_size = 0;
-    size_t dynamic_smem_size = 0;
-    void* func_ptr;
-
-    friend bool operator==(const RocmDeviceOccupancyParams& lhs,
-                           const RocmDeviceOccupancyParams& rhs) {
-      return 0 == memcmp(&lhs, &rhs, sizeof(lhs));
-    }
-
-    template <typename H>
-    friend H AbslHashValue(H hash_state,
-                           const RocmDeviceOccupancyParams& params) {
-      return H::combine(
-          std::move(hash_state), params.attributes.maxThreadsPerBlock,
-          params.attributes.numRegs, params.attributes.sharedSizeBytes,
-          params.attributes.maxDynamicSharedSizeBytes, params.block_size,
-          params.dynamic_smem_size, params.func_ptr);
-    }
-  };
-
-  struct OccupancyStats {
-    double occupancy_pct = 0.0;
-    int min_grid_size = 0;
-    int suggested_block_size = 0;
-  };
-  struct CorrelationInfo {
-    CorrelationInfo(uint32_t t, uint32_t e)
-        : thread_id(t), enqueue_time_ns(e) {}
-    uint32_t thread_id;
-    uint64_t enqueue_time_ns;
-  };
-
-  struct PerDeviceCollector {
-    void GetDeviceCapabilities(int32_t device_ordinal,
-                               XPlaneBuilder* device_plane) {
-      device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
-                                     GetStatTypeStr(StatType::kDevVendor)),
-                                 kDeviceVendorAMD);
-
-      if (hipGetDeviceProperties(&device_properties_, device_ordinal) !=
-          hipSuccess)
-        return;
-
-      auto clock_rate_in_khz =
-          device_properties_.clockRate;  // this is also in Khz
-      if (clock_rate_in_khz) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapClockRateKHz)),
-            clock_rate_in_khz);
-      }
-
-      auto core_count = device_properties_.multiProcessorCount;
-      if (core_count) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapCoreCount)),
-            core_count);
-      }
-
-      auto mem_clock_khz = device_properties_.memoryClockRate;
-      auto mem_bus_width_bits = device_properties_.memoryBusWidth;
-
-      if (mem_clock_khz && mem_bus_width_bits) {
-        // Times 2 because HBM is DDR memory; it gets two data bits per each
-        // data lane.
-        auto memory_bandwidth =
-            uint64_t{2} * (mem_clock_khz) * 1000 * (mem_bus_width_bits) / 8;
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
-            memory_bandwidth);
-      }
-
-      size_t total_memory = device_properties_.totalGlobalMem;
-      if (total_memory) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapMemorySize)),
-            static_cast<uint64_t>(total_memory));
-      }
-
-      auto compute_capability_major = device_properties_.major;
-      if (compute_capability_major) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
-            compute_capability_major);
-      }
-      auto compute_capability_minor = device_properties_.minor;
-      if (compute_capability_minor) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
-            compute_capability_minor);
-      }
-    }
-
-    inline std::string ToXStat(const KernelDetails& kernel_info,
-                               double occupancy_pct) {
-      return absl::StrCat(
-          "regs:", kernel_info.registers_per_thread,
-          " static_shared:", kernel_info.static_shared_memory_usage,
-          " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
-          " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
-          kernel_info.grid_z, " block:", kernel_info.block_x, ",",
-          kernel_info.block_y, ",", kernel_info.block_z,
-          " occ_pct:", occupancy_pct);
-    }
-    OccupancyStats GetOccupancy(const RocmDeviceOccupancyParams& params) const {
-      // TODO(rocm-profiler): hipOccupancyMaxActiveBlocksPerMultiprocessor only
-      // return hipSuccess for HIP_API_ID_hipLaunchKernel
-
-      OccupancyStats stats;
-      int number_of_active_blocks;
-      hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-          &number_of_active_blocks, params.func_ptr, params.block_size,
-          params.dynamic_smem_size);
-
-      if (err != hipError_t::hipSuccess) {
-        return {};
-      }
-
-      stats.occupancy_pct = number_of_active_blocks * params.block_size * 100;
-      stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor;
-
-      err = hipOccupancyMaxPotentialBlockSize(
-          &stats.min_grid_size, &stats.suggested_block_size,
-          (const void*)params.func_ptr,
-          params.dynamic_smem_size, 0);
-
-      if (err != hipError_t::hipSuccess) {
-        return {};
-      }
-
-      return stats;
-    }
-    void AddEvent(const RocmTracerEvent& event) {
-      mutex_lock l(events_mutex);
-      if (event.source == RocmTracerEventSource::ApiCallback) {
-        // Cupti api callback events were used to populate launch times etc.
-        if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
-          correlation_info_.insert(
-              {event.correlation_id,
-               CorrelationInfo(event.thread_id, event.start_time_ns)});
-        }
-        events.emplace_back(std::move(event));
-      } else {
-        // Cupti activity events measure device times etc.
-        events.emplace_back(std::move(event));
-      }
-    }
-
-    void SortByStartTime() {
-      mutex_lock lock(events_mutex);
-      std::sort(
-          events.begin(), events.end(),
-          [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
-            return event1.start_time_ns < event2.start_time_ns;
-          });
-    }
-
-    void CreateXEvent(const RocmTracerEvent& event, XPlaneBuilder* plane,
-                      uint64_t start_gpu_ns, uint64_t end_gpu_ns,
-                      XLineBuilder* line) {
-      if (event.start_time_ns < start_gpu_ns ||
-          event.end_time_ns > end_gpu_ns ||
-          event.start_time_ns > event.end_time_ns) {
-        VLOG(2) << "events have abnormal timestamps:" << event.name
-                << " start time(ns): " << event.start_time_ns
-                << " end time(ns): " << event.end_time_ns
-                << " start gpu(ns):" << start_gpu_ns
-                << " end gpu(ns):" << end_gpu_ns
-                << " corr. id:" << event.correlation_id;
-        return;
-      }
-      std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str());
-      if (kernel_name.empty()) {
-        kernel_name = GetRocmTracerEventTypeName(event.type);
-      }
-      XEventMetadata* event_metadata =
-          plane->GetOrCreateEventMetadata(std::move(kernel_name));
-      XEventBuilder xevent = line->AddEvent(*event_metadata);
-      VLOG(7) << "Adding event to line=" << line->Id();
-      xevent.SetTimestampNs(event.start_time_ns);
-      xevent.SetEndTimestampNs(event.end_time_ns);
-      if (event.source == RocmTracerEventSource::ApiCallback) {
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kDeviceId)),
-                            event.device_id);
-      }
-      if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kCorrelationId)),
-                            event.correlation_id);
-      }
-      if (!event.roctx_range.empty()) {
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kNVTXRange)),
-                            *plane->GetOrCreateStatMetadata(event.roctx_range));
-      }
-      // if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
-      //   xevent.AddStatValue(
-      //       *plane->GetOrCreateStatMetadata(
-      //           GetStatTypeStr(StatType::kContextId)),
-      //       absl::StrCat("$$", static_cast<uint64_t>(event.context_id)));
-      // }
-
-      if (event.type == RocmTracerEventType::Kernel &&
-          event.source == RocmTracerEventSource::Activity) {
-        RocmDeviceOccupancyParams params{};
-        params.attributes.maxThreadsPerBlock = INT_MAX;
-        params.attributes.numRegs =
-            static_cast<int>(event.kernel_info.registers_per_thread);
-        params.attributes.sharedSizeBytes =
-            event.kernel_info.static_shared_memory_usage;
-        // params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF;
-        // params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
-        params.attributes.maxDynamicSharedSizeBytes = 0;
-        params.block_size = static_cast<int>(event.kernel_info.block_x *
-                                             event.kernel_info.block_y *
-                                             event.kernel_info.block_z);
-
-        params.dynamic_smem_size =
-            event.kernel_info.dynamic_shared_memory_usage;
-        params.func_ptr = event.kernel_info.func_ptr;
-
-        OccupancyStats& occ_stats = occupancy_cache_[params];
-        if (occ_stats.occupancy_pct == 0.0) {
-          occ_stats = GetOccupancy(params);
-        }
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                                StatType::kTheoreticalOccupancyPct)),
-                            occ_stats.occupancy_pct);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                                StatType::kOccupancyMinGridSize)),
-                            static_cast<int32_t>(occ_stats.min_grid_size));
-        xevent.AddStatValue(
-            *plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)),
-            static_cast<int32_t>(occ_stats.suggested_block_size));
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kKernelDetails)),
-                            *plane->GetOrCreateStatMetadata(ToXStat(
-                                event.kernel_info, occ_stats.occupancy_pct)));
-      } else if (event.type == RocmTracerEventType::MemcpyH2D ||
-                 event.type == RocmTracerEventType::MemcpyD2H ||
-                 event.type == RocmTracerEventType::MemcpyD2D ||
-                 event.type == RocmTracerEventType::MemcpyP2P ||
-                 event.type == RocmTracerEventType::MemcpyOther) {
-        VLOG(7) << "Add Memcpy stat";
-        const auto& memcpy_info = event.memcpy_info;
-        std::string memcpy_details = absl::StrCat(
-            // TODO(rocm-profiler): we need to discover the memory kind similar
-            // to CUDA
-            "kind:", "Unknown", " size:", memcpy_info.num_bytes,
-            " dest:", memcpy_info.destination, " async:", memcpy_info.async);
-        xevent.AddStatValue(
-            *plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kMemcpyDetails)),
-            *plane->GetOrCreateStatMetadata(std::move(memcpy_details)));
-      } else if (event.type == RocmTracerEventType::MemoryAlloc) {
-        VLOG(7) << "Add MemAlloc stat";
-        std::string value =
-            // TODO(rocm-profiler): we need to discover the memory kind similar
-            // to CUDA
-            absl::StrCat("kind:", "Unknown",
-                         " num_bytes:", event.memalloc_info.num_bytes);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kMemallocDetails)),
-                            *plane->GetOrCreateStatMetadata(std::move(value)));
-      } else if (event.type == RocmTracerEventType::MemoryFree) {
-        VLOG(7) << "Add MemFree stat";
-        std::string value =
-            // TODO(rocm-profiler): we need to discover the memory kind similar
-            // to CUDA
-            absl::StrCat("kind:", "Unknown",
-                         " num_bytes:", event.memalloc_info.num_bytes);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kMemFreeDetails)),
-                            *plane->GetOrCreateStatMetadata(std::move(value)));
-      } else if (event.type == RocmTracerEventType::Memset) {
-        VLOG(7) << "Add Memset stat";
-        auto value =
-            // TODO(rocm-profiler): we need to discover the memory kind similar
-            // to CUDA
-            absl::StrCat("kind:", "Unknown",
-                         " num_bytes:", event.memset_info.num_bytes,
-                         " async:", event.memset_info.async);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kMemsetDetails)),
-                            *plane->GetOrCreateStatMetadata(std::move(value)));
-      }
-      // TODO(rocm-profiler): we need to support the following event type
-      /* else if (event.type == CuptiTracerEventType::MemoryResidency) {
-        VLOG(7) << "Add MemoryResidency stat";
-        std::string value = absl::StrCat(
-            "kind:", GetMemoryKindName(event.memory_residency_info.kind),
-            " num_bytes:", event.memory_residency_info.num_bytes,
-            " addr:", event.memory_residency_info.address);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                                StatType::kMemoryResidencyDetails)),
-                            *plane->GetOrCreateStatMetadata(std::move(value)));
-      } */
-
-      std::vector<Annotation> annotation_stack =
-          ParseAnnotationStack(event.annotation);
-      if (!annotation_stack.empty()) {
-        xevent.AddStatValue(
-            *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
-            *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
-      }
-      // If multiple metadata have the same key name, show the values from the
-      // top of the stack (innermost annotation). Concatenate the values from
-      // "hlo_op".
-      absl::flat_hash_set<absl::string_view> key_set;
-
-      for (auto annotation = annotation_stack.rbegin();
-           annotation != annotation_stack.rend(); ++annotation) {
-        for (const Annotation::Metadata& metadata : annotation->metadata) {
-          if (key_set.insert(metadata.key).second) {
-            xevent.ParseAndAddStatValue(
-                *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
-          }
-        }
-      }
-    }
-    bool IsHostEvent(const RocmTracerEvent& event, tsl::int64* line_id) {
-      // DriverCallback(i.e. kernel launching) events are host events.
-      if (event.source == RocmTracerEventSource::ApiCallback) {
-        *line_id = event.thread_id;
-        return true;
-      } else {  // activities
-        *line_id = event.stream_id;
-        return false;
-      }
-
-      // TODO(rocm-profiler): do we have such a report in rocm?
-      // Non-overhead activity events are device events.
-      /* if (event.type != CuptiTracerEventType::Overhead) {
-        *line_id = event.stream_id;
-        return false;
-      } */
-      // Overhead events can be associated with a thread or a stream, etc.
-      // If a valid thread id is specified, we consider it as a host event.
-      //
-
-      if (event.stream_id != RocmTracerEvent::kInvalidStreamId) {
-        *line_id = event.stream_id;
-        return false;
-      } else if (event.thread_id != RocmTracerEvent::kInvalidThreadId &&
-                 event.thread_id != 0) {
-        *line_id = event.thread_id;
-        return true;
-      } else {
-        *line_id = tsl::profiler::kThreadIdOverhead;
-        return false;
-      }
-    }
-    void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns,
-                uint64_t end_gputime_ns, XPlaneBuilder* device_plane,
-                XPlaneBuilder* host_plane) {
-      int host_ev_cnt = 0, dev_ev_cnt = 0;
-      mutex_lock l(events_mutex);
-      // Tracking event types per line.
-      absl::flat_hash_map<tsl::int64, absl::flat_hash_set<RocmTracerEventType>>
-          events_types_per_line;
-      for (const RocmTracerEvent& event : events) {
-        int64_t line_id = RocmTracerEvent::kInvalidThreadId;
-        bool is_host_event = IsHostEvent(event, &line_id);
-
-        if (is_host_event) {
-          host_ev_cnt++;
-        } else {
-          dev_ev_cnt++;
-        }
-
-        if (line_id == RocmTracerEvent::kInvalidThreadId ||
-            line_id == RocmTracerEvent::kInvalidStreamId) {
-          VLOG(3) << "Ignoring event, type=" << static_cast<int>(event.type);
-          continue;
-        }
-        auto* plane = is_host_event ? host_plane : device_plane;
-        VLOG(9) << "Event" << " type=" << static_cast<int>(event.type)
-                << " line_id=" << line_id
-                << (is_host_event ? " host plane=" : " device plane=")
-                << plane->Name();
-        XLineBuilder line = plane->GetOrCreateLine(line_id);
-        line.SetTimestampNs(start_gputime_ns);
-        CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line);
-        events_types_per_line[line_id].emplace(event.type);
-      }
-      device_plane->ForEachLine([&](XLineBuilder line) {
-        line.SetName(
-            GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
-      });
-      host_plane->ForEachLine([&](XLineBuilder line) {
-        line.SetName(absl::StrCat("Host Threads/", line.Id()));
-      });
-      events.clear();
-    }
-
-    mutex events_mutex;
-    std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
-    absl::flat_hash_map<uint32_t, CorrelationInfo> correlation_info_
-        TF_GUARDED_BY(events_mutex);
-    absl::flat_hash_map<RocmDeviceOccupancyParams, OccupancyStats>
-        occupancy_cache_;
-    hipDeviceProp_t device_properties_;
-  };
-
-  absl::flat_hash_map<const uint32_t, PerDeviceCollector>
-      per_device_collector_;
-};
-
-=======
->>>>>>> upstream/master
 // GpuTracer for ROCm GPU.
 class GpuTracer : public profiler::ProfilerInterface {
  public:
diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index 1094b9f9921fc4..59f1c8e3ca2890 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -71,11 +71,7 @@ limitations under the License.
 #include "tsl/platform/logging.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/statusor.h"
-<<<<<<< HEAD
-#include "tsl/util/proto/proto_utils.h"
 #include "tsl/util/env_var.h"
-=======
->>>>>>> upstream/master
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
 #include "third_party/gpus/cudnn/cudnn.h"  // IWYU pragma: keep
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
index 103a72d5aad106..ecd3f40c6725c9 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
@@ -51,22 +51,6 @@ absl::Duration RandomDuration() {
   return absl::Microseconds(distribution(rng));
 }
 
-<<<<<<< HEAD
-bool ShouldLaunchDelayKernel() {
-#if GOOGLE_CUDA
-  // Only launch the delay kernel if CUDA_LAUNCH_BLOCKING is not set to 1.
-  static bool value = [] {
-    const char* blocking = std::getenv("CUDA_LAUNCH_BLOCKING");
-    return !blocking || std::string_view{blocking} != "1";
-  }();
-  return value;
-#elif TENSORFLOW_USE_ROCM
-  return false;
-#endif
-}
-
-=======
->>>>>>> upstream/master
 }  // namespace
 
 /*deprecated*/ /*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(
@@ -101,80 +85,10 @@ GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
   return GpuTimer::Create(AsGpuStream(stream));
 }
 
-<<<<<<< HEAD
-DeviceMemory<GpuSemaphoreState> GpuTimer::GpuSemaphore::device() {
-  // This assumes unified addressing, as we do not explicitly translate the
-  // host pointer into a device pointer.
-  return DeviceMemory<GpuSemaphoreState>::MakeFromByteSize(
-      ptr_->opaque(), sizeof(GpuSemaphoreState));
-}
-
-/*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(Stream* real_stream) {
-  StreamExecutor* executor = real_stream->parent();
-  GpuStream* stream = AsGpuStream(real_stream);
-  GpuExecutor* parent = stream->parent();
-  GpuContext* context = parent->gpu_context();
-  GpuEventHandle start_event;
-  TF_RETURN_IF_ERROR(GpuDriver::InitEvent(context, &start_event,
-                                          GpuDriver::EventFlags::kDefault));
-  GpuEventHandle stop_event;
-  TF_RETURN_IF_ERROR(GpuDriver::InitEvent(context, &stop_event,
-                                          GpuDriver::EventFlags::kDefault));
-  CHECK(start_event != nullptr && stop_event != nullptr);
-  GpuSemaphore semaphore{};
-  if (ShouldLaunchDelayKernel()) {
-    // Check the assumption that this device supports unified addressing,
-    // otherwise skip the delay kernel
-#if GOOGLE_CUDA
-    auto attr = CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING;
-#elif TENSORFLOW_USE_ROCM
-    auto attr = hipDeviceAttributeUnifiedAddressing;
-#endif
-    TF_ASSIGN_OR_RETURN(int status, GpuDriver::GetDeviceAttribute(
-                                        attr, parent->device()));
-    if (!status) {
-      LOG(WARNING) << "Skipping the delay kernel because the device does not "
-                      "support unified addressing";
-    } else {
-      // Allocate a semaphore value that will be used to signal to the delay
-      // kernel that it may exit.
-      TF_ASSIGN_OR_RETURN(semaphore, GpuSemaphore::Create(executor));
-      *semaphore = GpuSemaphoreState::Hold;
-      // In principle the kernel could be loaded lazily and shared across
-      // multiple GpuTimer objects.
-      TF_ASSIGN_OR_RETURN(
-          auto kernel,
-          (TypedKernel<DeviceMemory<GpuSemaphoreState>,
-                       GpuSemaphoreState>::Create(executor, "DelayKernel",
-                                                  delay_kernel::kernel())));
-      // Launch a delay kernel into this stream, which will spin until
-      // GetElapsedDuration() is called, the timer is destroyed, or the timeout
-      // in the kernel is reached.
-      TF_RETURN_IF_ERROR(real_stream->ThenLaunch(
-          ThreadDim(1, 1, 1), BlockDim(1, 1, 1), kernel, semaphore.device(),
-          GpuSemaphoreState::Release));
-    }
-  }
-  // The start event goes after the delay kernel in the stream
-  TF_RETURN_IF_ERROR(GpuDriver::RecordEvent(parent->gpu_context(), start_event,
-                                            stream->gpu_stream()));
-  return absl::StatusOr<GpuTimer>{absl::in_place, parent, start_event,
-                                  stop_event,     stream, std::move(semaphore)};
-}
-
-/*static*/ absl::StatusOr<std::optional<GpuTimer>> GpuTimer::CreateIfNeeded(
-    Stream* stream, bool is_needed) {
-  if (is_needed) {
-    TF_ASSIGN_OR_RETURN(GpuTimer t, GpuTimer::Create(stream));
-    return {std::make_optional(std::move(t))};
-  }
-  return std::nullopt;
-=======
 [[deprecated("So it can quietly call a deprecated method")]] /*static*/ absl::
     StatusOr<std::optional<GpuTimer>>
     GpuTimer::CreateIfNeeded(Stream* stream, bool is_needed) {
   return GpuTimer::CreateIfNeeded(AsGpuStream(stream), is_needed);
->>>>>>> upstream/master
 }
 
 /*static*/ void GpuTimer::ReturnRandomDurationsForTesting() {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index 2fee8893a6bc3f..bee0eb58895369 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -49,15 +49,10 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/hash.h"
 #include "tsl/platform/logging.h"
-<<<<<<< HEAD
-#include "tsl/util/determinism.h"
-#include "tsl/util/env_var.h"
 #include "rocm/rocm_config.h"
 
 #include <hip/hip_fp16.h>
 #include <hip/hip_bfloat16.h>
-=======
->>>>>>> upstream/master
 
 namespace {
 

From a126b064ab99c39995d867ff36c09bb83b170a50 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Wed, 3 Apr 2024 00:21:11 +0000
Subject: [PATCH 662/670] Move remaining tsl/util to xla/tsl/util

---
 third_party/xla/xla/service/gpu/conv_algorithm_picker.cc | 2 +-
 third_party/xla/xla/service/gpu/gpu_layout_assignment.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index 59f1c8e3ca2890..e0f95e2cdba32f 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -71,7 +71,7 @@ limitations under the License.
 #include "tsl/platform/logging.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
+#include "xla/tsl/util/env_var.h"
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
 #include "third_party/gpus/cudnn/cudnn.h"  // IWYU pragma: keep
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
index 12d5198a852e78..9aa622b3ea79f6 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
@@ -55,7 +55,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
-#include "tsl/util/env_var.h"
+#include "xla/tsl/util/env_var.h"
 
 namespace xla {
 namespace gpu {

From b490540e02cf4a3cd5154f12219a59aaf8e07658 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Wed, 3 Apr 2024 00:22:07 +0000
Subject: [PATCH 663/670] Fix profiler build error

---
 third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
index 41b21c486eb340..66f8000f6d0d52 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
@@ -223,7 +223,7 @@ class PerDeviceCollector {
     stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor;
 
     err = hipOccupancyMaxPotentialBlockSize(
-        &stats.min_grid_size, &stats.suggested_block_size, params.func_ptr,
+        &stats.min_grid_size, &stats.suggested_block_size, static_cast<const void*>(params.func_ptr),
         params.dynamic_smem_size, 0);
 
     if (err != hipError_t::hipSuccess) {

From 8a94043f8634e7e760bb783e440cdd12a322be04 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Thu, 4 Apr 2024 11:53:28 +0000
Subject: [PATCH 664/670] Skip unsupported tests

---
 third_party/xla/xla/client/lib/BUILD                       | 1 +
 third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/third_party/xla/xla/client/lib/BUILD b/third_party/xla/xla/client/lib/BUILD
index dcfd207d69ee5d..bf75c6d8154934 100644
--- a/third_party/xla/xla/client/lib/BUILD
+++ b/third_party/xla/xla/client/lib/BUILD
@@ -529,6 +529,7 @@ cc_library(
     name = "self_adjoint_eig",
     srcs = ["self_adjoint_eig.cc"],
     hdrs = ["self_adjoint_eig.h"],
+    tags = ["no_rocm",],
     deps = [
         ":arithmetic",
         ":comparators",
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index 69e956f62deac4..874d602bb4914d 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -4843,6 +4843,10 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDF8) {
 
 // Do not fuse FP8 matrix bias.
 TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDMatrixBiasF8) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "UnscaledABUnscaledDMatrixBiasF8 is currently not supported on ROCm";
+  }
+
 #if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000

From 716d13718357c995debe6a290b1449a3b6d70475 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Thu, 4 Apr 2024 11:58:56 +0000
Subject: [PATCH 665/670] Revert py_function_lib target to upstream version

---
 tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index 90a2addeff9aa6..a7a56a610bec41 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -176,8 +176,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/container:flat_hash_map",
-        #"@com_google_absl//absl/strings:string_view",
-        "@pybind11",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 

From ac8268581c3ffbaefaa1af4a26a4ccfff5f82020 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Thu, 4 Apr 2024 14:00:35 +0000
Subject: [PATCH 666/670] Fix wrongly placed tag

---
 third_party/xla/xla/client/lib/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/client/lib/BUILD b/third_party/xla/xla/client/lib/BUILD
index bf75c6d8154934..a3b452c7171941 100644
--- a/third_party/xla/xla/client/lib/BUILD
+++ b/third_party/xla/xla/client/lib/BUILD
@@ -529,7 +529,6 @@ cc_library(
     name = "self_adjoint_eig",
     srcs = ["self_adjoint_eig.cc"],
     hdrs = ["self_adjoint_eig.h"],
-    tags = ["no_rocm",],
     deps = [
         ":arithmetic",
         ":comparators",
@@ -552,6 +551,7 @@ cc_library(
 xla_test(
     name = "self_adjoint_eig_test",
     srcs = ["self_adjoint_eig_test.cc"],
+    tags = ["no_rocm",],
     real_hardware_only = True,
     shard_count = 5,
     tags = ["optonly"],

From 314087ab68ada000c8a7e144ce2801b9297b08a2 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Thu, 4 Apr 2024 15:46:52 +0000
Subject: [PATCH 667/670] Fix wrongly placed tag

---
 third_party/xla/xla/client/lib/BUILD | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/third_party/xla/xla/client/lib/BUILD b/third_party/xla/xla/client/lib/BUILD
index a3b452c7171941..d558d9ed2070ea 100644
--- a/third_party/xla/xla/client/lib/BUILD
+++ b/third_party/xla/xla/client/lib/BUILD
@@ -551,10 +551,9 @@ cc_library(
 xla_test(
     name = "self_adjoint_eig_test",
     srcs = ["self_adjoint_eig_test.cc"],
-    tags = ["no_rocm",],
     real_hardware_only = True,
     shard_count = 5,
-    tags = ["optonly"],
+    tags = ["no_rocm", "optonly"],
     deps = [
         ":arithmetic",
         ":constants",

From 1f8e19dfe033a51c54a1f3c1cc56ac44f0b59d6d Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Fri, 5 Apr 2024 12:35:41 +0000
Subject: [PATCH 668/670] Skip subtest instead of whole test

---
 third_party/xla/xla/client/lib/BUILD                    | 8 +++++++-
 third_party/xla/xla/client/lib/self_adjoint_eig_test.cc | 4 ++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/client/lib/BUILD b/third_party/xla/xla/client/lib/BUILD
index d558d9ed2070ea..4d1f90698de0c1 100644
--- a/third_party/xla/xla/client/lib/BUILD
+++ b/third_party/xla/xla/client/lib/BUILD
@@ -5,6 +5,11 @@ load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla/tests:build_defs.bzl", "generate_backend_suites", "xla_test")
 
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = internal_visibility(["//xla/client:friends"]),
@@ -553,7 +558,8 @@ xla_test(
     srcs = ["self_adjoint_eig_test.cc"],
     real_hardware_only = True,
     shard_count = 5,
-    tags = ["no_rocm", "optonly"],
+    tags = ["optonly"],
+    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1",]),
     deps = [
         ":arithmetic",
         ":constants",
diff --git a/third_party/xla/xla/client/lib/self_adjoint_eig_test.cc b/third_party/xla/xla/client/lib/self_adjoint_eig_test.cc
index 7be635f2a9a796..f47f4830d2a1a7 100644
--- a/third_party/xla/xla/client/lib/self_adjoint_eig_test.cc
+++ b/third_party/xla/xla/client/lib/self_adjoint_eig_test.cc
@@ -277,6 +277,10 @@ class RandomEighTest : public ClientLibraryTestBase,
                        public ::testing::WithParamInterface<EighTestCase> {};
 
 XLA_TEST_P(RandomEighTest, Random) {
+#if TENSORFLOW_USE_ROCM
+  GTEST_SKIP() << "RandomEighTest.Random is currently not supported on ROCm.";
+#endif  // TENSORFLOW_USE_ROCM
+
   XlaBuilder builder(TestName());
   int64_t size = GetParam();
   Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);

From 4f31a438dffd4e92c43282597803e158e5353582 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Tue, 9 Apr 2024 10:13:33 +0000
Subject: [PATCH 669/670] Remove local deps

---
 third_party/xla/xla/mlir_hlo/tests/BUILD | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/mlir_hlo/tests/BUILD b/third_party/xla/xla/mlir_hlo/tests/BUILD
index 4db81efeb2dd52..30c6301afe61ee 100644
--- a/third_party/xla/xla/mlir_hlo/tests/BUILD
+++ b/third_party/xla/xla/mlir_hlo/tests/BUILD
@@ -26,7 +26,9 @@ package(
         tags = [
             "nomsan",  # The execution engine doesn't work with msan, see b/248097619.
         ],
-        deps = ["@pypi_lit//:pkg"],
+        #Build failing due to the double deps definition. Should be resolved by next weekly thanks to 
+        #https://github.com/llvm/llvm-project/commit/dd5797505ebc2dbfdd58927c4f0a11a1256696eb
+        #deps = ["@pypi_lit//:pkg"],
     )
     for src in glob(["**/*.mlir"])
 ]

From 276236d575a378d51afbd3c3a47962fd48d48c05 Mon Sep 17 00:00:00 2001
From: mmakevic <Milica.Makevic@amd.com>
Date: Thu, 11 Apr 2024 15:17:08 +0000
Subject: [PATCH 670/670] Add dropout to AMP allow list

---
 .../core/grappler/optimizers/auto_mixed_precision_lists.h       | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index 46f5cfa924a59e..e7cb39d0f4d7fd 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -139,6 +139,8 @@ class AutoMixedPrecisionListsFp16 : public AutoMixedPrecisionLists {
       list.insert("CudnnRNNBackpropV3");
       list.insert("CudnnRNNV2");
       list.insert("CudnnRNNV3");
+      list.insert("Dropout");
+      list.insert("DropoutGrad");
       list.insert("FusedConv2DBiasActivation");
       list.insert("FusedSparseConvGpuV2");
       list.insert("GRUBlockCell");